Spaces:
Running
on
Zero
Running
on
Zero
import gradio as gr | |
import torch | |
from transformers import AutoModelForCausalLM, AutoTokenizer | |
import gc | |
import os | |
import datetime | |
import time | |
import spaces | |
# --- μ€μ --- | |
MODEL_ID = "HyperCLOVAX-SEED-Vision-Instruct-3B" | |
MAX_NEW_TOKENS = 512 | |
CPU_THREAD_COUNT = 4 # νμμ μ‘°μ | |
# --- μ ν μ¬ν: CPU μ€λ λ μ€μ --- | |
# torch.set_num_threads(CPU_THREAD_COUNT) | |
# os.environ["OMP_NUM_THREADS"] = str(CPU_THREAD_COUNT) | |
# os.environ["MKL_NUM_THREADS"] = str(CPU_THREAD_COUNT) | |
print("--- νκ²½ μ€μ ---") | |
print(f"PyTorch λ²μ : {torch.__version__}") | |
print(f"μ€ν μ₯μΉ: {torch.device('cuda' if torch.cuda.is_available() else 'cpu')}") | |
print(f"Torch μ€λ λ: {torch.get_num_threads()}") | |
# --- λͺ¨λΈ λ° ν ν¬λμ΄μ λ‘λ© --- | |
print(f"--- λͺ¨λΈ λ‘λ© μ€: {MODEL_ID} ---") | |
print("첫 μ€ν μ λͺ λΆ μ λ μμλ μ μμ΅λλ€...") | |
model = None | |
tokenizer = None | |
load_successful = False | |
stop_token_ids_list = [] # stop_token_ids_list μ΄κΈ°ν | |
try: | |
start_load_time = time.time() | |
# μμμ λ°λΌ device_map μ€μ | |
device_map = "auto" if torch.cuda.is_available() else "cpu" | |
dtype = torch.float16 if torch.cuda.is_available() else torch.float32 | |
tokenizer = AutoTokenizer.from_pretrained( | |
MODEL_ID, | |
trust_remote_code=True | |
) | |
model = AutoModelForCausalLM.from_pretrained( | |
MODEL_ID, | |
torch_dtype=dtype, | |
device_map=device_map, | |
trust_remote_code=True | |
) | |
model.eval() | |
load_time = time.time() - start_load_time | |
print(f"--- λͺ¨λΈ λ° ν ν¬λμ΄μ λ‘λ© μλ£: {load_time:.2f}μ΄ μμ ---") | |
load_successful = True | |
# --- μ€μ§ ν ν° μ€μ --- | |
stop_token_strings = ["</s>", "<|endoftext|>"] | |
temp_stop_ids = [tokenizer.convert_tokens_to_ids(token) for token in stop_token_strings] | |
if tokenizer.eos_token_id is not None and tokenizer.eos_token_id not in temp_stop_ids: | |
temp_stop_ids.append(tokenizer.eos_token_id) | |
elif tokenizer.eos_token_id is None: | |
print("κ²½κ³ : tokenizer.eos_token_idκ° Noneμ λλ€. μ€μ§ ν ν°μ μΆκ°ν μ μμ΅λλ€.") | |
stop_token_ids_list = [tid for tid in temp_stop_ids if tid is not None] | |
if not stop_token_ids_list: | |
print("κ²½κ³ : μ€μ§ ν ν° IDλ₯Ό μ°Ύμ μ μμ΅λλ€. κ°λ₯νλ©΄ κΈ°λ³Έ EOSλ₯Ό μ¬μ©νκ³ , κ·Έλ μ§ μμΌλ©΄ μμ±μ΄ μ¬λ°λ₯΄κ² μ€μ§λμ§ μμ μ μμ΅λλ€.") | |
if tokenizer.eos_token_id is not None: | |
stop_token_ids_list = [tokenizer.eos_token_id] | |
else: | |
print("μ€λ₯: κΈ°λ³Έ EOSλ₯Ό ν¬ν¨νμ¬ μ€μ§ ν ν°μ μ°Ύμ μ μμ΅λλ€. μμ±μ΄ 무νμ μ€νλ μ μμ΅λλ€.") | |
print(f"μ¬μ©ν μ€μ§ ν ν° ID: {stop_token_ids_list}") | |
except Exception as e: | |
print(f"!!! λͺ¨λΈ λ‘λ© μ€λ₯: {e}") | |
if 'model' in locals() and model is not None: del model | |
if 'tokenizer' in locals() and tokenizer is not None: del tokenizer | |
gc.collect() | |
raise gr.Error(f"λͺ¨λΈ {MODEL_ID} λ‘λ©μ μ€ν¨νμ΅λλ€. μ ν리μΌμ΄μ μ μμν μ μμ΅λλ€. μ€λ₯: {e}") | |
# --- μμ€ν ν둬ννΈ μ μ --- | |
def get_system_prompt(): | |
current_date = datetime.datetime.now().strftime("%Y-%m-%d (%A)") | |
return ( | |
f"- μ€λμ {current_date}μ λλ€.\n" | |
f"- μ¬μ©μμ μ§λ¬Έμ λν΄ μΉμ νκ³ μμΈνκ² νκ΅μ΄λ‘ λ΅λ³ν΄μΌ ν©λλ€." | |
) | |
# --- μμ ν¨μ --- | |
def warmup_model(): | |
if not load_successful or model is None or tokenizer is None: | |
print("μμ 건λλ°κΈ°: λͺ¨λΈμ΄ μ±κ³΅μ μΌλ‘ λ‘λλμ§ μμμ΅λλ€.") | |
return | |
print("--- λͺ¨λΈ μμ μμ ---") | |
try: | |
start_warmup_time = time.time() | |
warmup_message = "μλ νμΈμ" | |
# λͺ¨λΈμ λ§λ νμμΌλ‘ μ λ ₯ κ΅¬μ± | |
system_prompt = get_system_prompt() | |
# MiMo λͺ¨λΈμ ν둬ννΈ νμμ λ§κ² μ‘°μ | |
prompt = f"Human: {warmup_message}\nAssistant:" | |
inputs = tokenizer(prompt, return_tensors="pt").to(model.device) | |
# μ€μ§ ν ν°μ΄ λΉμ΄ μλμ§ νμΈνκ³ μ μ ν μ²λ¦¬ | |
gen_kwargs = { | |
"max_new_tokens": 10, | |
"pad_token_id": tokenizer.eos_token_id if tokenizer.eos_token_id is not None else tokenizer.pad_token_id, | |
"do_sample": False | |
} | |
if stop_token_ids_list: | |
gen_kwargs["eos_token_id"] = stop_token_ids_list | |
else: | |
print("μμ κ²½κ³ : μμ±μ μ μλ μ€μ§ ν ν°μ΄ μμ΅λλ€.") | |
with torch.no_grad(): | |
output_ids = model.generate(**inputs, **gen_kwargs) | |
del inputs | |
del output_ids | |
gc.collect() | |
warmup_time = time.time() - start_warmup_time | |
print(f"--- λͺ¨λΈ μμ μλ£: {warmup_time:.2f}μ΄ μμ ---") | |
except Exception as e: | |
print(f"!!! λͺ¨λΈ μμ μ€ μ€λ₯ λ°μ: {e}") | |
finally: | |
gc.collect() | |
# --- μΆλ‘ ν¨μ --- | |
def predict(message, history): | |
""" | |
HyperCLOVAX-SEED-Vision-Instruct-3B λͺ¨λΈμ μ¬μ©νμ¬ μλ΅μ μμ±ν©λλ€. | |
'history'λ Gradio 'messages' νμμ κ°μ ν©λλ€: List[Dict]. | |
""" | |
if model is None or tokenizer is None: | |
return "μ€λ₯: λͺ¨λΈμ΄ λ‘λλμ§ μμμ΅λλ€." | |
# λν κΈ°λ‘ μ²λ¦¬ | |
history_text = "" | |
if isinstance(history, list): | |
for turn in history: | |
if isinstance(turn, tuple) and len(turn) == 2: | |
history_text += f"Human: {turn[0]}\nAssistant: {turn[1]}\n" | |
# MiMo λͺ¨λΈ μ λ ₯ νμμ λ§κ² ν둬ννΈ κ΅¬μ± | |
prompt = f"{history_text}Human: {message}\nAssistant:" | |
inputs = None | |
output_ids = None | |
try: | |
# μ λ ₯ μ€λΉ | |
inputs = tokenizer(prompt, return_tensors="pt").to(model.device) | |
input_length = inputs.input_ids.shape[1] | |
print(f"\nμ λ ₯ ν ν° μ: {input_length}") | |
except Exception as e: | |
print(f"!!! μ λ ₯ μ²λ¦¬ μ€ μ€λ₯ λ°μ: {e}") | |
return f"μ€λ₯: μ λ ₯ νμμ μ²λ¦¬νλ μ€ λ¬Έμ κ° λ°μνμ΅λλ€. ({e})" | |
try: | |
print("μλ΅ μμ± μ€...") | |
generation_start_time = time.time() | |
# μμ± μΈμ μ€λΉ, λΉμ΄ μλ stop_token_ids_list μ²λ¦¬ | |
gen_kwargs = { | |
"max_new_tokens": MAX_NEW_TOKENS, | |
"pad_token_id": tokenizer.eos_token_id if tokenizer.eos_token_id is not None else tokenizer.pad_token_id, | |
"do_sample": True, | |
"temperature": 0.7, | |
"top_p": 0.9, | |
"repetition_penalty": 1.1 | |
} | |
if stop_token_ids_list: | |
gen_kwargs["eos_token_id"] = stop_token_ids_list | |
else: | |
print("μμ± κ²½κ³ : μ μλ μ€μ§ ν ν°μ΄ μμ΅λλ€.") | |
with torch.no_grad(): | |
output_ids = model.generate(**inputs, **gen_kwargs) | |
generation_time = time.time() - generation_start_time | |
print(f"μμ± μλ£: {generation_time:.2f}μ΄ μμ.") | |
except Exception as e: | |
print(f"!!! λͺ¨λΈ μμ± μ€ μ€λ₯ λ°μ: {e}") | |
if inputs is not None: del inputs | |
if output_ids is not None: del output_ids | |
gc.collect() | |
return f"μ€λ₯: μλ΅μ μμ±νλ μ€ λ¬Έμ κ° λ°μνμ΅λλ€. ({e})" | |
# μλ΅ λμ½λ© | |
response = "μ€λ₯: μλ΅ μμ±μ μ€ν¨νμ΅λλ€." | |
if output_ids is not None: | |
try: | |
new_tokens = output_ids[0, input_length:] | |
response = tokenizer.decode(new_tokens, skip_special_tokens=True) | |
print(f"μΆλ ₯ ν ν° μ: {len(new_tokens)}") | |
del new_tokens | |
except Exception as e: | |
print(f"!!! μλ΅ λμ½λ© μ€ μ€λ₯ λ°μ: {e}") | |
response = "μ€λ₯: μλ΅μ λμ½λ©νλ μ€ λ¬Έμ κ° λ°μνμ΅λλ€." | |
# λ©λͺ¨λ¦¬ μ 리 | |
if inputs is not None: del inputs | |
if output_ids is not None: del output_ids | |
gc.collect() | |
print("λ©λͺ¨λ¦¬ μ 리 μλ£.") | |
return response.strip() | |
# --- Gradio μΈν°νμ΄μ€ μ€μ --- | |
print("--- Gradio μΈν°νμ΄μ€ μ€μ μ€ ---") | |
examples = [ | |
["μλ νμΈμ! μκΈ°μκ° μ’ ν΄μ£ΌμΈμ."], | |
["μΈκ³΅μ§λ₯κ³Ό λ¨Έμ λ¬λμ μ°¨μ΄μ μ 무μμΈκ°μ?"], | |
["λ₯λ¬λ λͺ¨λΈ νμ΅ κ³Όμ μ λ¨κ³λ³λ‘ μλ €μ£ΌμΈμ."], | |
["μ μ£Όλ μ¬ν κ³νμ μΈμ°κ³ μλλ°, 3λ° 4μΌ μΆμ² μ½μ€ μ’ μλ €μ£ΌμΈμ."], | |
] | |
# ChatInterfaceλ₯Ό μ¬μ©νμ¬ μ체 Chatbot μ»΄ν¬λνΈ κ΄λ¦¬ | |
demo = gr.ChatInterface( | |
fn=predict, | |
title="π€ HyperCLOVAX-SEED-Text-Instruct-0.5B", | |
description=( | |
f"**λͺ¨λΈ:** {MODEL_ID}\n" | |
), | |
examples=examples, | |
cache_examples=False, | |
theme=gr.themes.Soft(), | |
) | |
# --- μ ν리μΌμ΄μ μ€ν --- | |
if __name__ == "__main__": | |
if load_successful: | |
warmup_model() | |
else: | |
print("λͺ¨λΈ λ‘λ©μ μ€ν¨νμ¬ μμ μ 건λλλλ€.") | |
print("--- Gradio μ± μ€ν μ€ ---") | |
demo.queue().launch( | |
# share=True # κ³΅κ° λ§ν¬λ₯Ό μνλ©΄ μ£Όμ ν΄μ | |
# server_name="0.0.0.0" # λ‘컬 λ€νΈμν¬ μ κ·Όμ μνλ©΄ μ£Όμ ν΄μ | |
) |