Update app.py
Browse files
app.py
CHANGED
@@ -60,9 +60,32 @@ def str_to_json(str_obj):
|
|
60 |
|
61 |
|
62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
# Load the tokenizer and model
|
64 |
tokenizer = AutoTokenizer.from_pretrained("HumanLLMs/Human-Like-Mistral-Nemo-Instruct-2407")
|
65 |
-
model = AutoModelForCausalLM.from_pretrained("HumanLLMs/Human-Like-Mistral-Nemo-Instruct-2407", device_map="auto"
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
terminators = [
|
67 |
tokenizer.eos_token_id,
|
68 |
tokenizer.convert_tokens_to_ids("<|eot_id|>")
|
|
|
60 |
|
61 |
|
62 |
|
63 |
+
import subprocess
|
64 |
+
|
65 |
+
subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
|
66 |
+
|
67 |
+
|
68 |
+
from accelerate import init_empty_weights, infer_auto_device_map, load_checkpoint_and_dispatch
|
69 |
+
from accelerate import Accelerator
|
70 |
+
|
71 |
+
|
72 |
+
subprocess.run(
|
73 |
+
"pip install psutil",
|
74 |
+
|
75 |
+
shell=True,
|
76 |
+
)
|
77 |
+
|
78 |
+
|
79 |
+
|
80 |
# Load the tokenizer and model
|
81 |
tokenizer = AutoTokenizer.from_pretrained("HumanLLMs/Human-Like-Mistral-Nemo-Instruct-2407")
|
82 |
+
model = AutoModelForCausalLM.from_pretrained("HumanLLMs/Human-Like-Mistral-Nemo-Instruct-2407", device_map="auto",
|
83 |
+
low_cpu_mem_usage=True,
|
84 |
+
torch_dtype=torch.bfloat16,
|
85 |
+
# quantization_config=quantization_config,
|
86 |
+
attn_implementation="flash_attention_2",
|
87 |
+
|
88 |
+
) # to("cuda:0")
|
89 |
terminators = [
|
90 |
tokenizer.eos_token_id,
|
91 |
tokenizer.convert_tokens_to_ids("<|eot_id|>")
|