rishikumar20202023 commited on
Commit
5ca3ab1
·
verified ·
1 Parent(s): f897284

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -1
app.py CHANGED
@@ -60,9 +60,32 @@ def str_to_json(str_obj):
60
 
61
 
62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  # Load the tokenizer and model
64
  tokenizer = AutoTokenizer.from_pretrained("HumanLLMs/Human-Like-Mistral-Nemo-Instruct-2407")
65
- model = AutoModelForCausalLM.from_pretrained("HumanLLMs/Human-Like-Mistral-Nemo-Instruct-2407", device_map="auto") # to("cuda:0")
 
 
 
 
 
 
66
  terminators = [
67
  tokenizer.eos_token_id,
68
  tokenizer.convert_tokens_to_ids("<|eot_id|>")
 
60
 
61
 
62
 
63
+ import subprocess
64
+
65
+ subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
66
+
67
+
68
+ from accelerate import init_empty_weights, infer_auto_device_map, load_checkpoint_and_dispatch
69
+ from accelerate import Accelerator
70
+
71
+
72
+ subprocess.run(
73
+ "pip install psutil",
74
+
75
+ shell=True,
76
+ )
77
+
78
+
79
+
80
  # Load the tokenizer and model
81
  tokenizer = AutoTokenizer.from_pretrained("HumanLLMs/Human-Like-Mistral-Nemo-Instruct-2407")
82
+ model = AutoModelForCausalLM.from_pretrained("HumanLLMs/Human-Like-Mistral-Nemo-Instruct-2407", device_map="auto",
83
+ low_cpu_mem_usage=True,
84
+ torch_dtype=torch.bfloat16,
85
+ # quantization_config=quantization_config,
86
+ attn_implementation="flash_attention_2",
87
+
88
+ ) # to("cuda:0")
89
  terminators = [
90
  tokenizer.eos_token_id,
91
  tokenizer.convert_tokens_to_ids("<|eot_id|>")