Chat_with_Meta_llama3_8b

Running

rishikumar20202023 commited on Mar 2

Commit

5ca3ab1

verified ·

1 Parent(s): f897284

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -60,9 +60,32 @@ def str_to_json(str_obj):
 # Load the tokenizer and model
 tokenizer = AutoTokenizer.from_pretrained("HumanLLMs/Human-Like-Mistral-Nemo-Instruct-2407")
-model = AutoModelForCausalLM.from_pretrained("HumanLLMs/Human-Like-Mistral-Nemo-Instruct-2407", device_map="auto")  # to("cuda:0")
 terminators = [
     tokenizer.eos_token_id,
     tokenizer.convert_tokens_to_ids("<|eot_id|>")

+import subprocess
+subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
+from accelerate import init_empty_weights, infer_auto_device_map, load_checkpoint_and_dispatch
+from accelerate import Accelerator
+subprocess.run(
+    "pip install psutil",
+    shell=True,
+)
 # Load the tokenizer and model
 tokenizer = AutoTokenizer.from_pretrained("HumanLLMs/Human-Like-Mistral-Nemo-Instruct-2407")
+model = AutoModelForCausalLM.from_pretrained("HumanLLMs/Human-Like-Mistral-Nemo-Instruct-2407", device_map="auto",
+                                             low_cpu_mem_usage=True,
+    torch_dtype=torch.bfloat16,
+    # quantization_config=quantization_config,
+    attn_implementation="flash_attention_2",
+                                            )  # to("cuda:0")
 terminators = [
     tokenizer.eos_token_id,
     tokenizer.convert_tokens_to_ids("<|eot_id|>")