husseinhug321 commited on
Commit
594ce5e
·
verified ·
1 Parent(s): 4285c2d

Update llms.py

Browse files
Files changed (1) hide show
  1. llms.py +59 -70
llms.py CHANGED
@@ -1,70 +1,59 @@
1
- from dotenv import load_dotenv
2
- import logging
3
-
4
- import torch
5
- from transformers import (
6
- AutoModelForCausalLM,
7
- AutoTokenizer,
8
- BitsAndBytesConfig,
9
- AutoTokenizer,
10
- BitsAndBytesConfig,
11
- pipeline,
12
- )
13
-
14
- from langchain_huggingface import HuggingFacePipeline
15
- from langchain.globals import set_debug
16
- from langchain.globals import set_verbose
17
-
18
- from config import HF_MODEL_ID
19
- from config import LLM_VERBOSE
20
-
21
- set_verbose(LLM_VERBOSE)
22
- set_debug(LLM_VERBOSE)
23
-
24
- logger = logging.getLogger(__name__)
25
- load_dotenv()
26
-
27
- cuda_check = torch.cuda.is_available()
28
- logger.info(f"torch.cuda.is_available : {cuda_check}")
29
- print(f"> torch.cuda.is_available : {cuda_check}")
30
-
31
- # Load Llama3 model and tokenizer
32
- model_id = HF_MODEL_ID
33
-
34
- tokenizer = AutoTokenizer.from_pretrained(model_id)
35
-
36
- # BitsAndBytesConfig int-4 config
37
- # device_map = {"": 0}
38
- device_map = "auto"
39
- compute_dtype = getattr(torch, "float16")
40
- bnb_config = BitsAndBytesConfig(
41
- load_in_4bit=True,
42
- bnb_4bit_use_double_quant=False,
43
- bnb_4bit_quant_type="nf4",
44
- # bnb_4bit_compute_dtype=torch.bfloat16
45
- bnb_4bit_compute_dtype=compute_dtype,
46
- # bnb_4bit_use_double_quant=False,
47
- )
48
-
49
- model = AutoModelForCausalLM.from_pretrained(
50
- model_id,
51
- device_map=device_map,
52
- # attn_implementation="flash_attention_2",
53
- quantization_config=bnb_config,
54
- )
55
-
56
- model.generation_config.pad_token_id = tokenizer.eos_token_id
57
-
58
- pipe = pipeline(
59
- "text-generation",
60
- model=model,
61
- tokenizer=tokenizer,
62
- max_new_tokens=50,
63
- return_full_text=False,
64
- num_return_sequences=1,
65
- eos_token_id=tokenizer.eos_token_id,
66
- temperature=0.0001,
67
- do_sample=True,
68
- )
69
-
70
- llm = HuggingFacePipeline(pipeline=pipe)
 
1
+ from dotenv import load_dotenv
2
+ import logging
3
+
4
+ import torch
5
+ from transformers import (
6
+ AutoModelForCausalLM,
7
+ AutoTokenizer,
8
+ pipeline,
9
+ )
10
+
11
+ from langchain_huggingface import HuggingFacePipeline
12
+ from langchain.globals import set_debug
13
+ from langchain.globals import set_verbose
14
+
15
+ from config import HF_MODEL_ID
16
+ from config import LLM_VERBOSE
17
+
18
+ set_verbose(LLM_VERBOSE)
19
+ set_debug(LLM_VERBOSE)
20
+
21
+ logger = logging.getLogger(__name__)
22
+ load_dotenv()
23
+
24
+ cuda_check = torch.cuda.is_available()
25
+ logger.info(f"torch.cuda.is_available : {cuda_check}")
26
+ print(f"> torch.cuda.is_available : {cuda_check}")
27
+
28
+ # Load Llama3 model and tokenizer
29
+ model_id = HF_MODEL_ID
30
+
31
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
32
+
33
+ # BitsAndBytesConfig int-4 config
34
+ # device_map = {"": 0}
35
+ device_map = "auto"
36
+ compute_dtype = getattr(torch, "float16")
37
+
38
+ model = AutoModelForCausalLM.from_pretrained(
39
+ model_id,
40
+ device_map=device_map,
41
+ # attn_implementation="flash_attention_2",
42
+ # quantization_config=bnb_config,
43
+ )
44
+
45
+ model.generation_config.pad_token_id = tokenizer.eos_token_id
46
+
47
+ pipe = pipeline(
48
+ "text-generation",
49
+ model=model,
50
+ tokenizer=tokenizer,
51
+ max_new_tokens=50,
52
+ return_full_text=False,
53
+ num_return_sequences=1,
54
+ eos_token_id=tokenizer.eos_token_id,
55
+ temperature=0.0001,
56
+ do_sample=True,
57
+ )
58
+
59
+ llm = HuggingFacePipeline(pipeline=pipe)