from dotenv import load_dotenv import logging import torch from transformers import ( AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoTokenizer, BitsAndBytesConfig, pipeline, ) from langchain_huggingface import HuggingFacePipeline from langchain.globals import set_debug from langchain.globals import set_verbose from config import HF_MODEL_ID from config import LLM_VERBOSE set_verbose(LLM_VERBOSE) set_debug(LLM_VERBOSE) logger = logging.getLogger(__name__) load_dotenv() cuda_check = torch.cuda.is_available() logger.info(f"torch.cuda.is_available : {cuda_check}") print(f"> torch.cuda.is_available : {cuda_check}") # Load Llama3 model and tokenizer model_id = HF_MODEL_ID tokenizer = AutoTokenizer.from_pretrained(model_id) # BitsAndBytesConfig int-4 config # device_map = {"": 0} device_map = "auto" compute_dtype = getattr(torch, "float16") bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_use_double_quant=False, bnb_4bit_quant_type="nf4", # bnb_4bit_compute_dtype=torch.bfloat16 bnb_4bit_compute_dtype=compute_dtype, # bnb_4bit_use_double_quant=False, ) model = AutoModelForCausalLM.from_pretrained( model_id, device_map=device_map, # attn_implementation="flash_attention_2", quantization_config=bnb_config, ) model.generation_config.pad_token_id = tokenizer.eos_token_id pipe = pipeline( "text-generation", model=model, tokenizer=tokenizer, max_new_tokens=50, return_full_text=False, num_return_sequences=1, eos_token_id=tokenizer.eos_token_id, temperature=0.0001, do_sample=True, ) llm = HuggingFacePipeline(pipeline=pipe)