# from langchain import HuggingFaceHub, LLMChain from langchain.llms import HuggingFacePipeline from transformers import ( AutoModelForCausalLM, AutoTokenizer, pipeline, ) from transformers import LlamaForCausalLM, AutoModelForCausalLM, LlamaTokenizer from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler from langchain_groq import ChatGroq from langchain.chat_models import ChatOpenAI from langchain.llms import HuggingFaceTextGenInference def get_llm_hf_online(inference_api_url=""): """Get LLM using huggingface inference.""" if not inference_api_url: # default api url inference_api_url = ( "https://api-inference.huggingface.co/models/HuggingFaceH4/zephyr-7b-beta" ) llm = HuggingFaceTextGenInference( verbose=True, # Provides detailed logs of operation max_new_tokens=1024, # Maximum number of token that can be generated. top_p=0.95, # Threshold for controlling randomness in text generation process. temperature=0.1, inference_server_url=inference_api_url, timeout=10, # Timeout for connection with the url ) return llm def get_llm_hf_local(model_path): """Get local LLM.""" model = LlamaForCausalLM.from_pretrained( model_path, device_map="auto" ) tokenizer = AutoTokenizer.from_pretrained(model_path) # print('making a pipeline...') # max_length has typically been deprecated for max_new_tokens pipe = pipeline( "text-generation", model=model, tokenizer=tokenizer, max_new_tokens=1024, # better setting? model_kwargs={"temperature": 0.1}, # better setting? ) llm = HuggingFacePipeline(pipeline=pipe) return llm def get_llm_openai_chat(model_name, inference_server_url): """Get openai-like LLM.""" llm = ChatOpenAI( model=model_name, openai_api_key="EMPTY", openai_api_base=inference_server_url, max_tokens=1024, # better setting? temperature=0, ) return llm def get_groq_chat(model_name="llama-3.1-70b-versatile"): llm = ChatGroq(temperature=0, model_name=model_name) return llm