LISA-demo / llms.py
Kadi-IAM's picture
Clean and rebuild
2fafc94
raw
history blame
5.4 kB
# from langchain import HuggingFaceHub, LLMChain
from langchain.chains import LLMChain
from langchain.llms import HuggingFacePipeline
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
pipeline,
T5Tokenizer,
T5ForConditionalGeneration,
GPT2TokenizerFast,
)
from transformers import LlamaForCausalLM, AutoModelForCausalLM, LlamaTokenizer
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate, PromptTemplate
# model_path = "/mnt/localstorage/yinghan/llm/orca_mini_v3_13b"
# model = LlamaForCausalLM.from_pretrained(model_path, device_map="auto")#, load_in_8bit=True)
# tokenizer = AutoTokenizer.from_pretrained(model_path)
from langchain.chat_models import ChatOpenAI
# from langchain_openai import ChatOpenAI
# from langchain_openai import ChatOpenAI
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import (
CharacterTextSplitter,
RecursiveCharacterTextSplitter,
)
from langchain.document_loaders import TextLoader, UnstructuredHTMLLoader, PyPDFLoader
from langchain.chains.retrieval_qa.base import RetrievalQA
from langchain.llms import HuggingFaceHub
from dotenv import load_dotenv
from langchain.llms import HuggingFaceTextGenInference
from langchain.chains.question_answering import load_qa_chain
from langchain.chains import ConversationalRetrievalChain
from langchain.chains.conversation.memory import (
ConversationBufferMemory,
ConversationBufferWindowMemory,
)
def get_llm_hf_online(inference_api_url=""):
"""Get LLM using huggingface inference."""
if not inference_api_url: # default api url
inference_api_url = (
"https://api-inference.huggingface.co/models/HuggingFaceH4/zephyr-7b-beta"
)
llm = HuggingFaceTextGenInference(
# cache=None, # Optional: Cache verwenden oder nicht
verbose=True, # Provides detailed logs of operation
# callbacks=[StreamingStdOutCallbackHandler()], # Handeling Streams
max_new_tokens=1024, # Maximum number of token that can be generated.
# top_k=2, # Die Anzahl der Top-K Tokens, die beim Generieren berücksichtigt werden sollen
top_p=0.95, # Threshold for controlling randomness in text generation process.
typical_p=0.95, #
temperature=0.1, # For choosing probable words.
# repetition_penalty=None, # Wiederholungsstrafe beim Generieren
# truncate=None, # Schneidet die Eingabe-Tokens auf die gegebene Größe
# stop_sequences=None, # Eine Liste von Stop-Sequenzen beim Generieren
inference_server_url=inference_api_url, # URL des Inferenzservers
timeout=10, # Timeout for connection with the url
# streaming=True, # Streaming the answer
)
return llm
def get_llm_hf_local(model_path):
"""Get local LLM."""
# model_path = "/mnt/localstorage/yinghan/llm/orca_mini_v3_13b"
# model_path = "/mnt/localstorage/yinghan/llm/zephyr-7b-beta"
model = LlamaForCausalLM.from_pretrained( # or AutoModelForCausalLM. TODO: which is better? what's difference?
model_path, device_map="auto"
) # , load_in_8bit=True)
# model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto")#, load_in_8bit=True) # which is better?
tokenizer = AutoTokenizer.from_pretrained(model_path)
# print('making a pipeline...')
# max_length has typically been deprecated for max_new_tokens
pipe = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
max_new_tokens=1024, # need better set
model_kwargs={"temperature": 0.1}, # need better set
)
llm = HuggingFacePipeline(pipeline=pipe)
return llm
def get_llm_openai_chat(model_name, inference_server_url, langfuse_callback=None):
"""Get openai-like LLM."""
# Some defaults
# chat_model_name = "openchat/openchat_3.5"
# inference_server_url = "http://localhost:8080/v1"
llm = ChatOpenAI(
model=model_name,
openai_api_key="EMPTY",
openai_api_base=inference_server_url,
max_tokens=1024, # better setting?
temperature=0, # default 0.7, better setting?
# callbacks=[langfuse_callback],
)
# The following is not required for builing normal llm
# use the Ragas LangchainLLM wrapper to create a RagasLLM instance
# vllm = LangchainLLM(llm=chat)
# return vllm
return llm
def get_chat_vllm(model_name, inference_server_url, langfuse_callback=None):
# to fix
# Create vLLM Langchain instance
# Some defaults
# chat_model_name = "openchat/openchat_3.5"
# inference_server_url = "http://localhost:8080/v1"
chat = ChatOpenAI(
model=model_name,
openai_api_key="EMPTY",
openai_api_base=inference_server_url,
max_tokens=512, # better setting?
temperature=0.1, # default 0.7, better setting?
# callbacks=[langfuse_callback],
)
# The following is not required for builing normal llm
# use the Ragas LangchainLLM wrapper to create a RagasLLM instance
# vllm = LangchainLLM(llm=chat)
# return vllm
return chat