|
|
|
from langchain.chains import LLMChain |
|
from langchain.llms import HuggingFacePipeline |
|
from transformers import ( |
|
AutoModelForCausalLM, |
|
AutoTokenizer, |
|
pipeline, |
|
T5Tokenizer, |
|
T5ForConditionalGeneration, |
|
GPT2TokenizerFast, |
|
) |
|
from transformers import LlamaForCausalLM, AutoModelForCausalLM, LlamaTokenizer |
|
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler |
|
from langchain.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate, PromptTemplate |
|
|
|
|
|
|
|
|
|
from langchain.chat_models import ChatOpenAI |
|
|
|
|
|
from langchain.embeddings.openai import OpenAIEmbeddings |
|
from langchain.embeddings import HuggingFaceEmbeddings |
|
from langchain.vectorstores import Chroma |
|
from langchain.text_splitter import ( |
|
CharacterTextSplitter, |
|
RecursiveCharacterTextSplitter, |
|
) |
|
from langchain.document_loaders import TextLoader, UnstructuredHTMLLoader, PyPDFLoader |
|
from langchain.chains.retrieval_qa.base import RetrievalQA |
|
from langchain.llms import HuggingFaceHub |
|
from dotenv import load_dotenv |
|
from langchain.llms import HuggingFaceTextGenInference |
|
from langchain.chains.question_answering import load_qa_chain |
|
from langchain.chains import ConversationalRetrievalChain |
|
from langchain.chains.conversation.memory import ( |
|
ConversationBufferMemory, |
|
ConversationBufferWindowMemory, |
|
) |
|
|
|
|
|
def get_llm_hf_online(inference_api_url=""): |
|
"""Get LLM using huggingface inference.""" |
|
|
|
if not inference_api_url: |
|
inference_api_url = ( |
|
"https://api-inference.huggingface.co/models/HuggingFaceH4/zephyr-7b-beta" |
|
) |
|
|
|
llm = HuggingFaceTextGenInference( |
|
|
|
verbose=True, |
|
|
|
max_new_tokens=1024, |
|
|
|
top_p=0.95, |
|
typical_p=0.95, |
|
temperature=0.1, |
|
|
|
|
|
|
|
inference_server_url=inference_api_url, |
|
timeout=10, |
|
|
|
) |
|
|
|
return llm |
|
|
|
|
|
def get_llm_hf_local(model_path): |
|
"""Get local LLM.""" |
|
|
|
|
|
|
|
model = LlamaForCausalLM.from_pretrained( |
|
model_path, device_map="auto" |
|
) |
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_path) |
|
|
|
|
|
|
|
pipe = pipeline( |
|
"text-generation", |
|
model=model, |
|
tokenizer=tokenizer, |
|
max_new_tokens=1024, |
|
model_kwargs={"temperature": 0.1}, |
|
) |
|
llm = HuggingFacePipeline(pipeline=pipe) |
|
|
|
return llm |
|
|
|
|
|
|
|
def get_llm_openai_chat(model_name, inference_server_url, langfuse_callback=None): |
|
"""Get openai-like LLM.""" |
|
|
|
|
|
|
|
|
|
llm = ChatOpenAI( |
|
model=model_name, |
|
openai_api_key="EMPTY", |
|
openai_api_base=inference_server_url, |
|
max_tokens=1024, |
|
temperature=0, |
|
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
return llm |
|
|
|
|
|
def get_chat_vllm(model_name, inference_server_url, langfuse_callback=None): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
chat = ChatOpenAI( |
|
model=model_name, |
|
openai_api_key="EMPTY", |
|
openai_api_base=inference_server_url, |
|
max_tokens=512, |
|
temperature=0.1, |
|
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
return chat |
|
|