Spaces:
Paused
Paused
import os | |
from fastapi import FastAPI, HTTPException | |
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline | |
import torch | |
from pydantic import BaseModel | |
import traceback | |
from langchain.memory import ConversationBufferMemory | |
from langchain.chains import ConversationChain | |
from langchain_community.llms import HuggingFacePipeline | |
app = FastAPI() | |
# Get the Hugging Face API token from environment variables (BEST PRACTICE) | |
HUGGINGFACEHUB_API_TOKEN = os.environ.get("HUGGINGFACEHUB_API_TOKEN") | |
if HUGGINGFACEHUB_API_TOKEN is None: | |
raise ValueError("HUGGINGFACEHUB_API_TOKEN environment variable not set.") | |
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct") | |
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct", device_map="auto", torch_dtype=torch.bfloat16, trust_remote_code=True, token=HUGGINGFACEHUB_API_TOKEN) | |
#print(f"Tokenizer attributes: {dir(tokenizer)}") | |
if torch.backends.mps.is_available(): | |
device = "mps" | |
elif torch.cuda.is_available(): | |
device= "cuda" | |
else : | |
device = "cpu" | |
model.to(device) | |
memory = ConversationBufferMemory() # for memory management | |
# Initialize Langchain HuggingFacePipeline | |
llm = HuggingFacePipeline(pipeline=pipeline("text-generation", model=model, tokenizer=tokenizer)) | |
# Initialize Langchain ConversationChain | |
conversation = ConversationChain(llm=llm, memory=memory) | |
class QuestionRequest(BaseModel): | |
question: str | |
class ChatResponse(BaseModel): | |
response: str | |
async def generate_text(request: QuestionRequest): | |
try: | |
response = conversation.predict(input=request.question) | |
return {"response": response} | |
except Exception as e: | |
print("Error during generation:") | |
traceback.print_exc() | |
raise HTTPException(status_code=500, detail=str(e)) | |
# below when not using langchain fully | |
# try: | |
# # Retrieve history | |
# history = memory.load_memory_variables({})['history'] | |
# # Create prompt with history and current question | |
# prompt = f"History:\n{history}\nQuestion: {request.question}\nAnswer:" | |
# inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to(device) | |
# with torch.no_grad(): | |
# outputs = model.generate( | |
# inputs=inputs['input_ids'], # Pass the 'input_ids' tensor | |
# attention_mask=inputs['attention_mask'], | |
# max_length=300, | |
# num_beams=5, | |
# no_repeat_ngram_size=2, | |
# temperature=0.7, | |
# top_k=50, | |
# top_p=0.95, | |
# do_sample=True, | |
# eos_token_id=tokenizer.convert_tokens_to_ids("<|endoftext|>"), | |
# pad_token_id=tokenizer.convert_tokens_to_ids("<|endoftext|>") | |
# ) | |
# response = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
# return {"response": response} | |
# except Exception as e: | |
# print("Error during generation:") | |
# traceback.print_exc() | |
# raise HTTPException(status_code=500, detail=str(e)) | |