Spaces:
Paused
Paused
File size: 3,132 Bytes
5601c60 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 |
import os
from fastapi import FastAPI, HTTPException
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch
from pydantic import BaseModel
import traceback
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationChain
from langchain_community.llms import HuggingFacePipeline
app = FastAPI()
# Get the Hugging Face API token from environment variables (BEST PRACTICE)
HUGGINGFACEHUB_API_TOKEN = os.environ.get("HUGGINGFACEHUB_API_TOKEN")
if HUGGINGFACEHUB_API_TOKEN is None:
raise ValueError("HUGGINGFACEHUB_API_TOKEN environment variable not set.")
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct")
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct", device_map="auto", torch_dtype=torch.bfloat16, trust_remote_code=True, token=HUGGINGFACEHUB_API_TOKEN)
#print(f"Tokenizer attributes: {dir(tokenizer)}")
if torch.backends.mps.is_available():
device = "mps"
elif torch.cuda.is_available():
device= "cuda"
else :
device = "cpu"
model.to(device)
memory = ConversationBufferMemory() # for memory management
# Initialize Langchain HuggingFacePipeline
llm = HuggingFacePipeline(pipeline=pipeline("text-generation", model=model, tokenizer=tokenizer))
# Initialize Langchain ConversationChain
conversation = ConversationChain(llm=llm, memory=memory)
class QuestionRequest(BaseModel):
question: str
class ChatResponse(BaseModel):
response: str
@app.post("/api/generate")
async def generate_text(request: QuestionRequest):
try:
response = conversation.predict(input=request.question)
return {"response": response}
except Exception as e:
print("Error during generation:")
traceback.print_exc()
raise HTTPException(status_code=500, detail=str(e))
# below when not using langchain fully
# try:
# # Retrieve history
# history = memory.load_memory_variables({})['history']
# # Create prompt with history and current question
# prompt = f"History:\n{history}\nQuestion: {request.question}\nAnswer:"
# inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to(device)
# with torch.no_grad():
# outputs = model.generate(
# inputs=inputs['input_ids'], # Pass the 'input_ids' tensor
# attention_mask=inputs['attention_mask'],
# max_length=300,
# num_beams=5,
# no_repeat_ngram_size=2,
# temperature=0.7,
# top_k=50,
# top_p=0.95,
# do_sample=True,
# eos_token_id=tokenizer.convert_tokens_to_ids("<|endoftext|>"),
# pad_token_id=tokenizer.convert_tokens_to_ids("<|endoftext|>")
# )
# response = tokenizer.decode(outputs[0], skip_special_tokens=True)
# return {"response": response}
# except Exception as e:
# print("Error during generation:")
# traceback.print_exc()
# raise HTTPException(status_code=500, detail=str(e))
|