import os from fastapi import FastAPI, HTTPException from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline import torch from pydantic import BaseModel import traceback from langchain.memory import ConversationBufferMemory from langchain.chains import ConversationChain from langchain_community.llms import HuggingFacePipeline app = FastAPI() # Get the Hugging Face API token from environment variables (BEST PRACTICE) HUGGINGFACEHUB_API_TOKEN = os.environ.get("HUGGINGFACEHUB_API_TOKEN") if HUGGINGFACEHUB_API_TOKEN is None: raise ValueError("HUGGINGFACEHUB_API_TOKEN environment variable not set.") tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct") model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct", device_map="auto", torch_dtype=torch.bfloat16, trust_remote_code=True, token=HUGGINGFACEHUB_API_TOKEN) #print(f"Tokenizer attributes: {dir(tokenizer)}") if torch.backends.mps.is_available(): device = "mps" elif torch.cuda.is_available(): device= "cuda" else : device = "cpu" model.to(device) memory = ConversationBufferMemory() # for memory management # Initialize Langchain HuggingFacePipeline llm = HuggingFacePipeline(pipeline=pipeline("text-generation", model=model, tokenizer=tokenizer)) # Initialize Langchain ConversationChain conversation = ConversationChain(llm=llm, memory=memory) class QuestionRequest(BaseModel): question: str class ChatResponse(BaseModel): response: str @app.post("/api/generate") async def generate_text(request: QuestionRequest): try: response = conversation.predict(input=request.question) return {"response": response} except Exception as e: print("Error during generation:") traceback.print_exc() raise HTTPException(status_code=500, detail=str(e)) # below when not using langchain fully # try: # # Retrieve history # history = memory.load_memory_variables({})['history'] # # Create prompt with history and current question # prompt = f"History:\n{history}\nQuestion: {request.question}\nAnswer:" # inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to(device) # with torch.no_grad(): # outputs = model.generate( # inputs=inputs['input_ids'], # Pass the 'input_ids' tensor # attention_mask=inputs['attention_mask'], # max_length=300, # num_beams=5, # no_repeat_ngram_size=2, # temperature=0.7, # top_k=50, # top_p=0.95, # do_sample=True, # eos_token_id=tokenizer.convert_tokens_to_ids("<|endoftext|>"), # pad_token_id=tokenizer.convert_tokens_to_ids("<|endoftext|>") # ) # response = tokenizer.decode(outputs[0], skip_special_tokens=True) # return {"response": response} # except Exception as e: # print("Error during generation:") # traceback.print_exc() # raise HTTPException(status_code=500, detail=str(e))