from fastapi import FastAPI from transformers import pipeline from llama_cpp import Llama llm = Llama.from_pretrained( repo_id="hugging-quants/Llama-3.2-1B-Instruct-Q8_0-GGUF", filename="llama-3.2-1b-instruct-q8_0.gguf", ) check = llm.create_chat_completion( messages = [ { "role": "user", "content": "What is the capital of France?" } ] ) print(check['choices'][0]['message']['content']) ## create a new FASTAPI app instance app=FastAPI() @app.get("/") def home(): return {"message":"Hello World"} # Define a function to handle the GET request at `/generate` @app.get("/generate") def generate(text:str): ## use the pipeline to generate text from given input text output= llm.create_chat_completion( messages = [ { "role": "user", "content": f"{text}" } ] ) ## return the generate text in Json reposnfe return {"output":output[0]['generated_text']}