Spaces:

sreejith8100
/

llm_fastapi

Paused

App Files Files Community

sreejith8100 commited on 4 days ago

Commit

657c17b

1 Parent(s): 2184968

initial commit

Browse files

Files changed (5) hide show

Dockerfile +22 -0
client.py +61 -0
endpoint_handler.py +91 -0
main.py +50 -0
requirements.txt +19 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,22 @@

+FROM pytorch/pytorch:2.3.1-cuda12.1-cudnn8-runtime
+RUN apt-get update && apt-get install -y wget
+RUN useradd -m -u 1000 user
+USER user
+WORKDIR /app
+ENV PATH="/home/user/.local/bin:$PATH"
+ENV TRANSFORMERS_CACHE=/home/user/.cache/huggingface
+ENV TORCH_CUDA_ARCH_LIST="8.0+PTX"
+RUN wget https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/v0.0.4/flash_attn-2.7.3+cu121torch2.3-cp310-cp310-linux_x86_64.whl
+RUN pip install ./flash_attn-2.7.3+cu121torch2.3-cp310-cp310-linux_x86_64.whl && rm flash_attn-2.7.3+cu121torch2.3-cp310-cp310-linux_x86_64.whl
+COPY --chown=user requirements.txt .
+RUN pip install --upgrade pip setuptools wheel
+RUN pip install --no-cache-dir -r requirements.txt
+COPY --chown=user . .
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]

client.py ADDED Viewed

	@@ -0,0 +1,61 @@

+import os
+import base64
+import json
+import requests
+from PIL import Image
+import urllib3
+urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+orig_path = r"D:\grcpsample\content\extracted\images_8184456bc26d4e7f9c5237c350fe20ba\page_4.png"
+resized_path = r"D:\grcpsample\content\extracted\images_8184456bc26d4e7f9c5237c350fe20ba\page_4_small.jpg"
+# 1. Load and downscale
+img = Image.open(orig_path)
+max_side = 800  # tweak to 600 or 512 if still too large
+scale = max_side / max(img.size)
+if scale < 1.0:
+    new_size = (int(img.width * scale), int(img.height * scale))
+    img = img.resize(new_size, Image.LANCZOS)
+# 2. Save as JPEG at 70% quality
+img.save(resized_path, format="JPEG", quality=70)
+# 3. Print new on-disk size
+new_size_kb = os.path.getsize(resized_path) / 1024
+print(f"Resized JPEG size: {new_size_kb:.2f} KB")  # aim for ≤ 500 KB
+# 4. Base64-encode and print that size
+with open(resized_path, "rb") as f:
+    img_bytes = f.read()
+    b64 = base64.b64encode(img_bytes).decode("utf-8")
+b64_kb = len(b64.encode("utf-8")) / 1024
+print(f"Base64 size: {b64_kb:.2f} KB")  # expect ~1.33× raw size
+# 5. Build payload and measure final JSON
+payload = {
+    "inputs": {
+        "image": b64,
+        "question": "What is in the image?",
+        "stream": True
+    }
+}
+json_payload = json.dumps(payload)
+final_kb = len(json_payload.encode("utf-8")) / 1024
+print(f"Final JSON payload: {final_kb:.2f} KB")  # want < ~700 KB
+# 6. POST to the Space
+url = "https://huggingface.co/spaces/sreejith8100/llm_model/predict"
+headers = {"Content-Type": "application/json"}
+try:
+    with requests.post(url, data=json_payload, headers=headers, stream=True, verify=False) as resp:
+        resp.raise_for_status()
+        for line in resp.iter_lines(decode_unicode=True):
+            if line.startswith("data: "):
+                chunk = json.loads(line.replace("data: ", ""))
+                if chunk.get("output"):
+                    print(chunk["output"], end="", flush=True)
+except requests.HTTPError as e:
+    print(f"HTTP error: {e}, body:\n{resp.text}")
+    raise

endpoint_handler.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import torch
+from PIL import Image
+from transformers import AutoModel, AutoTokenizer
+from io import BytesIO
+import base64
+from huggingface_hub import login
+from huggingface_hub import login
+import os
+class EndpointHandler:
+    def __init__(self, model_dir=None):
+        print("[Init] Initializing EndpointHandler...")
+        self.load_model()
+    def load_model(self):
+        hf_token = os.getenv("HF_TOKEN")
+        model_path = "openbmb/MiniCPM-o-2_6"  # use model repo name directly
+        if hf_token:
+            print("[Auth] Logging into Hugging Face Hub with token...")
+            login(token=hf_token)
+        print(f"[Model Load] Loading model from: {model_path}")
+        try:
+            self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+            self.model = AutoModel.from_pretrained(
+                model_path,
+                trust_remote_code=True,
+                attn_implementation='sdpa',
+                torch_dtype='auto',  # safer on Spaces
+                init_vision=True,
+                init_audio=False,
+                init_tts=False
+            ).eval().cuda()
+            print("[Model Load] Model successfully loaded and moved to CUDA.")
+        except Exception as e:
+            print(f"[Model Load Error] {e}")
+            raise RuntimeError(f"Failed to load model: {e}")
+    def load_image(self, image_base64):
+        try:
+            print("[Image Load] Decoding base64 image...")
+            image_bytes = base64.b64decode(image_base64)
+            image = Image.open(BytesIO(image_bytes)).convert("RGB")
+            print("[Image Load] Image successfully decoded and converted to RGB.")
+            return image
+        except Exception as e:
+            print(f"[Image Load Error] {e}")
+            raise ValueError(f"Failed to open image from base64 string: {e}")
+    def predict(self, request):
+        print(f"[Predict] Received request: {request}")
+        image_base64 = request.get("inputs", {}).get("image")
+        question = request.get("inputs", {}).get("question")
+        stream = request.get("inputs", {}).get("stream", False)
+        if not image_base64 or not question:
+            print("[Predict Error] Missing 'image' or 'question' in the request.")
+            return {"error": "Missing 'image' or 'question' in inputs."}
+        try:
+            image = self.load_image(image_base64)
+            msgs = [{"role": "user", "content": [image, question]}]
+            print(f"[Predict] Asking model with question: {question}")
+            print("[Predict] Starting chat inference...")
+            res = self.model.chat(
+                image=None,
+                msgs=msgs,
+                tokenizer=self.tokenizer,
+                sampling=True,
+                stream=stream
+            )
+            if stream:
+                for new_text in res:
+                    yield {"output": new_text}
+            else:
+                generated_text = "".join(res)
+                print("[Predict] Inference complete.")
+                return {"output": generated_text}
+        except Exception as e:
+            print(f"[Predict Error] {e}")
+            return {"error": str(e)}
+    def __call__(self, data):
+        print("[__call__] Invoked handler with data.")
+        return self.predict(data)

main.py ADDED Viewed

	@@ -0,0 +1,50 @@

+from fastapi import FastAPI
+from fastapi.responses import JSONResponse, StreamingResponse
+from pydantic import BaseModel
+import types
+import json
+from endpoint_handler import EndpointHandler  # your handler file
+app = FastAPI()
+handler = None
+@app.on_event("startup")
+async def load_handler():
+    global handler
+    handler = EndpointHandler()
+class PredictInput(BaseModel):
+    image: str       # base64-encoded image string
+    question: str
+    stream: bool = False
+class PredictRequest(BaseModel):
+    inputs: PredictInput
+@app.get("/")
+async def root():
+    return {"message": "FastAPI app is running on Hugging Face"}
+@app.post("/predict")
+async def predict_endpoint(payload: PredictRequest):
+    print(f"[Request] Received question: {payload.inputs.question}")
+    data = {
+        "inputs": {
+            "image": payload.inputs.image,
+            "question": payload.inputs.question,
+            "stream": payload.inputs.stream
+        }
+    }
+    result = handler.predict(data)
+    if isinstance(result, types.GeneratorType):
+        def event_stream():
+            for chunk in result:
+                yield f"data: {json.dumps(chunk)}\n\n"
+        return StreamingResponse(event_stream(), media_type="text/event-stream")
+    return JSONResponse(content=result)

requirements.txt ADDED Viewed

	@@ -0,0 +1,19 @@

+Pillow==10.1.0
+torch==2.3.1
+torchaudio==2.3.1
+torchvision==0.18.1
+transformers==4.44.2
+librosa==0.9.0
+soundfile==0.12.1
+vector-quantize-pytorch==1.18.5
+vocos==0.1.0
+decord
+moviepy
+einops
+accelerate
+openbmb
+fastapi
+uvicorn[standard]
+timm>=0.6.13
+sentencepiece>=0.1.99
+python-multipart