sreejith8100 commited on
Commit
657c17b
·
1 Parent(s): 2184968

initial commit

Browse files
Files changed (5) hide show
  1. Dockerfile +22 -0
  2. client.py +61 -0
  3. endpoint_handler.py +91 -0
  4. main.py +50 -0
  5. requirements.txt +19 -0
Dockerfile ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM pytorch/pytorch:2.3.1-cuda12.1-cudnn8-runtime
2
+
3
+ RUN apt-get update && apt-get install -y wget
4
+ RUN useradd -m -u 1000 user
5
+
6
+ USER user
7
+ WORKDIR /app
8
+
9
+ ENV PATH="/home/user/.local/bin:$PATH"
10
+ ENV TRANSFORMERS_CACHE=/home/user/.cache/huggingface
11
+ ENV TORCH_CUDA_ARCH_LIST="8.0+PTX"
12
+
13
+ RUN wget https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/v0.0.4/flash_attn-2.7.3+cu121torch2.3-cp310-cp310-linux_x86_64.whl
14
+ RUN pip install ./flash_attn-2.7.3+cu121torch2.3-cp310-cp310-linux_x86_64.whl && rm flash_attn-2.7.3+cu121torch2.3-cp310-cp310-linux_x86_64.whl
15
+
16
+ COPY --chown=user requirements.txt .
17
+ RUN pip install --upgrade pip setuptools wheel
18
+ RUN pip install --no-cache-dir -r requirements.txt
19
+
20
+ COPY --chown=user . .
21
+
22
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
client.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import base64
3
+ import json
4
+ import requests
5
+ from PIL import Image
6
+ import urllib3
7
+
8
+ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
9
+
10
+ orig_path = r"D:\grcpsample\content\extracted\images_8184456bc26d4e7f9c5237c350fe20ba\page_4.png"
11
+ resized_path = r"D:\grcpsample\content\extracted\images_8184456bc26d4e7f9c5237c350fe20ba\page_4_small.jpg"
12
+
13
+ # 1. Load and downscale
14
+ img = Image.open(orig_path)
15
+ max_side = 800 # tweak to 600 or 512 if still too large
16
+ scale = max_side / max(img.size)
17
+ if scale < 1.0:
18
+ new_size = (int(img.width * scale), int(img.height * scale))
19
+ img = img.resize(new_size, Image.LANCZOS)
20
+
21
+ # 2. Save as JPEG at 70% quality
22
+ img.save(resized_path, format="JPEG", quality=70)
23
+
24
+ # 3. Print new on-disk size
25
+ new_size_kb = os.path.getsize(resized_path) / 1024
26
+ print(f"Resized JPEG size: {new_size_kb:.2f} KB") # aim for ≤ 500 KB
27
+
28
+ # 4. Base64-encode and print that size
29
+ with open(resized_path, "rb") as f:
30
+ img_bytes = f.read()
31
+ b64 = base64.b64encode(img_bytes).decode("utf-8")
32
+ b64_kb = len(b64.encode("utf-8")) / 1024
33
+ print(f"Base64 size: {b64_kb:.2f} KB") # expect ~1.33× raw size
34
+
35
+ # 5. Build payload and measure final JSON
36
+ payload = {
37
+ "inputs": {
38
+ "image": b64,
39
+ "question": "What is in the image?",
40
+ "stream": True
41
+ }
42
+ }
43
+ json_payload = json.dumps(payload)
44
+ final_kb = len(json_payload.encode("utf-8")) / 1024
45
+ print(f"Final JSON payload: {final_kb:.2f} KB") # want < ~700 KB
46
+
47
+ # 6. POST to the Space
48
+ url = "https://huggingface.co/spaces/sreejith8100/llm_model/predict"
49
+ headers = {"Content-Type": "application/json"}
50
+
51
+ try:
52
+ with requests.post(url, data=json_payload, headers=headers, stream=True, verify=False) as resp:
53
+ resp.raise_for_status()
54
+ for line in resp.iter_lines(decode_unicode=True):
55
+ if line.startswith("data: "):
56
+ chunk = json.loads(line.replace("data: ", ""))
57
+ if chunk.get("output"):
58
+ print(chunk["output"], end="", flush=True)
59
+ except requests.HTTPError as e:
60
+ print(f"HTTP error: {e}, body:\n{resp.text}")
61
+ raise
endpoint_handler.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from PIL import Image
3
+ from transformers import AutoModel, AutoTokenizer
4
+ from io import BytesIO
5
+ import base64
6
+ from huggingface_hub import login
7
+ from huggingface_hub import login
8
+ import os
9
+
10
+ class EndpointHandler:
11
+ def __init__(self, model_dir=None):
12
+ print("[Init] Initializing EndpointHandler...")
13
+ self.load_model()
14
+
15
+ def load_model(self):
16
+ hf_token = os.getenv("HF_TOKEN")
17
+ model_path = "openbmb/MiniCPM-o-2_6" # use model repo name directly
18
+
19
+ if hf_token:
20
+ print("[Auth] Logging into Hugging Face Hub with token...")
21
+ login(token=hf_token)
22
+
23
+ print(f"[Model Load] Loading model from: {model_path}")
24
+ try:
25
+ self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
26
+ self.model = AutoModel.from_pretrained(
27
+ model_path,
28
+ trust_remote_code=True,
29
+ attn_implementation='sdpa',
30
+ torch_dtype='auto', # safer on Spaces
31
+ init_vision=True,
32
+ init_audio=False,
33
+ init_tts=False
34
+ ).eval().cuda()
35
+ print("[Model Load] Model successfully loaded and moved to CUDA.")
36
+ except Exception as e:
37
+ print(f"[Model Load Error] {e}")
38
+ raise RuntimeError(f"Failed to load model: {e}")
39
+
40
+ def load_image(self, image_base64):
41
+ try:
42
+ print("[Image Load] Decoding base64 image...")
43
+ image_bytes = base64.b64decode(image_base64)
44
+ image = Image.open(BytesIO(image_bytes)).convert("RGB")
45
+ print("[Image Load] Image successfully decoded and converted to RGB.")
46
+ return image
47
+ except Exception as e:
48
+ print(f"[Image Load Error] {e}")
49
+ raise ValueError(f"Failed to open image from base64 string: {e}")
50
+
51
+ def predict(self, request):
52
+ print(f"[Predict] Received request: {request}")
53
+
54
+ image_base64 = request.get("inputs", {}).get("image")
55
+ question = request.get("inputs", {}).get("question")
56
+ stream = request.get("inputs", {}).get("stream", False)
57
+
58
+ if not image_base64 or not question:
59
+ print("[Predict Error] Missing 'image' or 'question' in the request.")
60
+ return {"error": "Missing 'image' or 'question' in inputs."}
61
+
62
+ try:
63
+ image = self.load_image(image_base64)
64
+ msgs = [{"role": "user", "content": [image, question]}]
65
+
66
+ print(f"[Predict] Asking model with question: {question}")
67
+ print("[Predict] Starting chat inference...")
68
+
69
+ res = self.model.chat(
70
+ image=None,
71
+ msgs=msgs,
72
+ tokenizer=self.tokenizer,
73
+ sampling=True,
74
+ stream=stream
75
+ )
76
+
77
+ if stream:
78
+ for new_text in res:
79
+ yield {"output": new_text}
80
+ else:
81
+ generated_text = "".join(res)
82
+ print("[Predict] Inference complete.")
83
+ return {"output": generated_text}
84
+
85
+ except Exception as e:
86
+ print(f"[Predict Error] {e}")
87
+ return {"error": str(e)}
88
+
89
+ def __call__(self, data):
90
+ print("[__call__] Invoked handler with data.")
91
+ return self.predict(data)
main.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+ from fastapi.responses import JSONResponse, StreamingResponse
3
+ from pydantic import BaseModel
4
+ import types
5
+ import json
6
+
7
+ from endpoint_handler import EndpointHandler # your handler file
8
+
9
+ app = FastAPI()
10
+
11
+ handler = None
12
+
13
+ @app.on_event("startup")
14
+ async def load_handler():
15
+ global handler
16
+ handler = EndpointHandler()
17
+
18
+ class PredictInput(BaseModel):
19
+ image: str # base64-encoded image string
20
+ question: str
21
+ stream: bool = False
22
+
23
+ class PredictRequest(BaseModel):
24
+ inputs: PredictInput
25
+
26
+ @app.get("/")
27
+ async def root():
28
+ return {"message": "FastAPI app is running on Hugging Face"}
29
+
30
+ @app.post("/predict")
31
+ async def predict_endpoint(payload: PredictRequest):
32
+ print(f"[Request] Received question: {payload.inputs.question}")
33
+
34
+ data = {
35
+ "inputs": {
36
+ "image": payload.inputs.image,
37
+ "question": payload.inputs.question,
38
+ "stream": payload.inputs.stream
39
+ }
40
+ }
41
+
42
+ result = handler.predict(data)
43
+
44
+ if isinstance(result, types.GeneratorType):
45
+ def event_stream():
46
+ for chunk in result:
47
+ yield f"data: {json.dumps(chunk)}\n\n"
48
+ return StreamingResponse(event_stream(), media_type="text/event-stream")
49
+
50
+ return JSONResponse(content=result)
requirements.txt ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Pillow==10.1.0
2
+ torch==2.3.1
3
+ torchaudio==2.3.1
4
+ torchvision==0.18.1
5
+ transformers==4.44.2
6
+ librosa==0.9.0
7
+ soundfile==0.12.1
8
+ vector-quantize-pytorch==1.18.5
9
+ vocos==0.1.0
10
+ decord
11
+ moviepy
12
+ einops
13
+ accelerate
14
+ openbmb
15
+ fastapi
16
+ uvicorn[standard]
17
+ timm>=0.6.13
18
+ sentencepiece>=0.1.99
19
+ python-multipart