Spaces:

AmeyaKawthalkar
/

Godfrey

Sleeping

App Files Files Community

AmeyaKawthalkar commited on May 23

Commit

c4fd269

verified ·

1 Parent(s): 075c1af

Update app.py

Browse files

Files changed (1) hide show

app.py +79 -54

app.py CHANGED Viewed

@@ -1,7 +1,7 @@
 # app.py
 # FastAPI backend for a Hugging Face Space (CPU tier)
 # • Only MedGemma-4B-IT, no Parakeet, no tool-calling
-# • Reads HF_TOKEN from Space secrets, uses /data/.cache as writable cache
 # • /chat endpoint expects {"messages":[{"role":"user","content": "..."}]}
 import os, pathlib, uuid
@@ -17,16 +17,16 @@ from transformers import pipeline
 # ------------------------------------------------------------
 # 1.  Configure cache + authentication BEFORE loading models
 # ------------------------------------------------------------
-HOME_DIR  = pathlib.Path.home()
-CACHE_DIR = HOME_DIR / ".cache" / "huggingface"
-CACHE_DIR.mkdir(parents=True, exist_ok=True)   # ← always writable
-os.environ["HF_HOME"]           = str(CACHE_DIR)
 os.environ["TRANSFORMERS_CACHE"] = str(CACHE_DIR)
 HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_TOKEN")  # fine-grained read token
 # ------------------------------------------------------------
 # 2.  Simple Pydantic request model
 # ------------------------------------------------------------
@@ -47,17 +47,21 @@ medgemma_pipe = None
 def get_medgemma():
     global medgemma_pipe
     if medgemma_pipe is None:
-        print("Loading MedGemma-4B-IT …")
-        medgemma_pipe = pipeline(
-            "text-generation",
-            model="google/medgemma-4b-it",
-            torch_dtype=DTYPE,
-            device=0 if torch.cuda.is_available() else -1,
-            token=HF_TOKEN,                # authenticate to gated repo  :contentReference[oaicite:5]{index=5}
-            cache_dir=CACHE_DIR,
-            trust_remote_code=True,
-        )
-        print("✅ MedGemma loaded")
     return medgemma_pipe
 # ------------------------------------------------------------
@@ -87,67 +91,88 @@ SYSTEM_PROMPT = (
 # ------------------------------------------------------------
 @app.post("/chat")
 async def chat(request: Request):
-    body     = await request.json()
-    payload  = ChatCompletionRequest(**body)
-    user_msg = payload.messages[-1].content or ""
-    prompt   = f"{SYSTEM_PROMPT}\n\n{user_msg}\n\nRadiology Report:\n"
-    pipe = get_medgemma()
-    if pipe is None:
         return JSONResponse(
             {
                 "id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
                 "choices": [{
                     "message": {
                         "role": "assistant",
-                        "content": "MedGemma model is unavailable. "
-                                   "Check your gated-model access and HF_TOKEN.",
                     }
-                }],
-            },
-            status_code=503,
         )
-    try:
-        result = pipe(
-            prompt,
-            max_new_tokens=256,
-            do_sample=True,
-            temperature=0.7,
-            pad_token_id=pipe.tokenizer.eos_token_id,
-            return_full_text=False,
-        )
-        assistant_text = result[0]["generated_text"].strip() if result else "No response."
     except Exception as e:
-        print("Generation error:", e)
-        assistant_text = "Error generating response. Please retry later."
-    return JSONResponse(
-        {
-            "id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
-            "choices": [{
-                "message": {
-                    "role": "assistant",
-                    "content": assistant_text,
-                }
-            }]
-        }
-    )
 # ------------------------------------------------------------
 # 7.  Health endpoint
 # ------------------------------------------------------------
 @app.get("/health")
 async def health():
     return {
         "status": "ok",
         "model_loaded": medgemma_pipe is not None,
         "hf_token_present": bool(HF_TOKEN),
     }
 # ------------------------------------------------------------
-# 8.  For local dev (won’t run inside Space runtime)
 # ------------------------------------------------------------
 if __name__ == "__main__":
     import uvicorn
     uvicorn.run(app, host="0.0.0.0", port=7860)

 # app.py
 # FastAPI backend for a Hugging Face Space (CPU tier)
 # • Only MedGemma-4B-IT, no Parakeet, no tool-calling
+# • Reads HF_TOKEN from Space secrets, uses /tmp for writable cache
 # • /chat endpoint expects {"messages":[{"role":"user","content": "..."}]}
 import os, pathlib, uuid
 # ------------------------------------------------------------
 # 1.  Configure cache + authentication BEFORE loading models
 # ------------------------------------------------------------
+# Use /tmp for cache in HF Spaces (always writable)
+CACHE_DIR = pathlib.Path("/tmp/hf_cache")
+CACHE_DIR.mkdir(parents=True, exist_ok=True)
+os.environ["HF_HOME"] = str(CACHE_DIR)
 os.environ["TRANSFORMERS_CACHE"] = str(CACHE_DIR)
 HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_TOKEN")  # fine-grained read token
 # ------------------------------------------------------------
 # 2.  Simple Pydantic request model
 # ------------------------------------------------------------
 def get_medgemma():
     global medgemma_pipe
     if medgemma_pipe is None:
+        try:
+            print("Loading MedGemma-4B-IT …")
+            medgemma_pipe = pipeline(
+                "text-generation",
+                model="google/medgemma-4b-it",
+                torch_dtype=DTYPE,
+                device=0 if torch.cuda.is_available() else -1,
+                token=HF_TOKEN,                # authenticate to gated repo
+                cache_dir=CACHE_DIR,
+                trust_remote_code=True,
+            )
+            print("✅ MedGemma loaded successfully")
+        except Exception as e:
+            print(f"❌ Error loading MedGemma: {e}")
+            medgemma_pipe = None
     return medgemma_pipe
 # ------------------------------------------------------------
 # ------------------------------------------------------------
 @app.post("/chat")
 async def chat(request: Request):
+    try:
+        body     = await request.json()
+        payload  = ChatCompletionRequest(**body)
+        user_msg = payload.messages[-1].content or ""
+        prompt   = f"{SYSTEM_PROMPT}\n\n{user_msg}\n\nRadiology Report:\n"
+        pipe = get_medgemma()
+        if pipe is None:
+            return JSONResponse(
+                {
+                    "id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
+                    "choices": [{
+                        "message": {
+                            "role": "assistant",
+                            "content": "MedGemma model is unavailable. "
+                                       "Check your gated-model access and HF_TOKEN.",
+                        }
+                    }],
+                },
+                status_code=503,
+            )
+        try:
+            result = pipe(
+                prompt,
+                max_new_tokens=256,
+                do_sample=True,
+                temperature=0.7,
+                pad_token_id=pipe.tokenizer.eos_token_id,
+                return_full_text=False,
+            )
+            assistant_text = result[0]["generated_text"].strip() if result else "No response."
+        except Exception as e:
+            print("Generation error:", e)
+            assistant_text = "Error generating response. Please retry later."
         return JSONResponse(
             {
                 "id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
                 "choices": [{
                     "message": {
                         "role": "assistant",
+                        "content": assistant_text,
                     }
+                }]
+            }
         )
     except Exception as e:
+        print(f"Chat endpoint error: {e}")
+        return JSONResponse(
+            {
+                "id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
+                "choices": [{
+                    "message": {
+                        "role": "assistant",
+                        "content": "Server error. Please try again later.",
+                    }
+                }]
+            },
+            status_code=500
+        )
 # ------------------------------------------------------------
 # 7.  Health endpoint
 # ------------------------------------------------------------
+@app.get("/")
+async def root():
+    return {"status": "healthy", "message": "MedGemma API is running"}
 @app.get("/health")
 async def health():
     return {
         "status": "ok",
         "model_loaded": medgemma_pipe is not None,
         "hf_token_present": bool(HF_TOKEN),
+        "cache_dir": str(CACHE_DIR),
     }
 # ------------------------------------------------------------
+# 8.  For local dev (won't run inside Space runtime)
 # ------------------------------------------------------------
 if __name__ == "__main__":
     import uvicorn
     uvicorn.run(app, host="0.0.0.0", port=7860)