howtomakepplragequit commited on
Commit
51a8151
·
verified ·
1 Parent(s): d445d1e

Upload 2 files

Browse files
Files changed (2) hide show
  1. Dockerfile +16 -8
  2. main.py +20 -18
Dockerfile CHANGED
@@ -1,13 +1,21 @@
1
- FROM python:3.10-slim
2
 
3
- RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
 
4
 
5
- RUN pip install --upgrade pip
6
- RUN pip install torch transformers fastapi uvicorn bitsandbytes accelerate
7
 
8
- WORKDIR /app
9
- COPY main.py .
10
- ENV HF_HOME=/app/hf_cache
11
- RUN mkdir -p /app/hf_cache
 
 
12
 
 
 
 
 
13
  CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
 
 
1
+ FROM python:3.10
2
 
3
+ # Set working directory
4
+ WORKDIR /app
5
 
6
+ # Copy local files
7
+ COPY . .
8
 
9
+ # Install Python dependencies
10
+ RUN pip install --no-cache-dir -r requirements.txt
11
+
12
+ # Set Hugging Face cache directory to writable one
13
+ ENV HF_HOME=/data
14
+ RUN mkdir -p /data && chmod 777 /data
15
 
16
+ # Expose the port
17
+ EXPOSE 7860
18
+
19
+ # Run the FastAPI app with Uvicorn
20
  CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
21
+
main.py CHANGED
@@ -1,25 +1,27 @@
1
- from fastapi import FastAPI, Request
2
- from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
3
- import torch
4
  import os
 
 
 
5
 
6
- os.environ["HF_HOME"] = "/app/hf_cache"
7
-
8
- app = FastAPI()
9
 
10
  model_name = "howtomakepplragequit/phi2-lora-instruct"
 
 
11
  tokenizer = AutoTokenizer.from_pretrained(model_name)
12
- model = AutoModelForCausalLM.from_pretrained(
13
- model_name,
14
- torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
15
- device_map="auto"
16
- )
17
- pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
 
 
 
 
18
 
19
  @app.post("/generate")
20
- async def generate(request: Request):
21
- data = await request.json()
22
- prompt = data.get("prompt", "")
23
- formatted = f"### Instruction:\n{prompt}\n\n### Response:\n"
24
- result = pipe(formatted, max_new_tokens=200)[0]["generated_text"]
25
- return {"response": result.split("### Response:")[-1].strip()}
 
 
 
 
1
  import os
2
+ from fastapi import FastAPI
3
+ from pydantic import BaseModel
4
+ from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
5
 
6
+ # Force Hugging Face cache to a writable dir
7
+ os.environ["HF_HOME"] = "/data"
 
8
 
9
  model_name = "howtomakepplragequit/phi2-lora-instruct"
10
+
11
+ # Load tokenizer and model
12
  tokenizer = AutoTokenizer.from_pretrained(model_name)
13
+ model = AutoModelForCausalLM.from_pretrained(model_name)
14
+
15
+ # Create pipeline
16
+ generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
17
+
18
+ # FastAPI app setup
19
+ app = FastAPI()
20
+
21
+ class Prompt(BaseModel):
22
+ prompt: str
23
 
24
  @app.post("/generate")
25
+ def generate_text(data: Prompt):
26
+ output = generator(data.prompt, max_length=200, do_sample=True)[0]["generated_text"]
27
+ return {"response": output}