Gatsby767 commited on
Commit
d86b11c
·
verified ·
1 Parent(s): 58f0efd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +9 -13
app.py CHANGED
@@ -10,29 +10,27 @@ import app_math as app_math # keeping your existing import
10
  HF_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")
11
  MODEL_ID = "HuggingFaceH4/zephyr-7b-beta"
12
 
13
- device = "cuda" if torch.cuda.is_available() else "cpu"
14
- dtype = torch.float16 if device == "cuda" else torch.float32
15
-
16
  tokenizer = AutoTokenizer.from_pretrained(
17
  MODEL_ID,
18
- token=HF_TOKEN, # uses your HF token if needed
19
  )
20
 
21
  model = AutoModelForCausalLM.from_pretrained(
22
  MODEL_ID,
23
- torch_dtype=dtype,
 
24
  low_cpu_mem_usage=True,
25
- token=HF_TOKEN, # uses your HF token if needed
26
  )
27
- model.to(device)
28
 
29
- # Ensure pad token is set for generation
30
  if tokenizer.pad_token_id is None and tokenizer.eos_token_id is not None:
31
  tokenizer.pad_token_id = tokenizer.eos_token_id
32
 
33
 
34
  def respond(message, history: list[tuple[str, str]], system_message, max_tokens, temperature, top_p):
35
- # Build chat messages with system + history + latest user message
36
  messages = [{"role": "system", "content": system_message}]
37
  for u, a in history:
38
  if u:
@@ -47,7 +45,7 @@ def respond(message, history: list[tuple[str, str]], system_message, max_tokens,
47
  add_generation_prompt=True,
48
  tokenize=True,
49
  return_tensors="pt",
50
- ).to(device)
51
 
52
  # Stream generation
53
  streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
@@ -63,7 +61,6 @@ def respond(message, history: list[tuple[str, str]], system_message, max_tokens,
63
  "streamer": streamer,
64
  }
65
 
66
- # Run generation in a background thread so we can yield tokens as they arrive
67
  thread = threading.Thread(target=model.generate, kwargs=gen_kwargs)
68
  thread.start()
69
 
@@ -74,8 +71,6 @@ def respond(message, history: list[tuple[str, str]], system_message, max_tokens,
74
 
75
 
76
  # ---- Gradio UI ----
77
- # For information on how to customize the ChatInterface, peruse the gradio docs:
78
- # https://www.gradio.app/docs/chatinterface
79
  demo = gr.ChatInterface(
80
  respond,
81
  additional_inputs=[
@@ -89,3 +84,4 @@ demo = gr.ChatInterface(
89
  if __name__ == "__main__":
90
  demo.launch()
91
 
 
 
10
  HF_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")
11
  MODEL_ID = "HuggingFaceH4/zephyr-7b-beta"
12
 
13
+ # Automatically map model across available devices (GPU/CPU)
 
 
14
  tokenizer = AutoTokenizer.from_pretrained(
15
  MODEL_ID,
16
+ token=HF_TOKEN,
17
  )
18
 
19
  model = AutoModelForCausalLM.from_pretrained(
20
  MODEL_ID,
21
+ device_map="auto", # << key change
22
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
23
  low_cpu_mem_usage=True,
24
+ token=HF_TOKEN,
25
  )
 
26
 
27
+ # Ensure pad token is set
28
  if tokenizer.pad_token_id is None and tokenizer.eos_token_id is not None:
29
  tokenizer.pad_token_id = tokenizer.eos_token_id
30
 
31
 
32
  def respond(message, history: list[tuple[str, str]], system_message, max_tokens, temperature, top_p):
33
+ # Build chat messages
34
  messages = [{"role": "system", "content": system_message}]
35
  for u, a in history:
36
  if u:
 
45
  add_generation_prompt=True,
46
  tokenize=True,
47
  return_tensors="pt",
48
+ ).to(model.device)
49
 
50
  # Stream generation
51
  streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
 
61
  "streamer": streamer,
62
  }
63
 
 
64
  thread = threading.Thread(target=model.generate, kwargs=gen_kwargs)
65
  thread.start()
66
 
 
71
 
72
 
73
  # ---- Gradio UI ----
 
 
74
  demo = gr.ChatInterface(
75
  respond,
76
  additional_inputs=[
 
84
  if __name__ == "__main__":
85
  demo.launch()
86
 
87
+