Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -89,12 +89,14 @@ SYSTEM_PROMPT = load_system_prompt(MODEL_ID, "SYSTEM_PROMPT.txt")
|
|
89 |
# If you prefer a hardcoded system prompt, you can use:
|
90 |
# SYSTEM_PROMPT = "You are a conversational agent that always answers straight to the point, and ends with an ASCII cat."
|
91 |
|
92 |
-
# Set the device explicitly
|
93 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
94 |
|
95 |
# Initialize the Mistral LLM via vllm.
|
96 |
# Note: Running this model on GPU may require very high VRAM.
|
97 |
-
|
|
|
|
|
98 |
|
99 |
# -----------------------------------------------------------------------------
|
100 |
# Main Generation Function
|
|
|
89 |
# If you prefer a hardcoded system prompt, you can use:
|
90 |
# SYSTEM_PROMPT = "You are a conversational agent that always answers straight to the point, and ends with an ASCII cat."
|
91 |
|
92 |
+
# Set the device explicitly
|
93 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
94 |
|
95 |
# Initialize the Mistral LLM via vllm.
|
96 |
# Note: Running this model on GPU may require very high VRAM.
|
97 |
+
# The 'enforce_eager=True' parameter disables asynchronous output,
|
98 |
+
# which avoids the NotImplementedError on platforms that do not support it.
|
99 |
+
llm = LLM(model=MODEL_ID, tokenizer_mode="mistral", device=device, enforce_eager=True)
|
100 |
|
101 |
# -----------------------------------------------------------------------------
|
102 |
# Main Generation Function
|