Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,12 +1,10 @@
|
|
1 |
import streamlit as st
|
2 |
import torch
|
3 |
-
from transformers import
|
4 |
from transformers.cache_utils import DynamicCache
|
5 |
import os
|
6 |
from time import time
|
7 |
import pandas as pd
|
8 |
-
|
9 |
-
import os
|
10 |
from huggingface_hub import login
|
11 |
|
12 |
HF_TOKEN = os.getenv("NEX_MODEL") # Updated key name for clarity
|
@@ -14,7 +12,8 @@ HF_TOKEN = os.getenv("NEX_MODEL") # Updated key name for clarity
|
|
14 |
if not HF_TOKEN:
|
15 |
raise ValueError("Hugging Face token not found. Please set the 'NEX_MODEL' environment variable.")
|
16 |
|
17 |
-
|
|
|
18 |
# ==============================
|
19 |
# Helper: Human-readable bytes
|
20 |
def sizeof_fmt(num, suffix="B"):
|
@@ -82,27 +81,37 @@ def calculate_cache_size(cache):
|
|
82 |
return total_memory /(1024*1024)
|
83 |
|
84 |
@st.cache_resource
|
85 |
-
def load_model_and_tokenizer(
|
86 |
-
model_name = "
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
model_name,
|
91 |
-
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
|
92 |
-
device_map="auto",
|
93 |
trust_remote_code=True
|
94 |
,token=HF_TOKEN
|
95 |
)
|
96 |
-
|
97 |
-
# Load the tokenizer
|
98 |
-
tokenizer = AutoTokenizer.from_pretrained(
|
99 |
model_name,
|
100 |
-
|
101 |
-
|
|
|
102 |
,token=HF_TOKEN
|
103 |
)
|
104 |
-
return
|
105 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
106 |
def clone_cache(cache):
|
107 |
new_cache = DynamicCache()
|
108 |
for key, value in zip(cache.key_cache, cache.value_cache):
|
@@ -117,7 +126,16 @@ def load_document_and_cache(file_path):
|
|
117 |
with open(file_path, 'r') as file:
|
118 |
doc_text = file.read()
|
119 |
doc_text_count = len(doc_text)
|
120 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
121 |
system_prompt = f"""
|
122 |
<|system|>
|
123 |
You are a helpful assistant. Provide concise, factual answers based only on the provided context.
|
@@ -194,11 +212,10 @@ if uploaded_file:
|
|
194 |
print(f"π Document Preview Display Time: {t_end3 - t_start3:.2f} s")
|
195 |
t_start4 = time()
|
196 |
# PART 4: Show Basic Info
|
197 |
-
|
198 |
-
#cache_size = os.path.getsize("temp_cache.pth") / 1024 if os.path.exists("temp_cache.pth") else "N/A"
|
199 |
t_end4 = time()
|
200 |
log.append(f"π doc_size_kb Preview Display Time: {t_end4 - t_start4:.2f} s")
|
201 |
-
print(f"π doc_size_kb Preview Display Time: {t_end4 - t_start4:.2f} s")
|
202 |
#st.info(
|
203 |
# f"Document Chars: {len(doc_text)} | Size: {doc_size_kb:.2f} KB | "
|
204 |
# f"Cache Size: {cache_size if cache_size == 'N/A' else f'{cache_size:.2f} KB'}"
|
@@ -222,10 +239,10 @@ if uploaded_file:
|
|
222 |
|
223 |
# PART 4.2: Tokenize Prompt
|
224 |
t_start6 = time()
|
225 |
-
|
226 |
full_prompt = f"""
|
227 |
<|user|>
|
228 |
-
Question: {query}
|
229 |
<|assistant|>
|
230 |
""".strip()
|
231 |
input_ids = tokenizer(full_prompt, return_tensors="pt").input_ids
|
@@ -249,7 +266,7 @@ if uploaded_file:
|
|
249 |
|
250 |
st.success("Answer:")
|
251 |
st.write(response)
|
252 |
-
|
253 |
# Final Info Display
|
254 |
st.info(
|
255 |
# f"Document Chars: {len(doc_text)} | Size: {doc_size_kb:.2f} KB | "
|
|
|
1 |
import streamlit as st
|
2 |
import torch
|
3 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
4 |
from transformers.cache_utils import DynamicCache
|
5 |
import os
|
6 |
from time import time
|
7 |
import pandas as pd
|
|
|
|
|
8 |
from huggingface_hub import login
|
9 |
|
10 |
HF_TOKEN = os.getenv("NEX_MODEL") # Updated key name for clarity
|
|
|
12 |
if not HF_TOKEN:
|
13 |
raise ValueError("Hugging Face token not found. Please set the 'NEX_MODEL' environment variable.")
|
14 |
|
15 |
+
|
16 |
+
|
17 |
# ==============================
|
18 |
# Helper: Human-readable bytes
|
19 |
def sizeof_fmt(num, suffix="B"):
|
|
|
81 |
return total_memory /(1024*1024)
|
82 |
|
83 |
@st.cache_resource
|
84 |
+
def load_model_and_tokenizer():
|
85 |
+
model_name = "GeneZC/MiniChat-1.5-3B"
|
86 |
+
|
87 |
+
|
88 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
89 |
model_name,
|
|
|
|
|
90 |
trust_remote_code=True
|
91 |
,token=HF_TOKEN
|
92 |
)
|
93 |
+
model = AutoModelForCausalLM.from_pretrained(
|
|
|
|
|
94 |
model_name,
|
95 |
+
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
|
96 |
+
device_map="auto",
|
97 |
+
trust_remote_code=True
|
98 |
,token=HF_TOKEN
|
99 |
)
|
100 |
+
return model, tokenizer
|
101 |
+
def calculate_cache_size(cache):
|
102 |
+
"""
|
103 |
+
Calculate the total memory used by the key-value cache (past_key_values) in megabytes.
|
104 |
+
Args:
|
105 |
+
cache: The past_key_values object (usually a tuple of (key, value) pairs per layer).
|
106 |
+
Returns:
|
107 |
+
Total memory in megabytes.
|
108 |
+
"""
|
109 |
+
total_memory = 0
|
110 |
+
for layer_cache in cache:
|
111 |
+
key_tensor, value_tensor = layer_cache
|
112 |
+
total_memory += key_tensor.element_size() * key_tensor.nelement()
|
113 |
+
total_memory += value_tensor.element_size() * value_tensor.nelement()
|
114 |
+
return total_memory / (1024 * 1024) # Convert to MB
|
115 |
def clone_cache(cache):
|
116 |
new_cache = DynamicCache()
|
117 |
for key, value in zip(cache.key_cache, cache.value_cache):
|
|
|
126 |
with open(file_path, 'r') as file:
|
127 |
doc_text = file.read()
|
128 |
doc_text_count = len(doc_text)
|
129 |
+
max_length = int(1.3 * (doc_text_count * 0.3 + 1))
|
130 |
+
|
131 |
+
# Cap the value at 16824
|
132 |
+
if max_length > 16824:
|
133 |
+
max_length = 16824
|
134 |
+
print(f" model_max_length set to: {max_length}")
|
135 |
+
|
136 |
+
model, tokenizer = load_model_and_tokenizer()
|
137 |
+
tokenizer.model_max_length=max_length
|
138 |
+
|
139 |
system_prompt = f"""
|
140 |
<|system|>
|
141 |
You are a helpful assistant. Provide concise, factual answers based only on the provided context.
|
|
|
212 |
print(f"π Document Preview Display Time: {t_end3 - t_start3:.2f} s")
|
213 |
t_start4 = time()
|
214 |
# PART 4: Show Basic Info
|
215 |
+
s_cache=calculate_cache_size(cache)
|
|
|
216 |
t_end4 = time()
|
217 |
log.append(f"π doc_size_kb Preview Display Time: {t_end4 - t_start4:.2f} s")
|
218 |
+
print(f"π doc_size_kb Preview Display Time: {t_end4 - t_start4:.2f} s||||||| size of the cache : {s_cache} MB")
|
219 |
#st.info(
|
220 |
# f"Document Chars: {len(doc_text)} | Size: {doc_size_kb:.2f} KB | "
|
221 |
# f"Cache Size: {cache_size if cache_size == 'N/A' else f'{cache_size:.2f} KB'}"
|
|
|
239 |
|
240 |
# PART 4.2: Tokenize Prompt
|
241 |
t_start6 = time()
|
242 |
+
|
243 |
full_prompt = f"""
|
244 |
<|user|>
|
245 |
+
Question: Please provide a clear and concise answer to the question .{query}
|
246 |
<|assistant|>
|
247 |
""".strip()
|
248 |
input_ids = tokenizer(full_prompt, return_tensors="pt").input_ids
|
|
|
266 |
|
267 |
st.success("Answer:")
|
268 |
st.write(response)
|
269 |
+
print(f"***************************************************************************************")
|
270 |
# Final Info Display
|
271 |
st.info(
|
272 |
# f"Document Chars: {len(doc_text)} | Size: {doc_size_kb:.2f} KB | "
|