kouki321 commited on
Commit
88d5af8
Β·
verified Β·
1 Parent(s): 9dfced6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +42 -25
app.py CHANGED
@@ -1,12 +1,10 @@
1
  import streamlit as st
2
  import torch
3
- from transformers import AutoTokenizer, AutoModelForCausalLM
4
  from transformers.cache_utils import DynamicCache
5
  import os
6
  from time import time
7
  import pandas as pd
8
-
9
- import os
10
  from huggingface_hub import login
11
 
12
  HF_TOKEN = os.getenv("NEX_MODEL") # Updated key name for clarity
@@ -14,7 +12,8 @@ HF_TOKEN = os.getenv("NEX_MODEL") # Updated key name for clarity
14
  if not HF_TOKEN:
15
  raise ValueError("Hugging Face token not found. Please set the 'NEX_MODEL' environment variable.")
16
 
17
- login(token=HF_TOKEN)
 
18
  # ==============================
19
  # Helper: Human-readable bytes
20
  def sizeof_fmt(num, suffix="B"):
@@ -82,27 +81,37 @@ def calculate_cache_size(cache):
82
  return total_memory /(1024*1024)
83
 
84
  @st.cache_resource
85
- def load_model_and_tokenizer(doc_text_count):
86
- model_name = "google/gemma-3-4b-it" # Configure quantization for 4-bit loading
87
-
88
- # Load the pre-trained model with quantization
89
- model = AutoModelForCausalLM.from_pretrained(
90
  model_name,
91
- torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
92
- device_map="auto",
93
  trust_remote_code=True
94
  ,token=HF_TOKEN
95
  )
96
-
97
- # Load the tokenizer
98
- tokenizer = AutoTokenizer.from_pretrained(
99
  model_name,
100
- trust_remote_code=True,
101
- model_max_length=1.3*round(doc_text_count * 0.3 + 1)
 
102
  ,token=HF_TOKEN
103
  )
104
- return tokenizer, model
105
-
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  def clone_cache(cache):
107
  new_cache = DynamicCache()
108
  for key, value in zip(cache.key_cache, cache.value_cache):
@@ -117,7 +126,16 @@ def load_document_and_cache(file_path):
117
  with open(file_path, 'r') as file:
118
  doc_text = file.read()
119
  doc_text_count = len(doc_text)
120
- model, tokenizer = load_model_and_tokenizer(doc_text_count)
 
 
 
 
 
 
 
 
 
121
  system_prompt = f"""
122
  <|system|>
123
  You are a helpful assistant. Provide concise, factual answers based only on the provided context.
@@ -194,11 +212,10 @@ if uploaded_file:
194
  print(f"πŸ‘€ Document Preview Display Time: {t_end3 - t_start3:.2f} s")
195
  t_start4 = time()
196
  # PART 4: Show Basic Info
197
- #doc_size_kb = os.path.getsize(temp_file_path) / 1024
198
- #cache_size = os.path.getsize("temp_cache.pth") / 1024 if os.path.exists("temp_cache.pth") else "N/A"
199
  t_end4 = time()
200
  log.append(f"πŸ‘€ doc_size_kb Preview Display Time: {t_end4 - t_start4:.2f} s")
201
- print(f"πŸ‘€ doc_size_kb Preview Display Time: {t_end4 - t_start4:.2f} s")
202
  #st.info(
203
  # f"Document Chars: {len(doc_text)} | Size: {doc_size_kb:.2f} KB | "
204
  # f"Cache Size: {cache_size if cache_size == 'N/A' else f'{cache_size:.2f} KB'}"
@@ -222,10 +239,10 @@ if uploaded_file:
222
 
223
  # PART 4.2: Tokenize Prompt
224
  t_start6 = time()
225
- model, tokenizer = load_model_and_tokenizer(doc_text_count)
226
  full_prompt = f"""
227
  <|user|>
228
- Question: {query}
229
  <|assistant|>
230
  """.strip()
231
  input_ids = tokenizer(full_prompt, return_tensors="pt").input_ids
@@ -249,7 +266,7 @@ if uploaded_file:
249
 
250
  st.success("Answer:")
251
  st.write(response)
252
-
253
  # Final Info Display
254
  st.info(
255
  # f"Document Chars: {len(doc_text)} | Size: {doc_size_kb:.2f} KB | "
 
1
  import streamlit as st
2
  import torch
3
+ from transformers import AutoTokenizer, AutoModelForCausalLM
4
  from transformers.cache_utils import DynamicCache
5
  import os
6
  from time import time
7
  import pandas as pd
 
 
8
  from huggingface_hub import login
9
 
10
  HF_TOKEN = os.getenv("NEX_MODEL") # Updated key name for clarity
 
12
  if not HF_TOKEN:
13
  raise ValueError("Hugging Face token not found. Please set the 'NEX_MODEL' environment variable.")
14
 
15
+
16
+
17
  # ==============================
18
  # Helper: Human-readable bytes
19
  def sizeof_fmt(num, suffix="B"):
 
81
  return total_memory /(1024*1024)
82
 
83
  @st.cache_resource
84
+ def load_model_and_tokenizer():
85
+ model_name = "GeneZC/MiniChat-1.5-3B"
86
+
87
+
88
+ tokenizer = AutoTokenizer.from_pretrained(
89
  model_name,
 
 
90
  trust_remote_code=True
91
  ,token=HF_TOKEN
92
  )
93
+ model = AutoModelForCausalLM.from_pretrained(
 
 
94
  model_name,
95
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
96
+ device_map="auto",
97
+ trust_remote_code=True
98
  ,token=HF_TOKEN
99
  )
100
+ return model, tokenizer
101
+ def calculate_cache_size(cache):
102
+ """
103
+ Calculate the total memory used by the key-value cache (past_key_values) in megabytes.
104
+ Args:
105
+ cache: The past_key_values object (usually a tuple of (key, value) pairs per layer).
106
+ Returns:
107
+ Total memory in megabytes.
108
+ """
109
+ total_memory = 0
110
+ for layer_cache in cache:
111
+ key_tensor, value_tensor = layer_cache
112
+ total_memory += key_tensor.element_size() * key_tensor.nelement()
113
+ total_memory += value_tensor.element_size() * value_tensor.nelement()
114
+ return total_memory / (1024 * 1024) # Convert to MB
115
  def clone_cache(cache):
116
  new_cache = DynamicCache()
117
  for key, value in zip(cache.key_cache, cache.value_cache):
 
126
  with open(file_path, 'r') as file:
127
  doc_text = file.read()
128
  doc_text_count = len(doc_text)
129
+ max_length = int(1.3 * (doc_text_count * 0.3 + 1))
130
+
131
+ # Cap the value at 16824
132
+ if max_length > 16824:
133
+ max_length = 16824
134
+ print(f" model_max_length set to: {max_length}")
135
+
136
+ model, tokenizer = load_model_and_tokenizer()
137
+ tokenizer.model_max_length=max_length
138
+
139
  system_prompt = f"""
140
  <|system|>
141
  You are a helpful assistant. Provide concise, factual answers based only on the provided context.
 
212
  print(f"πŸ‘€ Document Preview Display Time: {t_end3 - t_start3:.2f} s")
213
  t_start4 = time()
214
  # PART 4: Show Basic Info
215
+ s_cache=calculate_cache_size(cache)
 
216
  t_end4 = time()
217
  log.append(f"πŸ‘€ doc_size_kb Preview Display Time: {t_end4 - t_start4:.2f} s")
218
+ print(f"πŸ‘€ doc_size_kb Preview Display Time: {t_end4 - t_start4:.2f} s||||||| size of the cache : {s_cache} MB")
219
  #st.info(
220
  # f"Document Chars: {len(doc_text)} | Size: {doc_size_kb:.2f} KB | "
221
  # f"Cache Size: {cache_size if cache_size == 'N/A' else f'{cache_size:.2f} KB'}"
 
239
 
240
  # PART 4.2: Tokenize Prompt
241
  t_start6 = time()
242
+
243
  full_prompt = f"""
244
  <|user|>
245
+ Question: Please provide a clear and concise answer to the question .{query}
246
  <|assistant|>
247
  """.strip()
248
  input_ids = tokenizer(full_prompt, return_tensors="pt").input_ids
 
266
 
267
  st.success("Answer:")
268
  st.write(response)
269
+ print(f"***************************************************************************************")
270
  # Final Info Display
271
  st.info(
272
  # f"Document Chars: {len(doc_text)} | Size: {doc_size_kb:.2f} KB | "