kouki321 commited on
Commit
22c9862
·
verified ·
1 Parent(s): 175a15e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -11
app.py CHANGED
@@ -74,20 +74,28 @@ def calculate_cache_size(cache):
74
  return total_memory /(1024*1024)
75
 
76
  @st.cache_resource
77
- def load_model_and_tokenizer(doc_text_count):
78
- model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
79
- tokenizer = AutoTokenizer.from_pretrained(
80
- model_name,
81
- trust_remote_code=True,
82
- model_max_length=1.3*round(doc_text_count * 0.3 + 1)
 
83
  )
 
84
  model = AutoModelForCausalLM.from_pretrained(
85
  model_name,
86
- torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
87
- device_map="auto",
88
- trust_remote_code=True
 
 
 
 
 
 
89
  )
90
- return model, tokenizer
91
 
92
  def clone_cache(cache):
93
  new_cache = DynamicCache()
@@ -106,7 +114,8 @@ def load_document_and_cache(file_path):
106
  model, tokenizer = load_model_and_tokenizer(doc_text_count)
107
  system_prompt = f"""
108
  <|system|>
109
- Answer concisely and precisely. You are an assistant who provides concise factual answers.
 
110
  <|user|>
111
  Context:
112
  {doc_text}
 
74
  return total_memory /(1024*1024)
75
 
76
  @st.cache_resource
77
+ def load_quantized_model_and_tokenizer():
78
+ model_name = "mistralai/Mistral-7B-Instruct-v0.1" # Configure quantization for 4-bit loading
79
+ quantization_config = BitsAndBytesConfig(
80
+ load_in_4bit=True, # Enable 4-bit quantization
81
+ bnb_4bit_compute_dtype=torch.float16, # Set computation precision
82
+ bnb_4bit_quant_type="nf4", # Use Normal Float 4 (NF4) quantization
83
+ bnb_4bit_use_double_quant=True, # Enable double quantization
84
  )
85
+ # Load the pre-trained model with quantization
86
  model = AutoModelForCausalLM.from_pretrained(
87
  model_name,
88
+ device_map="auto", # Automatically allocate model to devices
89
+ quantization_config=quantization_config,
90
+ token=hf_token,
91
+ )
92
+
93
+ # Load the tokenizer
94
+ tokenizer = AutoTokenizer.from_pretrained(
95
+ model_name,
96
+ token=hf_token,
97
  )
98
+ return tokenizer, model
99
 
100
  def clone_cache(cache):
101
  new_cache = DynamicCache()
 
114
  model, tokenizer = load_model_and_tokenizer(doc_text_count)
115
  system_prompt = f"""
116
  <|system|>
117
+ You are a helpful assistant. Provide concise, factual answers based only on the provided context.
118
+ If the information is not available, respond with: "I'm sorry, I don't have enough information to answer that."
119
  <|user|>
120
  Context:
121
  {doc_text}