kouki321 commited on
Commit
35c42fb
·
verified ·
1 Parent(s): 441a0b0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +2 -8
app.py CHANGED
@@ -1,6 +1,6 @@
1
  import streamlit as st
2
  import torch
3
- from transformers import BitsAndBytesConfig, AutoTokenizer, AutoModelForCausalLM
4
  from transformers.cache_utils import DynamicCache
5
  import os
6
  from time import time
@@ -84,17 +84,11 @@ def calculate_cache_size(cache):
84
  @st.cache_resource
85
  def load_model_and_tokenizer(doc_text_count):
86
  model_name = "google/gemma-3-4b-it" # Configure quantization for 4-bit loading
87
- quantization_config = BitsAndBytesConfig(
88
- load_in_4bit=True, # Enable 4-bit quantization
89
- bnb_4bit_compute_dtype=torch.float16, # Set computation precision
90
- bnb_4bit_quant_type="nf4", # Use Normal Float 4 (NF4) quantization
91
- bnb_4bit_use_double_quant=True, # Enable double quantization
92
- )
93
  # Load the pre-trained model with quantization
94
  model = AutoModelForCausalLM.from_pretrained(
95
  model_name,
96
  device_map="auto", # Automatically allocate model to devices
97
- quantization_config=quantization_config,
98
  model_max_length=1.3*round(doc_text_count * 0.3 + 1),
99
  token=HF_TOKEN
100
  )
 
1
  import streamlit as st
2
  import torch
3
+ from transformers import AutoTokenizer, AutoModelForCausalLM
4
  from transformers.cache_utils import DynamicCache
5
  import os
6
  from time import time
 
84
  @st.cache_resource
85
  def load_model_and_tokenizer(doc_text_count):
86
  model_name = "google/gemma-3-4b-it" # Configure quantization for 4-bit loading
87
+
 
 
 
 
 
88
  # Load the pre-trained model with quantization
89
  model = AutoModelForCausalLM.from_pretrained(
90
  model_name,
91
  device_map="auto", # Automatically allocate model to devices
 
92
  model_max_length=1.3*round(doc_text_count * 0.3 + 1),
93
  token=HF_TOKEN
94
  )