Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
import streamlit as st
|
2 |
import torch
|
3 |
-
from transformers import
|
4 |
from transformers.cache_utils import DynamicCache
|
5 |
import os
|
6 |
from time import time
|
@@ -84,17 +84,11 @@ def calculate_cache_size(cache):
|
|
84 |
@st.cache_resource
|
85 |
def load_model_and_tokenizer(doc_text_count):
|
86 |
model_name = "google/gemma-3-4b-it" # Configure quantization for 4-bit loading
|
87 |
-
|
88 |
-
load_in_4bit=True, # Enable 4-bit quantization
|
89 |
-
bnb_4bit_compute_dtype=torch.float16, # Set computation precision
|
90 |
-
bnb_4bit_quant_type="nf4", # Use Normal Float 4 (NF4) quantization
|
91 |
-
bnb_4bit_use_double_quant=True, # Enable double quantization
|
92 |
-
)
|
93 |
# Load the pre-trained model with quantization
|
94 |
model = AutoModelForCausalLM.from_pretrained(
|
95 |
model_name,
|
96 |
device_map="auto", # Automatically allocate model to devices
|
97 |
-
quantization_config=quantization_config,
|
98 |
model_max_length=1.3*round(doc_text_count * 0.3 + 1),
|
99 |
token=HF_TOKEN
|
100 |
)
|
|
|
1 |
import streamlit as st
|
2 |
import torch
|
3 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
4 |
from transformers.cache_utils import DynamicCache
|
5 |
import os
|
6 |
from time import time
|
|
|
84 |
@st.cache_resource
|
85 |
def load_model_and_tokenizer(doc_text_count):
|
86 |
model_name = "google/gemma-3-4b-it" # Configure quantization for 4-bit loading
|
87 |
+
|
|
|
|
|
|
|
|
|
|
|
88 |
# Load the pre-trained model with quantization
|
89 |
model = AutoModelForCausalLM.from_pretrained(
|
90 |
model_name,
|
91 |
device_map="auto", # Automatically allocate model to devices
|
|
|
92 |
model_max_length=1.3*round(doc_text_count * 0.3 + 1),
|
93 |
token=HF_TOKEN
|
94 |
)
|