Update app.py
Browse files
app.py
CHANGED
@@ -197,6 +197,9 @@ with st.sidebar:
|
|
197 |
model = load_xception_model()
|
198 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
199 |
|
|
|
|
|
|
|
200 |
if model is not None:
|
201 |
st.session_state.xception_model = model
|
202 |
st.session_state.device = device
|
@@ -253,14 +256,34 @@ with st.sidebar:
|
|
253 |
if not st.session_state.llm_model_loaded:
|
254 |
if st.button("π₯ Load Vision LLM", type="primary"):
|
255 |
# Load LLM model
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
264 |
else:
|
265 |
st.success("β
Vision LLM loaded")
|
266 |
|
@@ -519,33 +542,6 @@ def fix_cross_attention_mask(inputs):
|
|
519 |
inputs['cross_attention_mask'] = new_mask
|
520 |
return inputs
|
521 |
|
522 |
-
# Load model function
|
523 |
-
@st.cache_resource
|
524 |
-
def load_llm_model():
|
525 |
-
with st.spinner("Loading LLM vision model... This may take a few minutes. Please be patient..."):
|
526 |
-
try:
|
527 |
-
# Check for GPU
|
528 |
-
has_gpu = check_gpu()
|
529 |
-
|
530 |
-
# Load base model and tokenizer using Unsloth
|
531 |
-
base_model_id = "unsloth/llama-3.2-11b-vision-instruct"
|
532 |
-
model, tokenizer = FastVisionModel.from_pretrained(
|
533 |
-
base_model_id,
|
534 |
-
load_in_4bit=True,
|
535 |
-
)
|
536 |
-
|
537 |
-
# Load the adapter
|
538 |
-
adapter_id = "saakshigupta/deepfake-explainer-new"
|
539 |
-
model = PeftModel.from_pretrained(model, adapter_id)
|
540 |
-
|
541 |
-
# Set to inference mode
|
542 |
-
FastVisionModel.for_inference(model)
|
543 |
-
|
544 |
-
return model, tokenizer
|
545 |
-
except Exception as e:
|
546 |
-
st.error(f"Error loading model: {str(e)}")
|
547 |
-
return None, None
|
548 |
-
|
549 |
# Analyze image function
|
550 |
def analyze_image_with_llm(image, gradcam_overlay, face_box, pred_label, confidence, question, model, tokenizer, temperature=0.7, max_tokens=500, custom_instruction=""):
|
551 |
# Create a prompt that includes GradCAM information
|
@@ -876,7 +872,8 @@ def main():
|
|
876 |
device = st.session_state.device
|
877 |
model = st.session_state.xception_model
|
878 |
|
879 |
-
# Ensure model is in eval mode
|
|
|
880 |
model.eval()
|
881 |
|
882 |
# Move tensor to device
|
@@ -918,7 +915,7 @@ def main():
|
|
918 |
st.subheader("GradCAM Visualization")
|
919 |
try:
|
920 |
cam, overlay, comparison, detected_face_box = process_image_with_xception_gradcam(
|
921 |
-
image, model, device, pred_class
|
922 |
)
|
923 |
|
924 |
if comparison:
|
|
|
197 |
model = load_xception_model()
|
198 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
199 |
|
200 |
+
# Explicitly move model to device
|
201 |
+
model = model.to(device)
|
202 |
+
|
203 |
if model is not None:
|
204 |
st.session_state.xception_model = model
|
205 |
st.session_state.device = device
|
|
|
256 |
if not st.session_state.llm_model_loaded:
|
257 |
if st.button("π₯ Load Vision LLM", type="primary"):
|
258 |
# Load LLM model
|
259 |
+
try:
|
260 |
+
with st.spinner("Loading LLM vision model... This may take a few minutes. Please be patient..."):
|
261 |
+
# Check for GPU
|
262 |
+
has_gpu = check_gpu()
|
263 |
+
|
264 |
+
# Load base model and tokenizer using Unsloth
|
265 |
+
base_model_id = "unsloth/llama-3.2-11b-vision-instruct"
|
266 |
+
model, tokenizer = FastVisionModel.from_pretrained(
|
267 |
+
base_model_id,
|
268 |
+
load_in_4bit=True,
|
269 |
+
)
|
270 |
+
|
271 |
+
# Load the adapter
|
272 |
+
adapter_id = "saakshigupta/deepfake-explainer-new"
|
273 |
+
model = PeftModel.from_pretrained(model, adapter_id)
|
274 |
+
|
275 |
+
# Set to inference mode
|
276 |
+
FastVisionModel.for_inference(model)
|
277 |
+
|
278 |
+
if model is not None and tokenizer is not None:
|
279 |
+
st.session_state.llm_model = model
|
280 |
+
st.session_state.tokenizer = tokenizer
|
281 |
+
st.session_state.llm_model_loaded = True
|
282 |
+
st.success("β
Vision LLM loaded!")
|
283 |
+
else:
|
284 |
+
st.error("β Failed to load Vision LLM.")
|
285 |
+
except Exception as e:
|
286 |
+
st.error(f"Error loading LLM model: {str(e)}")
|
287 |
else:
|
288 |
st.success("β
Vision LLM loaded")
|
289 |
|
|
|
542 |
inputs['cross_attention_mask'] = new_mask
|
543 |
return inputs
|
544 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
545 |
# Analyze image function
|
546 |
def analyze_image_with_llm(image, gradcam_overlay, face_box, pred_label, confidence, question, model, tokenizer, temperature=0.7, max_tokens=500, custom_instruction=""):
|
547 |
# Create a prompt that includes GradCAM information
|
|
|
872 |
device = st.session_state.device
|
873 |
model = st.session_state.xception_model
|
874 |
|
875 |
+
# Ensure model is in eval mode and on the correct device
|
876 |
+
model = model.to(device)
|
877 |
model.eval()
|
878 |
|
879 |
# Move tensor to device
|
|
|
915 |
st.subheader("GradCAM Visualization")
|
916 |
try:
|
917 |
cam, overlay, comparison, detected_face_box = process_image_with_xception_gradcam(
|
918 |
+
image, model.to(device), device, pred_class
|
919 |
)
|
920 |
|
921 |
if comparison:
|