Spaces:

s2049
/

LetterboxdRecommender

Sleeping

App Files Files Community

s2049 commited on May 10

Commit

afb6dea

verified ·

1 Parent(s): b90f766

Update app.py

Browse files

Files changed (1) hide show

app.py +159 -113

app.py CHANGED Viewed

@@ -4,36 +4,34 @@ from bs4 import BeautifulSoup
 import os
 import re
 import random
-from dotenv import load_dotenv
 from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
 import torch
 import gradio as gr
 import time
-# Opt-in to future pandas behavior to potentially silence the downcasting warning
-# pd.set_option('future.no_silent_downcasting', True) # You can uncomment this if you wish
 # --- Configuration ---
-load_dotenv()
-TMDB_API_KEY = os.environ.get("TMDB_API_KEY", "442a13f1865d8936f95aa20737e6f6f5")
-HF_TOKEN = os.environ.get("HF_TOKEN") # CRUCIAL for gated models
-# CORRECTED MODEL NAME
-MODEL_NAME = "ALLaM-AI/ALLaM-7B-Instruct-preview"
 BASE_TMDB_URL = "https://api.themoviedb.org/3"
 POSTER_BASE_URL = "https://image.tmdb.org/t/p/w500"
 NUM_RECOMMENDATIONS_TO_DISPLAY = 5
 MIN_RATING_FOR_SEED = 3.5
-MIN_VOTE_COUNT_TMDB = 100
-# --- Global Variables ---
 df_profile_global = None
 df_watchlist_global = None
 df_reviews_global = None
 df_diary_global = None
 df_ratings_global = None
-df_watched_global = None
 uri_to_movie_map_global = {}
 all_watched_titles_global = set()
@@ -48,7 +46,7 @@ llm_tokenizer = None
 def clean_html(raw_html):
     if pd.isna(raw_html) or raw_html is None: return ""
     text = str(raw_html)
-    text = re.sub(r'<br\s*/?>', '\n', text)
     soup = BeautifulSoup(text, "html.parser")
     return soup.get_text(separator=" ", strip=True)
@@ -67,7 +65,9 @@ def get_movie_uri_map(dfs_dict):
                             year = int(row['Year'])
                             uri_map[uri] = (str(row['Name']), year)
                             processed_uris.add(uri)
-                        except ValueError: pass
     return uri_map
 def load_all_data():
@@ -76,15 +76,17 @@ def load_all_data():
     global watchlist_titles_global, favorite_film_details_global, seed_movies_global
     try:
         df_profile_global = pd.read_csv("profile.csv")
         df_watchlist_global = pd.read_csv("watchlist.csv")
         df_reviews_global = pd.read_csv("reviews.csv")
         df_diary_global = pd.read_csv("diary.csv")
         df_ratings_global = pd.read_csv("ratings.csv")
-        _df_watched_log = pd.read_csv("watched.csv")
     except FileNotFoundError as e:
-        print(f"ERROR: CSV file not found: {e}.")
-        return False
     dfs_for_uri_map = {
         "reviews.csv": df_reviews_global, "diary.csv": df_diary_global,
@@ -114,17 +116,14 @@ def load_all_data():
     consolidated.drop(columns=['Rating_simple'], inplace=True)
     watched_log_subset = _df_watched_log[['Letterboxd URI', 'Name', 'Year']].copy()
-    watched_log_subset['from_watched_log'] = True # This column is an object/boolean dtype
     consolidated = pd.merge(consolidated, watched_log_subset, on=['Letterboxd URI', 'Name', 'Year'], how='outer')
-    # Address the FutureWarning directly or use pd.set_option
-    # This ensures 'from_watched_log' becomes boolean after fillna
     consolidated['from_watched_log'] = consolidated['from_watched_log'].fillna(False).astype(bool)
     consolidated['Review Text'] = consolidated['Review Text'].fillna('').apply(clean_html)
     consolidated['Year'] = pd.to_numeric(consolidated['Year'], errors='coerce').astype('Int64')
-    consolidated.dropna(subset=['Name', 'Year'], inplace=True)
     consolidated.drop_duplicates(subset=['Name', 'Year'], keep='first', inplace=True)
     df_watched_global = consolidated
@@ -142,7 +141,7 @@ def load_all_data():
                 except ValueError: pass
     favorite_film_details_global = []
-    if df_profile_global is not None and 'Favorite Films' in df_profile_global.columns:
         fav_uris_str = df_profile_global.iloc[0]['Favorite Films']
         if pd.notna(fav_uris_str):
             fav_uris = [uri.strip() for uri in fav_uris_str.split(',')]
@@ -155,61 +154,76 @@ def load_all_data():
                     favorite_film_details_global.append({'name': name, 'year': year, 'rating': rating, 'review_text': review, 'uri': uri})
     seed_movies_global.extend(favorite_film_details_global)
-    highly_rated_df = df_watched_global[df_watched_global['Rating'] >= MIN_RATING_FOR_SEED]
-    favorite_uris = {fav['uri'] for fav in favorite_film_details_global if 'uri' in fav}
-    for _, row in highly_rated_df.iterrows():
-        if row['Letterboxd URI'] not in favorite_uris:
-            seed_movies_global.append({
-                'name': row['Name'], 'year': row['Year'], 'rating': row['Rating'],
-                'review_text': row['Review Text'], 'uri': row['Letterboxd URI']
-            })
-    temp_df = pd.DataFrame(seed_movies_global)
-    if not temp_df.empty: # Check if DataFrame is not empty before dropping duplicates
-        temp_df.drop_duplicates(subset=['name', 'year'], keep='first', inplace=True)
-        seed_movies_global = temp_df.to_dict('records')
     else:
-        seed_movies_global = [] # Ensure it's an empty list if temp_df was empty
     random.shuffle(seed_movies_global)
     return True
 def initialize_llm():
     global llm_pipeline, llm_tokenizer
-    if llm_pipeline is None:
-        print(f"Initializing LLM: {MODEL_NAME}")
         if not HF_TOKEN:
-            print("WARNING: HF_TOKEN not found. Access to gated models like ALLaM will fail.")
-            # Optionally, you could prevent the attempt to load if no token,
-            # or let it try and fail, as it currently does.
-            # return # uncomment to stop here if no token
         try:
-            llm_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True, token=HF_TOKEN)
             model = AutoModelForCausalLM.from_pretrained(
                 MODEL_NAME,
                 torch_dtype=torch.float16,
-                device_map="auto",
-                load_in_8bit=True,
                 trust_remote_code=True,
                 token=HF_TOKEN
             )
             if llm_tokenizer.pad_token is None:
                 llm_tokenizer.pad_token = llm_tokenizer.eos_token
-                model.config.pad_token_id = model.config.eos_token_id
             llm_pipeline = pipeline(
-                "text-generation", model=model, tokenizer=llm_tokenizer
             )
-            print(f"LLM ({MODEL_NAME}) Initialized Successfully.")
         except Exception as e:
-            print(f"Error initializing LLM ({MODEL_NAME}): {e}")
             llm_pipeline = None
 # --- TMDB API Functions ---
 def search_tmdb_movie_details(title, year):
-    if not TMDB_API_KEY or TMDB_API_KEY == "YOUR_TMDB_API_KEY_FALLBACK":
-        print("TMDB API Key not properly configured.")
         return None
     try:
         search_url = f"{BASE_TMDB_URL}/search/movie"
@@ -232,14 +246,14 @@ def search_tmdb_movie_details(title, year):
                 'vote_average': movie.get('vote_average'), 'vote_count': movie.get('vote_count'),
                 'popularity': movie.get('popularity')
             }
-        time.sleep(0.25)
-    except requests.RequestException as e: print(f"Error searching TMDB for {title} ({year}): {e}")
-    except Exception as ex: print(f"Unexpected error in search_tmdb_movie_details for {title} ({year}): {ex}")
     return None
 def get_tmdb_recommendations(movie_id, page=1):
-    if not TMDB_API_KEY or TMDB_API_KEY == "YOUR_TMDB_API_KEY_FALLBACK":
-        print("TMDB API Key not properly configured.")
         return []
     recommendations = []
     try:
@@ -258,21 +272,23 @@ def get_tmdb_recommendations(movie_id, page=1):
                     'vote_average': movie.get('vote_average'), 'vote_count': movie.get('vote_count'),
                     'popularity': movie.get('popularity')
                 })
-        time.sleep(0.25)
-    except requests.RequestException as e: print(f"Error getting TMDB recommendations for movie ID {movie_id}: {e}")
-    except Exception as ex: print(f"Unexpected error in get_tmdb_recommendations for movie ID {movie_id}: {ex}")
     return recommendations
 # --- LLM Explanation ---
 def generate_saudi_explanation(recommended_movie_title, seed_movie_title, seed_movie_context=""):
     global llm_pipeline, llm_tokenizer
     if llm_pipeline is None or llm_tokenizer is None:
-        return "للأسف، نموذج الذكاء الاصطناعي مو جاهز الحين. حاول مرة ثانية بعد شوي."
     max_context_len = 150
     seed_movie_context_short = (seed_movie_context[:max_context_len] + "...") if len(seed_movie_context) > max_context_len else seed_movie_context
-    # Check ALLaM model card for specific prompt format. Using [INST] as it's common for Instruct models.
     prompt_template = f"""<s>[INST] أنت ناقد أفلام سعودي خبير ودمك خفيف جداً. مهمتك هي كتابة توصية لفيلم جديد بناءً على فيلم سابق أعجب المستخدم.
     المستخدم أعجب بالفيلم هذا: "{seed_movie_title}".
     وكان تعليقه أو سبب إعجابه (إذا متوفر): "{seed_movie_context_short}"
@@ -293,7 +309,7 @@ def generate_saudi_explanation(recommended_movie_title, seed_movie_title, seed_m
             prompt_template, do_sample=True, top_k=20, top_p=0.9, num_return_sequences=1,
             eos_token_id=llm_tokenizer.eos_token_id,
             pad_token_id=llm_tokenizer.pad_token_id if llm_tokenizer.pad_token_id is not None else llm_tokenizer.eos_token_id,
-            max_new_tokens=150
         )
         explanation = sequences[0]['generated_text'].split("[/INST]")[-1].strip()
         explanation = explanation.replace("<s>", "").replace("</s>", "").strip()
@@ -301,50 +317,61 @@ def generate_saudi_explanation(recommended_movie_title, seed_movie_title, seed_m
         explanation = re.sub(r"كنموذج لغوي.*?\s*,?\s*", "", explanation, flags=re.IGNORECASE)
         if not explanation or explanation.lower().startswith("أنت ناقد أفلام") or len(explanation) < 20 :
             return f"شكلك بتنبسط ع��ى فيلم '{recommended_movie_title}' لأنه يشبه جو فيلم '{seed_movie_title}' اللي حبيته! عطيه تجربة."
         return explanation
     except Exception as e:
-        print(f"Error during LLM generation with {MODEL_NAME}: {e}")
         return f"يا كابتن، شكلك بتحب '{recommended_movie_title}'، خاصة إنك استمتعت بـ'{seed_movie_title}'. جربه وعطنا رأيك!"
 # --- Recommendation Logic ---
-def get_recommendations(progress=gr.Progress()):
-    if not TMDB_API_KEY or (TMDB_API_KEY == "442a13f1865d8936f95aa20737e6f6f5" and not os.environ.get("TMDB_API_KEY")):
-        print("Warning: Using fallback TMDB API Key.")
     if not TMDB_API_KEY:
-        return "<p style='color:red; text-align:right;'>خطأ: مفتاح TMDB API مو موجود.</p>"
-    if not all([df_profile_global is not None, df_watched_global is not None, seed_movies_global]):
-        return "<p style='color:red; text-align:right;'>خطأ: فشل في تحميل بيانات المستخدم.</p>"
-    # Ensure LLM is initialized before trying to use it
-    if llm_pipeline is None:
-        initialize_llm() # Attempt to initialize if not already done
-        if llm_pipeline is None: # Check again if initialization failed
-             return "<p style='color:red; text-align:right;'>خطأ: فشل في تهيئة نموذج الذكاء الاصطناعي. تأكد من وجود HF_TOKEN وأن لديك صلاحية الوصول للنموذج.</p>"
     progress(0.1, desc="نجمع أفلامك المفضلة...")
     potential_recs = {}
-    seeds_to_process = seed_movies_global[:25]
     for i, seed_movie in enumerate(seeds_to_process):
-        progress(0.1 + (i / len(seeds_to_process)) * 0.4, desc=f"نبحث عن توصيات بناءً على: {seed_movie['name']}")
-        seed_tmdb_details = search_tmdb_movie_details(seed_movie['name'], seed_movie['year'])
         if seed_tmdb_details and seed_tmdb_details.get('id'):
             tmdb_recs = get_tmdb_recommendations(seed_tmdb_details['id'])
             for rec in tmdb_recs:
                 try:
-                    rec_tuple = (str(rec['title']), int(rec['year']))
                     if rec.get('id') and rec_tuple not in all_watched_titles_global and rec_tuple not in watchlist_titles_global:
-                        if rec['id'] not in potential_recs:
                             potential_recs[rec['id']] = {
-                                'movie_info': rec, 'seed_movie_title': seed_movie['name'],
                                 'seed_movie_context': seed_movie.get('review_text', '') or seed_movie.get('comment_text', '')
                             }
-                except (ValueError, TypeError): continue # Catch TypeError if year is None
     if not potential_recs:
-        return "<p style='text-align:right;'>ما لقينا توصيات جديدة لك حالياً. 😉</p>"
     sorted_recs_list = sorted(potential_recs.values(), key=lambda x: x['movie_info'].get('popularity', 0), reverse=True)
     final_recommendations_data = []
     displayed_ids = set()
     for rec_data in sorted_recs_list:
@@ -352,74 +379,93 @@ def get_recommendations(progress=gr.Progress()):
         if rec_data['movie_info']['id'] not in displayed_ids:
             final_recommendations_data.append(rec_data)
             displayed_ids.add(rec_data['movie_info']['id'])
     if not final_recommendations_data:
-         return "<p style='text-align:right;'>ما لقينا توصيات جديدة لك حالياً بعد الفلترة. 😉</p>"
-    output_html = "<div>"
     progress(0.6, desc="نجهز لك الشرح باللغة العامية...")
     for i, rec_data in enumerate(final_recommendations_data):
         progress(0.6 + (i / len(final_recommendations_data)) * 0.4, desc=f"نكتب شرح لفيلم: {rec_data['movie_info']['title']}")
         explanation = generate_saudi_explanation(
             rec_data['movie_info']['title'], rec_data['seed_movie_title'], rec_data['seed_movie_context']
         )
         poster_url = rec_data['movie_info']['poster_path']
-        if not poster_url or "placeholder.com" in poster_url:
-            poster_url = f"https://via.placeholder.com/300x450.png?text={rec_data['movie_info']['title'].replace(' ', '+')}"
         output_html += f"""
-        <div style="display: flex; flex-direction: row-reverse; align-items: flex-start; margin-bottom: 25px; border-bottom: 1px solid #ddd; padding-bottom:15px; background-color: #f9f9f9; border-radius: 8px; padding: 15px;">
             <img src="{poster_url}" alt="{rec_data['movie_info']['title']}" style="width: 150px; max-width:30%; height: auto; margin-left: 20px; border-radius: 5px; box-shadow: 2px 2px 5px rgba(0,0,0,0.1);">
             <div style="text-align: right; direction: rtl; flex-grow: 1;">
                 <h3 style="margin-top:0; color: #c70039;">{rec_data['movie_info']['title']} ({rec_data['movie_info']['year']})</h3>
                 <p style="font-size: 1.1em; color: #333; line-height: 1.6;">{explanation}</p>
-                <p style="font-size: 0.9em; color: #777; margin-top: 10px;"><em>رشحنا لك هذا الفيلم لأنك حبيت: <strong style="color:#555;">{rec_data['seed_movie_title']}</strong></em></p>
             </div>
         </div>"""
     output_html += "</div>"
     return gr.HTML(output_html)
 # --- Gradio Interface ---
-css = """
 body { font-family: 'Tajawal', sans-serif; }
-.gradio-container { font-family: 'Tajawal', sans-serif !important; direction: rtl; }
 footer { display: none !important; }
-.gr-button { background-color: #c70039 !important; color: white !important; font-size: 1.2em !important; padding: 10px 20px !important; border-radius: 8px !important; }
-.gr-button:hover { background-color: #a3002f !important; }
-h1, h3 { color: #900c3f !important; }
-""" # Removed .gr-input and .gr-output as they aren't used directly for styling here
 data_loaded_successfully = load_all_data()
 if data_loaded_successfully:
-    print("All user data loaded and preprocessed successfully.")
-    # LLM will be initialized on first click if not already
 else:
-    print("Failed to load user data. The app might not function correctly.")
-with gr.Blocks(theme=gr.themes.Soft(primary_hue="red", secondary_hue="pink"), css=css) as iface:
     gr.Markdown(
         """
-        <div style="text-align: center;">
-            <h1 style="color: #c70039; font-size: 2.5em;">🎬 رفيقك السينمائي 🍿</h1>
             <p style="font-size: 1.2em; color: #555;">يا هلا بك! اضغط الزر تحت وخلنا نعطيك توصيات أفلام على كيف كيفك، مع شرح بالعامية ليش ممكن تدخل مزاجك.</p>
         </div>"""
     )
-    recommend_button = gr.Button("عطني توصيات أفلام!")
-    with gr.Column():
-        output_recommendations = gr.HTML(label="توصياتك النارية 🔥")
-    # Call initialize_llm once when the interface is defined if data loaded successfully
-    # This way, it tries to load the LLM when the app starts, not just on the first click.
     if data_loaded_successfully:
-        initialize_llm() # Moved initialization here
-    recommend_button.click(fn=get_recommendations, inputs=[], outputs=[output_recommendations])
     gr.Markdown(
         """
-        <div style="text-align: center; margin-top: 30px; font-size: 0.9em; color: #777;">
-            <p>استمتع بالمشاهدة! 🎥</p>
         </div>"""
     )
 if __name__ == "__main__":
-    if not TMDB_API_KEY or (TMDB_API_KEY == "442a13f1865d8936f95aa20737e6f6f5" and not os.environ.get("TMDB_API_KEY")):
-        print("\nWARNING: TMDB_API_KEY is using the hardcoded fallback or is missing.")
-    iface.launch(debug=True) # Set debug=False for production or normal HF Space operation

 import os
 import re
 import random
+from dotenv import load_dotenv # For local testing with a .env file
 from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
 import torch
 import gradio as gr
 import time
 # --- Configuration ---
+load_dotenv() # Loads HF_TOKEN and TMDB_API_KEY from .env for local testing
+# SECRETS - These will be read from Hugging Face Space Secrets when deployed
+TMDB_API_KEY = os.environ.get("TMDB_API_KEY")
+HF_TOKEN = os.environ.get("HF_TOKEN") # Essential for gated models like ALLaM
+MODEL_NAME = "ALLaM-AI/ALLaM-7B-Instruct-preview" # Target ALLaM model
 BASE_TMDB_URL = "https://api.themoviedb.org/3"
 POSTER_BASE_URL = "https://image.tmdb.org/t/p/w500"
 NUM_RECOMMENDATIONS_TO_DISPLAY = 5
 MIN_RATING_FOR_SEED = 3.5
+MIN_VOTE_COUNT_TMDB = 100 # Minimum votes on TMDB for a movie to be considered
+# --- Global Variables for Data & Model (Load once) ---
 df_profile_global = None
 df_watchlist_global = None
 df_reviews_global = None
 df_diary_global = None
 df_ratings_global = None
+df_watched_global = None # This will be a consolidated df
 uri_to_movie_map_global = {}
 all_watched_titles_global = set()
 def clean_html(raw_html):
     if pd.isna(raw_html) or raw_html is None: return ""
     text = str(raw_html)
+    text = re.sub(r'<br\s*/?>', '\n', text) # Convert <br> to newlines
     soup = BeautifulSoup(text, "html.parser")
     return soup.get_text(separator=" ", strip=True)
                             year = int(row['Year'])
                             uri_map[uri] = (str(row['Name']), year)
                             processed_uris.add(uri)
+                        except ValueError:
+                            # Silently skip if year is not a valid integer for URI mapping
+                            pass
     return uri_map
 def load_all_data():
     global watchlist_titles_global, favorite_film_details_global, seed_movies_global
     try:
+        # Assumes CSV files are in the root of the Hugging Face Space
         df_profile_global = pd.read_csv("profile.csv")
+        # df_comments_global = pd.read_csv("comments.csv") # Not directly used in recs logic
         df_watchlist_global = pd.read_csv("watchlist.csv")
         df_reviews_global = pd.read_csv("reviews.csv")
         df_diary_global = pd.read_csv("diary.csv")
         df_ratings_global = pd.read_csv("ratings.csv")
+        _df_watched_log = pd.read_csv("watched.csv") # Raw log of watched films
     except FileNotFoundError as e:
+        print(f"CRITICAL ERROR: CSV file not found: {e}. Ensure all CSVs are uploaded to the HF Space root.")
+        return False # Indicate failure to load data
     dfs_for_uri_map = {
         "reviews.csv": df_reviews_global, "diary.csv": df_diary_global,
     consolidated.drop(columns=['Rating_simple'], inplace=True)
     watched_log_subset = _df_watched_log[['Letterboxd URI', 'Name', 'Year']].copy()
+    watched_log_subset['from_watched_log'] = True
     consolidated = pd.merge(consolidated, watched_log_subset, on=['Letterboxd URI', 'Name', 'Year'], how='outer')
     consolidated['from_watched_log'] = consolidated['from_watched_log'].fillna(False).astype(bool)
     consolidated['Review Text'] = consolidated['Review Text'].fillna('').apply(clean_html)
     consolidated['Year'] = pd.to_numeric(consolidated['Year'], errors='coerce').astype('Int64')
+    consolidated.dropna(subset=['Name', 'Year'], inplace=True) # Ensure essential fields are present
     consolidated.drop_duplicates(subset=['Name', 'Year'], keep='first', inplace=True)
     df_watched_global = consolidated
                 except ValueError: pass
     favorite_film_details_global = []
+    if df_profile_global is not None and 'Favorite Films' in df_profile_global.columns and not df_profile_global.empty:
         fav_uris_str = df_profile_global.iloc[0]['Favorite Films']
         if pd.notna(fav_uris_str):
             fav_uris = [uri.strip() for uri in fav_uris_str.split(',')]
                     favorite_film_details_global.append({'name': name, 'year': year, 'rating': rating, 'review_text': review, 'uri': uri})
     seed_movies_global.extend(favorite_film_details_global)
+    if not df_watched_global.empty: # Ensure df_watched_global is not empty
+        highly_rated_df = df_watched_global[df_watched_global['Rating'] >= MIN_RATING_FOR_SEED]
+        favorite_uris = {fav['uri'] for fav in favorite_film_details_global if 'uri' in fav}
+        for _, row in highly_rated_df.iterrows():
+            if row['Letterboxd URI'] not in favorite_uris:
+                seed_movies_global.append({
+                    'name': row['Name'], 'year': row['Year'], 'rating': row['Rating'],
+                    'review_text': row['Review Text'], 'uri': row['Letterboxd URI']
+                })
+    if seed_movies_global: # Only process if seed_movies_global is not empty
+        temp_df = pd.DataFrame(seed_movies_global)
+        if not temp_df.empty:
+            temp_df.drop_duplicates(subset=['name', 'year'], keep='first', inplace=True)
+            seed_movies_global = temp_df.to_dict('records')
     else:
+        seed_movies_global = []
     random.shuffle(seed_movies_global)
     return True
 def initialize_llm():
     global llm_pipeline, llm_tokenizer
+    if llm_pipeline is None: # Proceed only if pipeline is not already initialized
+        print(f"Attempting to initialize LLM: {MODEL_NAME}")
         if not HF_TOKEN:
+            print("CRITICAL ERROR: HF_TOKEN environment variable not set. Cannot access gated model.")
+            return # Stop initialization if token is missing
         try:
+            llm_tokenizer = AutoTokenizer.from_pretrained(
+                MODEL_NAME,
+                trust_remote_code=True,
+                token=HF_TOKEN,
+                use_fast=False # Using slow tokenizer as per previous debugging for SentencePiece
+            )
+            print(f"Tokenizer for {MODEL_NAME} loaded.")
             model = AutoModelForCausalLM.from_pretrained(
                 MODEL_NAME,
                 torch_dtype=torch.float16,
+                device_map="auto", # Automatically map to available device
+                load_in_8bit=True,  # Enable 8-bit quantization; requires bitsandbytes
                 trust_remote_code=True,
                 token=HF_TOKEN
             )
+            print(f"Model {MODEL_NAME} loaded.")
             if llm_tokenizer.pad_token is None:
+                print("Tokenizer pad_token is None, setting to eos_token.")
                 llm_tokenizer.pad_token = llm_tokenizer.eos_token
+                if model.config.pad_token_id is None: # Also update model config if needed
+                    model.config.pad_token_id = model.config.eos_token_id
+                    print(f"Model config pad_token_id set to: {model.config.pad_token_id}")
             llm_pipeline = pipeline(
+                "text-generation",
+                model=model,
+                tokenizer=llm_tokenizer
             )
+            print(f"LLM pipeline for {MODEL_NAME} initialized successfully.")
         except Exception as e:
+            print(f"ERROR during LLM initialization ({MODEL_NAME}): {e}")
+            # Ensure these are reset if initialization fails partway
             llm_pipeline = None
+            llm_tokenizer = None
 # --- TMDB API Functions ---
 def search_tmdb_movie_details(title, year):
+    if not TMDB_API_KEY:
+        print("CRITICAL ERROR: TMDB_API_KEY not configured.")
         return None
     try:
         search_url = f"{BASE_TMDB_URL}/search/movie"
                 'vote_average': movie.get('vote_average'), 'vote_count': movie.get('vote_count'),
                 'popularity': movie.get('popularity')
             }
+        time.sleep(0.3) # Slightly increased delay for API calls
+    except requests.RequestException as e: print(f"TMDB API Error (search) for {title} ({year}): {e}")
+    except Exception as ex: print(f"Unexpected error in TMDB search for {title} ({year}): {ex}")
     return None
 def get_tmdb_recommendations(movie_id, page=1):
+    if not TMDB_API_KEY:
+        print("CRITICAL ERROR: TMDB_API_KEY not configured.")
         return []
     recommendations = []
     try:
                     'vote_average': movie.get('vote_average'), 'vote_count': movie.get('vote_count'),
                     'popularity': movie.get('popularity')
                 })
+        time.sleep(0.3) # Slightly increased delay
+    except requests.RequestException as e: print(f"TMDB API Error (recommendations) for movie ID {movie_id}: {e}")
+    except Exception as ex: print(f"Unexpected error in TMDB recommendations for movie ID {movie_id}: {ex}")
     return recommendations
 # --- LLM Explanation ---
 def generate_saudi_explanation(recommended_movie_title, seed_movie_title, seed_movie_context=""):
     global llm_pipeline, llm_tokenizer
     if llm_pipeline is None or llm_tokenizer is None:
+        print("LLM pipeline or tokenizer not available for explanation generation.")
+        return "للأسف، نموذج الذكاء الاصطناعي مو جاهز حالياً. حاول مرة ثانية بعد شوي."
     max_context_len = 150
     seed_movie_context_short = (seed_movie_context[:max_context_len] + "...") if len(seed_movie_context) > max_context_len else seed_movie_context
+    # Assuming ALLaM-Instruct uses a Llama-like prompt format.
+    # ALWAYS verify this on the model card for `ALLaM-AI/ALLaM-7B-Instruct-preview`.
     prompt_template = f"""<s>[INST] أنت ناقد أفلام سعودي خبير ودمك خفيف جداً. مهمتك هي كتابة توصية لفيلم جديد بناءً على فيلم سابق أعجب المستخدم.
     المستخدم أعجب بالفيلم هذا: "{seed_movie_title}".
     وكان تعليقه أو سبب إعجابه (إذا متوفر): "{seed_movie_context_short}"
             prompt_template, do_sample=True, top_k=20, top_p=0.9, num_return_sequences=1,
             eos_token_id=llm_tokenizer.eos_token_id,
             pad_token_id=llm_tokenizer.pad_token_id if llm_tokenizer.pad_token_id is not None else llm_tokenizer.eos_token_id,
+            max_new_tokens=160 # Increased slightly more
         )
         explanation = sequences[0]['generated_text'].split("[/INST]")[-1].strip()
         explanation = explanation.replace("<s>", "").replace("</s>", "").strip()
         explanation = re.sub(r"كنموذج لغوي.*?\s*,?\s*", "", explanation, flags=re.IGNORECASE)
         if not explanation or explanation.lower().startswith("أنت ناقد أفلام") or len(explanation) < 20 :
+            print(f"LLM explanation for '{recommended_movie_title}' was too short or poor. Falling back.")
             return f"شكلك بتنبسط ع��ى فيلم '{recommended_movie_title}' لأنه يشبه جو فيلم '{seed_movie_title}' اللي حبيته! عطيه تجربة."
         return explanation
     except Exception as e:
+        print(f"ERROR during LLM generation with {MODEL_NAME}: {e}")
         return f"يا كابتن، شكلك بتحب '{recommended_movie_title}'، خاصة إنك استمتعت بـ'{seed_movie_title}'. جربه وعطنا رأيك!"
 # --- Recommendation Logic ---
+def get_recommendations(progress=gr.Progress(track_tqdm=True)):
     if not TMDB_API_KEY:
+        return "<p style='color:red; text-align:right;'>خطأ: مفتاح TMDB API مو موجود أو غير صحيح. الرجاء التأكد من إضافته كـ Secret بشكل صحيح في إعدادات الـ Space.</p>"
+    if not all([df_profile_global is not None, df_watched_global is not None, seed_movies_global is not None]): # seed_movies_global can be empty list
+        return "<p style='color:red; text-align:right;'>خطأ: فشل في تحميل بيانات المستخدم. تأكد من رفع ملفات CSV بشكل صحيح.</p>"
+    if llm_pipeline is None: # Ensure LLM is ready
+        initialize_llm() # Try to initialize if it wasn't at startup
+        if llm_pipeline is None:
+             return "<p style='color:red; text-align:right;'>خطأ: فشل في تهيئة نموذج الذكاء الاصطناعي. تأكد من وجود HF_TOKEN صحيح وأن لديك صلاحية الوصول للنموذج المحدد.</p>"
+    if not seed_movies_global: # Check if seed_movies list is empty after loading
+        return "<p style='text-align:right;'>ما لقينا أفلام مفضلة أو مقيمة تقييم عالي كفاية عشان نبني عليها توصيات. حاول تقيّم بعض الأفلام!</p>"
     progress(0.1, desc="نجمع أفلامك المفضلة...")
     potential_recs = {}
+    # Limit number of seeds to process to avoid excessive API calls / long processing
+    seeds_to_process = seed_movies_global[:20] if len(seed_movies_global) > 20 else seed_movies_global
     for i, seed_movie in enumerate(seeds_to_process):
+        progress(0.1 + (i / len(seeds_to_process)) * 0.4, desc=f"نبحث عن توصيات بناءً على: {seed_movie.get('name', 'فيلم غير معروف')}")
+        seed_tmdb_details = search_tmdb_movie_details(seed_movie.get('name'), seed_movie.get('year'))
         if seed_tmdb_details and seed_tmdb_details.get('id'):
             tmdb_recs = get_tmdb_recommendations(seed_tmdb_details['id'])
             for rec in tmdb_recs:
                 try:
+                    # Ensure year is a valid integer for tuple creation
+                    year_val = int(rec['year']) if rec.get('year') and str(rec['year']).isdigit() else None
+                    if year_val is None: continue # Skip if year is invalid
+                    rec_tuple = (str(rec['title']), year_val)
                     if rec.get('id') and rec_tuple not in all_watched_titles_global and rec_tuple not in watchlist_titles_global:
+                        if rec['id'] not in potential_recs: # Add if new
                             potential_recs[rec['id']] = {
+                                'movie_info': rec,
+                                'seed_movie_title': seed_movie.get('name'),
                                 'seed_movie_context': seed_movie.get('review_text', '') or seed_movie.get('comment_text', '')
                             }
+                except (ValueError, TypeError) as e:
+                    # print(f"Skipping recommendation due to data issue: {rec.get('title')} - {e}")
+                    continue
     if not potential_recs:
+        return "<p style='text-align:right;'>ما لقينا توصيات جديدة لك حالياً بناءً على أفلامك المفضلة. يمكن شفت كل شيء رهيب! 😉</p>"
+    # Sort recommendations by TMDB popularity
     sorted_recs_list = sorted(potential_recs.values(), key=lambda x: x['movie_info'].get('popularity', 0), reverse=True)
     final_recommendations_data = []
     displayed_ids = set()
     for rec_data in sorted_recs_list:
         if rec_data['movie_info']['id'] not in displayed_ids:
             final_recommendations_data.append(rec_data)
             displayed_ids.add(rec_data['movie_info']['id'])
     if not final_recommendations_data:
+         return "<p style='text-align:right;'>ما لقينا توصيات جديدة لك حالياً بعد الفلترة. يمكن شفت كل شيء رهيب! 😉</p>"
+    output_html = "<div style='padding: 10px;'>" # Main container with some padding
     progress(0.6, desc="نجهز لك الشرح باللغة العامية...")
     for i, rec_data in enumerate(final_recommendations_data):
         progress(0.6 + (i / len(final_recommendations_data)) * 0.4, desc=f"نكتب شرح لفيلم: {rec_data['movie_info']['title']}")
         explanation = generate_saudi_explanation(
             rec_data['movie_info']['title'], rec_data['seed_movie_title'], rec_data['seed_movie_context']
         )
         poster_url = rec_data['movie_info']['poster_path']
+        # Fallback for missing posters
+        if not poster_url or "No+Poster" in poster_url or "placeholder.com" in poster_url :
+            poster_url = f"https://via.placeholder.com/300x450.png?text={requests.utils.quote(rec_data['movie_info']['title'])}"
         output_html += f"""
+        <div style="display: flex; flex-direction: row-reverse; align-items: flex-start; margin-bottom: 25px; border-bottom: 1px solid #ddd; padding-bottom:15px; background-color: #f9f9f9; border-radius: 8px; padding: 15px; box-shadow: 0 2px 4px rgba(0,0,0,0.05);">
             <img src="{poster_url}" alt="{rec_data['movie_info']['title']}" style="width: 150px; max-width:30%; height: auto; margin-left: 20px; border-radius: 5px; box-shadow: 2px 2px 5px rgba(0,0,0,0.1);">
             <div style="text-align: right; direction: rtl; flex-grow: 1;">
                 <h3 style="margin-top:0; color: #c70039;">{rec_data['movie_info']['title']} ({rec_data['movie_info']['year']})</h3>
                 <p style="font-size: 1.1em; color: #333; line-height: 1.6;">{explanation}</p>
+                <p style="font-size: 0.9em; color: #555; margin-top: 10px;"><em><strong style="color:#c70039;">السبب:</strong> حبيّت فيلم <strong style="color:#333;">{rec_data['seed_movie_title']}</strong></em></p>
             </div>
         </div>"""
     output_html += "</div>"
     return gr.HTML(output_html)
 # --- Gradio Interface ---
+css_theme = """
 body { font-family: 'Tajawal', sans-serif; }
+.gradio-container { font-family: 'Tajawal', sans-serif !important; direction: rtl; max-width: 900px !important; margin: auto !important; }
 footer { display: none !important; }
+.gr-button { background-color: #c70039 !important; color: white !important; font-size: 1.2em !important; padding: 12px 24px !important; border-radius: 8px !important; font-weight: bold; }
+.gr-button:hover { background-color: #a3002f !important; box-shadow: 0 2px 5px rgba(0,0,0,0.2); }
+h1 { color: #900c3f !important; }
+.gr-html-output h3 { color: #c70039 !important; } /* Style h3 within the HTML output specifically */
+"""
+# Attempt to load data and LLM at startup
 data_loaded_successfully = load_all_data()
 if data_loaded_successfully:
+    print("User data loaded successfully.")
+    # LLM initialization will be attempted when the Gradio app starts,
+    # or on the first click if it failed at startup.
+    # initialize_llm() # Call it here to attempt loading at startup
 else:
+    print("CRITICAL: Failed to load user data. App functionality will be limited.")
+# It's better to initialize LLM once the app blocks are defined,
+# or trigger it on first use if it's very resource-intensive at startup.
+# For Spaces, startup initialization is fine.
+with gr.Blocks(theme=gr.themes.Soft(primary_hue="red", secondary_hue="pink", font=[gr.themes.GoogleFont("Tajawal"), "sans-serif"]), css=css_theme) as iface:
     gr.Markdown(
         """
+        <div style="text-align: center; margin-bottom:20px;">
+            <h1 style="color: #c70039; font-size: 2.8em; font-weight: bold; margin-bottom:5px;">🎬 رفيقك السينمائي 🍿</h1>
             <p style="font-size: 1.2em; color: #555;">يا هلا بك! اضغط الزر تحت وخلنا نعطيك توصيات أفلام على كيف كيفك، مع شرح بالعامية ليش ممكن تدخل مزاجك.</p>
         </div>"""
     )
+    recommend_button = gr.Button("عطني توصيات أفلام جديدة!")
+    with gr.Column(elem_id="recommendation-output-column"): # Added elem_id for potential specific styling
+        output_recommendations = gr.HTML(label="👇 توصياتك النارية وصلت 👇")
+    # Initialize LLM when the Blocks context is active, after data loading attempt
     if data_loaded_successfully:
+        initialize_llm()
+    recommend_button.click(fn=get_recommendations, inputs=None, outputs=[output_recommendations], show_progress="full")
     gr.Markdown(
         """
+        <div style="text-align: center; margin-top: 40px; padding-top: 20px; border-top: 1px solid #eee; font-size: 0.9em; color: #777;">
+            <p>نتمنى لك مشاهدة ممتعة مع رفيقك السينمائي! 🎥✨</p>
         </div>"""
     )
 if __name__ == "__main__":
+    # Print warnings if critical secrets are missing when running locally
+    if not TMDB_API_KEY:
+        print("\nCRITICAL WARNING: TMDB_API_KEY environment variable is NOT SET.")
+        print("TMDB API calls will fail. Please set it in your .env file or system environment.\n")
+    if not HF_TOKEN:
+        print("\nCRITICAL WARNING: HF_TOKEN environment variable is NOT SET.")
+        print(f"LLM initialization for gated models like {MODEL_NAME} will fail. Please set it.\n")
+    iface.launch(debug=True) # debug=True for local testing, set to False for production