Spaces:

rasoul-nikbakht
/

Tspec-RAG

Running

File size: 32,231 Bytes

import os
import re
import json
import logging
import hashlib
from pathlib import Path
from typing import List, Tuple, Dict, Any, Optional

import gradio as gr
from dotenv import load_dotenv
from huggingface_hub import hf_hub_download
from huggingface_hub.utils import EntryNotFoundError
from requests.exceptions import HTTPError

from langchain.text_splitter import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

# -----------------------------------------------------------------------------
# Configuration & Environment Variables
# -----------------------------------------------------------------------------
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
load_dotenv()  # Load .env file for local development

# --- API Keys ---
# Base key is for potentially pre-processing fixed files (if needed)
# User key is required for processing *new* dynamic files
BASE_OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") # Used for pre-processing base files if needed
HF_TOKEN = os.getenv("HF_TOKEN")

# --- Constants ---
DATASET_ID = "rasoul-nikbakht/TSpec-LLM"
DATA_SUBDIR = "3GPP-clean"
EMBEDDING_MODEL = "text-embedding-3-small"
LLM_MODEL = "gpt-4o-mini"
MAX_DYNAMIC_FILES = 3
ESTIMATED_COST_PER_FILE_CENTS = 2 # Rough estimate

# --- File Paths ---
SCRIPT_DIR = Path(__file__).parent
CACHE_DIR = SCRIPT_DIR / "cached_embeddings"
BASE_KNOWLEDGE_INDEX_PATH = CACHE_DIR / "base_knowledge.faiss"
USER_DATA_PATH = SCRIPT_DIR / "user_data.json"
CACHE_MANIFEST_PATH = SCRIPT_DIR / "cache_manifest.json"

# Ensure cache directory exists
CACHE_DIR.mkdir(exist_ok=True)

# --- Fixed Base Knowledge Files ---
# Relative paths within the dataset repo (without DATA_SUBDIR)
FIXED_FILES = [
    "Rel-16/38_series/38901-g10.md",
    "Rel-16/38_series/38821-g20.md",
    "Rel-15/36_series/36777-f00_1.md",
    "Rel-15/36_series/36777-f00_2.md",
]

# -----------------------------------------------------------------------------
# Global Variables & In-Memory Stores (Load at startup)
# -----------------------------------------------------------------------------
base_knowledge_index: Optional[FAISS] = None
user_data: Dict[str, List[str]] = {} # {email: [list_of_processed_files]}
cache_manifest: Dict[str, str] = {} # {repo_relative_path: local_faiss_path}

# -----------------------------------------------------------------------------
# Helper Functions
# -----------------------------------------------------------------------------

def sanitize_path_for_filename(repo_path: str) -> str:
    """Creates a safe filename from a repository path."""
    # Remove base dir prefix if present
    if repo_path.startswith(f"{DATA_SUBDIR}/"):
         repo_path = repo_path[len(f"{DATA_SUBDIR}/"):]
    # Replace slashes and invalid chars; use hashing for very long paths if needed
    sanitized = re.sub(r'[\\/*?:"<>|]', '_', repo_path)
    # Optional: Limit length and add hash if too long
    if len(sanitized) > 100:
        hash_suffix = hashlib.md5(repo_path.encode()).hexdigest()[:8]
        sanitized = sanitized[:90] + "_" + hash_suffix
    return sanitized + ".faiss"

def is_valid_email(email: str) -> bool:
    """Basic regex check for email format."""
    # This is a simple check, not foolproof validation
    pattern = r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"
    return re.match(pattern, email) is not None

def load_user_data():
    """Loads user email and associated file data from JSON."""
    global user_data
    if USER_DATA_PATH.exists():
        try:
            with open(USER_DATA_PATH, 'r') as f:
                user_data = json.load(f)
            logging.info(f"Loaded user data for {len(user_data)} users from {USER_DATA_PATH}")
        except json.JSONDecodeError:
            logging.error(f"Error decoding JSON from {USER_DATA_PATH}. Starting with empty user data.")
            user_data = {}
        except Exception as e:
            logging.error(f"Failed to load user data: {e}", exc_info=True)
            user_data = {}
    else:
        logging.info("User data file not found. Starting fresh.")
        user_data = {}

def save_user_data():
    """Saves user email and associated file data to JSON."""
    try:
        with open(USER_DATA_PATH, 'w') as f:
            json.dump(user_data, f, indent=4)
        # logging.info(f"Saved user data to {USER_DATA_PATH}") # Can be noisy
    except Exception as e:
        logging.error(f"Failed to save user data: {e}", exc_info=True)

def load_cache_manifest():
    """Loads the manifest of locally cached embeddings."""
    global cache_manifest
    if CACHE_MANIFEST_PATH.exists():
        try:
            with open(CACHE_MANIFEST_PATH, 'r') as f:
                cache_manifest = json.load(f)
            logging.info(f"Loaded cache manifest with {len(cache_manifest)} entries from {CACHE_MANIFEST_PATH}")
            # Optional: Verify that the referenced FAISS files actually exist
            # keys_to_remove = [k for k, v in cache_manifest.items() if not Path(v).exists()]
            # if keys_to_remove:
            #     logging.warning(f"Removing {len(keys_to_remove)} stale entries from cache manifest.")
            #     for k in keys_to_remove: del cache_manifest
            #     save_cache_manifest() # Save cleaned manifest
        except json.JSONDecodeError:
            logging.error(f"Error decoding JSON from {CACHE_MANIFEST_PATH}. Starting with empty manifest.")
            cache_manifest = {}
        except Exception as e:
            logging.error(f"Failed to load cache manifest: {e}", exc_info=True)
            cache_manifest = {}
    else:
        logging.info("Cache manifest file not found. Starting fresh.")
        cache_manifest = {}

def save_cache_manifest():
    """Saves the manifest of locally cached embeddings."""
    try:
        with open(CACHE_MANIFEST_PATH, 'w') as f:
            json.dump(cache_manifest, f, indent=4)
        # logging.info(f"Saved cache manifest to {CACHE_MANIFEST_PATH}") # Can be noisy
    except Exception as e:
        logging.error(f"Failed to save cache manifest: {e}", exc_info=True)

def download_and_process_file(repo_relative_path: str, api_key_for_embedding: str) -> Optional[FAISS]:
    """Downloads, chunks, embeds a single file, returning a FAISS index."""
    if not HF_TOKEN:
         logging.error("HF_TOKEN is missing. Cannot download from gated dataset.")
         # Don't raise gr.Error here, handle return value in caller
         return None
    if not api_key_for_embedding:
        logging.error("OpenAI API Key is missing. Cannot create embeddings.")
        return None

    full_repo_path = f"{DATA_SUBDIR}/{repo_relative_path}"
    logging.info(f"Processing file: {repo_relative_path}")

    # --- Download ---
    try:
        local_path_str = hf_hub_download(
            repo_id=DATASET_ID,
            filename=full_repo_path,
            repo_type="dataset",
            token=HF_TOKEN,
            cache_dir="./hf_cache"
        )
        local_path = Path(local_path_str)
        logging.info(f"Downloaded {repo_relative_path} to: {local_path}")
    except EntryNotFoundError:
        logging.error(f"File not found in repository: {full_repo_path}")
        raise gr.Error(f"File not found in repository: '{repo_relative_path}'. Please check the path.")
    except HTTPError as e:
        if e.response is not None and e.response.status_code in {401, 403}:
            logging.error(f"Hugging Face authentication/authorization failed (Status {e.response.status_code}).")
            raise gr.Error("Hugging Face authentication failed. Check HF_TOKEN and dataset license acceptance.")
        else:
            logging.error(f"HTTP error during download: {e}")
            raise gr.Error(f"Failed to download file due to an HTTP error: {e}")
    except Exception as e:
        logging.error(f"An unexpected error occurred during download for {repo_relative_path}: {e}", exc_info=True)
        raise gr.Error(f"Download error for {repo_relative_path}: {e}")

    # --- Load and Chunk ---
    try:
        text = local_path.read_text(encoding="utf-8", errors="replace")
        headers_to_split_on = [("#", "H1"), ("##", "H2"), ("###", "H3"), ("####", "H4")]
        splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on, strip_headers=False)
        docs = splitter.split_text(text)

        if not docs or (len(docs) == 1 and len(docs[0].page_content) > 5000):
            logging.warning(f"MarkdownHeaderTextSplitter yielded few/large chunks for {repo_relative_path}, using RecursiveCharacterTextSplitter.")
            fallback_splitter = RecursiveCharacterTextSplitter(
                chunk_size=1000, chunk_overlap=150, separators=["\n\n", "\n", ". ", ", ", " ", ""]
            )
            docs = fallback_splitter.create_documents([text])

        if not docs:
            logging.warning(f"File '{repo_relative_path}' resulted in zero documents after splitting.")
            return None # Cannot create index from no documents
        logging.info(f"Split {repo_relative_path} into {len(docs)} documents.")

    except Exception as e:
        logging.error(f"Failed to read/split file {local_path}: {e}", exc_info=True)
        raise gr.Error(f"Error processing content of {repo_relative_path}: {e}")

    # --- Embed and Create Vector Store ---
    try:
        embeddings = OpenAIEmbeddings(model=EMBEDDING_MODEL, api_key=api_key_for_embedding)
        vectordb = FAISS.from_documents(docs, embeddings)
        logging.info(f"Created FAISS index for {repo_relative_path}.")
        return vectordb
    except Exception as e:
        # Catch potential OpenAI API errors specifically if possible
        logging.error(f"Failed during embedding/vector store creation for {repo_relative_path}: {e}", exc_info=True)
        # Check for common errors based on string matching (less robust but helpful)
        if "AuthenticationError" in str(e) or "Incorrect API key" in str(e):
             raise gr.Error(f"OpenAI Authentication Error for {repo_relative_path}. Check your API Key. Details: {e}")
        elif "RateLimitError" in str(e):
             raise gr.Error(f"OpenAI Rate Limit Error for {repo_relative_path}. Details: {e}")
        else:
             raise gr.Error(f"Embedding/VectorStore Error for {repo_relative_path}: {e}")


def get_or_create_dynamic_index(repo_relative_path: str, user_api_key: str) -> Optional[FAISS]:
    """Loads a dynamic index from cache or creates+caches it if new."""
    global cache_manifest # Allow modification

    if repo_relative_path in cache_manifest:
        local_faiss_path_str = cache_manifest[repo_relative_path]
        local_faiss_path = Path(local_faiss_path_str)
        if local_faiss_path.exists():
            try:
                # Need embeddings object to load; use user's key as they initiated the session
                embeddings = OpenAIEmbeddings(model=EMBEDDING_MODEL, api_key=user_api_key)
                index = FAISS.load_local(str(local_faiss_path.parent), embeddings, index_name=local_faiss_path.stem, allow_dangerous_deserialization=True)
                logging.info(f"Loaded cached index for {repo_relative_path} from {local_faiss_path_str}")
                return index
            except Exception as e:
                logging.error(f"Failed to load cached index {local_faiss_path_str}: {e}. Will try to re-create.", exc_info=True)
                # Remove potentially corrupted entry from manifest
                del cache_manifest[repo_relative_path]
                save_cache_manifest()
        else:
            logging.warning(f"Cache manifest points to non-existent file: {local_faiss_path_str}. Removing entry.")
            del cache_manifest[repo_relative_path]
            save_cache_manifest()

    # --- If not cached or loading failed, create it ---
    logging.info(f"Cache miss or load failure for {repo_relative_path}. Processing anew.")
    if not user_api_key:
         raise gr.Error(f"Cannot process new file '{repo_relative_path}' without an OpenAI API Key.")

    new_index = download_and_process_file(repo_relative_path, user_api_key)

    if new_index:
        # Save the newly created index
        try:
            sanitized_name = sanitize_path_for_filename(repo_relative_path)
            save_path = CACHE_DIR / sanitized_name
            # FAISS save_local saves folder and index_name.faiss/pkl inside it
            new_index.save_local(folder_path=str(CACHE_DIR), index_name=save_path.stem)
            full_saved_path = str(CACHE_DIR / (save_path.stem + ".faiss")) # Path to the actual .faiss file

            # Update manifest
            cache_manifest[repo_relative_path] = full_saved_path
            save_cache_manifest()
            logging.info(f"Saved new index for {repo_relative_path} to {full_saved_path} and updated manifest.")
            return new_index
        except Exception as e:
            logging.error(f"Failed to save new index for {repo_relative_path}: {e}", exc_info=True)
            # Don't raise here, maybe it works in memory for the session
            return new_index # Return in-memory index even if saving failed
    else:
        # download_and_process_file failed, error already raised or logged
        return None

# -----------------------------------------------------------------------------
# Pre-processing Base Knowledge (Run once at startup if needed)
# -----------------------------------------------------------------------------
def preprocess_base_knowledge():
    """Creates and saves the base knowledge FAISS index if it doesn't exist by processing files individually and merging."""
    global base_knowledge_index
    if BASE_KNOWLEDGE_INDEX_PATH.exists():
        try:
            if not BASE_OPENAI_API_KEY:
                 logging.error("Base OpenAI API Key missing. Cannot load base knowledge index.")
                 return
            embeddings = OpenAIEmbeddings(model=EMBEDDING_MODEL, api_key=BASE_OPENAI_API_KEY)
            base_knowledge_index = FAISS.load_local(
                str(BASE_KNOWLEDGE_INDEX_PATH.parent),
                embeddings,
                index_name=BASE_KNOWLEDGE_INDEX_PATH.stem,
                allow_dangerous_deserialization=True
            )
            logging.info(f"Successfully loaded base knowledge index from {BASE_KNOWLEDGE_INDEX_PATH}")
            return # Successfully loaded, no need to rebuild
        except Exception as e:
            logging.error(f"Failed to load base knowledge index from {BASE_KNOWLEDGE_INDEX_PATH}: {e}. Will attempt to rebuild.", exc_info=True)
            base_knowledge_index = None
            # Optionally delete corrupted files:
            # try:
            #     if BASE_KNOWLEDGE_INDEX_PATH.exists(): BASE_KNOWLEDGE_INDEX_PATH.unlink()
            #     pkl_path = BASE_KNOWLEDGE_INDEX_PATH.with_suffix(".pkl")
            #     if pkl_path.exists(): pkl_path.unlink()
            # except OSError as rm_err:
            #     logging.error(f"Failed to delete potentially corrupted index files: {rm_err}")


    if base_knowledge_index is None:
        logging.info("Base knowledge index not found or failed to load. Starting pre-processing...")
        if not BASE_OPENAI_API_KEY:
            logging.error("Cannot pre-process base knowledge: BASE_OPENAI_API_KEY is not set.")
            raise RuntimeError("OpenAI API Key needed for initial base knowledge processing is not configured.")
        if not HF_TOKEN:
            logging.error("Cannot pre-process base knowledge: HF_TOKEN is not set.")
            raise RuntimeError("Hugging Face Token needed for initial base knowledge processing is not configured.")

        individual_indices : List[FAISS] = [] # Store index for each base file

        for file_path in FIXED_FILES:
            try:
                # Process each file individually to get its FAISS index
                # This ensures embedding requests are per-file, not one giant batch
                index = download_and_process_file(file_path, BASE_OPENAI_API_KEY)
                if index:
                     individual_indices.append(index)
                     # Note: document count is now per-file in logs from download_and_process_file
                     logging.info(f"Successfully processed base file: {file_path}")
                else:
                     logging.warning(f"Skipping base file {file_path} due to processing error (returned None index).")

            except Exception as e:
                # If download_and_process_file raises an error (e.g., download failed, API key invalid)
                logging.error(f"Failed processing base file {file_path}: {e}", exc_info=True)
                # Decide whether to stop or continue; let's stop to avoid partial base index
                raise RuntimeError(f"Failed to process base file {file_path}. Cannot create complete base knowledge index.") from e

        if not individual_indices:
            logging.error("No individual indices were successfully created for the base knowledge. Cannot proceed.")
            raise RuntimeError("Failed to process any base files successfully.")

        try:
            logging.info(f"Merging {len(individual_indices)} individual indices into the final base knowledge index...")
            # Start with the first index
            base_knowledge_index = individual_indices[0]
            # Merge the rest
            if len(individual_indices) > 1:
                for index_to_merge in individual_indices[1:]:
                    base_knowledge_index.merge_from(index_to_merge)

            total_vectors = base_knowledge_index.index.ntotal
            logging.info(f"Final base knowledge index created with {total_vectors} total vectors.")

            # Save the final merged index
            base_knowledge_index.save_local(folder_path=str(CACHE_DIR), index_name=BASE_KNOWLEDGE_INDEX_PATH.stem)
            logging.info(f"Successfully saved merged base knowledge index to {BASE_KNOWLEDGE_INDEX_PATH}")

        except Exception as e:
            logging.error(f"Failed to merge individual indices or save the final base knowledge index: {e}", exc_info=True)
            # Set base_knowledge_index back to None so app knows it failed
            base_knowledge_index = None
            raise RuntimeError("Failed to merge or save the final base knowledge index.") from e
# -----------------------------------------------------------------------------
# Gradio Chat Function
# -----------------------------------------------------------------------------

GradioChatMessages = List[Dict[str, str]] # [{'role': 'user', 'content': 'hi'}, ...]

def chat_llm(
    user_email: str,
    user_openai_key: str,
    dynamic_files_str: str,
    question: str,
    history: GradioChatMessages
) -> Tuple[GradioChatMessages, str, str]: # History, Clear Question Box, Status Update
    """
    Gradio callback function. Performs RAG QA for one turn.
    Uses base knowledge + dynamically loaded/cached files.
    """
    status_update = ""
    if not history: history = [] # Initialize history

    # --- Input Validation ---
    if not user_email or not is_valid_email(user_email):
        raise gr.Error("Please enter a valid email address.")
    if not question or not question.strip():
        raise gr.Error("Please enter a question.")

    # Parse and validate dynamic file paths
    dynamic_files = [f.strip() for f in dynamic_files_str.split(',') if f.strip()]
    if len(dynamic_files) > MAX_DYNAMIC_FILES:
        raise gr.Error(f"Please select a maximum of {MAX_DYNAMIC_FILES} dynamic files per session.")
    if dynamic_files and not user_openai_key:
        raise gr.Error("Please provide your OpenAI API Key to process dynamic files.")

    # Log user interaction
    logging.info(f"Chat request from: {user_email}, Dynamic files: {dynamic_files}, Question: '{question[:50]}...'")

    # Use provided key or fallback to base key if available (only if no dynamic files)
    # If dynamic files are present, user_openai_key MUST be used and validated
    api_key_to_use = user_openai_key if dynamic_files else (user_openai_key or BASE_OPENAI_API_KEY)
    if not api_key_to_use:
         raise gr.Error("An OpenAI API Key is required for this operation (either user-provided or pre-configured).")


    session_indices : List[FAISS] = []
    processed_dynamic_files_this_session : List[str] = []
    newly_cached_files: List[str] = []

    # --- Retriever Setup ---
    # 1. Add Base Knowledge
    if base_knowledge_index:
        session_indices.append(base_knowledge_index)
        logging.debug("Added base knowledge index to session.")
    else:
        logging.error("Base knowledge index is not loaded. Cannot proceed.")
        raise gr.Error("Base knowledge index is unavailable. Please check logs.")

    # 2. Process Dynamic Files
    for file_path in dynamic_files:
        try:
            was_cached = file_path in cache_manifest
            dynamic_index = get_or_create_dynamic_index(file_path, api_key_to_use) # Use the determined API key
            if dynamic_index:
                session_indices.append(dynamic_index)
                processed_dynamic_files_this_session.append(file_path)
                if not was_cached: # If it wasn't in the manifest before get_or_create ran
                    newly_cached_files.append(file_path)
            # else: Error handled within get_or_create_dynamic_index by raising gr.Error

        except gr.Error as e:
            # Propagate Gradio errors to UI
             raise e
        except Exception as e:
             logging.error(f"Unexpected error processing dynamic file {file_path}: {e}", exc_info=True)
             raise gr.Error(f"Failed to process dynamic file {file_path}: {e}")


    # --- Combine Indices for Session (if dynamic files were added) ---
    if len(session_indices) > 1 : # Need to merge if dynamic files were added
        try:
            logging.info(f"Merging {len(session_indices)} indices for the session...")
            # Create a temporary merged index for this session
            # Start with the first index (should be base knowledge)
            session_master_index = FAISS(
                 embedding_function=session_indices[0].embeddings, # Use embeddings from first index
                 index=session_indices[0].index,
                 docstore=session_indices[0].docstore,
                 index_to_docstore_id=session_indices[0].index_to_docstore_id
            )
            # Merge subsequent indices
            for index_to_merge in session_indices[1:]:
                 session_master_index.merge_from(index_to_merge)
            logging.info(f"Session index created with {session_master_index.index.ntotal} total vectors.")
            session_retriever = session_master_index.as_retriever(search_kwargs={"k": 5})
        except Exception as e:
            logging.error(f"Failed to merge session indices: {e}", exc_info=True)
            raise gr.Error(f"Error creating session knowledge base: {e}")
    elif session_indices: # Only base knowledge was used
         session_retriever = session_indices[0].as_retriever(search_kwargs={"k": 5})
    else:
         # Should have been caught earlier if base_knowledge_index was None
         raise gr.Error("No knowledge base available for retrieval.")


    # --- Setup LLM and RAG Chain ---
    try:
        llm = ChatOpenAI(model=LLM_MODEL, temperature=0.1, api_key=api_key_to_use, max_retries=1)

        template = """You are an assistant specializing in 3GPP technical specifications.
Answer the following question based *only* on the provided context document snippets from the specified files.
The context comes from the base knowledge files and potentially these user-provided files: {dynamic_files_list_str}
If the answer is not found in the context, state that you cannot answer based on the provided information. Be concise and accurate.

Context:
{context}

Question:
{question}

Answer:"""
        prompt = ChatPromptTemplate.from_template(template)

        # Function to format retrieved documents
        def format_docs(docs):
            return "\n\n".join(doc.page_content for doc in docs)

        # RAG Chain
        rag_chain = (
            {"context": session_retriever | format_docs,
             "question": RunnablePassthrough(),
             "dynamic_files_list_str" : lambda x: ", ".join(dynamic_files) or "None"} # Pass dynamic files for context
            | prompt
            | llm
            | StrOutputParser()
        )

        logging.info(f"Invoking RAG chain for question: '{question[:50]}...'")
        answer = rag_chain.invoke(question)
        logging.info(f"Received answer: '{answer[:100]}...'")

        # Update user data
        if user_email not in user_data: user_data[user_email] = []
        updated_files_for_user = set(user_data[user_email]) | set(processed_dynamic_files_this_session)
        user_data[user_email] = sorted(list(updated_files_for_user))
        save_user_data() # Save after successful interaction

        # Prepare status update message
        if newly_cached_files:
            status_update = f"Info: The following new files were processed and cached for future use: {', '.join(newly_cached_files)}."

    except Exception as e:
        logging.error(f"Error during RAG chain execution or user data update: {e}", exc_info=True)
        # Append error to chat instead of crashing
        history.append({"role": "user", "content": question})
        history.append({"role": "assistant", "content": f"An error occurred: {e}"})
        return history, question, "Error occurred. Check logs." # Keep question in box

    # --- Update History and Return ---
    history.append({"role": "user", "content": question})
    history.append({"role": "assistant", "content": answer})

    return history, "", status_update # Clear question box, provide status


# -----------------------------------------------------------------------------
# Gradio UI Definition
# -----------------------------------------------------------------------------

# --- UI Text Blocks ---

# Construct the cached files list string separately
sorted_keys = sorted(list(cache_manifest.keys()))
if sorted_keys:
    # Format each key as a markdown bullet point with backticks
    formatted_items = [f"*   `{key}`" for key in sorted_keys]
    # Join them with newlines
    file_list_str = "\n".join(formatted_items)
else:
    file_list_str = "*   None yet." # Message when no files are cached

# Now define the info string using the pre-formatted list
cached_files_info = f"""
**Available Cached Files:**
The following dynamically added files have already been processed and cached:
{file_list_str}
"""

# --- The rest of the UI text blocks (disclaimer_text, base_knowledge_info) remain the same ---
disclaimer_text = f"""
**Disclaimer & Usage Notes:**
*   **Research Preview:** This is a demonstration application for research purposes. Accuracy is not guaranteed.
*   **License:** By using this application, you agree to the terms and license of the underlying dataset (`{DATASET_ID}`). Please review the dataset's license terms on Hugging Face Hub.
*   **API Keys:** Your OpenAI API key is required to process *new* documents you specify. It is used solely for embedding generation during your session and is not stored persistently by this application.
*   **Caching:** Processed dynamic files are cached locally (embeddings only) to speed up future sessions.
*   **Estimated Cost:** Processing *new* files incurs OpenAI API costs (approx. ${ESTIMATED_COST_PER_FILE_CENTS / 100:.2f} per file for `{EMBEDDING_MODEL}`). Using already cached files or only the base knowledge is free within this app.
*   **Data:** Your email is logged along with the files you process for usage tracking. See `{USER_DATA_PATH.name}`.
"""

base_knowledge_info = f"""
**Base Knowledge:**
The chatbot always has access to the following pre-processed 3GPP specification files:
*   `{FIXED_FILES[0]}`
*   `{FIXED_FILES[1]}`
*   `{FIXED_FILES[2]}`
*   `{FIXED_FILES[3]}`
"""


# --- Build UI ---
with gr.Blocks(theme=gr.themes.Soft(), title="3GPP TSpec RAG Assistant") as demo:
    gr.Markdown("# 📄 3GPP TSpec RAG Assistant")

    with gr.Row():
        # --- Left Column (Chat Interface) ---
        with gr.Column(scale=7): # 70% width
            chatbot = gr.Chatbot(
                label="Chat Session",
                height=600,
                type="messages",
                show_copy_button=True,
            )
            question_inp = gr.Textbox(
                label="Your Question",
                placeholder="Ask a question about the selected documents...",
                lines=3
            )
            status_out = gr.Textbox(label="Status Updates", interactive=False)

        # --- Right Column (Controls & Info) ---
        with gr.Column(scale=3): # 30% width
            gr.Markdown("### Session Configuration")
            email_inp = gr.Textbox(label="Your Email Address", placeholder="Enter your email...")
            openai_key_inp = gr.Textbox(
                label="Your OpenAI API Key (Required for new files)",
                placeholder="Enter your OpenAI API key (sk-...)",
                type="password"
            )
            dynamic_files_inp = gr.Textbox(
                label=f"Dynamic Files (Optional, max {MAX_DYNAMIC_FILES}, comma-separated)",
                placeholder="e.g., Rel-17/23_series/23501-h50.md, Rel-18/...",
                lines=3
            )
            ask_btn = gr.Button("Ask Question", variant="primary")

            with gr.Accordion("Usage Information & Disclaimers", open=False):
                 gr.Markdown(disclaimer_text)
            with gr.Accordion("Base Knowledge Files", open=False):
                 gr.Markdown(base_knowledge_info)
            with gr.Accordion("Cached Dynamic Files", open=True):
                 # Use an HTML component to allow dynamic updates if needed later
                 # For now, just display the initial list
                 # cached_list_html = gr.HTML(value=f"<ul><li>{ '</li><li>'.join(sorted(list(cache_manifest.keys()))) or 'None' }</li></ul>")
                 # Simpler Markdown display:
                 cached_list_md = gr.Markdown(cached_files_info)


    # --- Event Handling ---
    ask_btn.click(
        fn=chat_llm,
        inputs=[email_inp, openai_key_inp, dynamic_files_inp, question_inp, chatbot],
        outputs=[chatbot, question_inp, status_out] # Update chat, clear question, show status
    )

    # Example Button (Optional - might be less useful with dynamic files)
    # gr.Examples(...)


# -----------------------------------------------------------------------------
# Application Entry Point & Initial Setup
# -----------------------------------------------------------------------------
if __name__ == "__main__":
    print("Starting application setup...")
    # 1. Load user data and cache manifest
    print("Loading user data...")
    load_user_data()
    print("Loading cache manifest...")
    load_cache_manifest()
    print(f"Found {len(cache_manifest)} cached files.")

    # 2. Ensure base knowledge index is ready
    print("Checking base knowledge index...")
    try:
        preprocess_base_knowledge()
        print("Base knowledge index is ready.")
    except Exception as e:
         print(f"\n!!! CRITICAL ERROR during base knowledge setup: {e} !!!")
         print("The application cannot start without the base knowledge index.")
         print("Please ensure BASE_OPENAI_API_KEY and HF_TOKEN are correctly set in your environment or .env file and you have accepted the dataset license.")
         # Exit if base knowledge failed critically
         import sys
         sys.exit(1)

    # 3. Launch Gradio App
    print("Launching Gradio interface...")
    demo.launch(debug=True, mcp_server=True) # debug=True for detailed logs locally