from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
from langchain_community.document_loaders import TextLoader
from huggingface_hub import InferenceClient 
import transformers
from sentence_transformers import SentenceTransformer
from datasets import Dataset, Features, Value, Sequence
import pandas as pd
import faiss
import os
import torch
import gradio as gr

ST_MODEL = "LazarusNLP/all-indo-e5-small-v4"
BASE_MODEL = "meta-llama/Llama-3.2-1B-Instruct"
DOMAIN_DATA_DIR = "./data"
CACHE_DIR = "./cache"
SYS_MSG = """
Kamu adalah asisten dalam sebuah perusahaan penyedia listrik (PLN) yang membantu menjawab pertanyaan seputar 'sexual harassment' dalam Bahasa Indonesia.
Jawab dengan singkat menggunakan konteks untuk menjawab pertanyaan dalam Bahasa Indonesia.
"""

# LOGIN HF Auth
from huggingface_hub import login

# Ambil token API dari environment variable (jika disimpan di secrets)
import os
hf_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")

# Autentikasi secara manual menggunakan token
login(token=hf_token)

# ----------------------------------------------------------------------------------------------------------
# RAG PROCESS
TOP_K = 1
domain_data = [os.path.join(DOMAIN_DATA_DIR, f) for f in os.listdir(DOMAIN_DATA_DIR) if f.endswith('.txt')]
pages = []

for file in domain_data:
    text_loader = TextLoader(file)
    file_pages = text_loader.load()
    pages.extend(file_pages)

from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(
    chunk_size=300,
    chunk_overlap=64,
    separators=["\n\n"]
)

documents = splitter.split_documents(pages)
content = [doc.page_content.strip() for doc in documents]

ST = SentenceTransformer(ST_MODEL)
embeddings = ST.encode(content)

features = Features({
    'text': Value('string'),
    'embeddings': Sequence(Value('float32'))
})

data = {'text': content, 'embeddings': [embedding.tolist() for embedding in embeddings]}
dataset = Dataset.from_dict(data, features=features)

dataset.add_faiss_index(column='embeddings')

def retrieve(query, top_k=3):
    query_embedding = ST.encode([query])
    scores, retrieved_examples = dataset.get_nearest_examples('embeddings', query_embedding, k=top_k)

    return scores, retrieved_examples['text']

# END RAG
# ----------------------------------------------------------------------------------------------------------

# LLM
# use quantization to lower GPU usage
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, cache_dir=CACHE_DIR)
model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    torch_dtype=torch.bfloat16,
    # device_map="auto",
    quantization_config=bnb_config,
    cache_dir=CACHE_DIR
)

def format_prompt(prompt, retrieved_documents, k):
    """using the retrieved documents we will prompt the model to generate our responses"""
    PROMPT = f"Pertanyaan:{prompt}\nKonteks:"
    for idx in range(k) :
        PROMPT+= f"{retrieved_documents[idx]}\n"
    return PROMPT

def chat_function(message, history, max_new_tokens=256, temperature=0.6):
    _, retrieved_doc = retrieve(message, TOP_K)
    formatted_prompt = format_prompt(message, retrieved_doc, TOP_K)
    
    messages = [{"role":"system","content":SYS_MSG},
                {"role":"user", "content":formatted_prompt}]
    prompt = pipeline.tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,)
    print(f"Prompt: {prompt}\n")
    terminators = [
        pipeline.tokenizer.eos_token_id,
        pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")]
    outputs = pipeline(
        prompt,
        max_new_tokens = max_new_tokens,
        eos_token_id = terminators,
        do_sample = True,
        temperature = temperature,
        top_p = 0.9,)
    return outputs[0]["generated_text"][len(prompt):]
    
demo = gr.ChatInterface(
    chat_function,
    textbox=gr.Textbox(placeholder="Enter message here", container=False, scale = 7),
    chatbot=gr.Chatbot(height=400),
)

if __name__ == "__main__":
    demo.launch(share=True)