from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline from langchain_community.document_loaders import TextLoader from huggingface_hub import InferenceClient import transformers from sentence_transformers import SentenceTransformer from datasets import Dataset, Features, Value, Sequence import pandas as pd import faiss import os import torch import gradio as gr ST_MODEL = "LazarusNLP/all-indo-e5-small-v4" BASE_MODEL = "meta-llama/Llama-3.1-8B-Instruct" DOMAIN_DATA_DIR = "./data" SYS_MSG = """ Kamu adalah asisten dalam sebuah perusahaan penyedia listrik (PLN) yang membantu menjawab pertanyaan seputar 'sexual harassment' dalam Bahasa Indonesia. Jawab dengan singkat menggunakan konteks untuk menjawab pertanyaan dalam Bahasa Indonesia. """ TOP_K = 1 from huggingface_hub import login hf_token = os.getenv("HUGGINGFACEHUB_API_TOKEN") # Autentikasi secara manual menggunakan token login(token=hf_token) domain_data = [os.path.join(DOMAIN_DATA_DIR, f) for f in os.listdir(DOMAIN_DATA_DIR) if f.endswith('.txt')] pages = [] for file in domain_data: text_loader = TextLoader(file) file_pages = text_loader.load() pages.extend(file_pages) from langchain.text_splitter import RecursiveCharacterTextSplitter splitter = RecursiveCharacterTextSplitter( chunk_size=300, chunk_overlap=64, separators=["\n\n"] ) documents = splitter.split_documents(pages) content = [doc.page_content.strip() for doc in documents] ST = SentenceTransformer(ST_MODEL) embeddings = ST.encode(content) features = Features({ 'text': Value('string'), 'embeddings': Sequence(Value('float32')) }) data = {'text': content, 'embeddings': [embedding.tolist() for embedding in embeddings]} dataset = Dataset.from_dict(data, features=features) dataset.add_faiss_index(column='embeddings') def retrieve(query, top_k=3): query_embedding = ST.encode([query]) scores, retrieved_examples = dataset.get_nearest_examples('embeddings', query_embedding, k=top_k) return scores, retrieved_examples['text'] # use quantization to lower GPU usage bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16 ) tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL) model = AutoModelForCausalLM.from_pretrained( BASE_MODEL, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=bnb_config, ) def format_prompt(prompt, retrieved_documents, k): """using the retrieved documents we will prompt the model to generate our responses""" PROMPT = f"Pertanyaan:{prompt}\nKonteks:" for idx in range(k) : PROMPT+= f"{retrieved_documents[idx]}\n" return PROMPT def chat_function(message, history, max_new_tokens=256, temperature=0.6): scores, retrieved_doc = retrieve(message, TOP_K) formatted_prompt = format_prompt(message, retrieved_doc, TOP_K) messages = [{"role":"system","content":SYS_MSG}, {"role":"user", "content":formatted_prompt}] prompt = pipeline.tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True,) terminators = [ pipeline.tokenizer.eos_token_id, pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")] outputs = pipeline( prompt, max_new_tokens = max_new_tokens, eos_token_id = terminators, do_sample = True, temperature = temperature + 0.1, top_p = 0.9,) return outputs[0]["generated_text"][len(prompt):] """ For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface """ # demo = gr.ChatInterface( # respond, # textbox=gr.Textbox(placeholder="Enter message here", container=False, scale = 7), # ) demo = gr.ChatInterface( chat_function, textbox=gr.Textbox(placeholder="Enter message here", container=False, scale = 7), chatbot=gr.Chatbot(height=400), ) if __name__ == "__main__": demo.launch(share=True)