Spaces:

Hieucyber2208
/

Foodstack

Running

App Files Files Community

hieu-nguyen2208 commited on May 17

Commit

7d0be36

1 Parent(s): bf62ebb

"LOL"

Browse files

Files changed (2) hide show

app.py +14 -0
src/generation/llm.py +21 -22

app.py CHANGED Viewed

@@ -1,5 +1,19 @@
 import gradio as gr
 from src.chatbot import RestaurantChatbot
 chatbot = RestaurantChatbot()
 chat_history = []

 import gradio as gr
 from src.chatbot import RestaurantChatbot
+import subprocess
+command = [
+    "pip",
+    "install",
+    "git+https://github.com/huggingface/transformers.git@096f25ae1f501a084d8ff2dcaf25fbc2bd60eba4"
+]
+# Chạy lệnh và in ra stdout/stderr nếu cần
+result = subprocess.run(command, capture_output=True, text=True)
+# In kết quả
+print("STDOUT:", result.stdout)
+print("STDERR:", result.stderr)
 chatbot = RestaurantChatbot()
 chat_history = []

src/generation/llm.py CHANGED Viewed

@@ -2,9 +2,10 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
 from langchain_core.prompts import PromptTemplate
 import os
 from typing import List
 class LLM:
-    def __init__(self, model_repo: str = "Qwen/Qwen2-1.5B-Instruct",
                  local_path: str = "models"):
         """
         Initialize the LLM with Qwen2-1.5B-Instruct using Hugging Face Transformers.
@@ -17,19 +18,14 @@ class LLM:
         try:
             # Load the model
-            self.llm = AutoModelForCausalLM.from_pretrained(
-                model_repo,
-                device_map="auto",  # Automatically map to CPU
-                cache_dir=local_path,
-                trust_remote_code=True
-            )
-            # Load the tokenizer
-            self.tokenizer = AutoTokenizer.from_pretrained(
-                model_repo,
-                cache_dir=local_path,
-                trust_remote_code=True
             )
             print(f"Model successfully loaded from {model_repo}")
         except Exception as e:
             raise RuntimeError(
@@ -88,19 +84,22 @@ class LLM:
                 messages, tokenize=False, add_generation_prompt=True
             )
             # Tokenize input prompt
-            inputs = self.tokenizer(prompt_with_template, return_tensors="pt").to(self.llm.device)
             # Generate text
-            outputs = self.llm.generate(
-                        **inputs,
-                        max_new_tokens=max_length,
-                        temperature=0.7,
-                        do_sample=True,
-                        pad_token_id=self.tokenizer.eos_token_id,
-                    )
             # Decode the generated tokens
             response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
             print("Response generated successfully!")
-            return response.split('assistant')[2]
         except Exception as e:
             raise RuntimeError(f"Failed to generate response: {str(e)}")

 from langchain_core.prompts import PromptTemplate
 import os
 from typing import List
+import torch
 class LLM:
+    def __init__(self, model_repo: str = "microsoft/bitnet-b1.58-2B-4T",
                  local_path: str = "models"):
         """
         Initialize the LLM with Qwen2-1.5B-Instruct using Hugging Face Transformers.
         try:
             # Load the model
+            model_id = "microsoft/bitnet-b1.58-2B-4T"
+            self.tokenizer = AutoTokenizer.from_pretrained(model_id)
+            self.model = AutoModelForCausalLM.from_pretrained(
+                model_id,
+                torch_dtype=torch.bfloat16
             )
+            self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+            self.model.to(self.device)
             print(f"Model successfully loaded from {model_repo}")
         except Exception as e:
             raise RuntimeError(
                 messages, tokenize=False, add_generation_prompt=True
             )
             # Tokenize input prompt
+            inputs = self.tokenizer(prompt_with_template, return_tensors="pt").to(self.device)
             # Generate text
+            outputs = self.model.generate(
+                **inputs,
+                max_new_tokens=max_length,
+                temperature=0.7,
+                do_sample=True,
+                pad_token_id=self.tokenizer.eos_token_id,
+            )
             # Decode the generated tokens
             response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
             print("Response generated successfully!")
+            # Adjust response parsing to handle the output structure
+            if "assistant" in response:
+                return response.split("assistant")[-1].strip()
+            return response
         except Exception as e:
             raise RuntimeError(f"Failed to generate response: {str(e)}")