Spaces:

Hieucyber2208
/

Foodstack

Running

App Files Files Community

hieu-nguyen2208 commited on 27 days ago

Commit

6839a40

1 Parent(s): 4c43261

"LOL"

Browse files

Files changed (2) hide show

app.py +0 -7
src/generation/llm.py +22 -21

app.py CHANGED Viewed

@@ -1,12 +1,5 @@
 import gradio as gr
 from src.chatbot import RestaurantChatbot
-import subprocess
-command = [
-    "pip",
-    "install",
-    "git+https://github.com/huggingface/transformers.git@096f25ae1f501a084d8ff2dcaf25fbc2bd60eba4"
-]
 # Chạy lệnh và in ra stdout/stderr nếu cần
 result = subprocess.run(command, capture_output=True, text=True)

 import gradio as gr
 from src.chatbot import RestaurantChatbot
 # Chạy lệnh và in ra stdout/stderr nếu cần
 result = subprocess.run(command, capture_output=True, text=True)

src/generation/llm.py CHANGED Viewed

@@ -2,10 +2,9 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
 from langchain_core.prompts import PromptTemplate
 import os
 from typing import List
-import torch
 class LLM:
-    def __init__(self, model_repo: str = "microsoft/bitnet-b1.58-2B-4T",
                  local_path: str = "models"):
         """
         Initialize the LLM with Qwen2-1.5B-Instruct using Hugging Face Transformers.
@@ -18,14 +17,19 @@ class LLM:
         try:
             # Load the model
-            model_id = "microsoft/bitnet-b1.58-2B-4T"
-            self.tokenizer = AutoTokenizer.from_pretrained(model_id)
-            self.model = AutoModelForCausalLM.from_pretrained(
-                model_id,
-                torch_dtype=torch.bfloat16
             )
-            self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-            self.model.to(self.device)
             print(f"Model successfully loaded from {model_repo}")
         except Exception as e:
             raise RuntimeError(
@@ -84,22 +88,19 @@ class LLM:
                 messages, tokenize=False, add_generation_prompt=True
             )
             # Tokenize input prompt
-            inputs = self.tokenizer(prompt_with_template, return_tensors="pt").to(self.device)
             # Generate text
-            outputs = self.model.generate(
-                **inputs,
-                max_new_tokens=max_length,
-                temperature=0.7,
-                do_sample=True,
-                pad_token_id=self.tokenizer.eos_token_id,
-            )
             # Decode the generated tokens
             response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
             print("Response generated successfully!")
-            # Adjust response parsing to handle the output structure
-            if "assistant" in response:
-                return response.split("assistant")[-1].strip()
-            return response
         except Exception as e:
             raise RuntimeError(f"Failed to generate response: {str(e)}")

 from langchain_core.prompts import PromptTemplate
 import os
 from typing import List
 class LLM:
+    def __init__(self, model_repo: str = "Qwen/Qwen2-1.5B-Instruct",
                  local_path: str = "models"):
         """
         Initialize the LLM with Qwen2-1.5B-Instruct using Hugging Face Transformers.
         try:
             # Load the model
+            self.llm = AutoModelForCausalLM.from_pretrained(
+                model_repo,
+                device_map="auto",  # Automatically map to CPU
+                cache_dir=local_path,
+                trust_remote_code=True
+            )
+            # Load the tokenizer
+            self.tokenizer = AutoTokenizer.from_pretrained(
+                model_repo,
+                cache_dir=local_path,
+                trust_remote_code=True
             )
             print(f"Model successfully loaded from {model_repo}")
         except Exception as e:
             raise RuntimeError(
                 messages, tokenize=False, add_generation_prompt=True
             )
             # Tokenize input prompt
+            inputs = self.tokenizer(prompt_with_template, return_tensors="pt").to(self.llm.device)
             # Generate text
+            outputs = self.llm.generate(
+                        **inputs,
+                        max_new_tokens=max_length,
+                        temperature=0.7,
+                        do_sample=True,
+                        pad_token_id=self.tokenizer.eos_token_id,
+                    )
             # Decode the generated tokens
             response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
             print("Response generated successfully!")
+            return response.split('assistant')[2]
         except Exception as e:
             raise RuntimeError(f"Failed to generate response: {str(e)}")