hieu-nguyen2208 commited on
Commit
6839a40
·
1 Parent(s): 4c43261
Files changed (2) hide show
  1. app.py +0 -7
  2. src/generation/llm.py +22 -21
app.py CHANGED
@@ -1,12 +1,5 @@
1
  import gradio as gr
2
  from src.chatbot import RestaurantChatbot
3
- import subprocess
4
-
5
- command = [
6
- "pip",
7
- "install",
8
- "git+https://github.com/huggingface/transformers.git@096f25ae1f501a084d8ff2dcaf25fbc2bd60eba4"
9
- ]
10
 
11
  # Chạy lệnh và in ra stdout/stderr nếu cần
12
  result = subprocess.run(command, capture_output=True, text=True)
 
1
  import gradio as gr
2
  from src.chatbot import RestaurantChatbot
 
 
 
 
 
 
 
3
 
4
  # Chạy lệnh và in ra stdout/stderr nếu cần
5
  result = subprocess.run(command, capture_output=True, text=True)
src/generation/llm.py CHANGED
@@ -2,10 +2,9 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
2
  from langchain_core.prompts import PromptTemplate
3
  import os
4
  from typing import List
5
- import torch
6
 
7
  class LLM:
8
- def __init__(self, model_repo: str = "microsoft/bitnet-b1.58-2B-4T",
9
  local_path: str = "models"):
10
  """
11
  Initialize the LLM with Qwen2-1.5B-Instruct using Hugging Face Transformers.
@@ -18,14 +17,19 @@ class LLM:
18
 
19
  try:
20
  # Load the model
21
- model_id = "microsoft/bitnet-b1.58-2B-4T"
22
- self.tokenizer = AutoTokenizer.from_pretrained(model_id)
23
- self.model = AutoModelForCausalLM.from_pretrained(
24
- model_id,
25
- torch_dtype=torch.bfloat16
 
 
 
 
 
 
 
26
  )
27
- self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
28
- self.model.to(self.device)
29
  print(f"Model successfully loaded from {model_repo}")
30
  except Exception as e:
31
  raise RuntimeError(
@@ -84,22 +88,19 @@ class LLM:
84
  messages, tokenize=False, add_generation_prompt=True
85
  )
86
  # Tokenize input prompt
87
- inputs = self.tokenizer(prompt_with_template, return_tensors="pt").to(self.device)
88
  # Generate text
89
- outputs = self.model.generate(
90
- **inputs,
91
- max_new_tokens=max_length,
92
- temperature=0.7,
93
- do_sample=True,
94
- pad_token_id=self.tokenizer.eos_token_id,
95
- )
96
  # Decode the generated tokens
97
  response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
98
  print("Response generated successfully!")
99
- # Adjust response parsing to handle the output structure
100
- if "assistant" in response:
101
- return response.split("assistant")[-1].strip()
102
- return response
103
  except Exception as e:
104
  raise RuntimeError(f"Failed to generate response: {str(e)}")
105
 
 
2
  from langchain_core.prompts import PromptTemplate
3
  import os
4
  from typing import List
 
5
 
6
  class LLM:
7
+ def __init__(self, model_repo: str = "Qwen/Qwen2-1.5B-Instruct",
8
  local_path: str = "models"):
9
  """
10
  Initialize the LLM with Qwen2-1.5B-Instruct using Hugging Face Transformers.
 
17
 
18
  try:
19
  # Load the model
20
+ self.llm = AutoModelForCausalLM.from_pretrained(
21
+ model_repo,
22
+ device_map="auto", # Automatically map to CPU
23
+ cache_dir=local_path,
24
+ trust_remote_code=True
25
+ )
26
+
27
+ # Load the tokenizer
28
+ self.tokenizer = AutoTokenizer.from_pretrained(
29
+ model_repo,
30
+ cache_dir=local_path,
31
+ trust_remote_code=True
32
  )
 
 
33
  print(f"Model successfully loaded from {model_repo}")
34
  except Exception as e:
35
  raise RuntimeError(
 
88
  messages, tokenize=False, add_generation_prompt=True
89
  )
90
  # Tokenize input prompt
91
+ inputs = self.tokenizer(prompt_with_template, return_tensors="pt").to(self.llm.device)
92
  # Generate text
93
+ outputs = self.llm.generate(
94
+ **inputs,
95
+ max_new_tokens=max_length,
96
+ temperature=0.7,
97
+ do_sample=True,
98
+ pad_token_id=self.tokenizer.eos_token_id,
99
+ )
100
  # Decode the generated tokens
101
  response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
102
  print("Response generated successfully!")
103
+ return response.split('assistant')[2]
 
 
 
104
  except Exception as e:
105
  raise RuntimeError(f"Failed to generate response: {str(e)}")
106