muryshev commited on
Commit
49da8ab
·
1 Parent(s): 03ee07e
Files changed (2) hide show
  1. app.py +8 -9
  2. llm_backend.py +4 -7
app.py CHANGED
@@ -16,12 +16,11 @@ llm = LlmBackend()
16
  _lock = threading.Lock()
17
 
18
  SYSTEM_PROMPT = os.environ.get('SYSTEM_PROMPT') or "Ты — русскоязычный автоматический ассистент. Ты максимально точно и отвечаешь на запросы пользователя, используя русский язык."
19
- CONTEXT_SIZE = os.environ.get('CONTEXT_SIZE') or 500
20
  HF_CACHE_DIR = os.environ.get('HF_CACHE_DIR') or '/home/user/app/.cache'
21
- USE_SYSTEM_PROMPT = os.environ.get('USE_SYSTEM_PROMPT') or False
22
- ENABLE_GPU = os.environ.get('ENABLE_GPU') or False
23
- GPU_LAYERS = os.environ.get('GPU_LAYERS') or 0
24
- N_GQA = os.environ.get('N_GQA') or None #must be set to 8 for 70b models
25
  CHAT_FORMAT = os.environ.get('CHAT_FORMAT') or 'llama-2'
26
  REPO_NAME = os.environ.get('REPO_NAME') or 'IlyaGusev/saiga2_7b_gguf'
27
  MODEL_NAME = os.environ.get('MODEL_NAME') or 'model-q4_K.gguf'
@@ -154,7 +153,7 @@ def generate_response():
154
  return Response(generate_and_log_tokens(user_request='1', generator=generator), content_type='text/plain', status=200, direct_passthrough=True)
155
 
156
  def init_model():
157
- llm.load_model(model_path=MODEL_PATH, context_size=CONTEXT_SIZE, enable_gpu=ENABLE_GPU, gpu_layer_number=GPU_LAYERS, n_gqa=N_GQA)
158
 
159
  # Function to check if no requests were made in the last 5 minutes
160
  def check_last_request_time():
@@ -171,9 +170,9 @@ if __name__ == "__main__":
171
 
172
  init_model()
173
 
174
- scheduler = BackgroundScheduler()
175
- scheduler.add_job(check_last_request_time, trigger='interval', minutes=1)
176
- scheduler.start()
177
 
178
  app.run(host="0.0.0.0", port=7860, debug=True, threaded=True)
179
 
 
16
  _lock = threading.Lock()
17
 
18
  SYSTEM_PROMPT = os.environ.get('SYSTEM_PROMPT') or "Ты — русскоязычный автоматический ассистент. Ты максимально точно и отвечаешь на запросы пользователя, используя русский язык."
19
+ CONTEXT_SIZE = int(os.environ.get('CONTEXT_SIZE', '500'))
20
  HF_CACHE_DIR = os.environ.get('HF_CACHE_DIR') or '/home/user/app/.cache'
21
+ USE_SYSTEM_PROMPT = os.environ.get('USE_SYSTEM_PROMPT', '').lower() == "true" or False
22
+ ENABLE_GPU = os.environ.get('ENABLE_GPU', '').lower() == "true" or False
23
+ GPU_LAYERS = int(os.environ.get('GPU_LAYERS', '0'))
 
24
  CHAT_FORMAT = os.environ.get('CHAT_FORMAT') or 'llama-2'
25
  REPO_NAME = os.environ.get('REPO_NAME') or 'IlyaGusev/saiga2_7b_gguf'
26
  MODEL_NAME = os.environ.get('MODEL_NAME') or 'model-q4_K.gguf'
 
153
  return Response(generate_and_log_tokens(user_request='1', generator=generator), content_type='text/plain', status=200, direct_passthrough=True)
154
 
155
  def init_model():
156
+ llm.load_model(model_path=MODEL_PATH, context_size=CONTEXT_SIZE, enable_gpu=ENABLE_GPU, gpu_layer_number=GPU_LAYERS)
157
 
158
  # Function to check if no requests were made in the last 5 minutes
159
  def check_last_request_time():
 
170
 
171
  init_model()
172
 
173
+ # scheduler = BackgroundScheduler()
174
+ # scheduler.add_job(check_last_request_time, trigger='interval', minutes=1)
175
+ # scheduler.start()
176
 
177
  app.run(host="0.0.0.0", port=7860, debug=True, threaded=True)
178
 
llm_backend.py CHANGED
@@ -34,14 +34,13 @@ class LlmBackend:
34
  def is_model_loaded(self):
35
  return self._model is not None
36
 
37
- def load_model(self, model_path, context_size=2000, enable_gpu=True, gpu_layer_number=35, n_gqa=8, chat_format='llama-2'):
38
  log.info('load_model - started')
39
  self._model_params = {}
40
  self._model_params['model_path'] = model_path
41
  self._model_params['context_size'] = context_size
42
  self._model_params['enable_gpu'] = enable_gpu
43
  self._model_params['gpu_layer_number'] = gpu_layer_number
44
- self._model_params['n_gqa'] = n_gqa
45
  self._model_params['chat_format'] = chat_format
46
 
47
  if self._model is not None:
@@ -57,9 +56,8 @@ class LlmBackend:
57
  #n_batch=100,
58
  logits_all=True,
59
  #n_threads=12,
60
- verbose=False,
61
- n_gpu_layers=gpu_layer_number,
62
- n_gqa=n_gqa #must be set for 70b models
63
  )
64
  log.info('load_model - finished')
65
  return self._model
@@ -72,8 +70,7 @@ class LlmBackend:
72
  #n_batch=100,
73
  logits_all=True,
74
  #n_threads=12,
75
- verbose=False,
76
- n_gqa=n_gqa #must be set for 70b models
77
  )
78
  log.info('load_model - finished')
79
  return self._model
 
34
  def is_model_loaded(self):
35
  return self._model is not None
36
 
37
+ def load_model(self, model_path, context_size=2000, enable_gpu=True, gpu_layer_number=35, chat_format='llama-2'):
38
  log.info('load_model - started')
39
  self._model_params = {}
40
  self._model_params['model_path'] = model_path
41
  self._model_params['context_size'] = context_size
42
  self._model_params['enable_gpu'] = enable_gpu
43
  self._model_params['gpu_layer_number'] = gpu_layer_number
 
44
  self._model_params['chat_format'] = chat_format
45
 
46
  if self._model is not None:
 
56
  #n_batch=100,
57
  logits_all=True,
58
  #n_threads=12,
59
+ verbose=True,
60
+ n_gpu_layers=gpu_layer_number
 
61
  )
62
  log.info('load_model - finished')
63
  return self._model
 
70
  #n_batch=100,
71
  logits_all=True,
72
  #n_threads=12,
73
+ verbose=True
 
74
  )
75
  log.info('load_model - finished')
76
  return self._model