Spaces:

zetavg
/

LLaMA-LoRA-Tuner-UI-Demo

Runtime error

App Files Files Community

zetavg commited on Apr 10, 2023

Commit

45501f4

unverified ·

2 Parent(s): 91cb7fd a1771a7

Merge branch 'main' into hf-ui-demo

Browse files

Files changed (14) hide show

LLaMA_LoRA.ipynb +31 -13
README.md +19 -16
app.py +2 -1
llama_lora/globals.py +11 -5
llama_lora/lib/finetune.py +21 -2
llama_lora/models.py +103 -59
llama_lora/ui/finetune_ui.py +111 -31
llama_lora/ui/inference_ui.py +186 -40
llama_lora/ui/main_page.py +48 -5
llama_lora/ui/tokenizer_ui.py +4 -2
llama_lora/utils/data.py +16 -0
llama_lora/utils/lru_cache.py +31 -0
requirements.lock.txt +120 -0
templates/user_and_ai.json +7 -0

LLaMA_LoRA.ipynb CHANGED Viewed

@@ -6,7 +6,6 @@
       "provenance": [],
       "private_outputs": true,
       "toc_visible": true,
-      "authorship_tag": "ABX9TyMHMc4PwWLbRlhFol+WRzoT",
       "include_colab_link": true
     },
     "kernelspec": {
@@ -27,13 +26,13 @@
         "colab_type": "text"
       },
       "source": [
-        "<a href=\"https://colab.research.google.com/github/zetavg/LLaMA-LoRA/blob/main/LLaMA_LoRA.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
       ]
     },
     {
       "cell_type": "markdown",
       "source": [
-        "# 🦙🎛️ LLaMA-LoRA\n",
         "\n",
         "TL;DR: **Runtime > Run All** (`⌘/Ctrl+F9`). Takes about 5 minutes to start. You will be promped to authorize Google Drive access."
       ],
@@ -55,6 +54,26 @@
       "execution_count": null,
       "outputs": []
     },
     {
       "cell_type": "markdown",
       "source": [
@@ -72,9 +91,9 @@
         "# @title Git/Project { display-mode: \"form\", run: \"auto\" }\n",
         "# @markdown Project settings.\n",
         "\n",
-        "# @markdown The URL of the LLaMA-LoRA project<br>&nbsp;&nbsp;(default: `https://github.com/zetavg/llama-lora.git`):\n",
-        "llama_lora_project_url = \"https://github.com/zetavg/llama-lora.git\" # @param {type:\"string\"}\n",
-        "# @markdown The branch to use for LLaMA-LoRA project:\n",
         "llama_lora_project_branch = \"main\" # @param {type:\"string\"}\n",
         "\n",
         "# # @markdown Forces the local directory to be updated by the remote branch:\n",
@@ -97,7 +116,7 @@
         "# @markdown You can customize the location of the stored data here.\n",
         "\n",
         "# @markdown The folder in Google Drive where Colab Notebook data are stored<br />&nbsp;&nbsp;**(WARNING: The content of this folder will be modified by this notebook)**:\n",
-        "google_drive_folder = \"Colab Data/LLaMA LoRA\" # @param {type:\"string\"}\n",
         "# google_drive_colab_data_folder = \"Colab Notebooks/Notebook Data\"\n",
         "\n",
         "# Where Google Drive will be mounted in the Colab runtime.\n",
@@ -220,7 +239,7 @@
       "source": [
         "![ ! -d llama_lora ] && git clone -b {llama_lora_project_branch} --filter=tree:0 {llama_lora_project_url} llama_lora\n",
         "!cd llama_lora && git add --all && git stash && git fetch origin {llama_lora_project_branch} && git checkout {llama_lora_project_branch} && git reset origin/{llama_lora_project_branch} --hard\n",
-        "![ ! -f llama-lora-requirements-installed ] && cd llama_lora && pip install -r requirements.txt && touch ../llama-lora-requirements-installed"
       ],
       "metadata": {
         "id": "JGYz2VDoAzC8"
@@ -262,7 +281,7 @@
         "\n",
         "# Set Configs\n",
         "from llama_lora.llama_lora.globals import Global\n",
-        "Global.base_model = base_model\n",
         "data_dir_realpath = !realpath ./data\n",
         "Global.data_dir = data_dir_realpath[0]\n",
         "Global.load_8bit = True\n",
@@ -270,12 +289,11 @@
         "# Prepare Data Dir\n",
         "import os\n",
         "from llama_lora.llama_lora.utils.data import init_data_dir\n",
-        "init_data_dir()\n",
         "\n",
         "# Load the Base Model\n",
-        "from llama_lora.llama_lora.models import load_base_model\n",
-        "load_base_model()\n",
-        "print(f\"Base model loaded: '{Global.base_model}'.\")"
       ],
       "metadata": {
         "id": "Yf6g248ylteP"

       "provenance": [],
       "private_outputs": true,
       "toc_visible": true,
       "include_colab_link": true
     },
     "kernelspec": {
         "colab_type": "text"
       },
       "source": [
+        "<a href=\"https://colab.research.google.com/github/zetavg/LLaMA-LoRA-Tuner/blob/main/LLaMA_LoRA.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
       ]
     },
     {
       "cell_type": "markdown",
       "source": [
+        "# 🦙🎛️ LLaMA-LoRA Tuner\n",
         "\n",
         "TL;DR: **Runtime > Run All** (`⌘/Ctrl+F9`). Takes about 5 minutes to start. You will be promped to authorize Google Drive access."
       ],
       "execution_count": null,
       "outputs": []
     },
+    {
+      "cell_type": "code",
+      "source": [
+        "# @title A small workaround { display-mode: \"form\" }\n",
+        "# @markdown Don't panic if you see an error here. Just click the `RESTART RUNTIME` button in the output below, then Run All again.\n",
+        "# @markdown The error will disappear on the next run.\n",
+        "!pip install Pillow==9.3.0\n",
+        "import PIL\n",
+        "major, minor = map(float, PIL.__version__.split(\".\")[:2])\n",
+        "version_float = major + minor / 10**len(str(minor))\n",
+        "print(version_float)\n",
+        "if version_float < 9.003:\n",
+        "    raise Exception(\"Restart the runtime by clicking the 'RESTART RUNTIME' button above (or Runtime > Restart Runtime).\")"
+      ],
+      "metadata": {
+        "id": "XcJ4WO3KhOX1"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
     {
       "cell_type": "markdown",
       "source": [
         "# @title Git/Project { display-mode: \"form\", run: \"auto\" }\n",
         "# @markdown Project settings.\n",
         "\n",
+        "# @markdown The URL of the LLaMA-LoRA-Tuner project<br>&nbsp;&nbsp;(default: `https://github.com/zetavg/LLaMA-LoRA-Tuner.git`):\n",
+        "llama_lora_project_url = \"https://github.com/zetavg/LLaMA-LoRA-Tuner.git\" # @param {type:\"string\"}\n",
+        "# @markdown The branch to use for LLaMA-LoRA-Tuner project:\n",
         "llama_lora_project_branch = \"main\" # @param {type:\"string\"}\n",
         "\n",
         "# # @markdown Forces the local directory to be updated by the remote branch:\n",
         "# @markdown You can customize the location of the stored data here.\n",
         "\n",
         "# @markdown The folder in Google Drive where Colab Notebook data are stored<br />&nbsp;&nbsp;**(WARNING: The content of this folder will be modified by this notebook)**:\n",
+        "google_drive_folder = \"Colab Data/LLaMA-LoRA Tuner\" # @param {type:\"string\"}\n",
         "# google_drive_colab_data_folder = \"Colab Notebooks/Notebook Data\"\n",
         "\n",
         "# Where Google Drive will be mounted in the Colab runtime.\n",
       "source": [
         "![ ! -d llama_lora ] && git clone -b {llama_lora_project_branch} --filter=tree:0 {llama_lora_project_url} llama_lora\n",
         "!cd llama_lora && git add --all && git stash && git fetch origin {llama_lora_project_branch} && git checkout {llama_lora_project_branch} && git reset origin/{llama_lora_project_branch} --hard\n",
+        "![ ! -f llama-lora-requirements-installed ] && cd llama_lora && pip install -r requirements.lock.txt && touch ../llama-lora-requirements-installed"
       ],
       "metadata": {
         "id": "JGYz2VDoAzC8"
         "\n",
         "# Set Configs\n",
         "from llama_lora.llama_lora.globals import Global\n",
+        "Global.default_base_model_name = base_model\n",
         "data_dir_realpath = !realpath ./data\n",
         "Global.data_dir = data_dir_realpath[0]\n",
         "Global.load_8bit = True\n",
         "# Prepare Data Dir\n",
         "import os\n",
         "from llama_lora.llama_lora.utils.data import init_data_dir\n",
+        "init_data_dir()",
         "\n",
         "# Load the Base Model\n",
+        "from llama_lora.llama_lora.models import prepare_base_model\n",
+        "prepare_base_model()\n"
       ],
       "metadata": {
         "id": "Yf6g248ylteP"

README.md CHANGED Viewed

@@ -20,16 +20,19 @@ git push -f hf-ui-demo hf-ui-demo:main
 ---
-# 🦙🎛️ LLaMA-LoRA
-<a href="https://colab.research.google.com/github/zetavg/LLaMA-LoRA/blob/main/LLaMA_LoRA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>
 Making evaluating and fine-tuning LLaMA models with low-rank adaptation (LoRA) easy.
 ## Features
-* [1-click up and running in Google Colab](https://colab.research.google.com/github/zetavg/LLaMA-LoRA/blob/main/LLaMA_LoRA.ipynb).
   * Loads and stores data in Google Drive.
 * Evaluate various LLaMA LoRA models stored in your folder or from Hugging Face.<br /><a href="https://youtu.be/A3kb4VkDWyY"><img width="640px" src="https://user-images.githubusercontent.com/3784687/230272844-09f7a35b-46bf-4101-b15d-4ddf243b8bef.gif" /></a>
 * Fine-tune LLaMA models with different prompt templates and training dataset format.<br /><a href="https://youtu.be/5Db9U8PsaUk"><img width="640px" src="https://user-images.githubusercontent.com/3784687/230277315-9a91d983-1690-4594-9d54-912eda8963ee.gif" /></a>
@@ -47,7 +50,7 @@ There are various ways to run this app:
 ### Run On Google Colab
-Open [this Colab Notebook](https://colab.research.google.com/github/zetavg/LLaMA-LoRA/blob/main/LLaMA_LoRA.ipynb) and select **Runtime > Run All** (`⌘/Ctrl+F9`).
 You will be prompted to authorize Google Drive access, as Google Drive will be used to store your data. See the "Config"/"Google Drive" section for settings and more info.
@@ -58,10 +61,10 @@ After approximately 5 minutes of running, you will see the public URL in the out
 After following the [installation guide of SkyPilot](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html), create a `.yaml` to define a task for running the app:
 ```yaml
-# llama-lora-multitool.yaml
 resources:
-  accelerators: A10:1  # 1x NVIDIA A10 GPU
   cloud: lambda  # Optional; if left out, SkyPilot will automatically pick the cheapest cloud.
 file_mounts:
@@ -69,27 +72,27 @@ file_mounts:
   # (to store train datasets trained models)
   # See https://skypilot.readthedocs.io/en/latest/reference/storage.html for details.
   /data:
-    name: llama-lora-multitool-data  # Make sure this name is unique or you own this bucket. If it does not exists, SkyPilot will try to create a bucket with this name.
-    store: gcs  # Could be either of [s3, gcs]
     mode: MOUNT
-# Clone the LLaMA-LoRA repo and install its dependencies.
 setup: |
-  git clone https://github.com/zetavg/LLaMA-LoRA.git llama_lora
-  cd llama_lora && pip install -r requirements.txt
   cd ..
   echo 'Dependencies installed.'
 # Start the app.
 run: |
   echo 'Starting...'
-  python llama_lora/app.py --data_dir='/data' --base_model='decapoda-research/llama-7b-hf' --share
 ```
 Then launch a cluster to run the task:
 ```
-sky launch -c llama-lora-multitool llama-lora-multitool.yaml
 ```
 `-c ...` is an optional flag to specify a cluster name. If not specified, SkyPilot will automatically generate one.
@@ -106,13 +109,13 @@ When you are done, run `sky stop <cluster_name>` to stop the cluster. To termina
   <summary>Prepare environment with conda</summary>
   ```bash
-  conda create -y -n llama-lora-multitool python=3.8
-  conda activate llama-lora-multitool
   ```
 </details>
 ```bash
-pip install -r requirements.txt
 python app.py --data_dir='./data' --base_model='decapoda-research/llama-7b-hf' --share
 ```

 ---
+# 🦙🎛️ LLaMA-LoRA Tuner
+<a href="https://colab.research.google.com/github/zetavg/LLaMA-LoRA-Tuner/blob/main/LLaMA_LoRA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>
 Making evaluating and fine-tuning LLaMA models with low-rank adaptation (LoRA) easy.
 ## Features
+**[See a demo on Hugging Face](https://huggingface.co/spaces/zetavg/LLaMA-LoRA-UI-Demo)** **Only serves UI demonstration. To try training or text generation, [run on Colab](#run-on-google-colab).*
+* **[1-click up and running in Google Colab](#run-on-google-colab)** with a standard GPU runtime.
   * Loads and stores data in Google Drive.
 * Evaluate various LLaMA LoRA models stored in your folder or from Hugging Face.<br /><a href="https://youtu.be/A3kb4VkDWyY"><img width="640px" src="https://user-images.githubusercontent.com/3784687/230272844-09f7a35b-46bf-4101-b15d-4ddf243b8bef.gif" /></a>
 * Fine-tune LLaMA models with different prompt templates and training dataset format.<br /><a href="https://youtu.be/5Db9U8PsaUk"><img width="640px" src="https://user-images.githubusercontent.com/3784687/230277315-9a91d983-1690-4594-9d54-912eda8963ee.gif" /></a>
 ### Run On Google Colab
+Open [this Colab Notebook](https://colab.research.google.com/github/zetavg/LLaMA-LoRA-Tuner/blob/main/LLaMA_LoRA.ipynb) and select **Runtime > Run All** (`⌘/Ctrl+F9`).
 You will be prompted to authorize Google Drive access, as Google Drive will be used to store your data. See the "Config"/"Google Drive" section for settings and more info.
 After following the [installation guide of SkyPilot](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html), create a `.yaml` to define a task for running the app:
 ```yaml
+# llama-lora-tuner.yaml
 resources:
+  accelerators: A10:1  # 1x NVIDIA A10 GPU, about US$ 0.6 / hr on Lambda Cloud.
   cloud: lambda  # Optional; if left out, SkyPilot will automatically pick the cheapest cloud.
 file_mounts:
   # (to store train datasets trained models)
   # See https://skypilot.readthedocs.io/en/latest/reference/storage.html for details.
   /data:
+    name: llama-lora-tuner-data  # Make sure this name is unique or you own this bucket. If it does not exists, SkyPilot will try to create a bucket with this name.
+    store: s3  # Could be either of [s3, gcs]
     mode: MOUNT
+# Clone the LLaMA-LoRA Tuner repo and install its dependencies.
 setup: |
+  git clone https://github.com/zetavg/LLaMA-LoRA-Tuner.git llama_lora_tuner
+  cd llama_lora_tuner && pip install -r requirements.lock.txt
   cd ..
   echo 'Dependencies installed.'
 # Start the app.
 run: |
   echo 'Starting...'
+  python llama_lora_tuner/app.py --data_dir='/data' --base_model='decapoda-research/llama-7b-hf' --share
 ```
 Then launch a cluster to run the task:
 ```
+sky launch -c llama-lora-tuner llama-lora-tuner.yaml
 ```
 `-c ...` is an optional flag to specify a cluster name. If not specified, SkyPilot will automatically generate one.
   <summary>Prepare environment with conda</summary>
   ```bash
+  conda create -y python=3.8 -n llama-lora-tuner
+  conda activate llama-lora-tuner
   ```
 </details>
 ```bash
+pip install -r requirements.lock.txt
 python app.py --data_dir='./data' --base_model='decapoda-research/llama-7b-hf' --share
 ```

app.py CHANGED Viewed

@@ -16,6 +16,7 @@ def main(
     # Allows to listen on all interfaces by providing '0.0.0.0'.
     server_name: str = "127.0.0.1",
     share: bool = False,
     ui_show_sys_info: bool = True,
     ui_dev_mode: bool = False,
 ):
@@ -29,7 +30,7 @@ def main(
         data_dir
     ), "Please specify a --data_dir, e.g. --data_dir='./data'"
-    Global.base_model = base_model
     Global.data_dir = os.path.abspath(data_dir)
     Global.load_8bit = load_8bit

     # Allows to listen on all interfaces by providing '0.0.0.0'.
     server_name: str = "127.0.0.1",
     share: bool = False,
+    skip_loading_base_model: bool = False,
     ui_show_sys_info: bool = True,
     ui_dev_mode: bool = False,
 ):
         data_dir
     ), "Please specify a --data_dir, e.g. --data_dir='./data'"
+    Global.default_base_model_name = base_model
     Global.data_dir = os.path.abspath(data_dir)
     Global.load_8bit = load_8bit

llama_lora/globals.py CHANGED Viewed

@@ -6,18 +6,17 @@ from typing import Any, Dict, List, Optional, Tuple, Union
 from numba import cuda
 import nvidia_smi
 from .lib.finetune import train
 class Global:
     version = None
-    base_model: str = ""
     data_dir: str = ""
     load_8bit: bool = False
-    loaded_tokenizer: Any = None
-    loaded_base_model: Any = None
     # Functions
     train_fn: Any = train
@@ -25,8 +24,15 @@ class Global:
     # Training Control
     should_stop_training = False
     # Model related
-    model_has_been_used = False
     # GPU Info
     gpu_cc = None  # GPU compute capability
@@ -35,7 +41,7 @@ class Global:
     gpu_total_memory = None
     # UI related
-    ui_title: str = "LLaMA-LoRA"
     ui_emoji: str = "🦙🎛️"
     ui_subtitle: str = "Toolkit for evaluating and fine-tuning LLaMA models with low-rank adaptation (LoRA)."
     ui_show_sys_info: bool = True

 from numba import cuda
 import nvidia_smi
+from .utils.lru_cache import LRUCache
 from .lib.finetune import train
 class Global:
     version = None
     data_dir: str = ""
     load_8bit: bool = False
+    default_base_model_name: str = ""
     # Functions
     train_fn: Any = train
     # Training Control
     should_stop_training = False
+    # Generation Control
+    should_stop_generating = False
+    generation_force_stopped_at = None
     # Model related
+    loaded_models = LRUCache(1)
+    loaded_tokenizers = LRUCache(1)
+    new_base_model_that_is_ready_to_be_used = None
+    name_of_new_base_model_that_is_ready_to_be_used = None
     # GPU Info
     gpu_cc = None  # GPU compute capability
     gpu_total_memory = None
     # UI related
+    ui_title: str = "LLaMA-LoRA Tuner"
     ui_emoji: str = "🦙🎛️"
     ui_subtitle: str = "Toolkit for evaluating and fine-tuning LLaMA models with low-rank adaptation (LoRA)."
     ui_show_sys_info: bool = True

llama_lora/lib/finetune.py CHANGED Viewed

@@ -2,6 +2,8 @@ import os
 import sys
 from typing import Any, List
 import fire
 import torch
 import transformers
@@ -47,6 +49,10 @@ def train(
     # logging
     callbacks: List[Any] = []
 ):
     device_map = "auto"
     world_size = int(os.environ.get("WORLD_SIZE", 1))
     ddp = world_size != 1
@@ -202,6 +208,12 @@ def train(
         ),
         callbacks=callbacks,
     )
     model.config.use_cache = False
     old_state_dict = model.state_dict
@@ -214,9 +226,16 @@ def train(
     if torch.__version__ >= "2" and sys.platform != "win32":
         model = torch.compile(model)
-    result = trainer.train(resume_from_checkpoint=resume_from_checkpoint)
     model.save_pretrained(output_dir)
     print(f"Model saved to {output_dir}.")
-    return result

 import sys
 from typing import Any, List
+import json
 import fire
 import torch
 import transformers
     # logging
     callbacks: List[Any] = []
 ):
+    if os.path.exists(output_dir):
+        if (not os.path.isdir(output_dir)) or os.path.exists(os.path.join(output_dir, 'adapter_config.json')):
+            raise ValueError(f"The output directory already exists and is not empty. ({output_dir})")
     device_map = "auto"
     world_size = int(os.environ.get("WORLD_SIZE", 1))
     ddp = world_size != 1
         ),
         callbacks=callbacks,
     )
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    with open(os.path.join(output_dir, "trainer_args.json"), 'w') as trainer_args_json_file:
+        json.dump(trainer.args.to_dict(), trainer_args_json_file, indent=2)
     model.config.use_cache = False
     old_state_dict = model.state_dict
     if torch.__version__ >= "2" and sys.platform != "win32":
         model = torch.compile(model)
+    train_output = trainer.train(resume_from_checkpoint=resume_from_checkpoint)
     model.save_pretrained(output_dir)
     print(f"Model saved to {output_dir}.")
+    with open(os.path.join(output_dir, "trainer_log_history.jsonl"), 'w') as trainer_log_history_jsonl_file:
+        trainer_log_history = "\n".join([json.dumps(line) for line in trainer.state.log_history])
+        trainer_log_history_jsonl_file.write(trainer_log_history)
+    with open(os.path.join(output_dir, "train_output.json"), 'w') as train_output_json_file:
+        json.dump(train_output, train_output_json_file, indent=2)
+    return train_output

llama_lora/models.py CHANGED Viewed

@@ -3,9 +3,8 @@ import sys
 import gc
 import torch
-import transformers
 from peft import PeftModel
-from transformers import GenerationConfig, LlamaForCausalLM, LlamaTokenizer
 from .globals import Global
@@ -23,84 +22,140 @@ def get_device():
         pass
-device = get_device()
-def get_base_model():
-    load_base_model()
-    return Global.loaded_base_model
-def get_model_with_lora(lora_weights: str = "tloen/alpaca-lora-7b"):
-    Global.model_has_been_used = True
     if device == "cuda":
-        model = PeftModel.from_pretrained(
-            get_base_model(),
-            lora_weights,
             torch_dtype=torch.float16,
-            device_map={'': 0},  # ? https://github.com/tloen/alpaca-lora/issues/21
         )
     elif device == "mps":
-        model = PeftModel.from_pretrained(
-            get_base_model(),
-            lora_weights,
             device_map={"": device},
             torch_dtype=torch.float16,
         )
     else:
-        model = PeftModel.from_pretrained(
-            get_base_model(),
-            lora_weights,
-            device_map={"": device},
         )
-    model.config.pad_token_id = get_tokenizer().pad_token_id = 0
     model.config.bos_token_id = 1
     model.config.eos_token_id = 2
-    if not Global.load_8bit:
-        model.half()  # seems to fix bugs for some users.
-    model.eval()
-    if torch.__version__ >= "2" and sys.platform != "win32":
-        model = torch.compile(model)
     return model
-def get_tokenizer():
-    load_base_model()
-    return Global.loaded_tokenizer
-def load_base_model():
     if Global.ui_dev_mode:
         return
-    if Global.loaded_tokenizer is None:
-        Global.loaded_tokenizer = LlamaTokenizer.from_pretrained(
-            Global.base_model
-        )
-    if Global.loaded_base_model is None:
         if device == "cuda":
-            Global.loaded_base_model = LlamaForCausalLM.from_pretrained(
-                Global.base_model,
-                load_in_8bit=Global.load_8bit,
                 torch_dtype=torch.float16,
-                # device_map="auto",
-                device_map={'': 0},  # ? https://github.com/tloen/alpaca-lora/issues/21
             )
         elif device == "mps":
-            Global.loaded_base_model = LlamaForCausalLM.from_pretrained(
-                Global.base_model,
                 device_map={"": device},
                 torch_dtype=torch.float16,
             )
         else:
-            Global.loaded_base_model = LlamaForCausalLM.from_pretrained(
-                Global.base_model, device_map={"": device}, low_cpu_mem_usage=True
             )
 def clear_cache():
     gc.collect()
@@ -111,17 +166,6 @@ def clear_cache():
 def unload_models():
-    del Global.loaded_base_model
-    Global.loaded_base_model = None
-    del Global.loaded_tokenizer
-    Global.loaded_tokenizer = None
     clear_cache()
-    Global.model_has_been_used = False
-def unload_models_if_already_used():
-    if Global.model_has_been_used:
-        unload_models()

 import gc
 import torch
+from transformers import LlamaForCausalLM, LlamaTokenizer
 from peft import PeftModel
 from .globals import Global
         pass
+def get_new_base_model(base_model_name):
+    if Global.ui_dev_mode:
+        return
+    if Global.new_base_model_that_is_ready_to_be_used:
+        if Global.name_of_new_base_model_that_is_ready_to_be_used == base_model_name:
+            model = Global.new_base_model_that_is_ready_to_be_used
+            Global.new_base_model_that_is_ready_to_be_used = None
+            Global.name_of_new_base_model_that_is_ready_to_be_used = None
+            return model
+        else:
+            Global.new_base_model_that_is_ready_to_be_used = None
+            Global.name_of_new_base_model_that_is_ready_to_be_used = None
+            clear_cache()
+    device = get_device()
     if device == "cuda":
+        model = LlamaForCausalLM.from_pretrained(
+            base_model_name,
+            load_in_8bit=Global.load_8bit,
             torch_dtype=torch.float16,
+            # device_map="auto",
+            # ? https://github.com/tloen/alpaca-lora/issues/21
+            device_map={'': 0},
         )
     elif device == "mps":
+        model = LlamaForCausalLM.from_pretrained(
+            base_model_name,
             device_map={"": device},
             torch_dtype=torch.float16,
         )
     else:
+        model = LlamaForCausalLM.from_pretrained(
+            base_model_name, device_map={"": device}, low_cpu_mem_usage=True
         )
+    model.config.pad_token_id = get_tokenizer(base_model_name).pad_token_id = 0
     model.config.bos_token_id = 1
     model.config.eos_token_id = 2
     return model
+def get_tokenizer(base_model_name):
+    if Global.ui_dev_mode:
+        return
+    loaded_tokenizer = Global.loaded_tokenizers.get(base_model_name)
+    if loaded_tokenizer:
+        return loaded_tokenizer
+    tokenizer = LlamaTokenizer.from_pretrained(base_model_name)
+    Global.loaded_tokenizers.set(base_model_name, tokenizer)
+    return tokenizer
+def get_model(
+        base_model_name,
+        peft_model_name=None):
     if Global.ui_dev_mode:
         return
+    if peft_model_name == "None":
+        peft_model_name = None
+    model_key = base_model_name
+    if peft_model_name:
+        model_key = f"{base_model_name}//{peft_model_name}"
+    loaded_model = Global.loaded_models.get(model_key)
+    if loaded_model:
+        return loaded_model
+    peft_model_name_or_path = peft_model_name
+    lora_models_directory_path = os.path.join(Global.data_dir, "lora_models")
+    possible_lora_model_path = os.path.join(
+        lora_models_directory_path, peft_model_name)
+    if os.path.isdir(possible_lora_model_path):
+        peft_model_name_or_path = possible_lora_model_path
+    Global.loaded_models.prepare_to_set()
+    clear_cache()
+    model = get_new_base_model(base_model_name)
+    if peft_model_name:
+        device = get_device()
         if device == "cuda":
+            model = PeftModel.from_pretrained(
+                model,
+                peft_model_name_or_path,
                 torch_dtype=torch.float16,
+                # ? https://github.com/tloen/alpaca-lora/issues/21
+                device_map={'': 0},
             )
         elif device == "mps":
+            model = PeftModel.from_pretrained(
+                model,
+                peft_model_name_or_path,
                 device_map={"": device},
                 torch_dtype=torch.float16,
             )
         else:
+            model = PeftModel.from_pretrained(
+                model,
+                peft_model_name_or_path,
+                device_map={"": device},
             )
+    model.config.pad_token_id = get_tokenizer(base_model_name).pad_token_id = 0
+    model.config.bos_token_id = 1
+    model.config.eos_token_id = 2
+    if not Global.load_8bit:
+        model.half()  # seems to fix bugs for some users.
+    model.eval()
+    if torch.__version__ >= "2" and sys.platform != "win32":
+        model = torch.compile(model)
+    Global.loaded_models.set(model_key, model)
+    clear_cache()
+    return model
+def prepare_base_model(base_model_name=Global.default_base_model_name):
+    Global.new_base_model_that_is_ready_to_be_used = get_new_base_model(base_model_name)
+    Global.name_of_new_base_model_that_is_ready_to_be_used = base_model_name
 def clear_cache():
     gc.collect()
 def unload_models():
+    Global.loaded_models.clear()
+    Global.loaded_tokenizers.clear()
     clear_cache()

llama_lora/ui/finetune_ui.py CHANGED Viewed

@@ -10,8 +10,8 @@ from transformers import TrainerCallback
 from ..globals import Global
 from ..models import (
-    get_base_model, get_tokenizer,
-    clear_cache, unload_models_if_already_used)
 from ..utils.data import (
     get_available_template_names,
     get_available_dataset_names,
@@ -258,22 +258,30 @@ def do_train(
     dataset_plain_text_data_separator,
     # Training Options
     max_seq_length,
     micro_batch_size,
     gradient_accumulation_steps,
     epochs,
     learning_rate,
     lora_r,
     lora_alpha,
     lora_dropout,
     model_name,
     progress=gr.Progress(track_tqdm=should_training_progress_track_tqdm),
 ):
     try:
-        clear_cache()
-        # If model has been used in inference, we need to unload it first.
-        # Otherwise, we'll get a 'Function MmBackward0 returned an invalid
-        # gradient at index 1 - expected device meta but got cuda:0' error.
-        unload_models_if_already_used()
         prompter = Prompter(template)
         variable_names = prompter.get_variable_names()
@@ -319,6 +327,7 @@ def do_train(
             data = process_json_dataset(data)
         data_count = len(data)
         train_data = [
             {
@@ -356,13 +365,16 @@ def do_train(
 Train options: {json.dumps({
     'max_seq_length': max_seq_length,
     'micro_batch_size': micro_batch_size,
     'gradient_accumulation_steps': gradient_accumulation_steps,
     'epochs': epochs,
     'learning_rate': learning_rate,
     'lora_r': lora_r,
     'lora_alpha': lora_alpha,
     'lora_dropout': lora_dropout,
     'model_name': model_name,
 }, indent=2)}
@@ -373,6 +385,9 @@ Train data (first 10):
             time.sleep(2)
             return message
         log_history = []
         class UiTrainerCallback(TrainerCallback):
@@ -409,21 +424,51 @@ Train data (first 10):
         Global.should_stop_training = False
-        # Do this again right before training to make sure the model is not used in inference.
-        unload_models_if_already_used()
-        clear_cache()
-        base_model = get_base_model()
-        tokenizer = get_tokenizer()
         # Do not let other tqdm iterations interfere the progress reporting after training starts.
         # progress.track_tqdm = False  # setting this dynamically is not working, determining if track_tqdm should be enabled based on GPU cores at start instead.
-        results = Global.train_fn(
             base_model,  # base_model
             tokenizer,  # tokenizer
-            os.path.join(Global.data_dir, "lora_models",
-                         model_name),  # output_dir
             train_data,
             # 128,  # batch_size (is not used, use gradient_accumulation_steps instead)
             micro_batch_size,    # micro_batch_size
@@ -431,12 +476,12 @@ Train data (first 10):
             epochs,   # num_epochs
             learning_rate,   # learning_rate
             max_seq_length,  # cutoff_len
-            0,  # val_set_size
             lora_r,  # lora_r
             lora_alpha,  # lora_alpha
             lora_dropout,  # lora_dropout
-            ["q_proj", "v_proj"],  # lora_target_modules
-            True,  # train_on_inputs
             False,  # group_by_length
             None,  # resume_from_checkpoint
             training_callbacks  # callbacks
@@ -445,12 +490,17 @@ Train data (first 10):
         logs_str = "\n".join([json.dumps(log)
                              for log in log_history]) or "None"
-        result_message = f"Training ended:\n{str(results)}\n\nLogs:\n{logs_str}"
         print(result_message)
         return result_message
     except Exception as e:
-        raise gr.Error(e)
 def do_abort_training():
@@ -595,11 +645,20 @@ def finetune_ui():
             )
             )
-            max_seq_length = gr.Slider(
-                minimum=1, maximum=4096, value=512,
-                label="Max Sequence Length",
-                info="The maximum length of each sample text sequence. Sequences longer than this will be truncated."
-            )
         with gr.Row():
             micro_batch_size_default_value = 1
@@ -625,7 +684,7 @@ def finetune_ui():
                 )
                 epochs = gr.Slider(
-                    minimum=1, maximum=100, step=1, value=3,
                     label="Epochs",
                     info="The number of times to iterate over the entire training dataset. A larger number of epochs may improve model performance but also increase the risk of overfitting.")
@@ -635,6 +694,12 @@ def finetune_ui():
                     info="The initial learning rate for the optimizer. A higher learning rate may speed up convergence but also cause instability or divergence. A lower learning rate may require more steps to reach optimal performance but also avoid overshooting or oscillating around local minima."
                 )
             with gr.Column():
                 lora_r = gr.Slider(
                     minimum=1, maximum=16, step=1, value=8,
@@ -654,6 +719,12 @@ def finetune_ui():
                     info="The dropout probability for LoRA, which controls the fraction of LoRA parameters that are set to zero during training. A larger lora_dropout increases the regularization effect of LoRA but also increases the risk of underfitting."
                 )
                 with gr.Column():
                     model_name = gr.Textbox(
                         lines=1, label="LoRA Model Name", value=random_name,
@@ -675,25 +746,28 @@ def finetune_ui():
                             elem_id="finetune_confirm_stop_btn"
                         )
-        training_status = gr.Text(
-            "Training status will be shown here.",
-            label="Training Status/Results",
             elem_id="finetune_training_status")
         train_progress = train_btn.click(
             fn=do_train,
             inputs=(dataset_inputs + [
                 max_seq_length,
                 micro_batch_size,
                 gradient_accumulation_steps,
                 epochs,
                 learning_rate,
                 lora_r,
                 lora_alpha,
                 lora_dropout,
                 model_name
             ]),
-            outputs=training_status
         )
         # controlled by JS, shows the confirm_abort_button
@@ -811,6 +885,12 @@ def finetune_ui():
                 document.getElementById('finetune_confirm_stop_btn').style.display =
                   'none';
               }, 5000);
               document.getElementById('finetune_stop_btn').style.display = 'none';
               document.getElementById('finetune_confirm_stop_btn').style.display =
                 'block';

 from ..globals import Global
 from ..models import (
+    get_new_base_model, get_tokenizer,
+    clear_cache, unload_models)
 from ..utils.data import (
     get_available_template_names,
     get_available_dataset_names,
     dataset_plain_text_data_separator,
     # Training Options
     max_seq_length,
+    evaluate_data_percentage,
     micro_batch_size,
     gradient_accumulation_steps,
     epochs,
     learning_rate,
+    train_on_inputs,
     lora_r,
     lora_alpha,
     lora_dropout,
+    lora_target_modules,
     model_name,
     progress=gr.Progress(track_tqdm=should_training_progress_track_tqdm),
 ):
     try:
+        base_model_name = Global.default_base_model_name
+        output_dir = os.path.join(Global.data_dir, "lora_models", model_name)
+        if os.path.exists(output_dir):
+            if (not os.path.isdir(output_dir)) or os.path.exists(os.path.join(output_dir, 'adapter_config.json')):
+                raise ValueError(f"The output directory already exists and is not empty. ({output_dir})")
+        if not should_training_progress_track_tqdm:
+            progress(0, desc="Preparing train data...")
+        unload_models()  # Need RAM for training
         prompter = Prompter(template)
         variable_names = prompter.get_variable_names()
             data = process_json_dataset(data)
         data_count = len(data)
+        evaluate_data_count = math.ceil(data_count * evaluate_data_percentage)
         train_data = [
             {
 Train options: {json.dumps({
     'max_seq_length': max_seq_length,
+    'val_set_size': evaluate_data_count,
     'micro_batch_size': micro_batch_size,
     'gradient_accumulation_steps': gradient_accumulation_steps,
     'epochs': epochs,
     'learning_rate': learning_rate,
+    'train_on_inputs': train_on_inputs,
     'lora_r': lora_r,
     'lora_alpha': lora_alpha,
     'lora_dropout': lora_dropout,
+    'lora_target_modules': lora_target_modules,
     'model_name': model_name,
 }, indent=2)}
             time.sleep(2)
             return message
+        if not should_training_progress_track_tqdm:
+            progress(0, desc="Preparing model for training...")
         log_history = []
         class UiTrainerCallback(TrainerCallback):
         Global.should_stop_training = False
+        base_model = get_new_base_model(base_model_name)
+        tokenizer = get_tokenizer(base_model_name)
         # Do not let other tqdm iterations interfere the progress reporting after training starts.
         # progress.track_tqdm = False  # setting this dynamically is not working, determining if track_tqdm should be enabled based on GPU cores at start instead.
+        if not os.path.exists(output_dir):
+            os.makedirs(output_dir)
+        with open(os.path.join(output_dir, "info.json"), 'w') as info_json_file:
+            dataset_name = "N/A (from text input)"
+            if load_dataset_from == "Data Dir":
+                dataset_name = dataset_from_data_dir
+            info = {
+                'base_model': base_model_name,
+                'prompt_template': template,
+                'dataset_name': dataset_name,
+                'dataset_rows': len(train_data),
+                'timestamp': time.time(),
+                'max_seq_length': max_seq_length,
+                'train_on_inputs': train_on_inputs,
+                'micro_batch_size': micro_batch_size,
+                'gradient_accumulation_steps': gradient_accumulation_steps,
+                'epochs': epochs,
+                'learning_rate': learning_rate,
+                'evaluate_data_percentage': evaluate_data_percentage,
+                'lora_r': lora_r,
+                'lora_alpha': lora_alpha,
+                'lora_dropout': lora_dropout,
+                'lora_target_modules': lora_target_modules,
+            }
+            json.dump(info, info_json_file, indent=2)
+        if not should_training_progress_track_tqdm:
+            progress(0, desc="Train starting...")
+        train_output = Global.train_fn(
             base_model,  # base_model
             tokenizer,  # tokenizer
+            output_dir,  # output_dir
             train_data,
             # 128,  # batch_size (is not used, use gradient_accumulation_steps instead)
             micro_batch_size,    # micro_batch_size
             epochs,   # num_epochs
             learning_rate,   # learning_rate
             max_seq_length,  # cutoff_len
+            evaluate_data_count,  # val_set_size
             lora_r,  # lora_r
             lora_alpha,  # lora_alpha
             lora_dropout,  # lora_dropout
+            lora_target_modules,  # lora_target_modules
+            train_on_inputs,  # train_on_inputs
             False,  # group_by_length
             None,  # resume_from_checkpoint
             training_callbacks  # callbacks
         logs_str = "\n".join([json.dumps(log)
                              for log in log_history]) or "None"
+        result_message = f"Training ended:\n{str(train_output)}\n\nLogs:\n{logs_str}"
         print(result_message)
+        del base_model
+        del tokenizer
+        clear_cache()
         return result_message
     except Exception as e:
+        raise gr.Error(f"{e} (To dismiss this error, click the 'Abort' button)")
 def do_abort_training():
             )
             )
+            with gr.Row():
+                max_seq_length = gr.Slider(
+                    minimum=1, maximum=4096, value=512,
+                    label="Max Sequence Length",
+                    info="The maximum length of each sample text sequence. Sequences longer than this will be truncated.",
+                    elem_id="finetune_max_seq_length"
+                )
+                train_on_inputs = gr.Checkbox(
+                    label="Train on Inputs",
+                    value=True,
+                    info="If not enabled, inputs will be masked out in loss.",
+                    elem_id="finetune_train_on_inputs"
+                )
         with gr.Row():
             micro_batch_size_default_value = 1
                 )
                 epochs = gr.Slider(
+                    minimum=1, maximum=100, step=1, value=10,
                     label="Epochs",
                     info="The number of times to iterate over the entire training dataset. A larger number of epochs may improve model performance but also increase the risk of overfitting.")
                     info="The initial learning rate for the optimizer. A higher learning rate may speed up convergence but also cause instability or divergence. A lower learning rate may require more steps to reach optimal performance but also avoid overshooting or oscillating around local minima."
                 )
+                evaluate_data_percentage = gr.Slider(
+                    minimum=0, maximum=0.5, step=0.001, value=0.03,
+                    label="Evaluation Data Percentage",
+                    info="The percentage of data to be used for evaluation. This percentage of data will not be used for training and will be used to assess the performance of the model during the process."
+                )
             with gr.Column():
                 lora_r = gr.Slider(
                     minimum=1, maximum=16, step=1, value=8,
                     info="The dropout probability for LoRA, which controls the fraction of LoRA parameters that are set to zero during training. A larger lora_dropout increases the regularization effect of LoRA but also increases the risk of underfitting."
                 )
+                lora_target_modules = gr.CheckboxGroup(
+                    label="LoRA Target Modules",
+                    choices=["q_proj", "k_proj", "v_proj", "o_proj"],
+                    value=["q_proj", "v_proj"],
+                )
                 with gr.Column():
                     model_name = gr.Textbox(
                         lines=1, label="LoRA Model Name", value=random_name,
                             elem_id="finetune_confirm_stop_btn"
                         )
+        train_output = gr.Text(
+            "Training results will be shown here.",
+            label="Train Output",
             elem_id="finetune_training_status")
         train_progress = train_btn.click(
             fn=do_train,
             inputs=(dataset_inputs + [
                 max_seq_length,
+                evaluate_data_percentage,
                 micro_batch_size,
                 gradient_accumulation_steps,
                 epochs,
                 learning_rate,
+                train_on_inputs,
                 lora_r,
                 lora_alpha,
                 lora_dropout,
+                lora_target_modules,
                 model_name
             ]),
+            outputs=train_output
         )
         # controlled by JS, shows the confirm_abort_button
                 document.getElementById('finetune_confirm_stop_btn').style.display =
                   'none';
               }, 5000);
+              document.getElementById('finetune_confirm_stop_btn').style['pointer-events'] =
+                'none';
+              setTimeout(function () {
+                document.getElementById('finetune_confirm_stop_btn').style['pointer-events'] =
+                  'inherit';
+              }, 300);
               document.getElementById('finetune_stop_btn').style.display = 'none';
               document.getElementById('finetune_confirm_stop_btn').style.display =
                 'block';

llama_lora/ui/inference_ui.py CHANGED Viewed

@@ -7,17 +7,30 @@ import transformers
 from transformers import GenerationConfig
 from ..globals import Global
-from ..models import get_base_model, get_model_with_lora, get_tokenizer, get_device
 from ..utils.data import (
     get_available_template_names,
     get_available_lora_model_names,
-    get_path_of_available_lora_model)
 from ..utils.prompter import Prompter
 from ..utils.callbacks import Iteratorize, Stream
 device = get_device()
 default_show_raw = True
 def do_inference(
@@ -35,20 +48,25 @@ def do_inference(
     show_raw=False,
     progress=gr.Progress(track_tqdm=True),
 ):
     try:
         variables = [variable_0, variable_1, variable_2, variable_3,
                      variable_4, variable_5, variable_6, variable_7]
         prompter = Prompter(prompt_template)
         prompt = prompter.generate_prompt(variables)
-        if lora_model_name is not None and "/" not in lora_model_name and lora_model_name != "None":
-            path_of_available_lora_model = get_path_of_available_lora_model(
-                lora_model_name)
-            if path_of_available_lora_model:
-                lora_model_name = path_of_available_lora_model
         if Global.ui_dev_mode:
-            message = f"Hi, I’m currently in UI-development mode and do not have access to resources to process your request. However, this behavior is similar to what will actually happen, so you can try and see how it will work!\n\nBase model: {Global.base_model}\nLoRA model: {lora_model_name}\n\nThe following text is your prompt:\n\n{prompt}"
             print(message)
             if stream_output:
@@ -66,18 +84,24 @@ def do_inference(
                         yield out
                 for partial_sentence in word_generator(message):
-                    yield partial_sentence, json.dumps(list(range(len(partial_sentence.split()))), indent=2)
                     time.sleep(0.05)
                 return
             time.sleep(1)
-            yield message, json.dumps(list(range(len(message.split()))), indent=2)
             return
-        model = get_base_model()
-        if not lora_model_name == "None" and lora_model_name is not None:
-            model = get_model_with_lora(lora_model_name)
-        tokenizer = get_tokenizer()
         inputs = tokenizer(prompt, return_tensors="pt")
         input_ids = inputs["input_ids"].to(device)
@@ -97,6 +121,19 @@ def do_inference(
             "max_new_tokens": max_new_tokens,
         }
         if stream_output:
             # Stream the reply 1 token at a time.
             # This is based on the trick of using 'stopping_criteria' to create an iterator,
@@ -128,29 +165,60 @@ def do_inference(
                     raw_output = None
                     if show_raw:
                         raw_output = str(output)
-                    yield prompter.get_response(decoded_output), raw_output
             return  # early return for stream_output
         # Without streaming
         with torch.no_grad():
-            generation_output = model.generate(
-                input_ids=input_ids,
-                generation_config=generation_config,
-                return_dict_in_generate=True,
-                output_scores=True,
-                max_new_tokens=max_new_tokens,
-            )
         s = generation_output.sequences[0]
         output = tokenizer.decode(s)
         raw_output = None
         if show_raw:
             raw_output = str(s)
-        yield prompter.get_response(output), raw_output
     except Exception as e:
         raise gr.Error(e)
 def reload_selections(current_lora_model, current_prompt_template):
     available_template_names = get_available_template_names()
     available_template_names_with_none = available_template_names + ["None"]
@@ -172,7 +240,7 @@ def reload_selections(current_lora_model, current_prompt_template):
             gr.Dropdown.update(choices=available_template_names_with_none, value=current_prompt_template))
-def handle_prompt_template_change(prompt_template):
     prompter = Prompter(prompt_template)
     var_names = prompter.get_variable_names()
     human_var_names = [' '.join(word.capitalize()
@@ -182,7 +250,36 @@ def handle_prompt_template_change(prompt_template):
     while len(gr_updates) < 8:
         gr_updates.append(gr.Textbox.update(
             label="Not Used", visible=False))
-    return gr_updates
 def update_prompt_preview(prompt_template,
@@ -200,12 +297,15 @@ def inference_ui():
     with gr.Blocks() as inference_ui_blocks:
         with gr.Row():
-            lora_model = gr.Dropdown(
-                label="LoRA Model",
-                elem_id="inference_lora_model",
-                value="tloen/alpaca-lora-7b",
-                allow_custom_value=True,
-            )
             prompt_template = gr.Dropdown(
                 label="Prompt Template",
                 elem_id="inference_prompt_template",
@@ -278,7 +378,7 @@ def inference_ui():
                         )
                     num_beams = gr.Slider(
-                        minimum=1, maximum=4, value=1, step=1,
                         label="Beams",
                         elem_id="inference_beams"
                     )
@@ -318,7 +418,7 @@ def inference_ui():
             with gr.Column(elem_id="inference_output_group_container"):
                 with gr.Column(elem_id="inference_output_group"):
                     inference_output = gr.Textbox(
-                        lines=12, label="Output", elem_id="inference_output")
                     inference_output.style(show_copy_button=True)
                     with gr.Accordion(
                             "Raw Output",
@@ -346,11 +446,25 @@ def inference_ui():
         )
         things_that_might_timeout.append(reload_selections_event)
-        prompt_template_change_event = prompt_template.change(fn=handle_prompt_template_change, inputs=[prompt_template], outputs=[
-            variable_0, variable_1, variable_2, variable_3, variable_4, variable_5, variable_6, variable_7])
         things_that_might_timeout.append(prompt_template_change_event)
         generate_event = generate_btn.click(
             fn=do_inference,
             inputs=[
                 lora_model,
@@ -369,8 +483,12 @@ def inference_ui():
             outputs=[inference_output, inference_raw_output],
             api_name="inference"
         )
-        stop_btn.click(fn=None, inputs=None, outputs=None,
-                       cancels=[generate_event])
         update_prompt_preview_event = update_prompt_preview_btn.click(fn=update_prompt_preview, inputs=[prompt_template,
                                                                                                         variable_0, variable_1, variable_2, variable_3,
@@ -543,9 +661,15 @@ def inference_ui():
           return function (...args) {
             const context = this;
             clearTimeout(timeout);
-            timeout = setTimeout(() => {
               func.apply(context, args);
-            }, wait);
           };
         }
@@ -580,5 +704,27 @@ def inference_ui():
           });
         }
       }, 100);
     }
     """)

 from transformers import GenerationConfig
 from ..globals import Global
+from ..models import get_model, get_tokenizer, get_device
 from ..utils.data import (
     get_available_template_names,
     get_available_lora_model_names,
+    get_info_of_available_lora_model)
 from ..utils.prompter import Prompter
 from ..utils.callbacks import Iteratorize, Stream
 device = get_device()
 default_show_raw = True
+inference_output_lines = 12
+def prepare_inference(lora_model_name, progress=gr.Progress(track_tqdm=True)):
+    base_model_name = Global.default_base_model_name
+    try:
+        get_tokenizer(base_model_name)
+        get_model(base_model_name, lora_model_name)
+        return ("", "")
+    except Exception as e:
+        raise gr.Error(e)
 def do_inference(
     show_raw=False,
     progress=gr.Progress(track_tqdm=True),
 ):
+    base_model_name = Global.default_base_model_name
     try:
+        if Global.generation_force_stopped_at is not None:
+            required_elapsed_time_after_forced_stop = 1
+            current_unix_time = time.time()
+            remaining_time = required_elapsed_time_after_forced_stop - \
+                (current_unix_time - Global.generation_force_stopped_at)
+            if remaining_time > 0:
+                time.sleep(remaining_time)
+            Global.generation_force_stopped_at = None
         variables = [variable_0, variable_1, variable_2, variable_3,
                      variable_4, variable_5, variable_6, variable_7]
         prompter = Prompter(prompt_template)
         prompt = prompter.generate_prompt(variables)
         if Global.ui_dev_mode:
+            message = f"Hi, I’m currently in UI-development mode and do not have access to resources to process your request. However, this behavior is similar to what will actually happen, so you can try and see how it will work!\n\nBase model: {base_model_name}\nLoRA model: {lora_model_name}\n\nThe following is your prompt:\n\n{prompt}"
             print(message)
             if stream_output:
                         yield out
                 for partial_sentence in word_generator(message):
+                    yield (
+                        gr.Textbox.update(
+                            value=partial_sentence, lines=inference_output_lines),
+                        json.dumps(
+                            list(range(len(partial_sentence.split()))), indent=2)
+                    )
                     time.sleep(0.05)
                 return
             time.sleep(1)
+            yield (
+                gr.Textbox.update(value=message, lines=inference_output_lines),
+                json.dumps(list(range(len(message.split()))), indent=2)
+            )
             return
+        tokenizer = get_tokenizer(base_model_name)
+        model = get_model(base_model_name, lora_model_name)
         inputs = tokenizer(prompt, return_tensors="pt")
         input_ids = inputs["input_ids"].to(device)
             "max_new_tokens": max_new_tokens,
         }
+        def ui_generation_stopping_criteria(input_ids, score, **kwargs):
+            if Global.should_stop_generating:
+                return True
+            return False
+        Global.should_stop_generating = False
+        generate_params.setdefault(
+            "stopping_criteria", transformers.StoppingCriteriaList()
+        )
+        generate_params["stopping_criteria"].append(
+            ui_generation_stopping_criteria
+        )
         if stream_output:
             # Stream the reply 1 token at a time.
             # This is based on the trick of using 'stopping_criteria' to create an iterator,
                     raw_output = None
                     if show_raw:
                         raw_output = str(output)
+                    response = prompter.get_response(decoded_output)
+                    if Global.should_stop_generating:
+                        return
+                    yield (
+                        gr.Textbox.update(
+                            value=response, lines=inference_output_lines),
+                        raw_output)
+                    if Global.should_stop_generating:
+                        # If the user stops the generation, and then clicks the
+                        # generation button again, they may mysteriously landed
+                        # here, in the previous, should-be-stopped generation
+                        # function call, with the new generation function not be
+                        # called at all. To workaround this, we yield a message
+                        # and setting lines=1, and if the front-end JS detects
+                        # that lines has been set to 1 (rows="1" in HTML),
+                        # it will automatically click the generate button again
+                        # (gr.Textbox.update() does not support updating
+                        # elem_classes or elem_id).
+                        # [WORKAROUND-UI01]
+                        yield (
+                            gr.Textbox.update(
+                                value="Please retry", lines=1),
+                            None)
             return  # early return for stream_output
         # Without streaming
         with torch.no_grad():
+            generation_output = model.generate(**generate_params)
         s = generation_output.sequences[0]
         output = tokenizer.decode(s)
         raw_output = None
         if show_raw:
             raw_output = str(s)
+        response = prompter.get_response(output)
+        if Global.should_stop_generating:
+            return
+        yield (
+            gr.Textbox.update(value=response, lines=inference_output_lines),
+            raw_output)
     except Exception as e:
         raise gr.Error(e)
+def handle_stop_generate():
+    Global.generation_force_stopped_at = time.time()
+    Global.should_stop_generating = True
 def reload_selections(current_lora_model, current_prompt_template):
     available_template_names = get_available_template_names()
     available_template_names_with_none = available_template_names + ["None"]
             gr.Dropdown.update(choices=available_template_names_with_none, value=current_prompt_template))
+def handle_prompt_template_change(prompt_template, lora_model):
     prompter = Prompter(prompt_template)
     var_names = prompter.get_variable_names()
     human_var_names = [' '.join(word.capitalize()
     while len(gr_updates) < 8:
         gr_updates.append(gr.Textbox.update(
             label="Not Used", visible=False))
+    model_prompt_template_message_update = gr.Markdown.update(
+        "", visible=False)
+    lora_mode_info = get_info_of_available_lora_model(lora_model)
+    if lora_mode_info and isinstance(lora_mode_info, dict):
+        model_prompt_template = lora_mode_info.get("prompt_template")
+        if model_prompt_template and model_prompt_template != prompt_template:
+            model_prompt_template_message_update = gr.Markdown.update(
+                f"This model was trained with prompt template `{model_prompt_template}`.", visible=True)
+    return [model_prompt_template_message_update] + gr_updates
+def handle_lora_model_change(lora_model, prompt_template):
+    lora_mode_info = get_info_of_available_lora_model(lora_model)
+    if not lora_mode_info:
+        return gr.Markdown.update("", visible=False), prompt_template
+    if not isinstance(lora_mode_info, dict):
+        return gr.Markdown.update("", visible=False), prompt_template
+    model_prompt_template = lora_mode_info.get("prompt_template")
+    if not model_prompt_template:
+        return gr.Markdown.update("", visible=False), prompt_template
+    available_template_names = get_available_template_names()
+    if model_prompt_template in available_template_names:
+        return gr.Markdown.update("", visible=False), model_prompt_template
+    return gr.Markdown.update(f"Trained with prompt template `{model_prompt_template}`", visible=True), prompt_template
 def update_prompt_preview(prompt_template,
     with gr.Blocks() as inference_ui_blocks:
         with gr.Row():
+            with gr.Column(elem_id="inference_lora_model_group"):
+                model_prompt_template_message = gr.Markdown(
+                    "", visible=False, elem_id="inference_lora_model_prompt_template_message")
+                lora_model = gr.Dropdown(
+                    label="LoRA Model",
+                    elem_id="inference_lora_model",
+                    value="tloen/alpaca-lora-7b",
+                    allow_custom_value=True,
+                )
             prompt_template = gr.Dropdown(
                 label="Prompt Template",
                 elem_id="inference_prompt_template",
                         )
                     num_beams = gr.Slider(
+                        minimum=1, maximum=5, value=2, step=1,
                         label="Beams",
                         elem_id="inference_beams"
                     )
             with gr.Column(elem_id="inference_output_group_container"):
                 with gr.Column(elem_id="inference_output_group"):
                     inference_output = gr.Textbox(
+                        lines=inference_output_lines, label="Output", elem_id="inference_output")
                     inference_output.style(show_copy_button=True)
                     with gr.Accordion(
                             "Raw Output",
         )
         things_that_might_timeout.append(reload_selections_event)
+        prompt_template_change_event = prompt_template.change(
+            fn=handle_prompt_template_change,
+            inputs=[prompt_template, lora_model],
+            outputs=[
+                model_prompt_template_message,
+                variable_0, variable_1, variable_2, variable_3, variable_4, variable_5, variable_6, variable_7])
         things_that_might_timeout.append(prompt_template_change_event)
+        lora_model_change_event = lora_model.change(
+            fn=handle_lora_model_change,
+            inputs=[lora_model, prompt_template],
+            outputs=[model_prompt_template_message, prompt_template])
+        things_that_might_timeout.append(lora_model_change_event)
         generate_event = generate_btn.click(
+            fn=prepare_inference,
+            inputs=[lora_model],
+            outputs=[inference_output, inference_raw_output],
+        ).then(
             fn=do_inference,
             inputs=[
                 lora_model,
             outputs=[inference_output, inference_raw_output],
             api_name="inference"
         )
+        stop_btn.click(
+            fn=handle_stop_generate,
+            inputs=None,
+            outputs=None,
+            cancels=[generate_event]
+        )
         update_prompt_preview_event = update_prompt_preview_btn.click(fn=update_prompt_preview, inputs=[prompt_template,
                                                                                                         variable_0, variable_1, variable_2, variable_3,
           return function (...args) {
             const context = this;
             clearTimeout(timeout);
+            const fn = () => {
+              if (document.querySelector('#inference_preview_prompt > .wrap:not(.hide)')) {
+                // Preview request is still loading, wait for 10ms and try again.
+                timeout = setTimeout(fn, 10);
+                return;
+              }
               func.apply(context, args);
+            };
+            timeout = setTimeout(fn, wait);
           };
         }
           });
         }
       }, 100);
+      // [WORKAROUND-UI01]
+      setTimeout(function () {
+        const inference_output_textarea = document.querySelector(
+          '#inference_output textarea'
+        );
+        if (!inference_output_textarea) return;
+        const observer = new MutationObserver(function () {
+          if (inference_output_textarea.getAttribute('rows') === '1') {
+            setTimeout(function () {
+              const inference_generate_btn = document.getElementById(
+                'inference_generate_btn'
+              );
+              if (inference_generate_btn) inference_generate_btn.click();
+            }, 10);
+          }
+        });
+        observer.observe(inference_output_textarea, {
+          attributes: true,
+          attributeFilter: ['rows'],
+        });
+      }, 100);
     }
     """)

llama_lora/ui/main_page.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import gradio as gr
 from ..globals import Global
-from ..models import get_model_with_lora
 from .inference_ui import inference_ui
 from .finetune_ui import finetune_ui
@@ -30,8 +29,8 @@ def main_page():
                 tokenizer_ui()
             info = []
             if Global.version:
-                info.append(f"LLaMA-LoRA `{Global.version}`")
-            info.append(f"Base model: `{Global.base_model}`")
             if Global.ui_show_sys_info:
                 info.append(f"Data dir: `{Global.data_dir}`")
             gr.Markdown(f"""
@@ -134,6 +133,41 @@ def main_page_custom_css():
         /* text-transform: uppercase; */
     }
     #inference_prompt_box > *:first-child {
         border-bottom-left-radius: 0;
         border-bottom-right-radius: 0;
@@ -193,6 +227,8 @@ def main_page_custom_css():
     #inference_raw_output > .wrap:first-child {
         /* allow users to select text while generation is still in progress */
         pointer-events: none;
     }
     /* position sticky */
@@ -266,12 +302,16 @@ def main_page_custom_css():
     }
     @media screen and (min-width: 640px) {
-        #inference_lora_model, #finetune_template {
             border-top-right-radius: 0;
             border-bottom-right-radius: 0;
             border-right: 0;
             margin-right: -16px;
         }
         #inference_prompt_template {
             border-top-left-radius: 0;
@@ -301,7 +341,7 @@ def main_page_custom_css():
             height: 42px !important;
             min-width: 42px !important;
             width: 42px !important;
-            z-index: 1;
         }
     }
@@ -388,6 +428,9 @@ def main_page_custom_css():
         white-space: pre-wrap;
     }
     @media screen and (max-width: 392px) {
         #inference_lora_model, #finetune_template {

 import gradio as gr
 from ..globals import Global
 from .inference_ui import inference_ui
 from .finetune_ui import finetune_ui
                 tokenizer_ui()
             info = []
             if Global.version:
+                info.append(f"LLaMA-LoRA Tuner `{Global.version}`")
+            info.append(f"Base model: `{Global.default_base_model_name}`")
             if Global.ui_show_sys_info:
                 info.append(f"Data dir: `{Global.data_dir}`")
             gr.Markdown(f"""
         /* text-transform: uppercase; */
     }
+    #inference_lora_model_group {
+        border-radius: var(--block-radius);
+        background: var(--block-background-fill);
+    }
+    #inference_lora_model_group #inference_lora_model {
+        background: transparent;
+    }
+    #inference_lora_model_prompt_template_message:not(.hidden) + #inference_lora_model {
+        padding-bottom: 28px;
+    }
+    #inference_lora_model_group > #inference_lora_model_prompt_template_message {
+        position: absolute;
+        bottom: 8px;
+        left: 20px;
+        z-index: 1;
+        font-size: 12px;
+        opacity: 0.7;
+    }
+    #inference_lora_model_group > #inference_lora_model_prompt_template_message p {
+        font-size: 12px;
+    }
+    #inference_lora_model_prompt_template_message > .wrap {
+        display: none;
+    }
+    #inference_lora_model > .wrap:first-child:not(.hide),
+    #inference_prompt_template > .wrap:first-child:not(.hide) {
+        opacity: 0.5;
+    }
+    #inference_lora_model_group, #inference_lora_model {
+        z-index: 60;
+    }
+    #inference_prompt_template {
+        z-index: 55;
+    }
     #inference_prompt_box > *:first-child {
         border-bottom-left-radius: 0;
         border-bottom-right-radius: 0;
     #inference_raw_output > .wrap:first-child {
         /* allow users to select text while generation is still in progress */
         pointer-events: none;
+        padding: 12px !important;
     }
     /* position sticky */
     }
     @media screen and (min-width: 640px) {
+        #inference_lora_model, #inference_lora_model_group,
+        #finetune_template {
             border-top-right-radius: 0;
             border-bottom-right-radius: 0;
             border-right: 0;
             margin-right: -16px;
         }
+        #inference_lora_model_group #inference_lora_model {
+            box-shadow: var(--block-shadow);
+        }
         #inference_prompt_template {
             border-top-left-radius: 0;
             height: 42px !important;
             min-width: 42px !important;
             width: 42px !important;
+            z-index: 61;
         }
     }
         white-space: pre-wrap;
     }
+    #finetune_max_seq_length {
+        flex: 2;
+    }
     @media screen and (max-width: 392px) {
         #inference_lora_model, #finetune_template {

llama_lora/ui/tokenizer_ui.py CHANGED Viewed

@@ -7,11 +7,12 @@ from ..models import get_tokenizer
 def handle_decode(encoded_tokens_json):
     try:
         encoded_tokens = json.loads(encoded_tokens_json)
         if Global.ui_dev_mode:
             return f"Not actually decoding tokens in UI dev mode.", gr.Markdown.update("", visible=False)
-        tokenizer = get_tokenizer()
         decoded_tokens = tokenizer.decode(encoded_tokens)
         return decoded_tokens, gr.Markdown.update("", visible=False)
     except Exception as e:
@@ -19,10 +20,11 @@ def handle_decode(encoded_tokens_json):
 def handle_encode(decoded_tokens):
     try:
         if Global.ui_dev_mode:
             return f"[\"Not actually encoding tokens in UI dev mode.\"]", gr.Markdown.update("", visible=False)
-        tokenizer = get_tokenizer()
         result = tokenizer(decoded_tokens)
         encoded_tokens_json = json.dumps(result['input_ids'], indent=2)
         return encoded_tokens_json, gr.Markdown.update("", visible=False)

 def handle_decode(encoded_tokens_json):
+    base_model_name = Global.default_base_model_name
     try:
         encoded_tokens = json.loads(encoded_tokens_json)
         if Global.ui_dev_mode:
             return f"Not actually decoding tokens in UI dev mode.", gr.Markdown.update("", visible=False)
+        tokenizer = get_tokenizer(base_model_name)
         decoded_tokens = tokenizer.decode(encoded_tokens)
         return decoded_tokens, gr.Markdown.update("", visible=False)
     except Exception as e:
 def handle_encode(decoded_tokens):
+    base_model_name = Global.default_base_model_name
     try:
         if Global.ui_dev_mode:
             return f"[\"Not actually encoding tokens in UI dev mode.\"]", gr.Markdown.update("", visible=False)
+        tokenizer = get_tokenizer(base_model_name)
         result = tokenizer(decoded_tokens)
         encoded_tokens_json = json.dumps(result['input_ids'], indent=2)
         return encoded_tokens_json, gr.Markdown.update("", visible=False)

llama_lora/utils/data.py CHANGED Viewed

@@ -52,6 +52,22 @@ def get_path_of_available_lora_model(name):
     return None
 def get_dataset_content(name):
     file_name = os.path.join(Global.data_dir, "datasets", name)
     if not os.path.exists(file_name):

     return None
+def get_info_of_available_lora_model(name):
+    try:
+        if "/" in name:
+            return None
+        path_of_available_lora_model = get_path_of_available_lora_model(
+            name)
+        if not path_of_available_lora_model:
+            return None
+        with open(os.path.join(path_of_available_lora_model, "info.json"), "r") as json_file:
+            return json.load(json_file)
+    except Exception as e:
+        return None
 def get_dataset_content(name):
     file_name = os.path.join(Global.data_dir, "datasets", name)
     if not os.path.exists(file_name):

llama_lora/utils/lru_cache.py ADDED Viewed

	@@ -0,0 +1,31 @@

+from collections import OrderedDict
+class LRUCache:
+    def __init__(self, capacity=5):
+        self.cache = OrderedDict()
+        self.capacity = capacity
+    def get(self, key):
+        if key in self.cache:
+            # Move the accessed item to the end of the OrderedDict
+            self.cache.move_to_end(key)
+            return self.cache[key]
+        return None
+    def set(self, key, value):
+        if key in self.cache:
+            # If the key already exists, update its value
+            self.cache[key] = value
+        else:
+            # If the cache has reached its capacity, remove the least recently used item
+            if len(self.cache) >= self.capacity:
+                self.cache.popitem(last=False)
+            self.cache[key] = value
+    def clear(self):
+        self.cache.clear()
+    def prepare_to_set(self):
+        if len(self.cache) >= self.capacity:
+            self.cache.popitem(last=False)

requirements.lock.txt ADDED Viewed

	@@ -0,0 +1,120 @@

+accelerate==0.18.0
+aiofiles==23.1.0
+aiohttp==3.8.4
+aiosignal==1.3.1
+altair==4.2.2
+anyio==3.6.2
+appdirs==1.4.4
+asttokens==2.2.1
+async-timeout==4.0.2
+attrs==22.2.0
+backcall==0.2.0
+bitsandbytes==0.37.2
+black==23.3.0
+charset-normalizer==3.1.0
+click==8.1.3
+contourpy==1.0.7
+cycler==0.11.0
+datasets==2.11.0
+decorator==5.1.1
+dill==0.3.6
+entrypoints==0.4
+exceptiongroup==1.1.1
+executing==1.2.0
+fastapi==0.95.0
+ffmpy==0.3.0
+filelock==3.11.0
+fire==0.5.0
+fonttools==4.39.3
+frozenlist==1.3.3
+fsspec==2023.3.0
+gradio==3.24.1
+gradio_client==0.0.8
+h11==0.14.0
+httpcore==0.16.3
+httpx==0.23.3
+huggingface-hub==0.13.4
+idna==3.4
+importlib-metadata==6.2.0
+importlib-resources==5.12.0
+iniconfig==2.0.0
+ipython==8.12.0
+jedi==0.18.2
+Jinja2==3.1.2
+jsonschema==4.17.3
+kiwisolver==1.4.4
+linkify-it-py==2.0.0
+llvmlite==0.39.1
+loralib==0.1.1
+markdown-it-py==2.2.0
+MarkupSafe==2.1.2
+matplotlib==3.7.1
+matplotlib-inline==0.1.6
+mdit-py-plugins==0.3.3
+mdurl==0.1.2
+mpmath==1.3.0
+multidict==6.0.4
+multiprocess==0.70.14
+mypy-extensions==1.0.0
+networkx==3.1
+numba==0.56.4
+numpy==1.23.5
+nvidia-ml-py3==7.352.0
+orjson==3.8.9
+packaging==23.0
+pandas==2.0.0
+parso==0.8.3
+pathspec==0.11.1
+peft @ git+https://github.com/huggingface/peft.git@382b178911edff38c1ff619bbac2ba556bd2276b
+pexpect==4.8.0
+pickleshare==0.7.5
+Pillow==9.3.0
+pkgutil_resolve_name==1.3.10
+platformdirs==3.2.0
+pluggy==1.0.0
+prompt-toolkit==3.0.38
+psutil==5.9.4
+ptyprocess==0.7.0
+pure-eval==0.2.2
+pyarrow==11.0.0
+pydantic==1.10.7
+pydub==0.25.1
+Pygments==2.14.0
+pyparsing==3.0.9
+pyrsistent==0.19.3
+pytest==7.2.2
+python-dateutil==2.8.2
+python-multipart==0.0.6
+pytz==2023.3
+PyYAML==6.0
+Random-Word==1.0.11
+regex==2023.3.23
+requests==2.28.2
+responses==0.18.0
+rfc3986==1.5.0
+semantic-version==2.10.0
+sentencepiece==0.1.97
+six==1.16.0
+sniffio==1.3.0
+stack-data==0.6.2
+starlette==0.26.1
+sympy==1.11.1
+termcolor==2.2.0
+tokenize-rt==5.0.0
+tokenizers==0.13.3
+tomli==2.0.1
+toolz==0.12.0
+torch==2.0.0
+tqdm==4.65.0
+traitlets==5.9.0
+transformers @ git+https://github.com/huggingface/transformers.git@3f96e0b4e483c4c7d4ec9dcdc24b0b0cdf31ea5c
+typing_extensions==4.5.0
+tzdata==2023.3
+uc-micro-py==1.0.1
+urllib3==1.26.15
+uvicorn==0.21.1
+wcwidth==0.2.6
+websockets==11.0.1
+xxhash==3.2.0
+yarl==1.8.2
+zipp==3.15.0

templates/user_and_ai.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "description": "Unhelpful AI assistant.",
+  "variables": ["instruction"],
+  "prompt": "### User:\n{instruction}\n\n### AI:\n",
+  "default": "prompt",
+  "response_split": "### AI:"
+}