diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..7204fb8b58f4a86370c09163cb1f5b6db391ec3a 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,19 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +data/multi_step_merged_arc_v4.json filter=lfs diff=lfs merge=lfs -text +data/multi_step_verifiers_training.json filter=lfs diff=lfs merge=lfs -text +data/multi_step_verifiers_training.txt filter=lfs diff=lfs merge=lfs -text +data/re_arc_v3.json filter=lfs diff=lfs merge=lfs -text +data/re_arc_v3.txt filter=lfs diff=lfs merge=lfs -text +data/re_arc_v4.json filter=lfs diff=lfs merge=lfs -text +data/re_arc_v4.txt filter=lfs diff=lfs merge=lfs -text +assets/logo.png filter=lfs diff=lfs merge=lfs -text +assets/wechat.jpg filter=lfs diff=lfs merge=lfs -text +assets/wechat_npu.jpg filter=lfs diff=lfs merge=lfs -text +data/mllm_demo_data/1.jpg filter=lfs diff=lfs merge=lfs -text +data/mllm_demo_data/1.mp4 filter=lfs diff=lfs merge=lfs -text +data/mllm_demo_data/2.avi filter=lfs diff=lfs merge=lfs -text +data/mllm_demo_data/2.jpg filter=lfs diff=lfs merge=lfs -text +data/mllm_demo_data/3.jpg filter=lfs diff=lfs merge=lfs -text +data/mllm_demo_data/3.mp4 filter=lfs diff=lfs merge=lfs -text diff --git a/assets/benchmark.svg b/assets/benchmark.svg new file mode 100644 index 0000000000000000000000000000000000000000..60f0aa4d39fb48df9b36dd4ebcb5b294e2f4ecce --- /dev/null +++ b/assets/benchmark.svg @@ -0,0 +1,1216 @@ + + + + + + + + 2023-11-18T11:28:03.028228 + image/svg+xml + + + Matplotlib v3.7.1, https://matplotlib.orgdiff --git a/assets/logo.png b/assets/logo.png new file mode 100644 index 0000000000000000000000000000000000000000..077a9681cfe5246109ebaf958acff89c39a11d39 --- /dev/null +++ b/assets/logo.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:21883234c575c86b5dd7cad711ff2c4428deefc60022465e7eba7d01dad07579 +size 56849 diff --git a/assets/wechat.jpg b/assets/wechat.jpg new file mode 100644 index 0000000000000000000000000000000000000000..3f4240cd8a8d52e5aeaf677b925988a18b7a8f10 --- /dev/null +++ b/assets/wechat.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:35bb80541ad2254475d008806a0bb805e4143817662f43548b20c807f7f03651 +size 169511 diff --git a/assets/wechat_npu.jpg b/assets/wechat_npu.jpg new file mode 100644 index 0000000000000000000000000000000000000000..94cb052a68c38d8eb83fced2c9f4d02e04ba17cb --- /dev/null +++ b/assets/wechat_npu.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6837a9019981c4f34dfc8a8e05a5d3c755bc9ca940d3d800e9e5c1a0c210b7a3 +size 169236 diff --git a/data/README.md b/data/README.md new file mode 100644 index 0000000000000000000000000000000000000000..1786804fa21b28dc85a3ae46bd2272717862c811 --- /dev/null +++ b/data/README.md @@ -0,0 +1,419 @@ +The [dataset_info.json](dataset_info.json) contains all available datasets. If you are using a custom dataset, please **make sure** to add a *dataset description* in `dataset_info.json` and specify `dataset: dataset_name` before training to use it. + +Currently we support datasets in **alpaca** and **sharegpt** format. + +```json +"dataset_name": { + "hf_hub_url": "the name of the dataset repository on the Hugging Face hub. (if specified, ignore script_url and file_name)", + "ms_hub_url": "the name of the dataset repository on the Model Scope hub. (if specified, ignore script_url and file_name)", + "script_url": "the name of the directory containing a dataset loading script. (if specified, ignore file_name)", + "file_name": "the name of the dataset folder or dataset file in this directory. (required if above are not specified)", + "formatting": "the format of the dataset. (optional, default: alpaca, can be chosen from {alpaca, sharegpt})", + "ranking": "whether the dataset is a preference dataset or not. (default: False)", + "subset": "the name of the subset. (optional, default: None)", + "split": "the name of dataset split to be used. (optional, default: train)", + "folder": "the name of the folder of the dataset repository on the Hugging Face hub. (optional, default: None)", + "num_samples": "the number of samples in the dataset to be used. (optional, default: None)", + "columns (optional)": { + "prompt": "the column name in the dataset containing the prompts. (default: instruction)", + "query": "the column name in the dataset containing the queries. (default: input)", + "response": "the column name in the dataset containing the responses. (default: output)", + "history": "the column name in the dataset containing the histories. (default: None)", + "messages": "the column name in the dataset containing the messages. (default: conversations)", + "system": "the column name in the dataset containing the system prompts. (default: None)", + "tools": "the column name in the dataset containing the tool description. (default: None)", + "images": "the column name in the dataset containing the image inputs. (default: None)", + "videos": "the column name in the dataset containing the videos inputs. (default: None)", + "chosen": "the column name in the dataset containing the chosen answers. (default: None)", + "rejected": "the column name in the dataset containing the rejected answers. (default: None)", + "kto_tag": "the column name in the dataset containing the kto tags. (default: None)" + }, + "tags (optional, used for the sharegpt format)": { + "role_tag": "the key in the message represents the identity. (default: from)", + "content_tag": "the key in the message represents the content. (default: value)", + "user_tag": "the value of the role_tag represents the user. (default: human)", + "assistant_tag": "the value of the role_tag represents the assistant. (default: gpt)", + "observation_tag": "the value of the role_tag represents the tool results. (default: observation)", + "function_tag": "the value of the role_tag represents the function call. (default: function_call)", + "system_tag": "the value of the role_tag represents the system prompt. (default: system, can override system column)" + } +} +``` + +## Alpaca Format + +### Supervised Fine-Tuning Dataset + +* [Example dataset](alpaca_en_demo.json) + +In supervised fine-tuning, the `instruction` column will be concatenated with the `input` column and used as the human prompt, then the human prompt would be `instruction\ninput`. The `output` column represents the model response. + +The `system` column will be used as the system prompt if specified. + +The `history` column is a list consisting of string tuples representing prompt-response pairs in the history messages. Note that the responses in the history **will also be learned by the model** in supervised fine-tuning. + +```json +[ + { + "instruction": "human instruction (required)", + "input": "human input (optional)", + "output": "model response (required)", + "system": "system prompt (optional)", + "history": [ + ["human instruction in the first round (optional)", "model response in the first round (optional)"], + ["human instruction in the second round (optional)", "model response in the second round (optional)"] + ] + } +] +``` + +Regarding the above dataset, the *dataset description* in `dataset_info.json` should be: + +```json +"dataset_name": { + "file_name": "data.json", + "columns": { + "prompt": "instruction", + "query": "input", + "response": "output", + "system": "system", + "history": "history" + } +} +``` + +### Pre-training Dataset + +- [Example dataset](c4_demo.json) + +In pre-training, only the `text` column will be used for model learning. + +```json +[ + {"text": "document"}, + {"text": "document"} +] +``` + +Regarding the above dataset, the *dataset description* in `dataset_info.json` should be: + +```json +"dataset_name": { + "file_name": "data.json", + "columns": { + "prompt": "text" + } +} +``` + +### Preference Dataset + +Preference datasets are used for reward modeling, DPO training, ORPO and SimPO training. + +It requires a better response in `chosen` column and a worse response in `rejected` column. + +```json +[ + { + "instruction": "human instruction (required)", + "input": "human input (optional)", + "chosen": "chosen answer (required)", + "rejected": "rejected answer (required)" + } +] +``` + +Regarding the above dataset, the *dataset description* in `dataset_info.json` should be: + +```json +"dataset_name": { + "file_name": "data.json", + "ranking": true, + "columns": { + "prompt": "instruction", + "query": "input", + "chosen": "chosen", + "rejected": "rejected" + } +} +``` + +### KTO Dataset + +An additional column `kto_tag` is required. Please refer to the [sharegpt](#sharegpt-format) format for details. + +### Multimodal Image Dataset + +An additional column `images` is required. Please refer to the [sharegpt](#sharegpt-format) format for details. + +### Multimodal Video Dataset + +An additional column `videos` is required. Please refer to the [sharegpt](#sharegpt-format) format for details. + +## Sharegpt Format + +### Supervised Fine-Tuning Dataset + +- [Example dataset](glaive_toolcall_en_demo.json) + +Compared to the alpaca format, the sharegpt format allows the datasets have **more roles**, such as human, gpt, observation and function. They are presented in a list of objects in the `conversations` column. + +Note that the human and observation should appear in odd positions, while gpt and function should appear in even positions. + +```json +[ + { + "conversations": [ + { + "from": "human", + "value": "human instruction" + }, + { + "from": "function_call", + "value": "tool arguments" + }, + { + "from": "observation", + "value": "tool result" + }, + { + "from": "gpt", + "value": "model response" + } + ], + "system": "system prompt (optional)", + "tools": "tool description (optional)" + } +] +``` + +Regarding the above dataset, the *dataset description* in `dataset_info.json` should be: + +```json +"dataset_name": { + "file_name": "data.json", + "formatting": "sharegpt", + "columns": { + "messages": "conversations", + "system": "system", + "tools": "tools" + } +} +``` + +### Pre-training Dataset + +Not yet supported, please use the [alpaca](#alpaca-format) format. + +### Preference Dataset + +- [Example dataset](dpo_en_demo.json) + +Preference datasets in sharegpt format also require a better message in `chosen` column and a worse message in `rejected` column. + +```json +[ + { + "conversations": [ + { + "from": "human", + "value": "human instruction" + }, + { + "from": "gpt", + "value": "model response" + }, + { + "from": "human", + "value": "human instruction" + } + ], + "chosen": { + "from": "gpt", + "value": "chosen answer (required)" + }, + "rejected": { + "from": "gpt", + "value": "rejected answer (required)" + } + } +] +``` + +Regarding the above dataset, the *dataset description* in `dataset_info.json` should be: + +```json +"dataset_name": { + "file_name": "data.json", + "formatting": "sharegpt", + "ranking": true, + "columns": { + "messages": "conversations", + "chosen": "chosen", + "rejected": "rejected" + } +} +``` + +### KTO Dataset + +- [Example dataset](kto_en_demo.json) + +KTO datasets require a extra `kto_tag` column containing the boolean human feedback. + +```json +[ + { + "conversations": [ + { + "from": "human", + "value": "human instruction" + }, + { + "from": "gpt", + "value": "model response" + } + ], + "kto_tag": "human feedback [true/false] (required)" + } +] +``` + +Regarding the above dataset, the *dataset description* in `dataset_info.json` should be: + +```json +"dataset_name": { + "file_name": "data.json", + "formatting": "sharegpt", + "columns": { + "messages": "conversations", + "kto_tag": "kto_tag" + } +} +``` + +### Multimodal Image Dataset + +- [Example dataset](mllm_demo.json) + +Multimodal image datasets require a `images` column containing the paths to the input images. + +The number of images should be identical to the `` tokens in the conversations. + +```json +[ + { + "conversations": [ + { + "from": "human", + "value": "human instruction" + }, + { + "from": "gpt", + "value": "model response" + } + ], + "images": [ + "image path (required)" + ] + } +] +``` + +Regarding the above dataset, the *dataset description* in `dataset_info.json` should be: + +```json +"dataset_name": { + "file_name": "data.json", + "formatting": "sharegpt", + "columns": { + "messages": "conversations", + "images": "images" + } +} +``` + +### Multimodal Video Dataset + +- [Example dataset](mllm_video_demo.json) + +Multimodal video datasets require a `videos` column containing the paths to the input videos. + +The number of videos should be identical to the `