{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "a1e514e1-c921-4fdb-a877-fef7a22d73cd",
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"import pandas as pd\n",
"\n",
"# ───────────────────────── Парсер ──────────────────────────\n",
"def parse_smart_log(path_or_str, top_n=15):\n",
" \"\"\"Принимает путь к .txt‑файлу или сам текст лога, \n",
" отдаёт DataFrame со шагами, dev/test/gap и всеми гиперпараметрами.\"\"\"\n",
" \n",
" # читаем либо из файла, либо из уже переданной строки\n",
" if '\\n' in path_or_str or 'Шаг' in path_or_str:\n",
" lines = path_or_str.splitlines()\n",
" else:\n",
" with open(path_or_str, encoding='utf-8') as f:\n",
" lines = f.readlines()\n",
"\n",
" rows, current = [], {}\n",
"\n",
" step_re = re.compile(\n",
" r\"Шаг\\s+(\\d+):\\s*([^=]+?)=\\s*\\((.*?)\\)\"\n",
" )\n",
" mean_re = re.compile(r\"MEAN\\s*=\\s*([0-9.]+)\")\n",
" gap_re = re.compile(r\"GAP\\s*=\\s*([+-]?[0-9.]+)\")\n",
"\n",
" for i, raw in enumerate(lines):\n",
" line = raw.rstrip(\"\\n\")\n",
" \n",
" # ── 1. ищем строку «Шаг N: …» ───────────────────\n",
" m = step_re.search(line)\n",
" if m:\n",
" # если предыдущий step уже набрал все метрики — сохраняем\n",
" if current.get('dev') and current.get('test'):\n",
" current.setdefault('gap', round(current['test'] - current['dev'], 4))\n",
" rows.append(current)\n",
" # начинаем новый шаг\n",
" current = {'step': int(m.group(1))}\n",
" \n",
" keys = [k.strip() for k in m.group(2).split('+')]\n",
" raw_vals = re.findall(r\"'[^']*'|[^,]+\", m.group(3))\n",
" vals = [v.strip().strip(\"'\") for v in raw_vals]\n",
" for k, v in zip(keys, vals):\n",
" try:\n",
" current[k] = eval(v) # превращаем 0.001 → float, 8 → int\n",
" except Exception:\n",
" current[k] = v # если это строка без кавычек\n",
" \n",
" # ── 2. «Результаты (DEV):» ───────────────────────\n",
" if \"Результаты (DEV):\" in line:\n",
" for j in range(i + 1, len(lines)):\n",
" m = mean_re.search(lines[j])\n",
" if m:\n",
" current['dev'] = float(m.group(1))\n",
" break\n",
" \n",
" # ── 3. «Результаты (TEST):» + GAP ────────────────\n",
" if \"Результаты (TEST):\" in line:\n",
" for j in range(i + 1, len(lines)):\n",
" m = mean_re.search(lines[j])\n",
" if m:\n",
" current['test'] = float(m.group(1))\n",
" break\n",
" for j in range(i + 1, len(lines)):\n",
" g = gap_re.search(lines[j])\n",
" if g:\n",
" current['gap'] = float(g.group(1))\n",
" break\n",
"\n",
" # не забываем «добавить хвост»\n",
" if current.get('dev') and current.get('test'):\n",
" current.setdefault('gap', round(current['test'] - current['dev'], 4))\n",
" rows.append(current)\n",
"\n",
" df = pd.DataFrame(rows)\n",
" if not df.empty:\n",
" df = df.sort_values('test', ascending=False)\n",
" if top_n is not None:\n",
" df = df.head(top_n)\n",
" df = df.reset_index(drop=True)\n",
" return df\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "d40405fe-1159-4d73-94ff-1084124840a1",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" step | \n",
" lr | \n",
" num_transformer_heads | \n",
" tr_layer_number | \n",
" optimizer | \n",
" scheduler_type | \n",
" warmup_ratio | \n",
" dev | \n",
" test | \n",
" gap | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 37 | \n",
" 0.0001 | \n",
" 8 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.5904 | \n",
" 0.5756 | \n",
" 0.0148 | \n",
"
\n",
" \n",
" 1 | \n",
" 38 | \n",
" 0.0001 | \n",
" 8 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.5904 | \n",
" 0.5756 | \n",
" 0.0148 | \n",
"
\n",
" \n",
" 2 | \n",
" 47 | \n",
" 0.0001 | \n",
" 8 | \n",
" 5 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.5877 | \n",
" 0.5771 | \n",
" 0.0106 | \n",
"
\n",
" \n",
" 3 | \n",
" 40 | \n",
" 0.0001 | \n",
" 8 | \n",
" 4 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.5856 | \n",
" 0.5751 | \n",
" 0.0105 | \n",
"
\n",
" \n",
" 4 | \n",
" 64 | \n",
" 0.0001 | \n",
" 16 | \n",
" 5 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.5830 | \n",
" 0.5705 | \n",
" 0.0126 | \n",
"
\n",
" \n",
" 5 | \n",
" 45 | \n",
" 0.0001 | \n",
" 8 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.5814 | \n",
" 0.5825 | \n",
" -0.0011 | \n",
"
\n",
" \n",
" 6 | \n",
" 46 | \n",
" 0.0001 | \n",
" 8 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.5814 | \n",
" 0.5825 | \n",
" -0.0011 | \n",
"
\n",
" \n",
" 7 | \n",
" 3 | \n",
" 0.0010 | \n",
" 8 | \n",
" 4 | \n",
" sgd | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.5810 | \n",
" 0.5788 | \n",
" 0.0022 | \n",
"
\n",
" \n",
" 8 | \n",
" 4 | \n",
" 0.0010 | \n",
" 8 | \n",
" 4 | \n",
" sgd | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.5801 | \n",
" 0.5770 | \n",
" 0.0031 | \n",
"
\n",
" \n",
" 9 | \n",
" 48 | \n",
" 0.0001 | \n",
" 8 | \n",
" 5 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.5798 | \n",
" 0.5731 | \n",
" 0.0066 | \n",
"
\n",
" \n",
" 10 | \n",
" 21 | \n",
" 0.0010 | \n",
" 16 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.5797 | \n",
" 0.5686 | \n",
" 0.0111 | \n",
"
\n",
" \n",
" 11 | \n",
" 22 | \n",
" 0.0010 | \n",
" 16 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.5797 | \n",
" 0.5686 | \n",
" 0.0111 | \n",
"
\n",
" \n",
" 12 | \n",
" 62 | \n",
" 0.0001 | \n",
" 16 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.5795 | \n",
" 0.5726 | \n",
" 0.0069 | \n",
"
\n",
" \n",
" 13 | \n",
" 61 | \n",
" 0.0001 | \n",
" 16 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.5795 | \n",
" 0.5726 | \n",
" 0.0069 | \n",
"
\n",
" \n",
" 14 | \n",
" 53 | \n",
" 0.0001 | \n",
" 16 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.5785 | \n",
" 0.5678 | \n",
" 0.0107 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" step lr num_transformer_heads tr_layer_number optimizer scheduler_type warmup_ratio dev test gap\n",
"0 37 0.0001 8 4 adam plateau 0.05 0.5904 0.5756 0.0148\n",
"1 38 0.0001 8 4 adam plateau 0.10 0.5904 0.5756 0.0148\n",
"2 47 0.0001 8 5 adam huggingface_cosine_with_restarts 0.05 0.5877 0.5771 0.0106\n",
"3 40 0.0001 8 4 adam huggingface_cosine_with_restarts 0.10 0.5856 0.5751 0.0105\n",
"4 64 0.0001 16 5 adam huggingface_cosine_with_restarts 0.10 0.5830 0.5705 0.0126\n",
"5 45 0.0001 8 5 adam plateau 0.05 0.5814 0.5825 -0.0011\n",
"6 46 0.0001 8 5 adam plateau 0.10 0.5814 0.5825 -0.0011\n",
"7 3 0.0010 8 4 sgd huggingface_cosine_with_restarts 0.05 0.5810 0.5788 0.0022\n",
"8 4 0.0010 8 4 sgd huggingface_cosine_with_restarts 0.10 0.5801 0.5770 0.0031\n",
"9 48 0.0001 8 5 adam huggingface_cosine_with_restarts 0.10 0.5798 0.5731 0.0066\n",
"10 21 0.0010 16 4 adam plateau 0.05 0.5797 0.5686 0.0111\n",
"11 22 0.0010 16 4 adam plateau 0.10 0.5797 0.5686 0.0111\n",
"12 62 0.0001 16 5 adam plateau 0.10 0.5795 0.5726 0.0069\n",
"13 61 0.0001 16 5 adam plateau 0.05 0.5795 0.5726 0.0069\n",
"14 53 0.0001 16 4 adam plateau 0.05 0.5785 0.5678 0.0107"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"df = parse_smart_log(\"C:/Users/Alexandr/Desktop/sampling/10.txt\",25)\n",
"\n",
"from IPython.display import display\n",
"pd.set_option(\"display.max_columns\", None)\n",
"pd.set_option(\"display.width\", 160)\n",
"\n",
"display(df.head(15))"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "248d2c65-2222-44b5-a83e-20e1c2048ba4",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" step | \n",
" lr | \n",
" num_transformer_heads | \n",
" tr_layer_number | \n",
" optimizer | \n",
" scheduler_type | \n",
" warmup_ratio | \n",
" dev | \n",
" test | \n",
" gap | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 56 | \n",
" 0.0001 | \n",
" 16 | \n",
" 4 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.5891 | \n",
" 0.5777 | \n",
" 0.0114 | \n",
"
\n",
" \n",
" 1 | \n",
" 3 | \n",
" 0.0010 | \n",
" 8 | \n",
" 4 | \n",
" sgd | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.5853 | \n",
" 0.5823 | \n",
" 0.0030 | \n",
"
\n",
" \n",
" 2 | \n",
" 53 | \n",
" 0.0001 | \n",
" 16 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.5845 | \n",
" 0.5774 | \n",
" 0.0072 | \n",
"
\n",
" \n",
" 3 | \n",
" 54 | \n",
" 0.0001 | \n",
" 16 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.5845 | \n",
" 0.5774 | \n",
" 0.0072 | \n",
"
\n",
" \n",
" 4 | \n",
" 30 | \n",
" 0.0010 | \n",
" 16 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.5839 | \n",
" 0.5694 | \n",
" 0.0145 | \n",
"
\n",
" \n",
" 5 | \n",
" 29 | \n",
" 0.0010 | \n",
" 16 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.5839 | \n",
" 0.5694 | \n",
" 0.0145 | \n",
"
\n",
" \n",
" 6 | \n",
" 62 | \n",
" 0.0001 | \n",
" 16 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.5838 | \n",
" 0.5775 | \n",
" 0.0063 | \n",
"
\n",
" \n",
" 7 | \n",
" 61 | \n",
" 0.0001 | \n",
" 16 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.5838 | \n",
" 0.5775 | \n",
" 0.0063 | \n",
"
\n",
" \n",
" 8 | \n",
" 38 | \n",
" 0.0001 | \n",
" 8 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.5834 | \n",
" 0.5836 | \n",
" -0.0002 | \n",
"
\n",
" \n",
" 9 | \n",
" 37 | \n",
" 0.0001 | \n",
" 8 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.5834 | \n",
" 0.5836 | \n",
" -0.0002 | \n",
"
\n",
" \n",
" 10 | \n",
" 40 | \n",
" 0.0001 | \n",
" 8 | \n",
" 4 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.5822 | \n",
" 0.5663 | \n",
" 0.0159 | \n",
"
\n",
" \n",
" 11 | \n",
" 19 | \n",
" 0.0010 | \n",
" 16 | \n",
" 4 | \n",
" sgd | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.5822 | \n",
" 0.5803 | \n",
" 0.0019 | \n",
"
\n",
" \n",
" 12 | \n",
" 21 | \n",
" 0.0010 | \n",
" 16 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.5805 | \n",
" 0.5740 | \n",
" 0.0065 | \n",
"
\n",
" \n",
" 13 | \n",
" 22 | \n",
" 0.0010 | \n",
" 16 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.5805 | \n",
" 0.5740 | \n",
" 0.0065 | \n",
"
\n",
" \n",
" 14 | \n",
" 20 | \n",
" 0.0010 | \n",
" 16 | \n",
" 4 | \n",
" sgd | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.5803 | \n",
" 0.5706 | \n",
" 0.0097 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" step lr num_transformer_heads tr_layer_number optimizer scheduler_type warmup_ratio dev test gap\n",
"0 56 0.0001 16 4 adam huggingface_cosine_with_restarts 0.10 0.5891 0.5777 0.0114\n",
"1 3 0.0010 8 4 sgd huggingface_cosine_with_restarts 0.05 0.5853 0.5823 0.0030\n",
"2 53 0.0001 16 4 adam plateau 0.05 0.5845 0.5774 0.0072\n",
"3 54 0.0001 16 4 adam plateau 0.10 0.5845 0.5774 0.0072\n",
"4 30 0.0010 16 5 adam plateau 0.10 0.5839 0.5694 0.0145\n",
"5 29 0.0010 16 5 adam plateau 0.05 0.5839 0.5694 0.0145\n",
"6 62 0.0001 16 5 adam plateau 0.10 0.5838 0.5775 0.0063\n",
"7 61 0.0001 16 5 adam plateau 0.05 0.5838 0.5775 0.0063\n",
"8 38 0.0001 8 4 adam plateau 0.10 0.5834 0.5836 -0.0002\n",
"9 37 0.0001 8 4 adam plateau 0.05 0.5834 0.5836 -0.0002\n",
"10 40 0.0001 8 4 adam huggingface_cosine_with_restarts 0.10 0.5822 0.5663 0.0159\n",
"11 19 0.0010 16 4 sgd huggingface_cosine_with_restarts 0.05 0.5822 0.5803 0.0019\n",
"12 21 0.0010 16 4 adam plateau 0.05 0.5805 0.5740 0.0065\n",
"13 22 0.0010 16 4 adam plateau 0.10 0.5805 0.5740 0.0065\n",
"14 20 0.0010 16 4 sgd huggingface_cosine_with_restarts 0.10 0.5803 0.5706 0.0097"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"df = parse_smart_log(\"C:/Users/Alexandr/Desktop/sampling/20.txt\",25)\n",
"\n",
"from IPython.display import display\n",
"pd.set_option(\"display.max_columns\", None)\n",
"pd.set_option(\"display.width\", 160)\n",
"\n",
"display(df.head(15))"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "7e06dfea-d6cc-479b-8113-3b0140840db8",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" step | \n",
" lr | \n",
" num_transformer_heads | \n",
" tr_layer_number | \n",
" optimizer | \n",
" scheduler_type | \n",
" warmup_ratio | \n",
" dev | \n",
" test | \n",
" gap | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 64 | \n",
" 0.0001 | \n",
" 16 | \n",
" 5 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.5812 | \n",
" 0.5712 | \n",
" 0.0100 | \n",
"
\n",
" \n",
" 1 | \n",
" 47 | \n",
" 0.0001 | \n",
" 8 | \n",
" 5 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.5811 | \n",
" 0.5710 | \n",
" 0.0101 | \n",
"
\n",
" \n",
" 2 | \n",
" 48 | \n",
" 0.0001 | \n",
" 8 | \n",
" 5 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.5800 | \n",
" 0.5722 | \n",
" 0.0078 | \n",
"
\n",
" \n",
" 3 | \n",
" 20 | \n",
" 0.0010 | \n",
" 16 | \n",
" 4 | \n",
" sgd | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.5791 | \n",
" 0.5724 | \n",
" 0.0067 | \n",
"
\n",
" \n",
" 4 | \n",
" 27 | \n",
" 0.0010 | \n",
" 16 | \n",
" 5 | \n",
" sgd | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.5790 | \n",
" 0.5746 | \n",
" 0.0044 | \n",
"
\n",
" \n",
" 5 | \n",
" 19 | \n",
" 0.0010 | \n",
" 16 | \n",
" 4 | \n",
" sgd | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.5783 | \n",
" 0.5700 | \n",
" 0.0083 | \n",
"
\n",
" \n",
" 6 | \n",
" 4 | \n",
" 0.0010 | \n",
" 8 | \n",
" 4 | \n",
" sgd | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.5774 | \n",
" 0.5654 | \n",
" 0.0120 | \n",
"
\n",
" \n",
" 7 | \n",
" 54 | \n",
" 0.0001 | \n",
" 16 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.5773 | \n",
" 0.5697 | \n",
" 0.0075 | \n",
"
\n",
" \n",
" 8 | \n",
" 53 | \n",
" 0.0001 | \n",
" 16 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.5773 | \n",
" 0.5697 | \n",
" 0.0075 | \n",
"
\n",
" \n",
" 9 | \n",
" 38 | \n",
" 0.0001 | \n",
" 8 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.5761 | \n",
" 0.5737 | \n",
" 0.0024 | \n",
"
\n",
" \n",
" 10 | \n",
" 37 | \n",
" 0.0001 | \n",
" 8 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.5761 | \n",
" 0.5737 | \n",
" 0.0024 | \n",
"
\n",
" \n",
" 11 | \n",
" 3 | \n",
" 0.0010 | \n",
" 8 | \n",
" 4 | \n",
" sgd | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.5759 | \n",
" 0.5736 | \n",
" 0.0023 | \n",
"
\n",
" \n",
" 12 | \n",
" 63 | \n",
" 0.0001 | \n",
" 16 | \n",
" 5 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.5757 | \n",
" 0.5649 | \n",
" 0.0108 | \n",
"
\n",
" \n",
" 13 | \n",
" 29 | \n",
" 0.0010 | \n",
" 16 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.5754 | \n",
" 0.5665 | \n",
" 0.0090 | \n",
"
\n",
" \n",
" 14 | \n",
" 30 | \n",
" 0.0010 | \n",
" 16 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.5754 | \n",
" 0.5665 | \n",
" 0.0090 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" step lr num_transformer_heads tr_layer_number optimizer scheduler_type warmup_ratio dev test gap\n",
"0 64 0.0001 16 5 adam huggingface_cosine_with_restarts 0.10 0.5812 0.5712 0.0100\n",
"1 47 0.0001 8 5 adam huggingface_cosine_with_restarts 0.05 0.5811 0.5710 0.0101\n",
"2 48 0.0001 8 5 adam huggingface_cosine_with_restarts 0.10 0.5800 0.5722 0.0078\n",
"3 20 0.0010 16 4 sgd huggingface_cosine_with_restarts 0.10 0.5791 0.5724 0.0067\n",
"4 27 0.0010 16 5 sgd huggingface_cosine_with_restarts 0.05 0.5790 0.5746 0.0044\n",
"5 19 0.0010 16 4 sgd huggingface_cosine_with_restarts 0.05 0.5783 0.5700 0.0083\n",
"6 4 0.0010 8 4 sgd huggingface_cosine_with_restarts 0.10 0.5774 0.5654 0.0120\n",
"7 54 0.0001 16 4 adam plateau 0.10 0.5773 0.5697 0.0075\n",
"8 53 0.0001 16 4 adam plateau 0.05 0.5773 0.5697 0.0075\n",
"9 38 0.0001 8 4 adam plateau 0.10 0.5761 0.5737 0.0024\n",
"10 37 0.0001 8 4 adam plateau 0.05 0.5761 0.5737 0.0024\n",
"11 3 0.0010 8 4 sgd huggingface_cosine_with_restarts 0.05 0.5759 0.5736 0.0023\n",
"12 63 0.0001 16 5 adam huggingface_cosine_with_restarts 0.05 0.5757 0.5649 0.0108\n",
"13 29 0.0010 16 5 adam plateau 0.05 0.5754 0.5665 0.0090\n",
"14 30 0.0010 16 5 adam plateau 0.10 0.5754 0.5665 0.0090"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"df = parse_smart_log(\"C:/Users/Alexandr/Desktop/sampling/30.txt\",25)\n",
"\n",
"from IPython.display import display\n",
"pd.set_option(\"display.max_columns\", None)\n",
"pd.set_option(\"display.width\", 160)\n",
"\n",
"display(df.head(15))"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "c3f7929b-5279-4490-84e6-f0e4309f769d",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" step | \n",
" lr | \n",
" num_transformer_heads | \n",
" tr_layer_number | \n",
" optimizer | \n",
" scheduler_type | \n",
" warmup_ratio | \n",
" dev | \n",
" test | \n",
" gap | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 37 | \n",
" 0.0001 | \n",
" 8 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.5827 | \n",
" 0.5734 | \n",
" 0.0093 | \n",
"
\n",
" \n",
" 1 | \n",
" 38 | \n",
" 0.0001 | \n",
" 8 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.5827 | \n",
" 0.5734 | \n",
" 0.0093 | \n",
"
\n",
" \n",
" 2 | \n",
" 56 | \n",
" 0.0001 | \n",
" 16 | \n",
" 4 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.5821 | \n",
" 0.5589 | \n",
" 0.0232 | \n",
"
\n",
" \n",
" 3 | \n",
" 3 | \n",
" 0.0010 | \n",
" 8 | \n",
" 4 | \n",
" sgd | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.5813 | \n",
" 0.5801 | \n",
" 0.0012 | \n",
"
\n",
" \n",
" 4 | \n",
" 39 | \n",
" 0.0001 | \n",
" 8 | \n",
" 4 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.5810 | \n",
" 0.5716 | \n",
" 0.0094 | \n",
"
\n",
" \n",
" 5 | \n",
" 17 | \n",
" 0.0010 | \n",
" 16 | \n",
" 4 | \n",
" sgd | \n",
" plateau | \n",
" 0.05 | \n",
" 0.5806 | \n",
" 0.5757 | \n",
" 0.0049 | \n",
"
\n",
" \n",
" 6 | \n",
" 18 | \n",
" 0.0010 | \n",
" 16 | \n",
" 4 | \n",
" sgd | \n",
" plateau | \n",
" 0.10 | \n",
" 0.5806 | \n",
" 0.5757 | \n",
" 0.0049 | \n",
"
\n",
" \n",
" 7 | \n",
" 63 | \n",
" 0.0001 | \n",
" 16 | \n",
" 5 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.5798 | \n",
" 0.5767 | \n",
" 0.0031 | \n",
"
\n",
" \n",
" 8 | \n",
" 28 | \n",
" 0.0010 | \n",
" 16 | \n",
" 5 | \n",
" sgd | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.5797 | \n",
" 0.5710 | \n",
" 0.0087 | \n",
"
\n",
" \n",
" 9 | \n",
" 19 | \n",
" 0.0010 | \n",
" 16 | \n",
" 4 | \n",
" sgd | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.5795 | \n",
" 0.5731 | \n",
" 0.0065 | \n",
"
\n",
" \n",
" 10 | \n",
" 4 | \n",
" 0.0010 | \n",
" 8 | \n",
" 4 | \n",
" sgd | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.5791 | \n",
" 0.5681 | \n",
" 0.0110 | \n",
"
\n",
" \n",
" 11 | \n",
" 20 | \n",
" 0.0010 | \n",
" 16 | \n",
" 4 | \n",
" sgd | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.5790 | \n",
" 0.5682 | \n",
" 0.0109 | \n",
"
\n",
" \n",
" 12 | \n",
" 1 | \n",
" 0.0010 | \n",
" 8 | \n",
" 4 | \n",
" sgd | \n",
" plateau | \n",
" 0.05 | \n",
" 0.5778 | \n",
" 0.5674 | \n",
" 0.0104 | \n",
"
\n",
" \n",
" 13 | \n",
" 2 | \n",
" 0.0010 | \n",
" 8 | \n",
" 4 | \n",
" sgd | \n",
" plateau | \n",
" 0.10 | \n",
" 0.5778 | \n",
" 0.5674 | \n",
" 0.0104 | \n",
"
\n",
" \n",
" 14 | \n",
" 64 | \n",
" 0.0001 | \n",
" 16 | \n",
" 5 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.5755 | \n",
" 0.5670 | \n",
" 0.0085 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" step lr num_transformer_heads tr_layer_number optimizer scheduler_type warmup_ratio dev test gap\n",
"0 37 0.0001 8 4 adam plateau 0.05 0.5827 0.5734 0.0093\n",
"1 38 0.0001 8 4 adam plateau 0.10 0.5827 0.5734 0.0093\n",
"2 56 0.0001 16 4 adam huggingface_cosine_with_restarts 0.10 0.5821 0.5589 0.0232\n",
"3 3 0.0010 8 4 sgd huggingface_cosine_with_restarts 0.05 0.5813 0.5801 0.0012\n",
"4 39 0.0001 8 4 adam huggingface_cosine_with_restarts 0.05 0.5810 0.5716 0.0094\n",
"5 17 0.0010 16 4 sgd plateau 0.05 0.5806 0.5757 0.0049\n",
"6 18 0.0010 16 4 sgd plateau 0.10 0.5806 0.5757 0.0049\n",
"7 63 0.0001 16 5 adam huggingface_cosine_with_restarts 0.05 0.5798 0.5767 0.0031\n",
"8 28 0.0010 16 5 sgd huggingface_cosine_with_restarts 0.10 0.5797 0.5710 0.0087\n",
"9 19 0.0010 16 4 sgd huggingface_cosine_with_restarts 0.05 0.5795 0.5731 0.0065\n",
"10 4 0.0010 8 4 sgd huggingface_cosine_with_restarts 0.10 0.5791 0.5681 0.0110\n",
"11 20 0.0010 16 4 sgd huggingface_cosine_with_restarts 0.10 0.5790 0.5682 0.0109\n",
"12 1 0.0010 8 4 sgd plateau 0.05 0.5778 0.5674 0.0104\n",
"13 2 0.0010 8 4 sgd plateau 0.10 0.5778 0.5674 0.0104\n",
"14 64 0.0001 16 5 adam huggingface_cosine_with_restarts 0.10 0.5755 0.5670 0.0085"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"df = parse_smart_log(\"C:/Users/Alexandr/Desktop/sampling/40.txt\",25)\n",
"\n",
"from IPython.display import display\n",
"pd.set_option(\"display.max_columns\", None)\n",
"pd.set_option(\"display.width\", 160)\n",
"\n",
"display(df.head(15))"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "70da89eb-18e6-4795-8b83-1116fe1fa968",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" step | \n",
" lr | \n",
" num_transformer_heads | \n",
" tr_layer_number | \n",
" optimizer | \n",
" scheduler_type | \n",
" warmup_ratio | \n",
" dev | \n",
" test | \n",
" gap | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 56 | \n",
" 0.0001 | \n",
" 16 | \n",
" 4 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.5919 | \n",
" 0.5773 | \n",
" 0.0146 | \n",
"
\n",
" \n",
" 1 | \n",
" 40 | \n",
" 0.0001 | \n",
" 8 | \n",
" 4 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.5911 | \n",
" 0.5771 | \n",
" 0.0140 | \n",
"
\n",
" \n",
" 2 | \n",
" 54 | \n",
" 0.0001 | \n",
" 16 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.5879 | \n",
" 0.5774 | \n",
" 0.0105 | \n",
"
\n",
" \n",
" 3 | \n",
" 53 | \n",
" 0.0001 | \n",
" 16 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.5879 | \n",
" 0.5774 | \n",
" 0.0105 | \n",
"
\n",
" \n",
" 4 | \n",
" 37 | \n",
" 0.0001 | \n",
" 8 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.5873 | \n",
" 0.5722 | \n",
" 0.0152 | \n",
"
\n",
" \n",
" 5 | \n",
" 38 | \n",
" 0.0001 | \n",
" 8 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.5873 | \n",
" 0.5722 | \n",
" 0.0152 | \n",
"
\n",
" \n",
" 6 | \n",
" 18 | \n",
" 0.0010 | \n",
" 16 | \n",
" 4 | \n",
" sgd | \n",
" plateau | \n",
" 0.10 | \n",
" 0.5861 | \n",
" 0.5761 | \n",
" 0.0100 | \n",
"
\n",
" \n",
" 7 | \n",
" 17 | \n",
" 0.0010 | \n",
" 16 | \n",
" 4 | \n",
" sgd | \n",
" plateau | \n",
" 0.05 | \n",
" 0.5861 | \n",
" 0.5761 | \n",
" 0.0100 | \n",
"
\n",
" \n",
" 8 | \n",
" 19 | \n",
" 0.0010 | \n",
" 16 | \n",
" 4 | \n",
" sgd | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.5838 | \n",
" 0.5826 | \n",
" 0.0012 | \n",
"
\n",
" \n",
" 9 | \n",
" 62 | \n",
" 0.0001 | \n",
" 16 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.5837 | \n",
" 0.5732 | \n",
" 0.0105 | \n",
"
\n",
" \n",
" 10 | \n",
" 61 | \n",
" 0.0001 | \n",
" 16 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.5837 | \n",
" 0.5732 | \n",
" 0.0105 | \n",
"
\n",
" \n",
" 11 | \n",
" 55 | \n",
" 0.0001 | \n",
" 16 | \n",
" 4 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.5836 | \n",
" 0.5692 | \n",
" 0.0144 | \n",
"
\n",
" \n",
" 12 | \n",
" 39 | \n",
" 0.0001 | \n",
" 8 | \n",
" 4 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.5834 | \n",
" 0.5705 | \n",
" 0.0129 | \n",
"
\n",
" \n",
" 13 | \n",
" 4 | \n",
" 0.0010 | \n",
" 8 | \n",
" 4 | \n",
" sgd | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.5826 | \n",
" 0.5795 | \n",
" 0.0031 | \n",
"
\n",
" \n",
" 14 | \n",
" 3 | \n",
" 0.0010 | \n",
" 8 | \n",
" 4 | \n",
" sgd | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.5825 | \n",
" 0.5793 | \n",
" 0.0032 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" step lr num_transformer_heads tr_layer_number optimizer scheduler_type warmup_ratio dev test gap\n",
"0 56 0.0001 16 4 adam huggingface_cosine_with_restarts 0.10 0.5919 0.5773 0.0146\n",
"1 40 0.0001 8 4 adam huggingface_cosine_with_restarts 0.10 0.5911 0.5771 0.0140\n",
"2 54 0.0001 16 4 adam plateau 0.10 0.5879 0.5774 0.0105\n",
"3 53 0.0001 16 4 adam plateau 0.05 0.5879 0.5774 0.0105\n",
"4 37 0.0001 8 4 adam plateau 0.05 0.5873 0.5722 0.0152\n",
"5 38 0.0001 8 4 adam plateau 0.10 0.5873 0.5722 0.0152\n",
"6 18 0.0010 16 4 sgd plateau 0.10 0.5861 0.5761 0.0100\n",
"7 17 0.0010 16 4 sgd plateau 0.05 0.5861 0.5761 0.0100\n",
"8 19 0.0010 16 4 sgd huggingface_cosine_with_restarts 0.05 0.5838 0.5826 0.0012\n",
"9 62 0.0001 16 5 adam plateau 0.10 0.5837 0.5732 0.0105\n",
"10 61 0.0001 16 5 adam plateau 0.05 0.5837 0.5732 0.0105\n",
"11 55 0.0001 16 4 adam huggingface_cosine_with_restarts 0.05 0.5836 0.5692 0.0144\n",
"12 39 0.0001 8 4 adam huggingface_cosine_with_restarts 0.05 0.5834 0.5705 0.0129\n",
"13 4 0.0010 8 4 sgd huggingface_cosine_with_restarts 0.10 0.5826 0.5795 0.0031\n",
"14 3 0.0010 8 4 sgd huggingface_cosine_with_restarts 0.05 0.5825 0.5793 0.0032"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"df = parse_smart_log(\"C:/Users/Alexandr/Desktop/sampling/50.txt\",25)\n",
"\n",
"from IPython.display import display\n",
"pd.set_option(\"display.max_columns\", None)\n",
"pd.set_option(\"display.width\", 160)\n",
"\n",
"display(df.head(15))"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "6614a37d-344e-46a5-b5ba-e49d4010027a",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" step | \n",
" lr | \n",
" num_transformer_heads | \n",
" tr_layer_number | \n",
" optimizer | \n",
" scheduler_type | \n",
" warmup_ratio | \n",
" weight_decay | \n",
" dropout | \n",
" dev | \n",
" test | \n",
" gap | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 27 | \n",
" 0.0001 | \n",
" 8 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.0 | \n",
" 0.20 | \n",
" 0.5850 | \n",
" 0.5768 | \n",
" 0.0082 | \n",
"
\n",
" \n",
" 1 | \n",
" 33 | \n",
" 0.0001 | \n",
" 8 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.0 | \n",
" 0.20 | \n",
" 0.5850 | \n",
" 0.5768 | \n",
" 0.0082 | \n",
"
\n",
" \n",
" 2 | \n",
" 32 | \n",
" 0.0001 | \n",
" 8 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.0 | \n",
" 0.15 | \n",
" 0.5827 | \n",
" 0.5732 | \n",
" 0.0095 | \n",
"
\n",
" \n",
" 3 | \n",
" 26 | \n",
" 0.0001 | \n",
" 8 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.0 | \n",
" 0.15 | \n",
" 0.5827 | \n",
" 0.5732 | \n",
" 0.0095 | \n",
"
\n",
" \n",
" 4 | \n",
" 49 | \n",
" 0.0001 | \n",
" 16 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.0 | \n",
" 0.10 | \n",
" 0.5827 | \n",
" 0.5709 | \n",
" 0.0119 | \n",
"
\n",
" \n",
" 5 | \n",
" 55 | \n",
" 0.0001 | \n",
" 16 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.0 | \n",
" 0.10 | \n",
" 0.5827 | \n",
" 0.5709 | \n",
" 0.0119 | \n",
"
\n",
" \n",
" 6 | \n",
" 36 | \n",
" 0.0001 | \n",
" 8 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.1 | \n",
" 0.20 | \n",
" 0.5820 | \n",
" 0.5717 | \n",
" 0.0104 | \n",
"
\n",
" \n",
" 7 | \n",
" 30 | \n",
" 0.0001 | \n",
" 8 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.1 | \n",
" 0.20 | \n",
" 0.5820 | \n",
" 0.5717 | \n",
" 0.0104 | \n",
"
\n",
" \n",
" 8 | \n",
" 29 | \n",
" 0.0001 | \n",
" 8 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.1 | \n",
" 0.15 | \n",
" 0.5815 | \n",
" 0.5719 | \n",
" 0.0096 | \n",
"
\n",
" \n",
" 9 | \n",
" 35 | \n",
" 0.0001 | \n",
" 8 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.1 | \n",
" 0.15 | \n",
" 0.5815 | \n",
" 0.5719 | \n",
" 0.0096 | \n",
"
\n",
" \n",
" 10 | \n",
" 51 | \n",
" 0.0001 | \n",
" 16 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.0 | \n",
" 0.20 | \n",
" 0.5810 | \n",
" 0.5694 | \n",
" 0.0116 | \n",
"
\n",
" \n",
" 11 | \n",
" 57 | \n",
" 0.0001 | \n",
" 16 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.0 | \n",
" 0.20 | \n",
" 0.5810 | \n",
" 0.5694 | \n",
" 0.0116 | \n",
"
\n",
" \n",
" 12 | \n",
" 68 | \n",
" 0.0001 | \n",
" 16 | \n",
" 4 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.0 | \n",
" 0.15 | \n",
" 0.5806 | \n",
" 0.5668 | \n",
" 0.0138 | \n",
"
\n",
" \n",
" 13 | \n",
" 3 | \n",
" 0.0001 | \n",
" 8 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.0 | \n",
" 0.20 | \n",
" 0.5797 | \n",
" 0.5709 | \n",
" 0.0088 | \n",
"
\n",
" \n",
" 14 | \n",
" 9 | \n",
" 0.0001 | \n",
" 8 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.0 | \n",
" 0.20 | \n",
" 0.5797 | \n",
" 0.5709 | \n",
" 0.0088 | \n",
"
\n",
" \n",
" 15 | \n",
" 8 | \n",
" 0.0001 | \n",
" 8 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.0 | \n",
" 0.15 | \n",
" 0.5791 | \n",
" 0.5669 | \n",
" 0.0123 | \n",
"
\n",
" \n",
" 16 | \n",
" 2 | \n",
" 0.0001 | \n",
" 8 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.0 | \n",
" 0.15 | \n",
" 0.5791 | \n",
" 0.5669 | \n",
" 0.0123 | \n",
"
\n",
" \n",
" 17 | \n",
" 61 | \n",
" 0.0001 | \n",
" 16 | \n",
" 4 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.0 | \n",
" 0.10 | \n",
" 0.5790 | \n",
" 0.5664 | \n",
" 0.0126 | \n",
"
\n",
" \n",
" 18 | \n",
" 31 | \n",
" 0.0001 | \n",
" 8 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.0 | \n",
" 0.10 | \n",
" 0.5788 | \n",
" 0.5695 | \n",
" 0.0093 | \n",
"
\n",
" \n",
" 19 | \n",
" 25 | \n",
" 0.0001 | \n",
" 8 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.0 | \n",
" 0.10 | \n",
" 0.5788 | \n",
" 0.5695 | \n",
" 0.0093 | \n",
"
\n",
" \n",
" 20 | \n",
" 77 | \n",
" 0.0001 | \n",
" 16 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.1 | \n",
" 0.15 | \n",
" 0.5785 | \n",
" 0.5733 | \n",
" 0.0051 | \n",
"
\n",
" \n",
" 21 | \n",
" 83 | \n",
" 0.0001 | \n",
" 16 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.1 | \n",
" 0.15 | \n",
" 0.5785 | \n",
" 0.5733 | \n",
" 0.0051 | \n",
"
\n",
" \n",
" 22 | \n",
" 50 | \n",
" 0.0001 | \n",
" 16 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.0 | \n",
" 0.15 | \n",
" 0.5779 | \n",
" 0.5670 | \n",
" 0.0109 | \n",
"
\n",
" \n",
" 23 | \n",
" 56 | \n",
" 0.0001 | \n",
" 16 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.0 | \n",
" 0.15 | \n",
" 0.5779 | \n",
" 0.5670 | \n",
" 0.0109 | \n",
"
\n",
" \n",
" 24 | \n",
" 62 | \n",
" 0.0001 | \n",
" 16 | \n",
" 4 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.0 | \n",
" 0.15 | \n",
" 0.5777 | \n",
" 0.5622 | \n",
" 0.0155 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" step lr num_transformer_heads tr_layer_number optimizer scheduler_type warmup_ratio weight_decay dropout dev test \\\n",
"0 27 0.0001 8 5 adam plateau 0.05 0.0 0.20 0.5850 0.5768 \n",
"1 33 0.0001 8 5 adam plateau 0.10 0.0 0.20 0.5850 0.5768 \n",
"2 32 0.0001 8 5 adam plateau 0.10 0.0 0.15 0.5827 0.5732 \n",
"3 26 0.0001 8 5 adam plateau 0.05 0.0 0.15 0.5827 0.5732 \n",
"4 49 0.0001 16 4 adam plateau 0.05 0.0 0.10 0.5827 0.5709 \n",
"5 55 0.0001 16 4 adam plateau 0.10 0.0 0.10 0.5827 0.5709 \n",
"6 36 0.0001 8 5 adam plateau 0.10 0.1 0.20 0.5820 0.5717 \n",
"7 30 0.0001 8 5 adam plateau 0.05 0.1 0.20 0.5820 0.5717 \n",
"8 29 0.0001 8 5 adam plateau 0.05 0.1 0.15 0.5815 0.5719 \n",
"9 35 0.0001 8 5 adam plateau 0.10 0.1 0.15 0.5815 0.5719 \n",
"10 51 0.0001 16 4 adam plateau 0.05 0.0 0.20 0.5810 0.5694 \n",
"11 57 0.0001 16 4 adam plateau 0.10 0.0 0.20 0.5810 0.5694 \n",
"12 68 0.0001 16 4 adam huggingface_cosine_with_restarts 0.10 0.0 0.15 0.5806 0.5668 \n",
"13 3 0.0001 8 4 adam plateau 0.05 0.0 0.20 0.5797 0.5709 \n",
"14 9 0.0001 8 4 adam plateau 0.10 0.0 0.20 0.5797 0.5709 \n",
"15 8 0.0001 8 4 adam plateau 0.10 0.0 0.15 0.5791 0.5669 \n",
"16 2 0.0001 8 4 adam plateau 0.05 0.0 0.15 0.5791 0.5669 \n",
"17 61 0.0001 16 4 adam huggingface_cosine_with_restarts 0.05 0.0 0.10 0.5790 0.5664 \n",
"18 31 0.0001 8 5 adam plateau 0.10 0.0 0.10 0.5788 0.5695 \n",
"19 25 0.0001 8 5 adam plateau 0.05 0.0 0.10 0.5788 0.5695 \n",
"20 77 0.0001 16 5 adam plateau 0.05 0.1 0.15 0.5785 0.5733 \n",
"21 83 0.0001 16 5 adam plateau 0.10 0.1 0.15 0.5785 0.5733 \n",
"22 50 0.0001 16 4 adam plateau 0.05 0.0 0.15 0.5779 0.5670 \n",
"23 56 0.0001 16 4 adam plateau 0.10 0.0 0.15 0.5779 0.5670 \n",
"24 62 0.0001 16 4 adam huggingface_cosine_with_restarts 0.05 0.0 0.15 0.5777 0.5622 \n",
"\n",
" gap \n",
"0 0.0082 \n",
"1 0.0082 \n",
"2 0.0095 \n",
"3 0.0095 \n",
"4 0.0119 \n",
"5 0.0119 \n",
"6 0.0104 \n",
"7 0.0104 \n",
"8 0.0096 \n",
"9 0.0096 \n",
"10 0.0116 \n",
"11 0.0116 \n",
"12 0.0138 \n",
"13 0.0088 \n",
"14 0.0088 \n",
"15 0.0123 \n",
"16 0.0123 \n",
"17 0.0126 \n",
"18 0.0093 \n",
"19 0.0093 \n",
"20 0.0051 \n",
"21 0.0051 \n",
"22 0.0109 \n",
"23 0.0109 \n",
"24 0.0155 "
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"df = parse_smart_log(\"C:/Users/Alexandr/Desktop/sampling/60.txt\",25)\n",
"\n",
"from IPython.display import display\n",
"pd.set_option(\"display.max_columns\", None)\n",
"pd.set_option(\"display.width\", 160)\n",
"\n",
"display(df.head(25))"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "7f6b722c-c134-45ec-9cf0-b4b4f8eb0c3f",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" step | \n",
" lr | \n",
" num_transformer_heads | \n",
" tr_layer_number | \n",
" optimizer | \n",
" scheduler_type | \n",
" warmup_ratio | \n",
" weight_decay | \n",
" dropout | \n",
" dev | \n",
" test | \n",
" gap | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 43 | \n",
" 0.0001 | \n",
" 8 | \n",
" 5 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.0 | \n",
" 0.10 | \n",
" 0.5935 | \n",
" 0.5771 | \n",
" 0.0164 | \n",
"
\n",
" \n",
" 1 | \n",
" 91 | \n",
" 0.0001 | \n",
" 16 | \n",
" 5 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.0 | \n",
" 0.10 | \n",
" 0.5902 | \n",
" 0.5680 | \n",
" 0.0221 | \n",
"
\n",
" \n",
" 2 | \n",
" 73 | \n",
" 0.0001 | \n",
" 16 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.0 | \n",
" 0.10 | \n",
" 0.5900 | \n",
" 0.5675 | \n",
" 0.0224 | \n",
"
\n",
" \n",
" 3 | \n",
" 79 | \n",
" 0.0001 | \n",
" 16 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.0 | \n",
" 0.10 | \n",
" 0.5900 | \n",
" 0.5675 | \n",
" 0.0224 | \n",
"
\n",
" \n",
" 4 | \n",
" 81 | \n",
" 0.0001 | \n",
" 16 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.0 | \n",
" 0.20 | \n",
" 0.5858 | \n",
" 0.5720 | \n",
" 0.0138 | \n",
"
\n",
" \n",
" 5 | \n",
" 75 | \n",
" 0.0001 | \n",
" 16 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.0 | \n",
" 0.20 | \n",
" 0.5858 | \n",
" 0.5720 | \n",
" 0.0138 | \n",
"
\n",
" \n",
" 6 | \n",
" 80 | \n",
" 0.0001 | \n",
" 16 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.0 | \n",
" 0.15 | \n",
" 0.5856 | \n",
" 0.5714 | \n",
" 0.0141 | \n",
"
\n",
" \n",
" 7 | \n",
" 74 | \n",
" 0.0001 | \n",
" 16 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.0 | \n",
" 0.15 | \n",
" 0.5856 | \n",
" 0.5714 | \n",
" 0.0141 | \n",
"
\n",
" \n",
" 8 | \n",
" 85 | \n",
" 0.0001 | \n",
" 16 | \n",
" 5 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.0 | \n",
" 0.10 | \n",
" 0.5848 | \n",
" 0.5637 | \n",
" 0.0211 | \n",
"
\n",
" \n",
" 9 | \n",
" 25 | \n",
" 0.0001 | \n",
" 8 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.0 | \n",
" 0.10 | \n",
" 0.5844 | \n",
" 0.5690 | \n",
" 0.0154 | \n",
"
\n",
" \n",
" 10 | \n",
" 31 | \n",
" 0.0001 | \n",
" 8 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.0 | \n",
" 0.10 | \n",
" 0.5844 | \n",
" 0.5690 | \n",
" 0.0154 | \n",
"
\n",
" \n",
" 11 | \n",
" 87 | \n",
" 0.0001 | \n",
" 16 | \n",
" 5 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.0 | \n",
" 0.20 | \n",
" 0.5837 | \n",
" 0.5640 | \n",
" 0.0197 | \n",
"
\n",
" \n",
" 12 | \n",
" 93 | \n",
" 0.0001 | \n",
" 16 | \n",
" 5 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.0 | \n",
" 0.20 | \n",
" 0.5834 | \n",
" 0.5653 | \n",
" 0.0180 | \n",
"
\n",
" \n",
" 13 | \n",
" 86 | \n",
" 0.0001 | \n",
" 16 | \n",
" 5 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.0 | \n",
" 0.15 | \n",
" 0.5832 | \n",
" 0.5644 | \n",
" 0.0189 | \n",
"
\n",
" \n",
" 14 | \n",
" 92 | \n",
" 0.0001 | \n",
" 16 | \n",
" 5 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.0 | \n",
" 0.15 | \n",
" 0.5819 | \n",
" 0.5640 | \n",
" 0.0180 | \n",
"
\n",
" \n",
" 15 | \n",
" 39 | \n",
" 0.0001 | \n",
" 8 | \n",
" 5 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.0 | \n",
" 0.20 | \n",
" 0.5814 | \n",
" 0.5729 | \n",
" 0.0085 | \n",
"
\n",
" \n",
" 16 | \n",
" 45 | \n",
" 0.0001 | \n",
" 8 | \n",
" 5 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.0 | \n",
" 0.20 | \n",
" 0.5812 | \n",
" 0.5751 | \n",
" 0.0060 | \n",
"
\n",
" \n",
" 17 | \n",
" 37 | \n",
" 0.0001 | \n",
" 8 | \n",
" 5 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.0 | \n",
" 0.10 | \n",
" 0.5807 | \n",
" 0.5717 | \n",
" 0.0090 | \n",
"
\n",
" \n",
" 18 | \n",
" 32 | \n",
" 0.0001 | \n",
" 8 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.0 | \n",
" 0.15 | \n",
" 0.5802 | \n",
" 0.5652 | \n",
" 0.0150 | \n",
"
\n",
" \n",
" 19 | \n",
" 26 | \n",
" 0.0001 | \n",
" 8 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.0 | \n",
" 0.15 | \n",
" 0.5802 | \n",
" 0.5652 | \n",
" 0.0150 | \n",
"
\n",
" \n",
" 20 | \n",
" 27 | \n",
" 0.0001 | \n",
" 8 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.0 | \n",
" 0.20 | \n",
" 0.5793 | \n",
" 0.5633 | \n",
" 0.0160 | \n",
"
\n",
" \n",
" 21 | \n",
" 33 | \n",
" 0.0001 | \n",
" 8 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.0 | \n",
" 0.20 | \n",
" 0.5793 | \n",
" 0.5633 | \n",
" 0.0160 | \n",
"
\n",
" \n",
" 22 | \n",
" 9 | \n",
" 0.0001 | \n",
" 8 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.0 | \n",
" 0.20 | \n",
" 0.5783 | \n",
" 0.5646 | \n",
" 0.0137 | \n",
"
\n",
" \n",
" 23 | \n",
" 3 | \n",
" 0.0001 | \n",
" 8 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.0 | \n",
" 0.20 | \n",
" 0.5783 | \n",
" 0.5646 | \n",
" 0.0137 | \n",
"
\n",
" \n",
" 24 | \n",
" 49 | \n",
" 0.0001 | \n",
" 16 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.0 | \n",
" 0.10 | \n",
" 0.5783 | \n",
" 0.5619 | \n",
" 0.0164 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" step lr num_transformer_heads tr_layer_number optimizer scheduler_type warmup_ratio weight_decay dropout dev test \\\n",
"0 43 0.0001 8 5 adam huggingface_cosine_with_restarts 0.10 0.0 0.10 0.5935 0.5771 \n",
"1 91 0.0001 16 5 adam huggingface_cosine_with_restarts 0.10 0.0 0.10 0.5902 0.5680 \n",
"2 73 0.0001 16 5 adam plateau 0.05 0.0 0.10 0.5900 0.5675 \n",
"3 79 0.0001 16 5 adam plateau 0.10 0.0 0.10 0.5900 0.5675 \n",
"4 81 0.0001 16 5 adam plateau 0.10 0.0 0.20 0.5858 0.5720 \n",
"5 75 0.0001 16 5 adam plateau 0.05 0.0 0.20 0.5858 0.5720 \n",
"6 80 0.0001 16 5 adam plateau 0.10 0.0 0.15 0.5856 0.5714 \n",
"7 74 0.0001 16 5 adam plateau 0.05 0.0 0.15 0.5856 0.5714 \n",
"8 85 0.0001 16 5 adam huggingface_cosine_with_restarts 0.05 0.0 0.10 0.5848 0.5637 \n",
"9 25 0.0001 8 5 adam plateau 0.05 0.0 0.10 0.5844 0.5690 \n",
"10 31 0.0001 8 5 adam plateau 0.10 0.0 0.10 0.5844 0.5690 \n",
"11 87 0.0001 16 5 adam huggingface_cosine_with_restarts 0.05 0.0 0.20 0.5837 0.5640 \n",
"12 93 0.0001 16 5 adam huggingface_cosine_with_restarts 0.10 0.0 0.20 0.5834 0.5653 \n",
"13 86 0.0001 16 5 adam huggingface_cosine_with_restarts 0.05 0.0 0.15 0.5832 0.5644 \n",
"14 92 0.0001 16 5 adam huggingface_cosine_with_restarts 0.10 0.0 0.15 0.5819 0.5640 \n",
"15 39 0.0001 8 5 adam huggingface_cosine_with_restarts 0.05 0.0 0.20 0.5814 0.5729 \n",
"16 45 0.0001 8 5 adam huggingface_cosine_with_restarts 0.10 0.0 0.20 0.5812 0.5751 \n",
"17 37 0.0001 8 5 adam huggingface_cosine_with_restarts 0.05 0.0 0.10 0.5807 0.5717 \n",
"18 32 0.0001 8 5 adam plateau 0.10 0.0 0.15 0.5802 0.5652 \n",
"19 26 0.0001 8 5 adam plateau 0.05 0.0 0.15 0.5802 0.5652 \n",
"20 27 0.0001 8 5 adam plateau 0.05 0.0 0.20 0.5793 0.5633 \n",
"21 33 0.0001 8 5 adam plateau 0.10 0.0 0.20 0.5793 0.5633 \n",
"22 9 0.0001 8 4 adam plateau 0.10 0.0 0.20 0.5783 0.5646 \n",
"23 3 0.0001 8 4 adam plateau 0.05 0.0 0.20 0.5783 0.5646 \n",
"24 49 0.0001 16 4 adam plateau 0.05 0.0 0.10 0.5783 0.5619 \n",
"\n",
" gap \n",
"0 0.0164 \n",
"1 0.0221 \n",
"2 0.0224 \n",
"3 0.0224 \n",
"4 0.0138 \n",
"5 0.0138 \n",
"6 0.0141 \n",
"7 0.0141 \n",
"8 0.0211 \n",
"9 0.0154 \n",
"10 0.0154 \n",
"11 0.0197 \n",
"12 0.0180 \n",
"13 0.0189 \n",
"14 0.0180 \n",
"15 0.0085 \n",
"16 0.0060 \n",
"17 0.0090 \n",
"18 0.0150 \n",
"19 0.0150 \n",
"20 0.0160 \n",
"21 0.0160 \n",
"22 0.0137 \n",
"23 0.0137 \n",
"24 0.0164 "
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"df = parse_smart_log(\"C:/Users/Alexandr/Desktop/sampling/70.txt\",25)\n",
"\n",
"from IPython.display import display\n",
"pd.set_option(\"display.max_columns\", None)\n",
"pd.set_option(\"display.width\", 160)\n",
"\n",
"display(df.head(25))"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "1d4db04e-ee0f-4c2c-b0d2-45edbf2128dd",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" step | \n",
" lr | \n",
" num_transformer_heads | \n",
" tr_layer_number | \n",
" optimizer | \n",
" scheduler_type | \n",
" warmup_ratio | \n",
" weight_decay | \n",
" dropout | \n",
" dev | \n",
" test | \n",
" gap | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 2 | \n",
" 0.0001 | \n",
" 8 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.0 | \n",
" 0.15 | \n",
" 0.5735 | \n",
" 0.5675 | \n",
" 0.0060 | \n",
"
\n",
" \n",
" 1 | \n",
" 8 | \n",
" 0.0001 | \n",
" 8 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.0 | \n",
" 0.15 | \n",
" 0.5735 | \n",
" 0.5675 | \n",
" 0.0060 | \n",
"
\n",
" \n",
" 2 | \n",
" 3 | \n",
" 0.0001 | \n",
" 8 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.0 | \n",
" 0.20 | \n",
" 0.5734 | \n",
" 0.5641 | \n",
" 0.0094 | \n",
"
\n",
" \n",
" 3 | \n",
" 9 | \n",
" 0.0001 | \n",
" 8 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.0 | \n",
" 0.20 | \n",
" 0.5734 | \n",
" 0.5641 | \n",
" 0.0094 | \n",
"
\n",
" \n",
" 4 | \n",
" 75 | \n",
" 0.0001 | \n",
" 16 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.0 | \n",
" 0.20 | \n",
" 0.5723 | \n",
" 0.5708 | \n",
" 0.0015 | \n",
"
\n",
" \n",
" 5 | \n",
" 81 | \n",
" 0.0001 | \n",
" 16 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.0 | \n",
" 0.20 | \n",
" 0.5723 | \n",
" 0.5708 | \n",
" 0.0015 | \n",
"
\n",
" \n",
" 6 | \n",
" 20 | \n",
" 0.0001 | \n",
" 8 | \n",
" 4 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.0 | \n",
" 0.15 | \n",
" 0.5715 | \n",
" 0.5687 | \n",
" 0.0028 | \n",
"
\n",
" \n",
" 7 | \n",
" 7 | \n",
" 0.0001 | \n",
" 8 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.0 | \n",
" 0.10 | \n",
" 0.5712 | \n",
" 0.5706 | \n",
" 0.0006 | \n",
"
\n",
" \n",
" 8 | \n",
" 1 | \n",
" 0.0001 | \n",
" 8 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.0 | \n",
" 0.10 | \n",
" 0.5712 | \n",
" 0.5706 | \n",
" 0.0006 | \n",
"
\n",
" \n",
" 9 | \n",
" 74 | \n",
" 0.0001 | \n",
" 16 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.0 | \n",
" 0.15 | \n",
" 0.5711 | \n",
" 0.5679 | \n",
" 0.0032 | \n",
"
\n",
" \n",
" 10 | \n",
" 80 | \n",
" 0.0001 | \n",
" 16 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.0 | \n",
" 0.15 | \n",
" 0.5711 | \n",
" 0.5679 | \n",
" 0.0032 | \n",
"
\n",
" \n",
" 11 | \n",
" 63 | \n",
" 0.0001 | \n",
" 16 | \n",
" 4 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.0 | \n",
" 0.20 | \n",
" 0.5704 | \n",
" 0.5612 | \n",
" 0.0092 | \n",
"
\n",
" \n",
" 12 | \n",
" 33 | \n",
" 0.0001 | \n",
" 8 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.0 | \n",
" 0.20 | \n",
" 0.5703 | \n",
" 0.5659 | \n",
" 0.0044 | \n",
"
\n",
" \n",
" 13 | \n",
" 27 | \n",
" 0.0001 | \n",
" 8 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.0 | \n",
" 0.20 | \n",
" 0.5703 | \n",
" 0.5659 | \n",
" 0.0044 | \n",
"
\n",
" \n",
" 14 | \n",
" 45 | \n",
" 0.0001 | \n",
" 8 | \n",
" 5 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.0 | \n",
" 0.20 | \n",
" 0.5699 | \n",
" 0.5607 | \n",
" 0.0092 | \n",
"
\n",
" \n",
" 15 | \n",
" 39 | \n",
" 0.0001 | \n",
" 8 | \n",
" 5 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.0 | \n",
" 0.20 | \n",
" 0.5697 | \n",
" 0.5687 | \n",
" 0.0010 | \n",
"
\n",
" \n",
" 16 | \n",
" 21 | \n",
" 0.0001 | \n",
" 8 | \n",
" 4 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.0 | \n",
" 0.20 | \n",
" 0.5692 | \n",
" 0.5540 | \n",
" 0.0152 | \n",
"
\n",
" \n",
" 17 | \n",
" 50 | \n",
" 0.0001 | \n",
" 16 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.0 | \n",
" 0.15 | \n",
" 0.5687 | \n",
" 0.5638 | \n",
" 0.0048 | \n",
"
\n",
" \n",
" 18 | \n",
" 56 | \n",
" 0.0001 | \n",
" 16 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.0 | \n",
" 0.15 | \n",
" 0.5687 | \n",
" 0.5638 | \n",
" 0.0048 | \n",
"
\n",
" \n",
" 19 | \n",
" 85 | \n",
" 0.0001 | \n",
" 16 | \n",
" 5 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.0 | \n",
" 0.10 | \n",
" 0.5686 | \n",
" 0.5670 | \n",
" 0.0016 | \n",
"
\n",
" \n",
" 20 | \n",
" 15 | \n",
" 0.0001 | \n",
" 8 | \n",
" 4 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.0 | \n",
" 0.20 | \n",
" 0.5685 | \n",
" 0.5629 | \n",
" 0.0056 | \n",
"
\n",
" \n",
" 21 | \n",
" 68 | \n",
" 0.0001 | \n",
" 16 | \n",
" 4 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.0 | \n",
" 0.15 | \n",
" 0.5676 | \n",
" 0.5617 | \n",
" 0.0059 | \n",
"
\n",
" \n",
" 22 | \n",
" 92 | \n",
" 0.0001 | \n",
" 16 | \n",
" 5 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.0 | \n",
" 0.15 | \n",
" 0.5674 | \n",
" 0.5638 | \n",
" 0.0036 | \n",
"
\n",
" \n",
" 23 | \n",
" 14 | \n",
" 0.0001 | \n",
" 8 | \n",
" 4 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.0 | \n",
" 0.15 | \n",
" 0.5673 | \n",
" 0.5661 | \n",
" 0.0011 | \n",
"
\n",
" \n",
" 24 | \n",
" 51 | \n",
" 0.0001 | \n",
" 16 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.0 | \n",
" 0.20 | \n",
" 0.5671 | \n",
" 0.5633 | \n",
" 0.0038 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" step lr num_transformer_heads tr_layer_number optimizer scheduler_type warmup_ratio weight_decay dropout dev test \\\n",
"0 2 0.0001 8 4 adam plateau 0.05 0.0 0.15 0.5735 0.5675 \n",
"1 8 0.0001 8 4 adam plateau 0.10 0.0 0.15 0.5735 0.5675 \n",
"2 3 0.0001 8 4 adam plateau 0.05 0.0 0.20 0.5734 0.5641 \n",
"3 9 0.0001 8 4 adam plateau 0.10 0.0 0.20 0.5734 0.5641 \n",
"4 75 0.0001 16 5 adam plateau 0.05 0.0 0.20 0.5723 0.5708 \n",
"5 81 0.0001 16 5 adam plateau 0.10 0.0 0.20 0.5723 0.5708 \n",
"6 20 0.0001 8 4 adam huggingface_cosine_with_restarts 0.10 0.0 0.15 0.5715 0.5687 \n",
"7 7 0.0001 8 4 adam plateau 0.10 0.0 0.10 0.5712 0.5706 \n",
"8 1 0.0001 8 4 adam plateau 0.05 0.0 0.10 0.5712 0.5706 \n",
"9 74 0.0001 16 5 adam plateau 0.05 0.0 0.15 0.5711 0.5679 \n",
"10 80 0.0001 16 5 adam plateau 0.10 0.0 0.15 0.5711 0.5679 \n",
"11 63 0.0001 16 4 adam huggingface_cosine_with_restarts 0.05 0.0 0.20 0.5704 0.5612 \n",
"12 33 0.0001 8 5 adam plateau 0.10 0.0 0.20 0.5703 0.5659 \n",
"13 27 0.0001 8 5 adam plateau 0.05 0.0 0.20 0.5703 0.5659 \n",
"14 45 0.0001 8 5 adam huggingface_cosine_with_restarts 0.10 0.0 0.20 0.5699 0.5607 \n",
"15 39 0.0001 8 5 adam huggingface_cosine_with_restarts 0.05 0.0 0.20 0.5697 0.5687 \n",
"16 21 0.0001 8 4 adam huggingface_cosine_with_restarts 0.10 0.0 0.20 0.5692 0.5540 \n",
"17 50 0.0001 16 4 adam plateau 0.05 0.0 0.15 0.5687 0.5638 \n",
"18 56 0.0001 16 4 adam plateau 0.10 0.0 0.15 0.5687 0.5638 \n",
"19 85 0.0001 16 5 adam huggingface_cosine_with_restarts 0.05 0.0 0.10 0.5686 0.5670 \n",
"20 15 0.0001 8 4 adam huggingface_cosine_with_restarts 0.05 0.0 0.20 0.5685 0.5629 \n",
"21 68 0.0001 16 4 adam huggingface_cosine_with_restarts 0.10 0.0 0.15 0.5676 0.5617 \n",
"22 92 0.0001 16 5 adam huggingface_cosine_with_restarts 0.10 0.0 0.15 0.5674 0.5638 \n",
"23 14 0.0001 8 4 adam huggingface_cosine_with_restarts 0.05 0.0 0.15 0.5673 0.5661 \n",
"24 51 0.0001 16 4 adam plateau 0.05 0.0 0.20 0.5671 0.5633 \n",
"\n",
" gap \n",
"0 0.0060 \n",
"1 0.0060 \n",
"2 0.0094 \n",
"3 0.0094 \n",
"4 0.0015 \n",
"5 0.0015 \n",
"6 0.0028 \n",
"7 0.0006 \n",
"8 0.0006 \n",
"9 0.0032 \n",
"10 0.0032 \n",
"11 0.0092 \n",
"12 0.0044 \n",
"13 0.0044 \n",
"14 0.0092 \n",
"15 0.0010 \n",
"16 0.0152 \n",
"17 0.0048 \n",
"18 0.0048 \n",
"19 0.0016 \n",
"20 0.0056 \n",
"21 0.0059 \n",
"22 0.0036 \n",
"23 0.0011 \n",
"24 0.0038 "
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"df = parse_smart_log(\"C:/Users/Alexandr/Desktop/sampling/80.txt\",25)\n",
"\n",
"from IPython.display import display\n",
"pd.set_option(\"display.max_columns\", None)\n",
"pd.set_option(\"display.width\", 160)\n",
"\n",
"display(df.head(25))"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "ff64fbf5-cea1-4ee1-b7dc-d415ed7de9e1",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" step | \n",
" lr | \n",
" num_transformer_heads | \n",
" tr_layer_number | \n",
" optimizer | \n",
" scheduler_type | \n",
" warmup_ratio | \n",
" weight_decay | \n",
" dropout | \n",
" dev | \n",
" test | \n",
" gap | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 20 | \n",
" 0.0001 | \n",
" 8 | \n",
" 4 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.0 | \n",
" 0.15 | \n",
" 0.5881 | \n",
" 0.5713 | \n",
" 0.0168 | \n",
"
\n",
" \n",
" 1 | \n",
" 57 | \n",
" 0.0001 | \n",
" 16 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.0 | \n",
" 0.20 | \n",
" 0.5846 | \n",
" 0.5672 | \n",
" 0.0174 | \n",
"
\n",
" \n",
" 2 | \n",
" 51 | \n",
" 0.0001 | \n",
" 16 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.0 | \n",
" 0.20 | \n",
" 0.5846 | \n",
" 0.5672 | \n",
" 0.0174 | \n",
"
\n",
" \n",
" 3 | \n",
" 14 | \n",
" 0.0001 | \n",
" 8 | \n",
" 4 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.0 | \n",
" 0.15 | \n",
" 0.5833 | \n",
" 0.5661 | \n",
" 0.0172 | \n",
"
\n",
" \n",
" 4 | \n",
" 21 | \n",
" 0.0001 | \n",
" 8 | \n",
" 4 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.0 | \n",
" 0.20 | \n",
" 0.5826 | \n",
" 0.5699 | \n",
" 0.0127 | \n",
"
\n",
" \n",
" 5 | \n",
" 15 | \n",
" 0.0001 | \n",
" 8 | \n",
" 4 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.0 | \n",
" 0.20 | \n",
" 0.5819 | \n",
" 0.5640 | \n",
" 0.0179 | \n",
"
\n",
" \n",
" 6 | \n",
" 13 | \n",
" 0.0001 | \n",
" 8 | \n",
" 4 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.0 | \n",
" 0.10 | \n",
" 0.5806 | \n",
" 0.5587 | \n",
" 0.0219 | \n",
"
\n",
" \n",
" 7 | \n",
" 2 | \n",
" 0.0001 | \n",
" 8 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.0 | \n",
" 0.15 | \n",
" 0.5800 | \n",
" 0.5686 | \n",
" 0.0113 | \n",
"
\n",
" \n",
" 8 | \n",
" 8 | \n",
" 0.0001 | \n",
" 8 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.0 | \n",
" 0.15 | \n",
" 0.5800 | \n",
" 0.5686 | \n",
" 0.0113 | \n",
"
\n",
" \n",
" 9 | \n",
" 7 | \n",
" 0.0001 | \n",
" 8 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.0 | \n",
" 0.10 | \n",
" 0.5795 | \n",
" 0.5693 | \n",
" 0.0103 | \n",
"
\n",
" \n",
" 10 | \n",
" 1 | \n",
" 0.0001 | \n",
" 8 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.0 | \n",
" 0.10 | \n",
" 0.5795 | \n",
" 0.5693 | \n",
" 0.0103 | \n",
"
\n",
" \n",
" 11 | \n",
" 50 | \n",
" 0.0001 | \n",
" 16 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.0 | \n",
" 0.15 | \n",
" 0.5787 | \n",
" 0.5662 | \n",
" 0.0125 | \n",
"
\n",
" \n",
" 12 | \n",
" 56 | \n",
" 0.0001 | \n",
" 16 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.0 | \n",
" 0.15 | \n",
" 0.5787 | \n",
" 0.5662 | \n",
" 0.0125 | \n",
"
\n",
" \n",
" 13 | \n",
" 45 | \n",
" 0.0001 | \n",
" 8 | \n",
" 5 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.0 | \n",
" 0.20 | \n",
" 0.5775 | \n",
" 0.5637 | \n",
" 0.0138 | \n",
"
\n",
" \n",
" 14 | \n",
" 55 | \n",
" 0.0001 | \n",
" 16 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.0 | \n",
" 0.10 | \n",
" 0.5766 | \n",
" 0.5717 | \n",
" 0.0049 | \n",
"
\n",
" \n",
" 15 | \n",
" 49 | \n",
" 0.0001 | \n",
" 16 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.0 | \n",
" 0.10 | \n",
" 0.5766 | \n",
" 0.5717 | \n",
" 0.0049 | \n",
"
\n",
" \n",
" 16 | \n",
" 61 | \n",
" 0.0001 | \n",
" 16 | \n",
" 4 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.0 | \n",
" 0.10 | \n",
" 0.5760 | \n",
" 0.5620 | \n",
" 0.0140 | \n",
"
\n",
" \n",
" 17 | \n",
" 81 | \n",
" 0.0001 | \n",
" 16 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.0 | \n",
" 0.20 | \n",
" 0.5749 | \n",
" 0.5684 | \n",
" 0.0066 | \n",
"
\n",
" \n",
" 18 | \n",
" 75 | \n",
" 0.0001 | \n",
" 16 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.0 | \n",
" 0.20 | \n",
" 0.5749 | \n",
" 0.5684 | \n",
" 0.0066 | \n",
"
\n",
" \n",
" 19 | \n",
" 31 | \n",
" 0.0001 | \n",
" 8 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.0 | \n",
" 0.10 | \n",
" 0.5745 | \n",
" 0.5659 | \n",
" 0.0086 | \n",
"
\n",
" \n",
" 20 | \n",
" 25 | \n",
" 0.0001 | \n",
" 8 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.0 | \n",
" 0.10 | \n",
" 0.5745 | \n",
" 0.5659 | \n",
" 0.0086 | \n",
"
\n",
" \n",
" 21 | \n",
" 37 | \n",
" 0.0001 | \n",
" 8 | \n",
" 5 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.0 | \n",
" 0.10 | \n",
" 0.5745 | \n",
" 0.5712 | \n",
" 0.0034 | \n",
"
\n",
" \n",
" 22 | \n",
" 44 | \n",
" 0.0001 | \n",
" 8 | \n",
" 5 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.0 | \n",
" 0.15 | \n",
" 0.5740 | \n",
" 0.5744 | \n",
" -0.0004 | \n",
"
\n",
" \n",
" 23 | \n",
" 19 | \n",
" 0.0001 | \n",
" 8 | \n",
" 4 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.0 | \n",
" 0.10 | \n",
" 0.5736 | \n",
" 0.5606 | \n",
" 0.0130 | \n",
"
\n",
" \n",
" 24 | \n",
" 63 | \n",
" 0.0001 | \n",
" 16 | \n",
" 4 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.0 | \n",
" 0.20 | \n",
" 0.5734 | \n",
" 0.5624 | \n",
" 0.0110 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" step lr num_transformer_heads tr_layer_number optimizer scheduler_type warmup_ratio weight_decay dropout dev test \\\n",
"0 20 0.0001 8 4 adam huggingface_cosine_with_restarts 0.10 0.0 0.15 0.5881 0.5713 \n",
"1 57 0.0001 16 4 adam plateau 0.10 0.0 0.20 0.5846 0.5672 \n",
"2 51 0.0001 16 4 adam plateau 0.05 0.0 0.20 0.5846 0.5672 \n",
"3 14 0.0001 8 4 adam huggingface_cosine_with_restarts 0.05 0.0 0.15 0.5833 0.5661 \n",
"4 21 0.0001 8 4 adam huggingface_cosine_with_restarts 0.10 0.0 0.20 0.5826 0.5699 \n",
"5 15 0.0001 8 4 adam huggingface_cosine_with_restarts 0.05 0.0 0.20 0.5819 0.5640 \n",
"6 13 0.0001 8 4 adam huggingface_cosine_with_restarts 0.05 0.0 0.10 0.5806 0.5587 \n",
"7 2 0.0001 8 4 adam plateau 0.05 0.0 0.15 0.5800 0.5686 \n",
"8 8 0.0001 8 4 adam plateau 0.10 0.0 0.15 0.5800 0.5686 \n",
"9 7 0.0001 8 4 adam plateau 0.10 0.0 0.10 0.5795 0.5693 \n",
"10 1 0.0001 8 4 adam plateau 0.05 0.0 0.10 0.5795 0.5693 \n",
"11 50 0.0001 16 4 adam plateau 0.05 0.0 0.15 0.5787 0.5662 \n",
"12 56 0.0001 16 4 adam plateau 0.10 0.0 0.15 0.5787 0.5662 \n",
"13 45 0.0001 8 5 adam huggingface_cosine_with_restarts 0.10 0.0 0.20 0.5775 0.5637 \n",
"14 55 0.0001 16 4 adam plateau 0.10 0.0 0.10 0.5766 0.5717 \n",
"15 49 0.0001 16 4 adam plateau 0.05 0.0 0.10 0.5766 0.5717 \n",
"16 61 0.0001 16 4 adam huggingface_cosine_with_restarts 0.05 0.0 0.10 0.5760 0.5620 \n",
"17 81 0.0001 16 5 adam plateau 0.10 0.0 0.20 0.5749 0.5684 \n",
"18 75 0.0001 16 5 adam plateau 0.05 0.0 0.20 0.5749 0.5684 \n",
"19 31 0.0001 8 5 adam plateau 0.10 0.0 0.10 0.5745 0.5659 \n",
"20 25 0.0001 8 5 adam plateau 0.05 0.0 0.10 0.5745 0.5659 \n",
"21 37 0.0001 8 5 adam huggingface_cosine_with_restarts 0.05 0.0 0.10 0.5745 0.5712 \n",
"22 44 0.0001 8 5 adam huggingface_cosine_with_restarts 0.10 0.0 0.15 0.5740 0.5744 \n",
"23 19 0.0001 8 4 adam huggingface_cosine_with_restarts 0.10 0.0 0.10 0.5736 0.5606 \n",
"24 63 0.0001 16 4 adam huggingface_cosine_with_restarts 0.05 0.0 0.20 0.5734 0.5624 \n",
"\n",
" gap \n",
"0 0.0168 \n",
"1 0.0174 \n",
"2 0.0174 \n",
"3 0.0172 \n",
"4 0.0127 \n",
"5 0.0179 \n",
"6 0.0219 \n",
"7 0.0113 \n",
"8 0.0113 \n",
"9 0.0103 \n",
"10 0.0103 \n",
"11 0.0125 \n",
"12 0.0125 \n",
"13 0.0138 \n",
"14 0.0049 \n",
"15 0.0049 \n",
"16 0.0140 \n",
"17 0.0066 \n",
"18 0.0066 \n",
"19 0.0086 \n",
"20 0.0086 \n",
"21 0.0034 \n",
"22 -0.0004 \n",
"23 0.0130 \n",
"24 0.0110 "
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"df = parse_smart_log(\"C:/Users/Alexandr/Desktop/sampling/90.txt\",25)\n",
"\n",
"from IPython.display import display\n",
"pd.set_option(\"display.max_columns\", None)\n",
"pd.set_option(\"display.width\", 160)\n",
"\n",
"display(df.head(25))"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "00b62770-f38f-405c-9c5c-630d4afd7d26",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" step | \n",
" lr | \n",
" num_transformer_heads | \n",
" tr_layer_number | \n",
" optimizer | \n",
" scheduler_type | \n",
" warmup_ratio | \n",
" weight_decay | \n",
" dropout | \n",
" dev | \n",
" test | \n",
" gap | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 49 | \n",
" 0.0001 | \n",
" 16 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.0 | \n",
" 0.10 | \n",
" 0.5860 | \n",
" 0.5702 | \n",
" 0.0158 | \n",
"
\n",
" \n",
" 1 | \n",
" 55 | \n",
" 0.0001 | \n",
" 16 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.0 | \n",
" 0.10 | \n",
" 0.5860 | \n",
" 0.5702 | \n",
" 0.0158 | \n",
"
\n",
" \n",
" 2 | \n",
" 56 | \n",
" 0.0001 | \n",
" 16 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.0 | \n",
" 0.15 | \n",
" 0.5827 | \n",
" 0.5771 | \n",
" 0.0056 | \n",
"
\n",
" \n",
" 3 | \n",
" 50 | \n",
" 0.0001 | \n",
" 16 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.0 | \n",
" 0.15 | \n",
" 0.5827 | \n",
" 0.5771 | \n",
" 0.0056 | \n",
"
\n",
" \n",
" 4 | \n",
" 2 | \n",
" 0.0001 | \n",
" 8 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.0 | \n",
" 0.15 | \n",
" 0.5816 | \n",
" 0.5781 | \n",
" 0.0034 | \n",
"
\n",
" \n",
" 5 | \n",
" 8 | \n",
" 0.0001 | \n",
" 8 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.0 | \n",
" 0.15 | \n",
" 0.5816 | \n",
" 0.5781 | \n",
" 0.0034 | \n",
"
\n",
" \n",
" 6 | \n",
" 51 | \n",
" 0.0001 | \n",
" 16 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.0 | \n",
" 0.20 | \n",
" 0.5805 | \n",
" 0.5720 | \n",
" 0.0085 | \n",
"
\n",
" \n",
" 7 | \n",
" 57 | \n",
" 0.0001 | \n",
" 16 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.0 | \n",
" 0.20 | \n",
" 0.5805 | \n",
" 0.5720 | \n",
" 0.0085 | \n",
"
\n",
" \n",
" 8 | \n",
" 7 | \n",
" 0.0001 | \n",
" 8 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.0 | \n",
" 0.10 | \n",
" 0.5793 | \n",
" 0.5715 | \n",
" 0.0079 | \n",
"
\n",
" \n",
" 9 | \n",
" 1 | \n",
" 0.0001 | \n",
" 8 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.0 | \n",
" 0.10 | \n",
" 0.5793 | \n",
" 0.5715 | \n",
" 0.0079 | \n",
"
\n",
" \n",
" 10 | \n",
" 79 | \n",
" 0.0001 | \n",
" 16 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.0 | \n",
" 0.10 | \n",
" 0.5789 | \n",
" 0.5692 | \n",
" 0.0097 | \n",
"
\n",
" \n",
" 11 | \n",
" 73 | \n",
" 0.0001 | \n",
" 16 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.0 | \n",
" 0.10 | \n",
" 0.5789 | \n",
" 0.5692 | \n",
" 0.0097 | \n",
"
\n",
" \n",
" 12 | \n",
" 3 | \n",
" 0.0001 | \n",
" 8 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.0 | \n",
" 0.20 | \n",
" 0.5782 | \n",
" 0.5734 | \n",
" 0.0048 | \n",
"
\n",
" \n",
" 13 | \n",
" 9 | \n",
" 0.0001 | \n",
" 8 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.0 | \n",
" 0.20 | \n",
" 0.5782 | \n",
" 0.5734 | \n",
" 0.0048 | \n",
"
\n",
" \n",
" 14 | \n",
" 45 | \n",
" 0.0001 | \n",
" 8 | \n",
" 5 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.0 | \n",
" 0.20 | \n",
" 0.5780 | \n",
" 0.5777 | \n",
" 0.0003 | \n",
"
\n",
" \n",
" 15 | \n",
" 39 | \n",
" 0.0001 | \n",
" 8 | \n",
" 5 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.0 | \n",
" 0.20 | \n",
" 0.5778 | \n",
" 0.5751 | \n",
" 0.0026 | \n",
"
\n",
" \n",
" 16 | \n",
" 63 | \n",
" 0.0001 | \n",
" 16 | \n",
" 4 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.0 | \n",
" 0.20 | \n",
" 0.5767 | \n",
" 0.5718 | \n",
" 0.0048 | \n",
"
\n",
" \n",
" 17 | \n",
" 62 | \n",
" 0.0001 | \n",
" 16 | \n",
" 4 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.0 | \n",
" 0.15 | \n",
" 0.5766 | \n",
" 0.5708 | \n",
" 0.0059 | \n",
"
\n",
" \n",
" 18 | \n",
" 74 | \n",
" 0.0001 | \n",
" 16 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.0 | \n",
" 0.15 | \n",
" 0.5756 | \n",
" 0.5675 | \n",
" 0.0080 | \n",
"
\n",
" \n",
" 19 | \n",
" 80 | \n",
" 0.0001 | \n",
" 16 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.0 | \n",
" 0.15 | \n",
" 0.5756 | \n",
" 0.5675 | \n",
" 0.0080 | \n",
"
\n",
" \n",
" 20 | \n",
" 87 | \n",
" 0.0001 | \n",
" 16 | \n",
" 5 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.0 | \n",
" 0.20 | \n",
" 0.5754 | \n",
" 0.5685 | \n",
" 0.0069 | \n",
"
\n",
" \n",
" 21 | \n",
" 93 | \n",
" 0.0001 | \n",
" 16 | \n",
" 5 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.0 | \n",
" 0.20 | \n",
" 0.5752 | \n",
" 0.5744 | \n",
" 0.0009 | \n",
"
\n",
" \n",
" 22 | \n",
" 67 | \n",
" 0.0001 | \n",
" 16 | \n",
" 4 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.0 | \n",
" 0.10 | \n",
" 0.5740 | \n",
" 0.5694 | \n",
" 0.0045 | \n",
"
\n",
" \n",
" 23 | \n",
" 85 | \n",
" 0.0001 | \n",
" 16 | \n",
" 5 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.0 | \n",
" 0.10 | \n",
" 0.5737 | \n",
" 0.5707 | \n",
" 0.0030 | \n",
"
\n",
" \n",
" 24 | \n",
" 14 | \n",
" 0.0001 | \n",
" 8 | \n",
" 4 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.0 | \n",
" 0.15 | \n",
" 0.5736 | \n",
" 0.5565 | \n",
" 0.0171 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" step lr num_transformer_heads tr_layer_number optimizer scheduler_type warmup_ratio weight_decay dropout dev test \\\n",
"0 49 0.0001 16 4 adam plateau 0.05 0.0 0.10 0.5860 0.5702 \n",
"1 55 0.0001 16 4 adam plateau 0.10 0.0 0.10 0.5860 0.5702 \n",
"2 56 0.0001 16 4 adam plateau 0.10 0.0 0.15 0.5827 0.5771 \n",
"3 50 0.0001 16 4 adam plateau 0.05 0.0 0.15 0.5827 0.5771 \n",
"4 2 0.0001 8 4 adam plateau 0.05 0.0 0.15 0.5816 0.5781 \n",
"5 8 0.0001 8 4 adam plateau 0.10 0.0 0.15 0.5816 0.5781 \n",
"6 51 0.0001 16 4 adam plateau 0.05 0.0 0.20 0.5805 0.5720 \n",
"7 57 0.0001 16 4 adam plateau 0.10 0.0 0.20 0.5805 0.5720 \n",
"8 7 0.0001 8 4 adam plateau 0.10 0.0 0.10 0.5793 0.5715 \n",
"9 1 0.0001 8 4 adam plateau 0.05 0.0 0.10 0.5793 0.5715 \n",
"10 79 0.0001 16 5 adam plateau 0.10 0.0 0.10 0.5789 0.5692 \n",
"11 73 0.0001 16 5 adam plateau 0.05 0.0 0.10 0.5789 0.5692 \n",
"12 3 0.0001 8 4 adam plateau 0.05 0.0 0.20 0.5782 0.5734 \n",
"13 9 0.0001 8 4 adam plateau 0.10 0.0 0.20 0.5782 0.5734 \n",
"14 45 0.0001 8 5 adam huggingface_cosine_with_restarts 0.10 0.0 0.20 0.5780 0.5777 \n",
"15 39 0.0001 8 5 adam huggingface_cosine_with_restarts 0.05 0.0 0.20 0.5778 0.5751 \n",
"16 63 0.0001 16 4 adam huggingface_cosine_with_restarts 0.05 0.0 0.20 0.5767 0.5718 \n",
"17 62 0.0001 16 4 adam huggingface_cosine_with_restarts 0.05 0.0 0.15 0.5766 0.5708 \n",
"18 74 0.0001 16 5 adam plateau 0.05 0.0 0.15 0.5756 0.5675 \n",
"19 80 0.0001 16 5 adam plateau 0.10 0.0 0.15 0.5756 0.5675 \n",
"20 87 0.0001 16 5 adam huggingface_cosine_with_restarts 0.05 0.0 0.20 0.5754 0.5685 \n",
"21 93 0.0001 16 5 adam huggingface_cosine_with_restarts 0.10 0.0 0.20 0.5752 0.5744 \n",
"22 67 0.0001 16 4 adam huggingface_cosine_with_restarts 0.10 0.0 0.10 0.5740 0.5694 \n",
"23 85 0.0001 16 5 adam huggingface_cosine_with_restarts 0.05 0.0 0.10 0.5737 0.5707 \n",
"24 14 0.0001 8 4 adam huggingface_cosine_with_restarts 0.05 0.0 0.15 0.5736 0.5565 \n",
"\n",
" gap \n",
"0 0.0158 \n",
"1 0.0158 \n",
"2 0.0056 \n",
"3 0.0056 \n",
"4 0.0034 \n",
"5 0.0034 \n",
"6 0.0085 \n",
"7 0.0085 \n",
"8 0.0079 \n",
"9 0.0079 \n",
"10 0.0097 \n",
"11 0.0097 \n",
"12 0.0048 \n",
"13 0.0048 \n",
"14 0.0003 \n",
"15 0.0026 \n",
"16 0.0048 \n",
"17 0.0059 \n",
"18 0.0080 \n",
"19 0.0080 \n",
"20 0.0069 \n",
"21 0.0009 \n",
"22 0.0045 \n",
"23 0.0030 \n",
"24 0.0171 "
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"df = parse_smart_log(\"C:/Users/Alexandr/Desktop/sampling/100.txt\",25)\n",
"\n",
"from IPython.display import display\n",
"pd.set_option(\"display.max_columns\", None)\n",
"pd.set_option(\"display.width\", 160)\n",
"\n",
"display(df.head(25))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fb9186a9-f0bc-406d-b2c3-724d7b5f9d43",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "0f9fbc1c-ab41-4dca-bbad-6a0eadd28f7f",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "75806125-04ac-4e18-968e-4632b92d1d16",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "4ac6d74a-3625-4c46-b1a1-7d70a7ef5446",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "99e64f6f-eeb4-4f1f-9bd0-b2cb230dc1da",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 12,
"id": "467b04df-a408-4808-941d-5b0f2ebbf217",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" step | \n",
" lr | \n",
" num_transformer_heads | \n",
" tr_layer_number | \n",
" optimizer | \n",
" scheduler_type | \n",
" warmup_ratio | \n",
" dropout | \n",
" dev | \n",
" test | \n",
" gap | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 5 | \n",
" 0.0001 | \n",
" 8 | \n",
" 4 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.10 | \n",
" 0.5710 | \n",
" 0.5575 | \n",
" 0.0135 | \n",
"
\n",
" \n",
" 1 | \n",
" 14 | \n",
" 0.0001 | \n",
" 8 | \n",
" 5 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.15 | \n",
" 0.5707 | \n",
" 0.5670 | \n",
" 0.0037 | \n",
"
\n",
" \n",
" 2 | \n",
" 13 | \n",
" 0.0001 | \n",
" 8 | \n",
" 5 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.10 | \n",
" 0.5698 | \n",
" 0.5650 | \n",
" 0.0048 | \n",
"
\n",
" \n",
" 3 | \n",
" 9 | \n",
" 0.0001 | \n",
" 8 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.10 | \n",
" 0.5691 | \n",
" 0.5565 | \n",
" 0.0126 | \n",
"
\n",
" \n",
" 4 | \n",
" 11 | \n",
" 0.0001 | \n",
" 8 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.10 | \n",
" 0.5691 | \n",
" 0.5565 | \n",
" 0.0126 | \n",
"
\n",
" \n",
" 5 | \n",
" 27 | \n",
" 0.0001 | \n",
" 16 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.10 | \n",
" 0.5690 | \n",
" 0.5678 | \n",
" 0.0011 | \n",
"
\n",
" \n",
" 6 | \n",
" 25 | \n",
" 0.0001 | \n",
" 16 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.10 | \n",
" 0.5690 | \n",
" 0.5678 | \n",
" 0.0011 | \n",
"
\n",
" \n",
" 7 | \n",
" 7 | \n",
" 0.0001 | \n",
" 8 | \n",
" 4 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.10 | \n",
" 0.5671 | \n",
" 0.5574 | \n",
" 0.0097 | \n",
"
\n",
" \n",
" 8 | \n",
" 32 | \n",
" 0.0001 | \n",
" 16 | \n",
" 5 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.15 | \n",
" 0.5664 | \n",
" 0.5634 | \n",
" 0.0031 | \n",
"
\n",
" \n",
" 9 | \n",
" 24 | \n",
" 0.0001 | \n",
" 16 | \n",
" 4 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.15 | \n",
" 0.5663 | \n",
" 0.5629 | \n",
" 0.0034 | \n",
"
\n",
" \n",
" 10 | \n",
" 29 | \n",
" 0.0001 | \n",
" 16 | \n",
" 5 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.10 | \n",
" 0.5662 | \n",
" 0.5590 | \n",
" 0.0072 | \n",
"
\n",
" \n",
" 11 | \n",
" 31 | \n",
" 0.0001 | \n",
" 16 | \n",
" 5 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.10 | \n",
" 0.5661 | \n",
" 0.5605 | \n",
" 0.0056 | \n",
"
\n",
" \n",
" 12 | \n",
" 28 | \n",
" 0.0001 | \n",
" 16 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.15 | \n",
" 0.5656 | \n",
" 0.5668 | \n",
" -0.0012 | \n",
"
\n",
" \n",
" 13 | \n",
" 26 | \n",
" 0.0001 | \n",
" 16 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.15 | \n",
" 0.5656 | \n",
" 0.5668 | \n",
" -0.0012 | \n",
"
\n",
" \n",
" 14 | \n",
" 16 | \n",
" 0.0001 | \n",
" 8 | \n",
" 5 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.15 | \n",
" 0.5646 | \n",
" 0.5552 | \n",
" 0.0094 | \n",
"
\n",
" \n",
" 15 | \n",
" 15 | \n",
" 0.0001 | \n",
" 8 | \n",
" 5 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.10 | \n",
" 0.5640 | \n",
" 0.5596 | \n",
" 0.0044 | \n",
"
\n",
" \n",
" 16 | \n",
" 30 | \n",
" 0.0001 | \n",
" 16 | \n",
" 5 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.15 | \n",
" 0.5638 | \n",
" 0.5588 | \n",
" 0.0050 | \n",
"
\n",
" \n",
" 17 | \n",
" 12 | \n",
" 0.0001 | \n",
" 8 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.15 | \n",
" 0.5637 | \n",
" 0.5534 | \n",
" 0.0102 | \n",
"
\n",
" \n",
" 18 | \n",
" 10 | \n",
" 0.0001 | \n",
" 8 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.15 | \n",
" 0.5637 | \n",
" 0.5534 | \n",
" 0.0102 | \n",
"
\n",
" \n",
" 19 | \n",
" 8 | \n",
" 0.0001 | \n",
" 8 | \n",
" 4 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.15 | \n",
" 0.5632 | \n",
" 0.5561 | \n",
" 0.0072 | \n",
"
\n",
" \n",
" 20 | \n",
" 21 | \n",
" 0.0001 | \n",
" 16 | \n",
" 4 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.10 | \n",
" 0.5621 | \n",
" 0.5522 | \n",
" 0.0099 | \n",
"
\n",
" \n",
" 21 | \n",
" 2 | \n",
" 0.0001 | \n",
" 8 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.15 | \n",
" 0.5615 | \n",
" 0.5515 | \n",
" 0.0100 | \n",
"
\n",
" \n",
" 22 | \n",
" 4 | \n",
" 0.0001 | \n",
" 8 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.15 | \n",
" 0.5615 | \n",
" 0.5515 | \n",
" 0.0100 | \n",
"
\n",
" \n",
" 23 | \n",
" 6 | \n",
" 0.0001 | \n",
" 8 | \n",
" 4 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.15 | \n",
" 0.5613 | \n",
" 0.5513 | \n",
" 0.0100 | \n",
"
\n",
" \n",
" 24 | \n",
" 3 | \n",
" 0.0001 | \n",
" 8 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.10 | \n",
" 0.5606 | \n",
" 0.5525 | \n",
" 0.0081 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" step lr num_transformer_heads tr_layer_number optimizer scheduler_type warmup_ratio dropout dev test gap\n",
"0 5 0.0001 8 4 adam huggingface_cosine_with_restarts 0.05 0.10 0.5710 0.5575 0.0135\n",
"1 14 0.0001 8 5 adam huggingface_cosine_with_restarts 0.05 0.15 0.5707 0.5670 0.0037\n",
"2 13 0.0001 8 5 adam huggingface_cosine_with_restarts 0.05 0.10 0.5698 0.5650 0.0048\n",
"3 9 0.0001 8 5 adam plateau 0.05 0.10 0.5691 0.5565 0.0126\n",
"4 11 0.0001 8 5 adam plateau 0.10 0.10 0.5691 0.5565 0.0126\n",
"5 27 0.0001 16 5 adam plateau 0.10 0.10 0.5690 0.5678 0.0011\n",
"6 25 0.0001 16 5 adam plateau 0.05 0.10 0.5690 0.5678 0.0011\n",
"7 7 0.0001 8 4 adam huggingface_cosine_with_restarts 0.10 0.10 0.5671 0.5574 0.0097\n",
"8 32 0.0001 16 5 adam huggingface_cosine_with_restarts 0.10 0.15 0.5664 0.5634 0.0031\n",
"9 24 0.0001 16 4 adam huggingface_cosine_with_restarts 0.10 0.15 0.5663 0.5629 0.0034\n",
"10 29 0.0001 16 5 adam huggingface_cosine_with_restarts 0.05 0.10 0.5662 0.5590 0.0072\n",
"11 31 0.0001 16 5 adam huggingface_cosine_with_restarts 0.10 0.10 0.5661 0.5605 0.0056\n",
"12 28 0.0001 16 5 adam plateau 0.10 0.15 0.5656 0.5668 -0.0012\n",
"13 26 0.0001 16 5 adam plateau 0.05 0.15 0.5656 0.5668 -0.0012\n",
"14 16 0.0001 8 5 adam huggingface_cosine_with_restarts 0.10 0.15 0.5646 0.5552 0.0094\n",
"15 15 0.0001 8 5 adam huggingface_cosine_with_restarts 0.10 0.10 0.5640 0.5596 0.0044\n",
"16 30 0.0001 16 5 adam huggingface_cosine_with_restarts 0.05 0.15 0.5638 0.5588 0.0050\n",
"17 12 0.0001 8 5 adam plateau 0.10 0.15 0.5637 0.5534 0.0102\n",
"18 10 0.0001 8 5 adam plateau 0.05 0.15 0.5637 0.5534 0.0102\n",
"19 8 0.0001 8 4 adam huggingface_cosine_with_restarts 0.10 0.15 0.5632 0.5561 0.0072\n",
"20 21 0.0001 16 4 adam huggingface_cosine_with_restarts 0.05 0.10 0.5621 0.5522 0.0099\n",
"21 2 0.0001 8 4 adam plateau 0.05 0.15 0.5615 0.5515 0.0100\n",
"22 4 0.0001 8 4 adam plateau 0.10 0.15 0.5615 0.5515 0.0100\n",
"23 6 0.0001 8 4 adam huggingface_cosine_with_restarts 0.05 0.15 0.5613 0.5513 0.0100\n",
"24 3 0.0001 8 4 adam plateau 0.10 0.10 0.5606 0.5525 0.0081"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"df = parse_smart_log(\"C:/Users/Alexandr/Desktop/sampling/bi/10.txt\",25)\n",
"\n",
"from IPython.display import display\n",
"pd.set_option(\"display.max_columns\", None)\n",
"pd.set_option(\"display.width\", 160)\n",
"\n",
"display(df.head(25))"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "efba3e34-f64b-4962-9c8b-ef70294badb1",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" step | \n",
" lr | \n",
" num_transformer_heads | \n",
" tr_layer_number | \n",
" optimizer | \n",
" scheduler_type | \n",
" warmup_ratio | \n",
" dropout | \n",
" dev | \n",
" test | \n",
" gap | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 7 | \n",
" 0.0001 | \n",
" 8 | \n",
" 4 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.10 | \n",
" 0.5804 | \n",
" 0.5689 | \n",
" 0.0116 | \n",
"
\n",
" \n",
" 1 | \n",
" 21 | \n",
" 0.0001 | \n",
" 16 | \n",
" 4 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.10 | \n",
" 0.5791 | \n",
" 0.5709 | \n",
" 0.0082 | \n",
"
\n",
" \n",
" 2 | \n",
" 22 | \n",
" 0.0001 | \n",
" 16 | \n",
" 4 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.15 | \n",
" 0.5739 | \n",
" 0.5651 | \n",
" 0.0088 | \n",
"
\n",
" \n",
" 3 | \n",
" 18 | \n",
" 0.0001 | \n",
" 16 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.15 | \n",
" 0.5737 | \n",
" 0.5679 | \n",
" 0.0058 | \n",
"
\n",
" \n",
" 4 | \n",
" 20 | \n",
" 0.0001 | \n",
" 16 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.15 | \n",
" 0.5737 | \n",
" 0.5679 | \n",
" 0.0058 | \n",
"
\n",
" \n",
" 5 | \n",
" 15 | \n",
" 0.0001 | \n",
" 8 | \n",
" 5 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.10 | \n",
" 0.5731 | \n",
" 0.5638 | \n",
" 0.0092 | \n",
"
\n",
" \n",
" 6 | \n",
" 13 | \n",
" 0.0001 | \n",
" 8 | \n",
" 5 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.10 | \n",
" 0.5731 | \n",
" 0.5673 | \n",
" 0.0058 | \n",
"
\n",
" \n",
" 7 | \n",
" 5 | \n",
" 0.0001 | \n",
" 8 | \n",
" 4 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.10 | \n",
" 0.5730 | \n",
" 0.5689 | \n",
" 0.0041 | \n",
"
\n",
" \n",
" 8 | \n",
" 8 | \n",
" 0.0001 | \n",
" 8 | \n",
" 4 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.15 | \n",
" 0.5724 | \n",
" 0.5631 | \n",
" 0.0093 | \n",
"
\n",
" \n",
" 9 | \n",
" 30 | \n",
" 0.0001 | \n",
" 16 | \n",
" 5 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.15 | \n",
" 0.5719 | \n",
" 0.5702 | \n",
" 0.0017 | \n",
"
\n",
" \n",
" 10 | \n",
" 4 | \n",
" 0.0001 | \n",
" 8 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.15 | \n",
" 0.5717 | \n",
" 0.5662 | \n",
" 0.0055 | \n",
"
\n",
" \n",
" 11 | \n",
" 2 | \n",
" 0.0001 | \n",
" 8 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.15 | \n",
" 0.5717 | \n",
" 0.5662 | \n",
" 0.0055 | \n",
"
\n",
" \n",
" 12 | \n",
" 17 | \n",
" 0.0001 | \n",
" 16 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.10 | \n",
" 0.5712 | \n",
" 0.5680 | \n",
" 0.0032 | \n",
"
\n",
" \n",
" 13 | \n",
" 26 | \n",
" 0.0001 | \n",
" 16 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.15 | \n",
" 0.5712 | \n",
" 0.5626 | \n",
" 0.0086 | \n",
"
\n",
" \n",
" 14 | \n",
" 19 | \n",
" 0.0001 | \n",
" 16 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.10 | \n",
" 0.5712 | \n",
" 0.5680 | \n",
" 0.0032 | \n",
"
\n",
" \n",
" 15 | \n",
" 28 | \n",
" 0.0001 | \n",
" 16 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.15 | \n",
" 0.5712 | \n",
" 0.5626 | \n",
" 0.0086 | \n",
"
\n",
" \n",
" 16 | \n",
" 9 | \n",
" 0.0001 | \n",
" 8 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.10 | \n",
" 0.5708 | \n",
" 0.5533 | \n",
" 0.0175 | \n",
"
\n",
" \n",
" 17 | \n",
" 11 | \n",
" 0.0001 | \n",
" 8 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.10 | \n",
" 0.5708 | \n",
" 0.5533 | \n",
" 0.0175 | \n",
"
\n",
" \n",
" 18 | \n",
" 23 | \n",
" 0.0001 | \n",
" 16 | \n",
" 4 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.10 | \n",
" 0.5706 | \n",
" 0.5706 | \n",
" -0.0001 | \n",
"
\n",
" \n",
" 19 | \n",
" 16 | \n",
" 0.0001 | \n",
" 8 | \n",
" 5 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.15 | \n",
" 0.5704 | \n",
" 0.5649 | \n",
" 0.0055 | \n",
"
\n",
" \n",
" 20 | \n",
" 6 | \n",
" 0.0001 | \n",
" 8 | \n",
" 4 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.15 | \n",
" 0.5697 | \n",
" 0.5657 | \n",
" 0.0041 | \n",
"
\n",
" \n",
" 21 | \n",
" 29 | \n",
" 0.0001 | \n",
" 16 | \n",
" 5 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.10 | \n",
" 0.5696 | \n",
" 0.5661 | \n",
" 0.0035 | \n",
"
\n",
" \n",
" 22 | \n",
" 25 | \n",
" 0.0001 | \n",
" 16 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.10 | \n",
" 0.5695 | \n",
" 0.5604 | \n",
" 0.0092 | \n",
"
\n",
" \n",
" 23 | \n",
" 27 | \n",
" 0.0001 | \n",
" 16 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.10 | \n",
" 0.5695 | \n",
" 0.5604 | \n",
" 0.0092 | \n",
"
\n",
" \n",
" 24 | \n",
" 10 | \n",
" 0.0001 | \n",
" 8 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.15 | \n",
" 0.5694 | \n",
" 0.5525 | \n",
" 0.0170 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" step lr num_transformer_heads tr_layer_number optimizer scheduler_type warmup_ratio dropout dev test gap\n",
"0 7 0.0001 8 4 adam huggingface_cosine_with_restarts 0.10 0.10 0.5804 0.5689 0.0116\n",
"1 21 0.0001 16 4 adam huggingface_cosine_with_restarts 0.05 0.10 0.5791 0.5709 0.0082\n",
"2 22 0.0001 16 4 adam huggingface_cosine_with_restarts 0.05 0.15 0.5739 0.5651 0.0088\n",
"3 18 0.0001 16 4 adam plateau 0.05 0.15 0.5737 0.5679 0.0058\n",
"4 20 0.0001 16 4 adam plateau 0.10 0.15 0.5737 0.5679 0.0058\n",
"5 15 0.0001 8 5 adam huggingface_cosine_with_restarts 0.10 0.10 0.5731 0.5638 0.0092\n",
"6 13 0.0001 8 5 adam huggingface_cosine_with_restarts 0.05 0.10 0.5731 0.5673 0.0058\n",
"7 5 0.0001 8 4 adam huggingface_cosine_with_restarts 0.05 0.10 0.5730 0.5689 0.0041\n",
"8 8 0.0001 8 4 adam huggingface_cosine_with_restarts 0.10 0.15 0.5724 0.5631 0.0093\n",
"9 30 0.0001 16 5 adam huggingface_cosine_with_restarts 0.05 0.15 0.5719 0.5702 0.0017\n",
"10 4 0.0001 8 4 adam plateau 0.10 0.15 0.5717 0.5662 0.0055\n",
"11 2 0.0001 8 4 adam plateau 0.05 0.15 0.5717 0.5662 0.0055\n",
"12 17 0.0001 16 4 adam plateau 0.05 0.10 0.5712 0.5680 0.0032\n",
"13 26 0.0001 16 5 adam plateau 0.05 0.15 0.5712 0.5626 0.0086\n",
"14 19 0.0001 16 4 adam plateau 0.10 0.10 0.5712 0.5680 0.0032\n",
"15 28 0.0001 16 5 adam plateau 0.10 0.15 0.5712 0.5626 0.0086\n",
"16 9 0.0001 8 5 adam plateau 0.05 0.10 0.5708 0.5533 0.0175\n",
"17 11 0.0001 8 5 adam plateau 0.10 0.10 0.5708 0.5533 0.0175\n",
"18 23 0.0001 16 4 adam huggingface_cosine_with_restarts 0.10 0.10 0.5706 0.5706 -0.0001\n",
"19 16 0.0001 8 5 adam huggingface_cosine_with_restarts 0.10 0.15 0.5704 0.5649 0.0055\n",
"20 6 0.0001 8 4 adam huggingface_cosine_with_restarts 0.05 0.15 0.5697 0.5657 0.0041\n",
"21 29 0.0001 16 5 adam huggingface_cosine_with_restarts 0.05 0.10 0.5696 0.5661 0.0035\n",
"22 25 0.0001 16 5 adam plateau 0.05 0.10 0.5695 0.5604 0.0092\n",
"23 27 0.0001 16 5 adam plateau 0.10 0.10 0.5695 0.5604 0.0092\n",
"24 10 0.0001 8 5 adam plateau 0.05 0.15 0.5694 0.5525 0.0170"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"df = parse_smart_log(\"C:/Users/Alexandr/Desktop/sampling/bi/20.txt\",25)\n",
"\n",
"from IPython.display import display\n",
"pd.set_option(\"display.max_columns\", None)\n",
"pd.set_option(\"display.width\", 160)\n",
"\n",
"display(df.head(25))"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "5c48bc72-cac8-4119-96ad-a1f09cfed996",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" step | \n",
" lr | \n",
" num_transformer_heads | \n",
" tr_layer_number | \n",
" optimizer | \n",
" scheduler_type | \n",
" warmup_ratio | \n",
" dropout | \n",
" dev | \n",
" test | \n",
" gap | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 20 | \n",
" 0.0001 | \n",
" 16 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.15 | \n",
" 0.5696 | \n",
" 0.5631 | \n",
" 0.0065 | \n",
"
\n",
" \n",
" 1 | \n",
" 18 | \n",
" 0.0001 | \n",
" 16 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.15 | \n",
" 0.5696 | \n",
" 0.5631 | \n",
" 0.0065 | \n",
"
\n",
" \n",
" 2 | \n",
" 19 | \n",
" 0.0001 | \n",
" 16 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.10 | \n",
" 0.5654 | \n",
" 0.5636 | \n",
" 0.0017 | \n",
"
\n",
" \n",
" 3 | \n",
" 17 | \n",
" 0.0001 | \n",
" 16 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.10 | \n",
" 0.5654 | \n",
" 0.5636 | \n",
" 0.0017 | \n",
"
\n",
" \n",
" 4 | \n",
" 26 | \n",
" 0.0001 | \n",
" 16 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.15 | \n",
" 0.5619 | \n",
" 0.5594 | \n",
" 0.0025 | \n",
"
\n",
" \n",
" 5 | \n",
" 28 | \n",
" 0.0001 | \n",
" 16 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.15 | \n",
" 0.5619 | \n",
" 0.5594 | \n",
" 0.0025 | \n",
"
\n",
" \n",
" 6 | \n",
" 11 | \n",
" 0.0001 | \n",
" 8 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.10 | \n",
" 0.5617 | \n",
" 0.5549 | \n",
" 0.0069 | \n",
"
\n",
" \n",
" 7 | \n",
" 9 | \n",
" 0.0001 | \n",
" 8 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.10 | \n",
" 0.5617 | \n",
" 0.5549 | \n",
" 0.0069 | \n",
"
\n",
" \n",
" 8 | \n",
" 15 | \n",
" 0.0001 | \n",
" 8 | \n",
" 5 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.10 | \n",
" 0.5609 | \n",
" 0.5547 | \n",
" 0.0062 | \n",
"
\n",
" \n",
" 9 | \n",
" 27 | \n",
" 0.0001 | \n",
" 16 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.10 | \n",
" 0.5607 | \n",
" 0.5584 | \n",
" 0.0023 | \n",
"
\n",
" \n",
" 10 | \n",
" 25 | \n",
" 0.0001 | \n",
" 16 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.10 | \n",
" 0.5607 | \n",
" 0.5584 | \n",
" 0.0023 | \n",
"
\n",
" \n",
" 11 | \n",
" 30 | \n",
" 0.0001 | \n",
" 16 | \n",
" 5 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.15 | \n",
" 0.5601 | \n",
" 0.5603 | \n",
" -0.0001 | \n",
"
\n",
" \n",
" 12 | \n",
" 12 | \n",
" 0.0001 | \n",
" 8 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.15 | \n",
" 0.5596 | \n",
" 0.5551 | \n",
" 0.0045 | \n",
"
\n",
" \n",
" 13 | \n",
" 10 | \n",
" 0.0001 | \n",
" 8 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.15 | \n",
" 0.5596 | \n",
" 0.5551 | \n",
" 0.0045 | \n",
"
\n",
" \n",
" 14 | \n",
" 14 | \n",
" 0.0001 | \n",
" 8 | \n",
" 5 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.15 | \n",
" 0.5593 | \n",
" 0.5567 | \n",
" 0.0026 | \n",
"
\n",
" \n",
" 15 | \n",
" 1 | \n",
" 0.0001 | \n",
" 8 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.10 | \n",
" 0.5589 | \n",
" 0.5553 | \n",
" 0.0036 | \n",
"
\n",
" \n",
" 16 | \n",
" 3 | \n",
" 0.0001 | \n",
" 8 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.10 | \n",
" 0.5589 | \n",
" 0.5553 | \n",
" 0.0036 | \n",
"
\n",
" \n",
" 17 | \n",
" 16 | \n",
" 0.0001 | \n",
" 8 | \n",
" 5 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.15 | \n",
" 0.5588 | \n",
" 0.5547 | \n",
" 0.0041 | \n",
"
\n",
" \n",
" 18 | \n",
" 5 | \n",
" 0.0001 | \n",
" 8 | \n",
" 4 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.10 | \n",
" 0.5588 | \n",
" 0.5551 | \n",
" 0.0037 | \n",
"
\n",
" \n",
" 19 | \n",
" 13 | \n",
" 0.0001 | \n",
" 8 | \n",
" 5 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.10 | \n",
" 0.5586 | \n",
" 0.5520 | \n",
" 0.0066 | \n",
"
\n",
" \n",
" 20 | \n",
" 21 | \n",
" 0.0001 | \n",
" 16 | \n",
" 4 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.10 | \n",
" 0.5583 | \n",
" 0.5599 | \n",
" -0.0017 | \n",
"
\n",
" \n",
" 21 | \n",
" 32 | \n",
" 0.0001 | \n",
" 16 | \n",
" 5 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.15 | \n",
" 0.5578 | \n",
" 0.5591 | \n",
" -0.0012 | \n",
"
\n",
" \n",
" 22 | \n",
" 6 | \n",
" 0.0001 | \n",
" 8 | \n",
" 4 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.15 | \n",
" 0.5576 | \n",
" 0.5588 | \n",
" -0.0012 | \n",
"
\n",
" \n",
" 23 | \n",
" 8 | \n",
" 0.0001 | \n",
" 8 | \n",
" 4 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.15 | \n",
" 0.5574 | \n",
" 0.5522 | \n",
" 0.0052 | \n",
"
\n",
" \n",
" 24 | \n",
" 31 | \n",
" 0.0001 | \n",
" 16 | \n",
" 5 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.10 | \n",
" 0.5566 | \n",
" 0.5523 | \n",
" 0.0043 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" step lr num_transformer_heads tr_layer_number optimizer scheduler_type warmup_ratio dropout dev test gap\n",
"0 20 0.0001 16 4 adam plateau 0.10 0.15 0.5696 0.5631 0.0065\n",
"1 18 0.0001 16 4 adam plateau 0.05 0.15 0.5696 0.5631 0.0065\n",
"2 19 0.0001 16 4 adam plateau 0.10 0.10 0.5654 0.5636 0.0017\n",
"3 17 0.0001 16 4 adam plateau 0.05 0.10 0.5654 0.5636 0.0017\n",
"4 26 0.0001 16 5 adam plateau 0.05 0.15 0.5619 0.5594 0.0025\n",
"5 28 0.0001 16 5 adam plateau 0.10 0.15 0.5619 0.5594 0.0025\n",
"6 11 0.0001 8 5 adam plateau 0.10 0.10 0.5617 0.5549 0.0069\n",
"7 9 0.0001 8 5 adam plateau 0.05 0.10 0.5617 0.5549 0.0069\n",
"8 15 0.0001 8 5 adam huggingface_cosine_with_restarts 0.10 0.10 0.5609 0.5547 0.0062\n",
"9 27 0.0001 16 5 adam plateau 0.10 0.10 0.5607 0.5584 0.0023\n",
"10 25 0.0001 16 5 adam plateau 0.05 0.10 0.5607 0.5584 0.0023\n",
"11 30 0.0001 16 5 adam huggingface_cosine_with_restarts 0.05 0.15 0.5601 0.5603 -0.0001\n",
"12 12 0.0001 8 5 adam plateau 0.10 0.15 0.5596 0.5551 0.0045\n",
"13 10 0.0001 8 5 adam plateau 0.05 0.15 0.5596 0.5551 0.0045\n",
"14 14 0.0001 8 5 adam huggingface_cosine_with_restarts 0.05 0.15 0.5593 0.5567 0.0026\n",
"15 1 0.0001 8 4 adam plateau 0.05 0.10 0.5589 0.5553 0.0036\n",
"16 3 0.0001 8 4 adam plateau 0.10 0.10 0.5589 0.5553 0.0036\n",
"17 16 0.0001 8 5 adam huggingface_cosine_with_restarts 0.10 0.15 0.5588 0.5547 0.0041\n",
"18 5 0.0001 8 4 adam huggingface_cosine_with_restarts 0.05 0.10 0.5588 0.5551 0.0037\n",
"19 13 0.0001 8 5 adam huggingface_cosine_with_restarts 0.05 0.10 0.5586 0.5520 0.0066\n",
"20 21 0.0001 16 4 adam huggingface_cosine_with_restarts 0.05 0.10 0.5583 0.5599 -0.0017\n",
"21 32 0.0001 16 5 adam huggingface_cosine_with_restarts 0.10 0.15 0.5578 0.5591 -0.0012\n",
"22 6 0.0001 8 4 adam huggingface_cosine_with_restarts 0.05 0.15 0.5576 0.5588 -0.0012\n",
"23 8 0.0001 8 4 adam huggingface_cosine_with_restarts 0.10 0.15 0.5574 0.5522 0.0052\n",
"24 31 0.0001 16 5 adam huggingface_cosine_with_restarts 0.10 0.10 0.5566 0.5523 0.0043"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"df = parse_smart_log(\"C:/Users/Alexandr/Desktop/sampling/bi/30.txt\",25)\n",
"\n",
"from IPython.display import display\n",
"pd.set_option(\"display.max_columns\", None)\n",
"pd.set_option(\"display.width\", 160)\n",
"\n",
"display(df.head(25))"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "a93f7c97-3e3c-4e6f-8bcb-454480aae5f8",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" step | \n",
" lr | \n",
" num_transformer_heads | \n",
" tr_layer_number | \n",
" optimizer | \n",
" scheduler_type | \n",
" warmup_ratio | \n",
" dropout | \n",
" dev | \n",
" test | \n",
" gap | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 19 | \n",
" 0.0001 | \n",
" 16 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.10 | \n",
" 0.5822 | \n",
" 0.5671 | \n",
" 0.0151 | \n",
"
\n",
" \n",
" 1 | \n",
" 17 | \n",
" 0.0001 | \n",
" 16 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.10 | \n",
" 0.5822 | \n",
" 0.5671 | \n",
" 0.0151 | \n",
"
\n",
" \n",
" 2 | \n",
" 2 | \n",
" 0.0001 | \n",
" 8 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.15 | \n",
" 0.5729 | \n",
" 0.5643 | \n",
" 0.0086 | \n",
"
\n",
" \n",
" 3 | \n",
" 4 | \n",
" 0.0001 | \n",
" 8 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.15 | \n",
" 0.5729 | \n",
" 0.5643 | \n",
" 0.0086 | \n",
"
\n",
" \n",
" 4 | \n",
" 20 | \n",
" 0.0001 | \n",
" 16 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.15 | \n",
" 0.5727 | \n",
" 0.5601 | \n",
" 0.0126 | \n",
"
\n",
" \n",
" 5 | \n",
" 18 | \n",
" 0.0001 | \n",
" 16 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.15 | \n",
" 0.5727 | \n",
" 0.5601 | \n",
" 0.0126 | \n",
"
\n",
" \n",
" 6 | \n",
" 6 | \n",
" 0.0001 | \n",
" 8 | \n",
" 4 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.15 | \n",
" 0.5698 | \n",
" 0.5626 | \n",
" 0.0072 | \n",
"
\n",
" \n",
" 7 | \n",
" 8 | \n",
" 0.0001 | \n",
" 8 | \n",
" 4 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.15 | \n",
" 0.5697 | \n",
" 0.5623 | \n",
" 0.0074 | \n",
"
\n",
" \n",
" 8 | \n",
" 11 | \n",
" 0.0001 | \n",
" 8 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.10 | \n",
" 0.5691 | \n",
" 0.5622 | \n",
" 0.0069 | \n",
"
\n",
" \n",
" 9 | \n",
" 9 | \n",
" 0.0001 | \n",
" 8 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.10 | \n",
" 0.5691 | \n",
" 0.5622 | \n",
" 0.0069 | \n",
"
\n",
" \n",
" 10 | \n",
" 22 | \n",
" 0.0001 | \n",
" 16 | \n",
" 4 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.15 | \n",
" 0.5681 | \n",
" 0.5641 | \n",
" 0.0039 | \n",
"
\n",
" \n",
" 11 | \n",
" 24 | \n",
" 0.0001 | \n",
" 16 | \n",
" 4 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.15 | \n",
" 0.5676 | \n",
" 0.5649 | \n",
" 0.0027 | \n",
"
\n",
" \n",
" 12 | \n",
" 26 | \n",
" 0.0001 | \n",
" 16 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.15 | \n",
" 0.5675 | \n",
" 0.5629 | \n",
" 0.0045 | \n",
"
\n",
" \n",
" 13 | \n",
" 28 | \n",
" 0.0001 | \n",
" 16 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.15 | \n",
" 0.5675 | \n",
" 0.5629 | \n",
" 0.0045 | \n",
"
\n",
" \n",
" 14 | \n",
" 3 | \n",
" 0.0001 | \n",
" 8 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.10 | \n",
" 0.5675 | \n",
" 0.5613 | \n",
" 0.0062 | \n",
"
\n",
" \n",
" 15 | \n",
" 1 | \n",
" 0.0001 | \n",
" 8 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.10 | \n",
" 0.5675 | \n",
" 0.5613 | \n",
" 0.0062 | \n",
"
\n",
" \n",
" 16 | \n",
" 5 | \n",
" 0.0001 | \n",
" 8 | \n",
" 4 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.10 | \n",
" 0.5663 | \n",
" 0.5575 | \n",
" 0.0088 | \n",
"
\n",
" \n",
" 17 | \n",
" 25 | \n",
" 0.0001 | \n",
" 16 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.10 | \n",
" 0.5655 | \n",
" 0.5618 | \n",
" 0.0037 | \n",
"
\n",
" \n",
" 18 | \n",
" 27 | \n",
" 0.0001 | \n",
" 16 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.10 | \n",
" 0.5655 | \n",
" 0.5618 | \n",
" 0.0037 | \n",
"
\n",
" \n",
" 19 | \n",
" 7 | \n",
" 0.0001 | \n",
" 8 | \n",
" 4 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.10 | \n",
" 0.5644 | \n",
" 0.5625 | \n",
" 0.0019 | \n",
"
\n",
" \n",
" 20 | \n",
" 23 | \n",
" 0.0001 | \n",
" 16 | \n",
" 4 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.10 | \n",
" 0.5641 | \n",
" 0.5656 | \n",
" -0.0015 | \n",
"
\n",
" \n",
" 21 | \n",
" 32 | \n",
" 0.0001 | \n",
" 16 | \n",
" 5 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.15 | \n",
" 0.5634 | \n",
" 0.5565 | \n",
" 0.0069 | \n",
"
\n",
" \n",
" 22 | \n",
" 21 | \n",
" 0.0001 | \n",
" 16 | \n",
" 4 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.10 | \n",
" 0.5632 | \n",
" 0.5638 | \n",
" -0.0006 | \n",
"
\n",
" \n",
" 23 | \n",
" 30 | \n",
" 0.0001 | \n",
" 16 | \n",
" 5 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.15 | \n",
" 0.5627 | \n",
" 0.5607 | \n",
" 0.0020 | \n",
"
\n",
" \n",
" 24 | \n",
" 12 | \n",
" 0.0001 | \n",
" 8 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.15 | \n",
" 0.5622 | \n",
" 0.5585 | \n",
" 0.0038 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" step lr num_transformer_heads tr_layer_number optimizer scheduler_type warmup_ratio dropout dev test gap\n",
"0 19 0.0001 16 4 adam plateau 0.10 0.10 0.5822 0.5671 0.0151\n",
"1 17 0.0001 16 4 adam plateau 0.05 0.10 0.5822 0.5671 0.0151\n",
"2 2 0.0001 8 4 adam plateau 0.05 0.15 0.5729 0.5643 0.0086\n",
"3 4 0.0001 8 4 adam plateau 0.10 0.15 0.5729 0.5643 0.0086\n",
"4 20 0.0001 16 4 adam plateau 0.10 0.15 0.5727 0.5601 0.0126\n",
"5 18 0.0001 16 4 adam plateau 0.05 0.15 0.5727 0.5601 0.0126\n",
"6 6 0.0001 8 4 adam huggingface_cosine_with_restarts 0.05 0.15 0.5698 0.5626 0.0072\n",
"7 8 0.0001 8 4 adam huggingface_cosine_with_restarts 0.10 0.15 0.5697 0.5623 0.0074\n",
"8 11 0.0001 8 5 adam plateau 0.10 0.10 0.5691 0.5622 0.0069\n",
"9 9 0.0001 8 5 adam plateau 0.05 0.10 0.5691 0.5622 0.0069\n",
"10 22 0.0001 16 4 adam huggingface_cosine_with_restarts 0.05 0.15 0.5681 0.5641 0.0039\n",
"11 24 0.0001 16 4 adam huggingface_cosine_with_restarts 0.10 0.15 0.5676 0.5649 0.0027\n",
"12 26 0.0001 16 5 adam plateau 0.05 0.15 0.5675 0.5629 0.0045\n",
"13 28 0.0001 16 5 adam plateau 0.10 0.15 0.5675 0.5629 0.0045\n",
"14 3 0.0001 8 4 adam plateau 0.10 0.10 0.5675 0.5613 0.0062\n",
"15 1 0.0001 8 4 adam plateau 0.05 0.10 0.5675 0.5613 0.0062\n",
"16 5 0.0001 8 4 adam huggingface_cosine_with_restarts 0.05 0.10 0.5663 0.5575 0.0088\n",
"17 25 0.0001 16 5 adam plateau 0.05 0.10 0.5655 0.5618 0.0037\n",
"18 27 0.0001 16 5 adam plateau 0.10 0.10 0.5655 0.5618 0.0037\n",
"19 7 0.0001 8 4 adam huggingface_cosine_with_restarts 0.10 0.10 0.5644 0.5625 0.0019\n",
"20 23 0.0001 16 4 adam huggingface_cosine_with_restarts 0.10 0.10 0.5641 0.5656 -0.0015\n",
"21 32 0.0001 16 5 adam huggingface_cosine_with_restarts 0.10 0.15 0.5634 0.5565 0.0069\n",
"22 21 0.0001 16 4 adam huggingface_cosine_with_restarts 0.05 0.10 0.5632 0.5638 -0.0006\n",
"23 30 0.0001 16 5 adam huggingface_cosine_with_restarts 0.05 0.15 0.5627 0.5607 0.0020\n",
"24 12 0.0001 8 5 adam plateau 0.10 0.15 0.5622 0.5585 0.0038"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"df = parse_smart_log(\"C:/Users/Alexandr/Desktop/sampling/bi/40.txt\",25)\n",
"\n",
"from IPython.display import display\n",
"pd.set_option(\"display.max_columns\", None)\n",
"pd.set_option(\"display.width\", 160)\n",
"\n",
"display(df.head(25))"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "a488eb61-89a5-4c79-82a6-3980a5621fdd",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" step | \n",
" lr | \n",
" num_transformer_heads | \n",
" tr_layer_number | \n",
" optimizer | \n",
" scheduler_type | \n",
" warmup_ratio | \n",
" dropout | \n",
" dev | \n",
" test | \n",
" gap | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 19 | \n",
" 0.0001 | \n",
" 16 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.10 | \n",
" 0.5723 | \n",
" 0.5676 | \n",
" 0.0047 | \n",
"
\n",
" \n",
" 1 | \n",
" 17 | \n",
" 0.0001 | \n",
" 16 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.10 | \n",
" 0.5723 | \n",
" 0.5676 | \n",
" 0.0047 | \n",
"
\n",
" \n",
" 2 | \n",
" 20 | \n",
" 0.0001 | \n",
" 16 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.15 | \n",
" 0.5721 | \n",
" 0.5691 | \n",
" 0.0031 | \n",
"
\n",
" \n",
" 3 | \n",
" 18 | \n",
" 0.0001 | \n",
" 16 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.15 | \n",
" 0.5721 | \n",
" 0.5691 | \n",
" 0.0031 | \n",
"
\n",
" \n",
" 4 | \n",
" 30 | \n",
" 0.0001 | \n",
" 16 | \n",
" 5 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.15 | \n",
" 0.5707 | \n",
" 0.5654 | \n",
" 0.0053 | \n",
"
\n",
" \n",
" 5 | \n",
" 21 | \n",
" 0.0001 | \n",
" 16 | \n",
" 4 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.10 | \n",
" 0.5701 | \n",
" 0.5630 | \n",
" 0.0071 | \n",
"
\n",
" \n",
" 6 | \n",
" 29 | \n",
" 0.0001 | \n",
" 16 | \n",
" 5 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.10 | \n",
" 0.5700 | \n",
" 0.5695 | \n",
" 0.0005 | \n",
"
\n",
" \n",
" 7 | \n",
" 24 | \n",
" 0.0001 | \n",
" 16 | \n",
" 4 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.15 | \n",
" 0.5695 | \n",
" 0.5629 | \n",
" 0.0066 | \n",
"
\n",
" \n",
" 8 | \n",
" 27 | \n",
" 0.0001 | \n",
" 16 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.10 | \n",
" 0.5678 | \n",
" 0.5712 | \n",
" -0.0034 | \n",
"
\n",
" \n",
" 9 | \n",
" 25 | \n",
" 0.0001 | \n",
" 16 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.10 | \n",
" 0.5678 | \n",
" 0.5712 | \n",
" -0.0034 | \n",
"
\n",
" \n",
" 10 | \n",
" 13 | \n",
" 0.0001 | \n",
" 8 | \n",
" 5 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.10 | \n",
" 0.5664 | \n",
" 0.5571 | \n",
" 0.0092 | \n",
"
\n",
" \n",
" 11 | \n",
" 2 | \n",
" 0.0001 | \n",
" 8 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.15 | \n",
" 0.5663 | \n",
" 0.5630 | \n",
" 0.0033 | \n",
"
\n",
" \n",
" 12 | \n",
" 4 | \n",
" 0.0001 | \n",
" 8 | \n",
" 4 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.15 | \n",
" 0.5663 | \n",
" 0.5630 | \n",
" 0.0033 | \n",
"
\n",
" \n",
" 13 | \n",
" 16 | \n",
" 0.0001 | \n",
" 8 | \n",
" 5 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.15 | \n",
" 0.5656 | \n",
" 0.5630 | \n",
" 0.0026 | \n",
"
\n",
" \n",
" 14 | \n",
" 11 | \n",
" 0.0001 | \n",
" 8 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.10 | \n",
" 0.5650 | \n",
" 0.5584 | \n",
" 0.0066 | \n",
"
\n",
" \n",
" 15 | \n",
" 9 | \n",
" 0.0001 | \n",
" 8 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.10 | \n",
" 0.5650 | \n",
" 0.5584 | \n",
" 0.0066 | \n",
"
\n",
" \n",
" 16 | \n",
" 23 | \n",
" 0.0001 | \n",
" 16 | \n",
" 4 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.10 | \n",
" 0.5648 | \n",
" 0.5595 | \n",
" 0.0053 | \n",
"
\n",
" \n",
" 17 | \n",
" 5 | \n",
" 0.0001 | \n",
" 8 | \n",
" 4 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.10 | \n",
" 0.5647 | \n",
" 0.5586 | \n",
" 0.0061 | \n",
"
\n",
" \n",
" 18 | \n",
" 26 | \n",
" 0.0001 | \n",
" 16 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.15 | \n",
" 0.5643 | \n",
" 0.5575 | \n",
" 0.0067 | \n",
"
\n",
" \n",
" 19 | \n",
" 28 | \n",
" 0.0001 | \n",
" 16 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.15 | \n",
" 0.5643 | \n",
" 0.5575 | \n",
" 0.0067 | \n",
"
\n",
" \n",
" 20 | \n",
" 6 | \n",
" 0.0001 | \n",
" 8 | \n",
" 4 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.15 | \n",
" 0.5638 | \n",
" 0.5608 | \n",
" 0.0031 | \n",
"
\n",
" \n",
" 21 | \n",
" 14 | \n",
" 0.0001 | \n",
" 8 | \n",
" 5 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.15 | \n",
" 0.5633 | \n",
" 0.5624 | \n",
" 0.0009 | \n",
"
\n",
" \n",
" 22 | \n",
" 10 | \n",
" 0.0001 | \n",
" 8 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.05 | \n",
" 0.15 | \n",
" 0.5632 | \n",
" 0.5603 | \n",
" 0.0028 | \n",
"
\n",
" \n",
" 23 | \n",
" 12 | \n",
" 0.0001 | \n",
" 8 | \n",
" 5 | \n",
" adam | \n",
" plateau | \n",
" 0.10 | \n",
" 0.15 | \n",
" 0.5632 | \n",
" 0.5603 | \n",
" 0.0028 | \n",
"
\n",
" \n",
" 24 | \n",
" 32 | \n",
" 0.0001 | \n",
" 16 | \n",
" 5 | \n",
" adam | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.15 | \n",
" 0.5632 | \n",
" 0.5644 | \n",
" -0.0012 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" step lr num_transformer_heads tr_layer_number optimizer scheduler_type warmup_ratio dropout dev test gap\n",
"0 19 0.0001 16 4 adam plateau 0.10 0.10 0.5723 0.5676 0.0047\n",
"1 17 0.0001 16 4 adam plateau 0.05 0.10 0.5723 0.5676 0.0047\n",
"2 20 0.0001 16 4 adam plateau 0.10 0.15 0.5721 0.5691 0.0031\n",
"3 18 0.0001 16 4 adam plateau 0.05 0.15 0.5721 0.5691 0.0031\n",
"4 30 0.0001 16 5 adam huggingface_cosine_with_restarts 0.05 0.15 0.5707 0.5654 0.0053\n",
"5 21 0.0001 16 4 adam huggingface_cosine_with_restarts 0.05 0.10 0.5701 0.5630 0.0071\n",
"6 29 0.0001 16 5 adam huggingface_cosine_with_restarts 0.05 0.10 0.5700 0.5695 0.0005\n",
"7 24 0.0001 16 4 adam huggingface_cosine_with_restarts 0.10 0.15 0.5695 0.5629 0.0066\n",
"8 27 0.0001 16 5 adam plateau 0.10 0.10 0.5678 0.5712 -0.0034\n",
"9 25 0.0001 16 5 adam plateau 0.05 0.10 0.5678 0.5712 -0.0034\n",
"10 13 0.0001 8 5 adam huggingface_cosine_with_restarts 0.05 0.10 0.5664 0.5571 0.0092\n",
"11 2 0.0001 8 4 adam plateau 0.05 0.15 0.5663 0.5630 0.0033\n",
"12 4 0.0001 8 4 adam plateau 0.10 0.15 0.5663 0.5630 0.0033\n",
"13 16 0.0001 8 5 adam huggingface_cosine_with_restarts 0.10 0.15 0.5656 0.5630 0.0026\n",
"14 11 0.0001 8 5 adam plateau 0.10 0.10 0.5650 0.5584 0.0066\n",
"15 9 0.0001 8 5 adam plateau 0.05 0.10 0.5650 0.5584 0.0066\n",
"16 23 0.0001 16 4 adam huggingface_cosine_with_restarts 0.10 0.10 0.5648 0.5595 0.0053\n",
"17 5 0.0001 8 4 adam huggingface_cosine_with_restarts 0.05 0.10 0.5647 0.5586 0.0061\n",
"18 26 0.0001 16 5 adam plateau 0.05 0.15 0.5643 0.5575 0.0067\n",
"19 28 0.0001 16 5 adam plateau 0.10 0.15 0.5643 0.5575 0.0067\n",
"20 6 0.0001 8 4 adam huggingface_cosine_with_restarts 0.05 0.15 0.5638 0.5608 0.0031\n",
"21 14 0.0001 8 5 adam huggingface_cosine_with_restarts 0.05 0.15 0.5633 0.5624 0.0009\n",
"22 10 0.0001 8 5 adam plateau 0.05 0.15 0.5632 0.5603 0.0028\n",
"23 12 0.0001 8 5 adam plateau 0.10 0.15 0.5632 0.5603 0.0028\n",
"24 32 0.0001 16 5 adam huggingface_cosine_with_restarts 0.10 0.15 0.5632 0.5644 -0.0012"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"df = parse_smart_log(\"C:/Users/Alexandr/Desktop/sampling/bi/50.txt\",25)\n",
"\n",
"from IPython.display import display\n",
"pd.set_option(\"display.max_columns\", None)\n",
"pd.set_option(\"display.width\", 160)\n",
"\n",
"display(df.head(25))"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "1e7603d5-f30b-4304-b6b6-a59daf1f5356",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" step | \n",
" optimizer | \n",
" lr | \n",
" weight_decay | \n",
" num_transformer_heads | \n",
" tr_layer_number | \n",
" hidden_dim | \n",
" out_features | \n",
" dev | \n",
" test | \n",
" gap | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 21 | \n",
" adam | \n",
" 0.00010 | \n",
" 0.00 | \n",
" 16 | \n",
" 5 | \n",
" 256 | \n",
" 256 | \n",
" 0.5862 | \n",
" 0.5820 | \n",
" 0.0043 | \n",
"
\n",
" \n",
" 1 | \n",
" 262 | \n",
" lion | \n",
" 0.00001 | \n",
" 0.01 | \n",
" 8 | \n",
" 5 | \n",
" 256 | \n",
" 512 | \n",
" 0.5847 | \n",
" 0.5728 | \n",
" 0.0120 | \n",
"
\n",
" \n",
" 2 | \n",
" 226 | \n",
" lion | \n",
" 0.00001 | \n",
" 0.00 | \n",
" 8 | \n",
" 5 | \n",
" 256 | \n",
" 512 | \n",
" 0.5847 | \n",
" 0.5730 | \n",
" 0.0117 | \n",
"
\n",
" \n",
" 3 | \n",
" 128 | \n",
" adam | \n",
" 0.00001 | \n",
" 0.01 | \n",
" 16 | \n",
" 4 | \n",
" 512 | \n",
" 512 | \n",
" 0.5845 | \n",
" 0.5731 | \n",
" 0.0114 | \n",
"
\n",
" \n",
" 4 | \n",
" 92 | \n",
" adam | \n",
" 0.00001 | \n",
" 0.00 | \n",
" 16 | \n",
" 4 | \n",
" 512 | \n",
" 512 | \n",
" 0.5840 | \n",
" 0.5768 | \n",
" 0.0072 | \n",
"
\n",
" \n",
" 5 | \n",
" 104 | \n",
" adam | \n",
" 0.00001 | \n",
" 0.00 | \n",
" 32 | \n",
" 4 | \n",
" 512 | \n",
" 512 | \n",
" 0.5834 | \n",
" 0.5743 | \n",
" 0.0091 | \n",
"
\n",
" \n",
" 6 | \n",
" 5 | \n",
" adam | \n",
" 0.00010 | \n",
" 0.00 | \n",
" 8 | \n",
" 4 | \n",
" 256 | \n",
" 256 | \n",
" 0.5834 | \n",
" 0.5740 | \n",
" 0.0094 | \n",
"
\n",
" \n",
" 7 | \n",
" 127 | \n",
" adam | \n",
" 0.00001 | \n",
" 0.01 | \n",
" 16 | \n",
" 4 | \n",
" 512 | \n",
" 256 | \n",
" 0.5833 | \n",
" 0.5759 | \n",
" 0.0074 | \n",
"
\n",
" \n",
" 8 | \n",
" 80 | \n",
" adam | \n",
" 0.00001 | \n",
" 0.00 | \n",
" 8 | \n",
" 4 | \n",
" 512 | \n",
" 512 | \n",
" 0.5826 | \n",
" 0.5739 | \n",
" 0.0087 | \n",
"
\n",
" \n",
" 9 | \n",
" 139 | \n",
" adam | \n",
" 0.00001 | \n",
" 0.01 | \n",
" 32 | \n",
" 4 | \n",
" 512 | \n",
" 256 | \n",
" 0.5820 | \n",
" 0.5736 | \n",
" 0.0085 | \n",
"
\n",
" \n",
" 10 | \n",
" 103 | \n",
" adam | \n",
" 0.00001 | \n",
" 0.00 | \n",
" 32 | \n",
" 4 | \n",
" 512 | \n",
" 256 | \n",
" 0.5820 | \n",
" 0.5765 | \n",
" 0.0055 | \n",
"
\n",
" \n",
" 11 | \n",
" 34 | \n",
" adam | \n",
" 0.00010 | \n",
" 0.00 | \n",
" 32 | \n",
" 5 | \n",
" 256 | \n",
" 512 | \n",
" 0.5815 | \n",
" 0.5788 | \n",
" 0.0027 | \n",
"
\n",
" \n",
" 12 | \n",
" 108 | \n",
" adam | \n",
" 0.00001 | \n",
" 0.00 | \n",
" 32 | \n",
" 5 | \n",
" 512 | \n",
" 512 | \n",
" 0.5814 | \n",
" 0.5796 | \n",
" 0.0018 | \n",
"
\n",
" \n",
" 13 | \n",
" 245 | \n",
" lion | \n",
" 0.00001 | \n",
" 0.00 | \n",
" 32 | \n",
" 4 | \n",
" 256 | \n",
" 256 | \n",
" 0.5813 | \n",
" 0.5758 | \n",
" 0.0055 | \n",
"
\n",
" \n",
" 14 | \n",
" 233 | \n",
" lion | \n",
" 0.00001 | \n",
" 0.00 | \n",
" 16 | \n",
" 4 | \n",
" 256 | \n",
" 256 | \n",
" 0.5812 | \n",
" 0.5723 | \n",
" 0.0088 | \n",
"
\n",
" \n",
" 15 | \n",
" 33 | \n",
" adam | \n",
" 0.00010 | \n",
" 0.00 | \n",
" 32 | \n",
" 5 | \n",
" 256 | \n",
" 256 | \n",
" 0.5811 | \n",
" 0.5792 | \n",
" 0.0020 | \n",
"
\n",
" \n",
" 16 | \n",
" 281 | \n",
" lion | \n",
" 0.00001 | \n",
" 0.01 | \n",
" 32 | \n",
" 4 | \n",
" 256 | \n",
" 256 | \n",
" 0.5808 | \n",
" 0.5759 | \n",
" 0.0050 | \n",
"
\n",
" \n",
" 17 | \n",
" 230 | \n",
" lion | \n",
" 0.00001 | \n",
" 0.00 | \n",
" 16 | \n",
" 3 | \n",
" 256 | \n",
" 512 | \n",
" 0.5806 | \n",
" 0.5714 | \n",
" 0.0092 | \n",
"
\n",
" \n",
" 18 | \n",
" 49 | \n",
" adam | \n",
" 0.00010 | \n",
" 0.01 | \n",
" 16 | \n",
" 3 | \n",
" 256 | \n",
" 256 | \n",
" 0.5805 | \n",
" 0.5729 | \n",
" 0.0076 | \n",
"
\n",
" \n",
" 19 | \n",
" 266 | \n",
" lion | \n",
" 0.00001 | \n",
" 0.01 | \n",
" 16 | \n",
" 3 | \n",
" 256 | \n",
" 512 | \n",
" 0.5803 | \n",
" 0.5718 | \n",
" 0.0085 | \n",
"
\n",
" \n",
" 20 | \n",
" 116 | \n",
" adam | \n",
" 0.00001 | \n",
" 0.01 | \n",
" 8 | \n",
" 4 | \n",
" 512 | \n",
" 512 | \n",
" 0.5802 | \n",
" 0.5716 | \n",
" 0.0085 | \n",
"
\n",
" \n",
" 21 | \n",
" 9 | \n",
" adam | \n",
" 0.00010 | \n",
" 0.00 | \n",
" 8 | \n",
" 5 | \n",
" 256 | \n",
" 256 | \n",
" 0.5801 | \n",
" 0.5762 | \n",
" 0.0039 | \n",
"
\n",
" \n",
" 22 | \n",
" 115 | \n",
" adam | \n",
" 0.00001 | \n",
" 0.01 | \n",
" 8 | \n",
" 4 | \n",
" 512 | \n",
" 256 | \n",
" 0.5799 | \n",
" 0.5719 | \n",
" 0.0080 | \n",
"
\n",
" \n",
" 23 | \n",
" 22 | \n",
" adam | \n",
" 0.00010 | \n",
" 0.00 | \n",
" 16 | \n",
" 5 | \n",
" 256 | \n",
" 512 | \n",
" 0.5798 | \n",
" 0.5714 | \n",
" 0.0084 | \n",
"
\n",
" \n",
" 24 | \n",
" 79 | \n",
" adam | \n",
" 0.00001 | \n",
" 0.00 | \n",
" 8 | \n",
" 4 | \n",
" 512 | \n",
" 256 | \n",
" 0.5797 | \n",
" 0.5759 | \n",
" 0.0038 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" step optimizer lr weight_decay num_transformer_heads tr_layer_number hidden_dim out_features dev test gap\n",
"0 21 adam 0.00010 0.00 16 5 256 256 0.5862 0.5820 0.0043\n",
"1 262 lion 0.00001 0.01 8 5 256 512 0.5847 0.5728 0.0120\n",
"2 226 lion 0.00001 0.00 8 5 256 512 0.5847 0.5730 0.0117\n",
"3 128 adam 0.00001 0.01 16 4 512 512 0.5845 0.5731 0.0114\n",
"4 92 adam 0.00001 0.00 16 4 512 512 0.5840 0.5768 0.0072\n",
"5 104 adam 0.00001 0.00 32 4 512 512 0.5834 0.5743 0.0091\n",
"6 5 adam 0.00010 0.00 8 4 256 256 0.5834 0.5740 0.0094\n",
"7 127 adam 0.00001 0.01 16 4 512 256 0.5833 0.5759 0.0074\n",
"8 80 adam 0.00001 0.00 8 4 512 512 0.5826 0.5739 0.0087\n",
"9 139 adam 0.00001 0.01 32 4 512 256 0.5820 0.5736 0.0085\n",
"10 103 adam 0.00001 0.00 32 4 512 256 0.5820 0.5765 0.0055\n",
"11 34 adam 0.00010 0.00 32 5 256 512 0.5815 0.5788 0.0027\n",
"12 108 adam 0.00001 0.00 32 5 512 512 0.5814 0.5796 0.0018\n",
"13 245 lion 0.00001 0.00 32 4 256 256 0.5813 0.5758 0.0055\n",
"14 233 lion 0.00001 0.00 16 4 256 256 0.5812 0.5723 0.0088\n",
"15 33 adam 0.00010 0.00 32 5 256 256 0.5811 0.5792 0.0020\n",
"16 281 lion 0.00001 0.01 32 4 256 256 0.5808 0.5759 0.0050\n",
"17 230 lion 0.00001 0.00 16 3 256 512 0.5806 0.5714 0.0092\n",
"18 49 adam 0.00010 0.01 16 3 256 256 0.5805 0.5729 0.0076\n",
"19 266 lion 0.00001 0.01 16 3 256 512 0.5803 0.5718 0.0085\n",
"20 116 adam 0.00001 0.01 8 4 512 512 0.5802 0.5716 0.0085\n",
"21 9 adam 0.00010 0.00 8 5 256 256 0.5801 0.5762 0.0039\n",
"22 115 adam 0.00001 0.01 8 4 512 256 0.5799 0.5719 0.0080\n",
"23 22 adam 0.00010 0.00 16 5 256 512 0.5798 0.5714 0.0084\n",
"24 79 adam 0.00001 0.00 8 4 512 256 0.5797 0.5759 0.0038"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"df = parse_smart_log(\"C:/Users/Alexandr/Desktop/sampling/5862_адам лучший.txt\",25)\n",
"\n",
"from IPython.display import display\n",
"pd.set_option(\"display.max_columns\", None)\n",
"pd.set_option(\"display.width\", 160)\n",
"\n",
"display(df.head(25))"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "22d74f6b-4d7f-4a1a-a98b-6754c286712d",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" step | \n",
" lr | \n",
" momentum | \n",
" num_transformer_heads | \n",
" tr_layer_number | \n",
" hidden_dim | \n",
" out_features | \n",
" dev | \n",
" test | \n",
" gap | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 5 | \n",
" 0.0010 | \n",
" 0.5 | \n",
" 16 | \n",
" 4 | \n",
" 256 | \n",
" 256 | \n",
" 0.5862 | \n",
" 0.5841 | \n",
" 0.0021 | \n",
"
\n",
" \n",
" 1 | \n",
" 8 | \n",
" 0.0010 | \n",
" 0.9 | \n",
" 8 | \n",
" 4 | \n",
" 256 | \n",
" 256 | \n",
" 0.5857 | \n",
" 0.5682 | \n",
" 0.0175 | \n",
"
\n",
" \n",
" 2 | \n",
" 23 | \n",
" 0.0001 | \n",
" 0.9 | \n",
" 16 | \n",
" 4 | \n",
" 256 | \n",
" 256 | \n",
" 0.5807 | \n",
" 0.5764 | \n",
" 0.0043 | \n",
"
\n",
" \n",
" 3 | \n",
" 2 | \n",
" 0.0010 | \n",
" 0.5 | \n",
" 8 | \n",
" 4 | \n",
" 256 | \n",
" 256 | \n",
" 0.5801 | \n",
" 0.5707 | \n",
" 0.0094 | \n",
"
\n",
" \n",
" 4 | \n",
" 11 | \n",
" 0.0010 | \n",
" 0.9 | \n",
" 16 | \n",
" 4 | \n",
" 256 | \n",
" 256 | \n",
" 0.5791 | \n",
" 0.5732 | \n",
" 0.0059 | \n",
"
\n",
" \n",
" 5 | \n",
" 20 | \n",
" 0.0001 | \n",
" 0.9 | \n",
" 8 | \n",
" 4 | \n",
" 256 | \n",
" 256 | \n",
" 0.5750 | \n",
" 0.5738 | \n",
" 0.0013 | \n",
"
\n",
" \n",
" 6 | \n",
" 24 | \n",
" 0.0001 | \n",
" 0.9 | \n",
" 16 | \n",
" 5 | \n",
" 256 | \n",
" 256 | \n",
" 0.5699 | \n",
" 0.5715 | \n",
" -0.0016 | \n",
"
\n",
" \n",
" 7 | \n",
" 3 | \n",
" 0.0010 | \n",
" 0.5 | \n",
" 8 | \n",
" 5 | \n",
" 256 | \n",
" 256 | \n",
" 0.5696 | \n",
" 0.5705 | \n",
" -0.0009 | \n",
"
\n",
" \n",
" 8 | \n",
" 10 | \n",
" 0.0010 | \n",
" 0.9 | \n",
" 16 | \n",
" 3 | \n",
" 256 | \n",
" 256 | \n",
" 0.5688 | \n",
" 0.5594 | \n",
" 0.0094 | \n",
"
\n",
" \n",
" 9 | \n",
" 12 | \n",
" 0.0010 | \n",
" 0.9 | \n",
" 16 | \n",
" 5 | \n",
" 256 | \n",
" 256 | \n",
" 0.5672 | \n",
" 0.5726 | \n",
" -0.0054 | \n",
"
\n",
" \n",
" 10 | \n",
" 9 | \n",
" 0.0010 | \n",
" 0.9 | \n",
" 8 | \n",
" 5 | \n",
" 256 | \n",
" 256 | \n",
" 0.5657 | \n",
" 0.5705 | \n",
" -0.0049 | \n",
"
\n",
" \n",
" 11 | \n",
" 21 | \n",
" 0.0001 | \n",
" 0.9 | \n",
" 8 | \n",
" 5 | \n",
" 256 | \n",
" 256 | \n",
" 0.5652 | \n",
" 0.5696 | \n",
" -0.0045 | \n",
"
\n",
" \n",
" 12 | \n",
" 6 | \n",
" 0.0010 | \n",
" 0.5 | \n",
" 16 | \n",
" 5 | \n",
" 256 | \n",
" 256 | \n",
" 0.5643 | \n",
" 0.5718 | \n",
" -0.0075 | \n",
"
\n",
" \n",
" 13 | \n",
" 7 | \n",
" 0.0010 | \n",
" 0.9 | \n",
" 8 | \n",
" 3 | \n",
" 256 | \n",
" 256 | \n",
" 0.5612 | \n",
" 0.5586 | \n",
" 0.0026 | \n",
"
\n",
" \n",
" 14 | \n",
" 4 | \n",
" 0.0010 | \n",
" 0.5 | \n",
" 16 | \n",
" 3 | \n",
" 256 | \n",
" 256 | \n",
" 0.5580 | \n",
" 0.5624 | \n",
" -0.0044 | \n",
"
\n",
" \n",
" 15 | \n",
" 14 | \n",
" 0.0001 | \n",
" 0.5 | \n",
" 8 | \n",
" 4 | \n",
" 256 | \n",
" 256 | \n",
" 0.5569 | \n",
" 0.5644 | \n",
" -0.0075 | \n",
"
\n",
" \n",
" 16 | \n",
" 17 | \n",
" 0.0001 | \n",
" 0.5 | \n",
" 16 | \n",
" 4 | \n",
" 256 | \n",
" 256 | \n",
" 0.5567 | \n",
" 0.5531 | \n",
" 0.0036 | \n",
"
\n",
" \n",
" 17 | \n",
" 15 | \n",
" 0.0001 | \n",
" 0.5 | \n",
" 8 | \n",
" 5 | \n",
" 256 | \n",
" 256 | \n",
" 0.5552 | \n",
" 0.5499 | \n",
" 0.0053 | \n",
"
\n",
" \n",
" 18 | \n",
" 1 | \n",
" 0.0010 | \n",
" 0.5 | \n",
" 8 | \n",
" 3 | \n",
" 256 | \n",
" 256 | \n",
" 0.5496 | \n",
" 0.5565 | \n",
" -0.0069 | \n",
"
\n",
" \n",
" 19 | \n",
" 18 | \n",
" 0.0001 | \n",
" 0.5 | \n",
" 16 | \n",
" 5 | \n",
" 256 | \n",
" 256 | \n",
" 0.5477 | \n",
" 0.5486 | \n",
" -0.0009 | \n",
"
\n",
" \n",
" 20 | \n",
" 13 | \n",
" 0.0001 | \n",
" 0.5 | \n",
" 8 | \n",
" 3 | \n",
" 256 | \n",
" 256 | \n",
" 0.5361 | \n",
" 0.5440 | \n",
" -0.0078 | \n",
"
\n",
" \n",
" 21 | \n",
" 19 | \n",
" 0.0001 | \n",
" 0.9 | \n",
" 8 | \n",
" 3 | \n",
" 256 | \n",
" 256 | \n",
" 0.5359 | \n",
" 0.5513 | \n",
" -0.0154 | \n",
"
\n",
" \n",
" 22 | \n",
" 22 | \n",
" 0.0001 | \n",
" 0.9 | \n",
" 16 | \n",
" 3 | \n",
" 256 | \n",
" 256 | \n",
" 0.5339 | \n",
" 0.5474 | \n",
" -0.0135 | \n",
"
\n",
" \n",
" 23 | \n",
" 16 | \n",
" 0.0001 | \n",
" 0.5 | \n",
" 16 | \n",
" 3 | \n",
" 256 | \n",
" 256 | \n",
" 0.5313 | \n",
" 0.5505 | \n",
" -0.0192 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" step lr momentum num_transformer_heads tr_layer_number hidden_dim out_features dev test gap\n",
"0 5 0.0010 0.5 16 4 256 256 0.5862 0.5841 0.0021\n",
"1 8 0.0010 0.9 8 4 256 256 0.5857 0.5682 0.0175\n",
"2 23 0.0001 0.9 16 4 256 256 0.5807 0.5764 0.0043\n",
"3 2 0.0010 0.5 8 4 256 256 0.5801 0.5707 0.0094\n",
"4 11 0.0010 0.9 16 4 256 256 0.5791 0.5732 0.0059\n",
"5 20 0.0001 0.9 8 4 256 256 0.5750 0.5738 0.0013\n",
"6 24 0.0001 0.9 16 5 256 256 0.5699 0.5715 -0.0016\n",
"7 3 0.0010 0.5 8 5 256 256 0.5696 0.5705 -0.0009\n",
"8 10 0.0010 0.9 16 3 256 256 0.5688 0.5594 0.0094\n",
"9 12 0.0010 0.9 16 5 256 256 0.5672 0.5726 -0.0054\n",
"10 9 0.0010 0.9 8 5 256 256 0.5657 0.5705 -0.0049\n",
"11 21 0.0001 0.9 8 5 256 256 0.5652 0.5696 -0.0045\n",
"12 6 0.0010 0.5 16 5 256 256 0.5643 0.5718 -0.0075\n",
"13 7 0.0010 0.9 8 3 256 256 0.5612 0.5586 0.0026\n",
"14 4 0.0010 0.5 16 3 256 256 0.5580 0.5624 -0.0044\n",
"15 14 0.0001 0.5 8 4 256 256 0.5569 0.5644 -0.0075\n",
"16 17 0.0001 0.5 16 4 256 256 0.5567 0.5531 0.0036\n",
"17 15 0.0001 0.5 8 5 256 256 0.5552 0.5499 0.0053\n",
"18 1 0.0010 0.5 8 3 256 256 0.5496 0.5565 -0.0069\n",
"19 18 0.0001 0.5 16 5 256 256 0.5477 0.5486 -0.0009\n",
"20 13 0.0001 0.5 8 3 256 256 0.5361 0.5440 -0.0078\n",
"21 19 0.0001 0.9 8 3 256 256 0.5359 0.5513 -0.0154\n",
"22 22 0.0001 0.9 16 3 256 256 0.5339 0.5474 -0.0135\n",
"23 16 0.0001 0.5 16 3 256 256 0.5313 0.5505 -0.0192"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"df = parse_smart_log(\"C:/Users/Alexandr/Desktop/sampling/sgd_2.txt\",25)\n",
"\n",
"from IPython.display import display\n",
"pd.set_option(\"display.max_columns\", None)\n",
"pd.set_option(\"display.width\", 160)\n",
"\n",
"display(df.head(25))"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "35ae96f7-7ceb-46da-9411-ceba258a98ac",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" step | \n",
" optimizer | \n",
" lr | \n",
" num_transformer_heads | \n",
" tr_layer_number | \n",
" hidden_dim | \n",
" out_features | \n",
" scheduler_type | \n",
" warmup_ratio | \n",
" dev | \n",
" test | \n",
" gap | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 68 | \n",
" adam | \n",
" 0.00001 | \n",
" 8 | \n",
" 5 | \n",
" 256 | \n",
" 256 | \n",
" plateau | \n",
" 0.10 | \n",
" 0.5768 | \n",
" 0.5797 | \n",
" -0.0029 | \n",
"
\n",
" \n",
" 1 | \n",
" 67 | \n",
" adam | \n",
" 0.00001 | \n",
" 8 | \n",
" 5 | \n",
" 256 | \n",
" 256 | \n",
" plateau | \n",
" 0.05 | \n",
" 0.5768 | \n",
" 0.5797 | \n",
" -0.0029 | \n",
"
\n",
" \n",
" 2 | \n",
" 92 | \n",
" adam | \n",
" 0.00001 | \n",
" 16 | \n",
" 5 | \n",
" 256 | \n",
" 256 | \n",
" plateau | \n",
" 0.10 | \n",
" 0.5742 | \n",
" 0.5771 | \n",
" -0.0028 | \n",
"
\n",
" \n",
" 3 | \n",
" 91 | \n",
" adam | \n",
" 0.00001 | \n",
" 16 | \n",
" 5 | \n",
" 256 | \n",
" 256 | \n",
" plateau | \n",
" 0.05 | \n",
" 0.5742 | \n",
" 0.5771 | \n",
" -0.0028 | \n",
"
\n",
" \n",
" 4 | \n",
" 65 | \n",
" adam | \n",
" 0.00001 | \n",
" 8 | \n",
" 5 | \n",
" 256 | \n",
" 256 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.5737 | \n",
" 0.5766 | \n",
" -0.0029 | \n",
"
\n",
" \n",
" 5 | \n",
" 13 | \n",
" adam | \n",
" 0.00010 | \n",
" 8 | \n",
" 4 | \n",
" 256 | \n",
" 512 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.5749 | \n",
" 0.5765 | \n",
" -0.0015 | \n",
"
\n",
" \n",
" 6 | \n",
" 34 | \n",
" adam | \n",
" 0.00010 | \n",
" 16 | \n",
" 4 | \n",
" 256 | \n",
" 256 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.5774 | \n",
" 0.5760 | \n",
" 0.0014 | \n",
"
\n",
" \n",
" 7 | \n",
" 66 | \n",
" adam | \n",
" 0.00001 | \n",
" 8 | \n",
" 5 | \n",
" 256 | \n",
" 256 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.5765 | \n",
" 0.5760 | \n",
" 0.0005 | \n",
"
\n",
" \n",
" 8 | \n",
" 33 | \n",
" adam | \n",
" 0.00010 | \n",
" 16 | \n",
" 4 | \n",
" 256 | \n",
" 256 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.5759 | \n",
" 0.5757 | \n",
" 0.0002 | \n",
"
\n",
" \n",
" 9 | \n",
" 46 | \n",
" adam | \n",
" 0.00010 | \n",
" 16 | \n",
" 5 | \n",
" 256 | \n",
" 512 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.5784 | \n",
" 0.5752 | \n",
" 0.0032 | \n",
"
\n",
" \n",
" 10 | \n",
" 37 | \n",
" adam | \n",
" 0.00010 | \n",
" 16 | \n",
" 4 | \n",
" 256 | \n",
" 512 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.5762 | \n",
" 0.5744 | \n",
" 0.0018 | \n",
"
\n",
" \n",
" 11 | \n",
" 18 | \n",
" adam | \n",
" 0.00010 | \n",
" 8 | \n",
" 5 | \n",
" 256 | \n",
" 256 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.5730 | \n",
" 0.5724 | \n",
" 0.0006 | \n",
"
\n",
" \n",
" 12 | \n",
" 90 | \n",
" adam | \n",
" 0.00001 | \n",
" 16 | \n",
" 5 | \n",
" 256 | \n",
" 256 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.5728 | \n",
" 0.5720 | \n",
" 0.0008 | \n",
"
\n",
" \n",
" 13 | \n",
" 26 | \n",
" adam | \n",
" 0.00010 | \n",
" 16 | \n",
" 3 | \n",
" 256 | \n",
" 256 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.5781 | \n",
" 0.5713 | \n",
" 0.0068 | \n",
"
\n",
" \n",
" 14 | \n",
" 89 | \n",
" adam | \n",
" 0.00001 | \n",
" 16 | \n",
" 5 | \n",
" 256 | \n",
" 256 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.5710 | \n",
" 0.5711 | \n",
" -0.0001 | \n",
"
\n",
" \n",
" 15 | \n",
" 51 | \n",
" adam | \n",
" 0.00001 | \n",
" 8 | \n",
" 3 | \n",
" 256 | \n",
" 256 | \n",
" plateau | \n",
" 0.05 | \n",
" 0.5761 | \n",
" 0.5701 | \n",
" 0.0060 | \n",
"
\n",
" \n",
" 16 | \n",
" 52 | \n",
" adam | \n",
" 0.00001 | \n",
" 8 | \n",
" 3 | \n",
" 256 | \n",
" 256 | \n",
" plateau | \n",
" 0.10 | \n",
" 0.5761 | \n",
" 0.5701 | \n",
" 0.0060 | \n",
"
\n",
" \n",
" 17 | \n",
" 9 | \n",
" adam | \n",
" 0.00010 | \n",
" 8 | \n",
" 4 | \n",
" 256 | \n",
" 256 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.5781 | \n",
" 0.5700 | \n",
" 0.0080 | \n",
"
\n",
" \n",
" 18 | \n",
" 19 | \n",
" adam | \n",
" 0.00010 | \n",
" 8 | \n",
" 5 | \n",
" 256 | \n",
" 256 | \n",
" plateau | \n",
" 0.05 | \n",
" 0.5743 | \n",
" 0.5698 | \n",
" 0.0045 | \n",
"
\n",
" \n",
" 19 | \n",
" 20 | \n",
" adam | \n",
" 0.00010 | \n",
" 8 | \n",
" 5 | \n",
" 256 | \n",
" 256 | \n",
" plateau | \n",
" 0.10 | \n",
" 0.5743 | \n",
" 0.5698 | \n",
" 0.0045 | \n",
"
\n",
" \n",
" 20 | \n",
" 17 | \n",
" adam | \n",
" 0.00010 | \n",
" 8 | \n",
" 5 | \n",
" 256 | \n",
" 256 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.5670 | \n",
" 0.5697 | \n",
" -0.0027 | \n",
"
\n",
" \n",
" 21 | \n",
" 73 | \n",
" adam | \n",
" 0.00001 | \n",
" 16 | \n",
" 3 | \n",
" 256 | \n",
" 256 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.5757 | \n",
" 0.5697 | \n",
" 0.0060 | \n",
"
\n",
" \n",
" 22 | \n",
" 44 | \n",
" adam | \n",
" 0.00010 | \n",
" 16 | \n",
" 5 | \n",
" 256 | \n",
" 256 | \n",
" plateau | \n",
" 0.10 | \n",
" 0.5779 | \n",
" 0.5694 | \n",
" 0.0085 | \n",
"
\n",
" \n",
" 23 | \n",
" 43 | \n",
" adam | \n",
" 0.00010 | \n",
" 16 | \n",
" 5 | \n",
" 256 | \n",
" 256 | \n",
" plateau | \n",
" 0.05 | \n",
" 0.5779 | \n",
" 0.5694 | \n",
" 0.0085 | \n",
"
\n",
" \n",
" 24 | \n",
" 82 | \n",
" adam | \n",
" 0.00001 | \n",
" 16 | \n",
" 4 | \n",
" 256 | \n",
" 256 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.5780 | \n",
" 0.5693 | \n",
" 0.0087 | \n",
"
\n",
" \n",
" 25 | \n",
" 81 | \n",
" adam | \n",
" 0.00001 | \n",
" 16 | \n",
" 4 | \n",
" 256 | \n",
" 256 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.5773 | \n",
" 0.5692 | \n",
" 0.0082 | \n",
"
\n",
" \n",
" 26 | \n",
" 41 | \n",
" adam | \n",
" 0.00010 | \n",
" 16 | \n",
" 5 | \n",
" 256 | \n",
" 256 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.5720 | \n",
" 0.5689 | \n",
" 0.0031 | \n",
"
\n",
" \n",
" 27 | \n",
" 16 | \n",
" adam | \n",
" 0.00010 | \n",
" 8 | \n",
" 4 | \n",
" 256 | \n",
" 512 | \n",
" plateau | \n",
" 0.10 | \n",
" 0.5783 | \n",
" 0.5687 | \n",
" 0.0096 | \n",
"
\n",
" \n",
" 28 | \n",
" 15 | \n",
" adam | \n",
" 0.00010 | \n",
" 8 | \n",
" 4 | \n",
" 256 | \n",
" 512 | \n",
" plateau | \n",
" 0.05 | \n",
" 0.5783 | \n",
" 0.5687 | \n",
" 0.0096 | \n",
"
\n",
" \n",
" 29 | \n",
" 42 | \n",
" adam | \n",
" 0.00010 | \n",
" 16 | \n",
" 5 | \n",
" 256 | \n",
" 256 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.5746 | \n",
" 0.5686 | \n",
" 0.0060 | \n",
"
\n",
" \n",
" 30 | \n",
" 85 | \n",
" adam | \n",
" 0.00001 | \n",
" 16 | \n",
" 4 | \n",
" 256 | \n",
" 512 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.5733 | \n",
" 0.5685 | \n",
" 0.0048 | \n",
"
\n",
" \n",
" 31 | \n",
" 38 | \n",
" adam | \n",
" 0.00010 | \n",
" 16 | \n",
" 4 | \n",
" 256 | \n",
" 512 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.5728 | \n",
" 0.5682 | \n",
" 0.0046 | \n",
"
\n",
" \n",
" 32 | \n",
" 94 | \n",
" adam | \n",
" 0.00001 | \n",
" 16 | \n",
" 5 | \n",
" 256 | \n",
" 512 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.5754 | \n",
" 0.5676 | \n",
" 0.0078 | \n",
"
\n",
" \n",
" 33 | \n",
" 70 | \n",
" adam | \n",
" 0.00001 | \n",
" 8 | \n",
" 5 | \n",
" 256 | \n",
" 512 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.5713 | \n",
" 0.5675 | \n",
" 0.0038 | \n",
"
\n",
" \n",
" 34 | \n",
" 86 | \n",
" adam | \n",
" 0.00001 | \n",
" 16 | \n",
" 4 | \n",
" 256 | \n",
" 512 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.5708 | \n",
" 0.5675 | \n",
" 0.0033 | \n",
"
\n",
" \n",
" 35 | \n",
" 50 | \n",
" adam | \n",
" 0.00001 | \n",
" 8 | \n",
" 3 | \n",
" 256 | \n",
" 256 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.5771 | \n",
" 0.5674 | \n",
" 0.0097 | \n",
"
\n",
" \n",
" 36 | \n",
" 30 | \n",
" adam | \n",
" 0.00010 | \n",
" 16 | \n",
" 3 | \n",
" 256 | \n",
" 512 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.5711 | \n",
" 0.5674 | \n",
" 0.0037 | \n",
"
\n",
" \n",
" 37 | \n",
" 58 | \n",
" adam | \n",
" 0.00001 | \n",
" 8 | \n",
" 4 | \n",
" 256 | \n",
" 256 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.5729 | \n",
" 0.5673 | \n",
" 0.0056 | \n",
"
\n",
" \n",
" 38 | \n",
" 48 | \n",
" adam | \n",
" 0.00010 | \n",
" 16 | \n",
" 5 | \n",
" 256 | \n",
" 512 | \n",
" plateau | \n",
" 0.10 | \n",
" 0.5715 | \n",
" 0.5666 | \n",
" 0.0049 | \n",
"
\n",
" \n",
" 39 | \n",
" 47 | \n",
" adam | \n",
" 0.00010 | \n",
" 16 | \n",
" 5 | \n",
" 256 | \n",
" 512 | \n",
" plateau | \n",
" 0.05 | \n",
" 0.5715 | \n",
" 0.5666 | \n",
" 0.0049 | \n",
"
\n",
" \n",
" 40 | \n",
" 35 | \n",
" adam | \n",
" 0.00010 | \n",
" 16 | \n",
" 4 | \n",
" 256 | \n",
" 256 | \n",
" plateau | \n",
" 0.05 | \n",
" 0.5767 | \n",
" 0.5662 | \n",
" 0.0105 | \n",
"
\n",
" \n",
" 41 | \n",
" 36 | \n",
" adam | \n",
" 0.00010 | \n",
" 16 | \n",
" 4 | \n",
" 256 | \n",
" 256 | \n",
" plateau | \n",
" 0.10 | \n",
" 0.5767 | \n",
" 0.5662 | \n",
" 0.0105 | \n",
"
\n",
" \n",
" 42 | \n",
" 10 | \n",
" adam | \n",
" 0.00010 | \n",
" 8 | \n",
" 4 | \n",
" 256 | \n",
" 256 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.5734 | \n",
" 0.5661 | \n",
" 0.0073 | \n",
"
\n",
" \n",
" 43 | \n",
" 83 | \n",
" adam | \n",
" 0.00001 | \n",
" 16 | \n",
" 4 | \n",
" 256 | \n",
" 256 | \n",
" plateau | \n",
" 0.05 | \n",
" 0.5742 | \n",
" 0.5660 | \n",
" 0.0081 | \n",
"
\n",
" \n",
" 44 | \n",
" 84 | \n",
" adam | \n",
" 0.00001 | \n",
" 16 | \n",
" 4 | \n",
" 256 | \n",
" 256 | \n",
" plateau | \n",
" 0.10 | \n",
" 0.5742 | \n",
" 0.5660 | \n",
" 0.0081 | \n",
"
\n",
" \n",
" 45 | \n",
" 57 | \n",
" adam | \n",
" 0.00001 | \n",
" 8 | \n",
" 4 | \n",
" 256 | \n",
" 256 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.5717 | \n",
" 0.5658 | \n",
" 0.0058 | \n",
"
\n",
" \n",
" 46 | \n",
" 75 | \n",
" adam | \n",
" 0.00001 | \n",
" 16 | \n",
" 3 | \n",
" 256 | \n",
" 256 | \n",
" plateau | \n",
" 0.05 | \n",
" 0.5717 | \n",
" 0.5656 | \n",
" 0.0061 | \n",
"
\n",
" \n",
" 47 | \n",
" 71 | \n",
" adam | \n",
" 0.00001 | \n",
" 8 | \n",
" 5 | \n",
" 256 | \n",
" 512 | \n",
" plateau | \n",
" 0.05 | \n",
" 0.5749 | \n",
" 0.5656 | \n",
" 0.0094 | \n",
"
\n",
" \n",
" 48 | \n",
" 72 | \n",
" adam | \n",
" 0.00001 | \n",
" 8 | \n",
" 5 | \n",
" 256 | \n",
" 512 | \n",
" plateau | \n",
" 0.10 | \n",
" 0.5749 | \n",
" 0.5656 | \n",
" 0.0094 | \n",
"
\n",
" \n",
" 49 | \n",
" 49 | \n",
" adam | \n",
" 0.00001 | \n",
" 8 | \n",
" 3 | \n",
" 256 | \n",
" 256 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.5786 | \n",
" 0.5656 | \n",
" 0.0130 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" step optimizer lr num_transformer_heads tr_layer_number hidden_dim out_features scheduler_type warmup_ratio dev test \\\n",
"0 68 adam 0.00001 8 5 256 256 plateau 0.10 0.5768 0.5797 \n",
"1 67 adam 0.00001 8 5 256 256 plateau 0.05 0.5768 0.5797 \n",
"2 92 adam 0.00001 16 5 256 256 plateau 0.10 0.5742 0.5771 \n",
"3 91 adam 0.00001 16 5 256 256 plateau 0.05 0.5742 0.5771 \n",
"4 65 adam 0.00001 8 5 256 256 huggingface_cosine_with_restarts 0.05 0.5737 0.5766 \n",
"5 13 adam 0.00010 8 4 256 512 huggingface_cosine_with_restarts 0.05 0.5749 0.5765 \n",
"6 34 adam 0.00010 16 4 256 256 huggingface_cosine_with_restarts 0.10 0.5774 0.5760 \n",
"7 66 adam 0.00001 8 5 256 256 huggingface_cosine_with_restarts 0.10 0.5765 0.5760 \n",
"8 33 adam 0.00010 16 4 256 256 huggingface_cosine_with_restarts 0.05 0.5759 0.5757 \n",
"9 46 adam 0.00010 16 5 256 512 huggingface_cosine_with_restarts 0.10 0.5784 0.5752 \n",
"10 37 adam 0.00010 16 4 256 512 huggingface_cosine_with_restarts 0.05 0.5762 0.5744 \n",
"11 18 adam 0.00010 8 5 256 256 huggingface_cosine_with_restarts 0.10 0.5730 0.5724 \n",
"12 90 adam 0.00001 16 5 256 256 huggingface_cosine_with_restarts 0.10 0.5728 0.5720 \n",
"13 26 adam 0.00010 16 3 256 256 huggingface_cosine_with_restarts 0.10 0.5781 0.5713 \n",
"14 89 adam 0.00001 16 5 256 256 huggingface_cosine_with_restarts 0.05 0.5710 0.5711 \n",
"15 51 adam 0.00001 8 3 256 256 plateau 0.05 0.5761 0.5701 \n",
"16 52 adam 0.00001 8 3 256 256 plateau 0.10 0.5761 0.5701 \n",
"17 9 adam 0.00010 8 4 256 256 huggingface_cosine_with_restarts 0.05 0.5781 0.5700 \n",
"18 19 adam 0.00010 8 5 256 256 plateau 0.05 0.5743 0.5698 \n",
"19 20 adam 0.00010 8 5 256 256 plateau 0.10 0.5743 0.5698 \n",
"20 17 adam 0.00010 8 5 256 256 huggingface_cosine_with_restarts 0.05 0.5670 0.5697 \n",
"21 73 adam 0.00001 16 3 256 256 huggingface_cosine_with_restarts 0.05 0.5757 0.5697 \n",
"22 44 adam 0.00010 16 5 256 256 plateau 0.10 0.5779 0.5694 \n",
"23 43 adam 0.00010 16 5 256 256 plateau 0.05 0.5779 0.5694 \n",
"24 82 adam 0.00001 16 4 256 256 huggingface_cosine_with_restarts 0.10 0.5780 0.5693 \n",
"25 81 adam 0.00001 16 4 256 256 huggingface_cosine_with_restarts 0.05 0.5773 0.5692 \n",
"26 41 adam 0.00010 16 5 256 256 huggingface_cosine_with_restarts 0.05 0.5720 0.5689 \n",
"27 16 adam 0.00010 8 4 256 512 plateau 0.10 0.5783 0.5687 \n",
"28 15 adam 0.00010 8 4 256 512 plateau 0.05 0.5783 0.5687 \n",
"29 42 adam 0.00010 16 5 256 256 huggingface_cosine_with_restarts 0.10 0.5746 0.5686 \n",
"30 85 adam 0.00001 16 4 256 512 huggingface_cosine_with_restarts 0.05 0.5733 0.5685 \n",
"31 38 adam 0.00010 16 4 256 512 huggingface_cosine_with_restarts 0.10 0.5728 0.5682 \n",
"32 94 adam 0.00001 16 5 256 512 huggingface_cosine_with_restarts 0.10 0.5754 0.5676 \n",
"33 70 adam 0.00001 8 5 256 512 huggingface_cosine_with_restarts 0.10 0.5713 0.5675 \n",
"34 86 adam 0.00001 16 4 256 512 huggingface_cosine_with_restarts 0.10 0.5708 0.5675 \n",
"35 50 adam 0.00001 8 3 256 256 huggingface_cosine_with_restarts 0.10 0.5771 0.5674 \n",
"36 30 adam 0.00010 16 3 256 512 huggingface_cosine_with_restarts 0.10 0.5711 0.5674 \n",
"37 58 adam 0.00001 8 4 256 256 huggingface_cosine_with_restarts 0.10 0.5729 0.5673 \n",
"38 48 adam 0.00010 16 5 256 512 plateau 0.10 0.5715 0.5666 \n",
"39 47 adam 0.00010 16 5 256 512 plateau 0.05 0.5715 0.5666 \n",
"40 35 adam 0.00010 16 4 256 256 plateau 0.05 0.5767 0.5662 \n",
"41 36 adam 0.00010 16 4 256 256 plateau 0.10 0.5767 0.5662 \n",
"42 10 adam 0.00010 8 4 256 256 huggingface_cosine_with_restarts 0.10 0.5734 0.5661 \n",
"43 83 adam 0.00001 16 4 256 256 plateau 0.05 0.5742 0.5660 \n",
"44 84 adam 0.00001 16 4 256 256 plateau 0.10 0.5742 0.5660 \n",
"45 57 adam 0.00001 8 4 256 256 huggingface_cosine_with_restarts 0.05 0.5717 0.5658 \n",
"46 75 adam 0.00001 16 3 256 256 plateau 0.05 0.5717 0.5656 \n",
"47 71 adam 0.00001 8 5 256 512 plateau 0.05 0.5749 0.5656 \n",
"48 72 adam 0.00001 8 5 256 512 plateau 0.10 0.5749 0.5656 \n",
"49 49 adam 0.00001 8 3 256 256 huggingface_cosine_with_restarts 0.05 0.5786 0.5656 \n",
"\n",
" gap \n",
"0 -0.0029 \n",
"1 -0.0029 \n",
"2 -0.0028 \n",
"3 -0.0028 \n",
"4 -0.0029 \n",
"5 -0.0015 \n",
"6 0.0014 \n",
"7 0.0005 \n",
"8 0.0002 \n",
"9 0.0032 \n",
"10 0.0018 \n",
"11 0.0006 \n",
"12 0.0008 \n",
"13 0.0068 \n",
"14 -0.0001 \n",
"15 0.0060 \n",
"16 0.0060 \n",
"17 0.0080 \n",
"18 0.0045 \n",
"19 0.0045 \n",
"20 -0.0027 \n",
"21 0.0060 \n",
"22 0.0085 \n",
"23 0.0085 \n",
"24 0.0087 \n",
"25 0.0082 \n",
"26 0.0031 \n",
"27 0.0096 \n",
"28 0.0096 \n",
"29 0.0060 \n",
"30 0.0048 \n",
"31 0.0046 \n",
"32 0.0078 \n",
"33 0.0038 \n",
"34 0.0033 \n",
"35 0.0097 \n",
"36 0.0037 \n",
"37 0.0056 \n",
"38 0.0049 \n",
"39 0.0049 \n",
"40 0.0105 \n",
"41 0.0105 \n",
"42 0.0073 \n",
"43 0.0081 \n",
"44 0.0081 \n",
"45 0.0058 \n",
"46 0.0061 \n",
"47 0.0094 \n",
"48 0.0094 \n",
"49 0.0130 "
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"df = parse_smart_log(\"C:/Users/Alexandr/Desktop/sampling/last/biformer.txt\",50)\n",
"\n",
"from IPython.display import display\n",
"pd.set_option(\"display.max_columns\", None)\n",
"pd.set_option(\"display.width\", 160)\n",
"\n",
"display(df.head(50))"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "5832fe53-8308-4195-bb94-db7a86148076",
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" step | \n",
" optimizer | \n",
" lr | \n",
" num_transformer_heads | \n",
" tr_layer_number | \n",
" hidden_dim | \n",
" out_features | \n",
" scheduler_type | \n",
" warmup_ratio | \n",
" dev | \n",
" test | \n",
" gap | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 28 | \n",
" adam | \n",
" 0.00010 | \n",
" 16 | \n",
" 3 | \n",
" 256 | \n",
" 256 | \n",
" plateau | \n",
" 0.10 | \n",
" 0.5750 | \n",
" 0.5758 | \n",
" -0.0008 | \n",
"
\n",
" \n",
" 1 | \n",
" 27 | \n",
" adam | \n",
" 0.00010 | \n",
" 16 | \n",
" 3 | \n",
" 256 | \n",
" 256 | \n",
" plateau | \n",
" 0.05 | \n",
" 0.5750 | \n",
" 0.5758 | \n",
" -0.0008 | \n",
"
\n",
" \n",
" 2 | \n",
" 25 | \n",
" adam | \n",
" 0.00010 | \n",
" 16 | \n",
" 3 | \n",
" 256 | \n",
" 256 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.5721 | \n",
" 0.5716 | \n",
" 0.0005 | \n",
"
\n",
" \n",
" 3 | \n",
" 56 | \n",
" adam | \n",
" 0.00001 | \n",
" 8 | \n",
" 3 | \n",
" 256 | \n",
" 512 | \n",
" plateau | \n",
" 0.10 | \n",
" 0.5717 | \n",
" 0.5705 | \n",
" 0.0012 | \n",
"
\n",
" \n",
" 4 | \n",
" 55 | \n",
" adam | \n",
" 0.00001 | \n",
" 8 | \n",
" 3 | \n",
" 256 | \n",
" 512 | \n",
" plateau | \n",
" 0.05 | \n",
" 0.5717 | \n",
" 0.5705 | \n",
" 0.0012 | \n",
"
\n",
" \n",
" 5 | \n",
" 17 | \n",
" adam | \n",
" 0.00010 | \n",
" 8 | \n",
" 5 | \n",
" 256 | \n",
" 256 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.5683 | \n",
" 0.5690 | \n",
" -0.0006 | \n",
"
\n",
" \n",
" 6 | \n",
" 26 | \n",
" adam | \n",
" 0.00010 | \n",
" 16 | \n",
" 3 | \n",
" 256 | \n",
" 256 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.5758 | \n",
" 0.5687 | \n",
" 0.0071 | \n",
"
\n",
" \n",
" 7 | \n",
" 37 | \n",
" adam | \n",
" 0.00010 | \n",
" 16 | \n",
" 4 | \n",
" 256 | \n",
" 512 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.5674 | \n",
" 0.5676 | \n",
" -0.0002 | \n",
"
\n",
" \n",
" 8 | \n",
" 44 | \n",
" adam | \n",
" 0.00010 | \n",
" 16 | \n",
" 5 | \n",
" 256 | \n",
" 256 | \n",
" plateau | \n",
" 0.10 | \n",
" 0.5681 | \n",
" 0.5671 | \n",
" 0.0010 | \n",
"
\n",
" \n",
" 9 | \n",
" 43 | \n",
" adam | \n",
" 0.00010 | \n",
" 16 | \n",
" 5 | \n",
" 256 | \n",
" 256 | \n",
" plateau | \n",
" 0.05 | \n",
" 0.5681 | \n",
" 0.5671 | \n",
" 0.0010 | \n",
"
\n",
" \n",
" 10 | \n",
" 31 | \n",
" adam | \n",
" 0.00010 | \n",
" 16 | \n",
" 3 | \n",
" 256 | \n",
" 512 | \n",
" plateau | \n",
" 0.05 | \n",
" 0.5624 | \n",
" 0.5659 | \n",
" -0.0035 | \n",
"
\n",
" \n",
" 11 | \n",
" 32 | \n",
" adam | \n",
" 0.00010 | \n",
" 16 | \n",
" 3 | \n",
" 256 | \n",
" 512 | \n",
" plateau | \n",
" 0.10 | \n",
" 0.5624 | \n",
" 0.5659 | \n",
" -0.0035 | \n",
"
\n",
" \n",
" 12 | \n",
" 2 | \n",
" adam | \n",
" 0.00010 | \n",
" 8 | \n",
" 3 | \n",
" 256 | \n",
" 256 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.5661 | \n",
" 0.5656 | \n",
" 0.0005 | \n",
"
\n",
" \n",
" 13 | \n",
" 80 | \n",
" adam | \n",
" 0.00001 | \n",
" 16 | \n",
" 3 | \n",
" 256 | \n",
" 512 | \n",
" plateau | \n",
" 0.10 | \n",
" 0.5689 | \n",
" 0.5650 | \n",
" 0.0040 | \n",
"
\n",
" \n",
" 14 | \n",
" 79 | \n",
" adam | \n",
" 0.00001 | \n",
" 16 | \n",
" 3 | \n",
" 256 | \n",
" 512 | \n",
" plateau | \n",
" 0.05 | \n",
" 0.5689 | \n",
" 0.5650 | \n",
" 0.0040 | \n",
"
\n",
" \n",
" 15 | \n",
" 53 | \n",
" adam | \n",
" 0.00001 | \n",
" 8 | \n",
" 3 | \n",
" 256 | \n",
" 512 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.5665 | \n",
" 0.5649 | \n",
" 0.0016 | \n",
"
\n",
" \n",
" 16 | \n",
" 77 | \n",
" adam | \n",
" 0.00001 | \n",
" 16 | \n",
" 3 | \n",
" 256 | \n",
" 512 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.5690 | \n",
" 0.5649 | \n",
" 0.0041 | \n",
"
\n",
" \n",
" 17 | \n",
" 1 | \n",
" adam | \n",
" 0.00010 | \n",
" 8 | \n",
" 3 | \n",
" 256 | \n",
" 256 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.5639 | \n",
" 0.5648 | \n",
" -0.0009 | \n",
"
\n",
" \n",
" 18 | \n",
" 78 | \n",
" adam | \n",
" 0.00001 | \n",
" 16 | \n",
" 3 | \n",
" 256 | \n",
" 512 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.5656 | \n",
" 0.5645 | \n",
" 0.0011 | \n",
"
\n",
" \n",
" 19 | \n",
" 62 | \n",
" adam | \n",
" 0.00001 | \n",
" 8 | \n",
" 4 | \n",
" 256 | \n",
" 512 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.5649 | \n",
" 0.5644 | \n",
" 0.0005 | \n",
"
\n",
" \n",
" 20 | \n",
" 46 | \n",
" adam | \n",
" 0.00010 | \n",
" 16 | \n",
" 5 | \n",
" 256 | \n",
" 512 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.5596 | \n",
" 0.5638 | \n",
" -0.0042 | \n",
"
\n",
" \n",
" 21 | \n",
" 14 | \n",
" adam | \n",
" 0.00010 | \n",
" 8 | \n",
" 4 | \n",
" 256 | \n",
" 512 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.5722 | \n",
" 0.5631 | \n",
" 0.0091 | \n",
"
\n",
" \n",
" 22 | \n",
" 95 | \n",
" adam | \n",
" 0.00001 | \n",
" 16 | \n",
" 5 | \n",
" 256 | \n",
" 512 | \n",
" plateau | \n",
" 0.05 | \n",
" 0.5654 | \n",
" 0.5631 | \n",
" 0.0023 | \n",
"
\n",
" \n",
" 23 | \n",
" 96 | \n",
" adam | \n",
" 0.00001 | \n",
" 16 | \n",
" 5 | \n",
" 256 | \n",
" 512 | \n",
" plateau | \n",
" 0.10 | \n",
" 0.5654 | \n",
" 0.5631 | \n",
" 0.0023 | \n",
"
\n",
" \n",
" 24 | \n",
" 50 | \n",
" adam | \n",
" 0.00001 | \n",
" 8 | \n",
" 3 | \n",
" 256 | \n",
" 256 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.5702 | \n",
" 0.5629 | \n",
" 0.0073 | \n",
"
\n",
" \n",
" 25 | \n",
" 73 | \n",
" adam | \n",
" 0.00001 | \n",
" 16 | \n",
" 3 | \n",
" 256 | \n",
" 256 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.5674 | \n",
" 0.5628 | \n",
" 0.0046 | \n",
"
\n",
" \n",
" 26 | \n",
" 6 | \n",
" adam | \n",
" 0.00010 | \n",
" 8 | \n",
" 3 | \n",
" 256 | \n",
" 512 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.5708 | \n",
" 0.5626 | \n",
" 0.0082 | \n",
"
\n",
" \n",
" 27 | \n",
" 61 | \n",
" adam | \n",
" 0.00001 | \n",
" 8 | \n",
" 4 | \n",
" 256 | \n",
" 512 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.5649 | \n",
" 0.5624 | \n",
" 0.0025 | \n",
"
\n",
" \n",
" 28 | \n",
" 74 | \n",
" adam | \n",
" 0.00001 | \n",
" 16 | \n",
" 3 | \n",
" 256 | \n",
" 256 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.5676 | \n",
" 0.5623 | \n",
" 0.0053 | \n",
"
\n",
" \n",
" 29 | \n",
" 91 | \n",
" adam | \n",
" 0.00001 | \n",
" 16 | \n",
" 5 | \n",
" 256 | \n",
" 256 | \n",
" plateau | \n",
" 0.05 | \n",
" 0.5586 | \n",
" 0.5622 | \n",
" -0.0036 | \n",
"
\n",
" \n",
" 30 | \n",
" 92 | \n",
" adam | \n",
" 0.00001 | \n",
" 16 | \n",
" 5 | \n",
" 256 | \n",
" 256 | \n",
" plateau | \n",
" 0.10 | \n",
" 0.5586 | \n",
" 0.5622 | \n",
" -0.0036 | \n",
"
\n",
" \n",
" 31 | \n",
" 8 | \n",
" adam | \n",
" 0.00010 | \n",
" 8 | \n",
" 3 | \n",
" 256 | \n",
" 512 | \n",
" plateau | \n",
" 0.10 | \n",
" 0.5669 | \n",
" 0.5616 | \n",
" 0.0053 | \n",
"
\n",
" \n",
" 32 | \n",
" 38 | \n",
" adam | \n",
" 0.00010 | \n",
" 16 | \n",
" 4 | \n",
" 256 | \n",
" 512 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.5640 | \n",
" 0.5616 | \n",
" 0.0024 | \n",
"
\n",
" \n",
" 33 | \n",
" 7 | \n",
" adam | \n",
" 0.00010 | \n",
" 8 | \n",
" 3 | \n",
" 256 | \n",
" 512 | \n",
" plateau | \n",
" 0.05 | \n",
" 0.5669 | \n",
" 0.5616 | \n",
" 0.0053 | \n",
"
\n",
" \n",
" 34 | \n",
" 54 | \n",
" adam | \n",
" 0.00001 | \n",
" 8 | \n",
" 3 | \n",
" 256 | \n",
" 512 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.5646 | \n",
" 0.5615 | \n",
" 0.0031 | \n",
"
\n",
" \n",
" 35 | \n",
" 93 | \n",
" adam | \n",
" 0.00001 | \n",
" 16 | \n",
" 5 | \n",
" 256 | \n",
" 512 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.5673 | \n",
" 0.5612 | \n",
" 0.0061 | \n",
"
\n",
" \n",
" 36 | \n",
" 4 | \n",
" adam | \n",
" 0.00010 | \n",
" 8 | \n",
" 3 | \n",
" 256 | \n",
" 256 | \n",
" plateau | \n",
" 0.10 | \n",
" 0.5665 | \n",
" 0.5610 | \n",
" 0.0054 | \n",
"
\n",
" \n",
" 37 | \n",
" 3 | \n",
" adam | \n",
" 0.00010 | \n",
" 8 | \n",
" 3 | \n",
" 256 | \n",
" 256 | \n",
" plateau | \n",
" 0.05 | \n",
" 0.5665 | \n",
" 0.5610 | \n",
" 0.0054 | \n",
"
\n",
" \n",
" 38 | \n",
" 49 | \n",
" adam | \n",
" 0.00001 | \n",
" 8 | \n",
" 3 | \n",
" 256 | \n",
" 256 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.5706 | \n",
" 0.5609 | \n",
" 0.0097 | \n",
"
\n",
" \n",
" 39 | \n",
" 75 | \n",
" adam | \n",
" 0.00001 | \n",
" 16 | \n",
" 3 | \n",
" 256 | \n",
" 256 | \n",
" plateau | \n",
" 0.05 | \n",
" 0.5656 | \n",
" 0.5606 | \n",
" 0.0050 | \n",
"
\n",
" \n",
" 40 | \n",
" 76 | \n",
" adam | \n",
" 0.00001 | \n",
" 16 | \n",
" 3 | \n",
" 256 | \n",
" 256 | \n",
" plateau | \n",
" 0.10 | \n",
" 0.5656 | \n",
" 0.5606 | \n",
" 0.0050 | \n",
"
\n",
" \n",
" 41 | \n",
" 42 | \n",
" adam | \n",
" 0.00010 | \n",
" 16 | \n",
" 5 | \n",
" 256 | \n",
" 256 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.5666 | \n",
" 0.5603 | \n",
" 0.0064 | \n",
"
\n",
" \n",
" 42 | \n",
" 89 | \n",
" adam | \n",
" 0.00001 | \n",
" 16 | \n",
" 5 | \n",
" 256 | \n",
" 256 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" 0.5616 | \n",
" 0.5600 | \n",
" 0.0017 | \n",
"
\n",
" \n",
" 43 | \n",
" 10 | \n",
" adam | \n",
" 0.00010 | \n",
" 8 | \n",
" 4 | \n",
" 256 | \n",
" 256 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.5651 | \n",
" 0.5597 | \n",
" 0.0053 | \n",
"
\n",
" \n",
" 44 | \n",
" 70 | \n",
" adam | \n",
" 0.00001 | \n",
" 8 | \n",
" 5 | \n",
" 256 | \n",
" 512 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.5691 | \n",
" 0.5594 | \n",
" 0.0097 | \n",
"
\n",
" \n",
" 45 | \n",
" 34 | \n",
" adam | \n",
" 0.00010 | \n",
" 16 | \n",
" 4 | \n",
" 256 | \n",
" 256 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" 0.5650 | \n",
" 0.5589 | \n",
" 0.0061 | \n",
"
\n",
" \n",
" 46 | \n",
" 47 | \n",
" adam | \n",
" 0.00010 | \n",
" 16 | \n",
" 5 | \n",
" 256 | \n",
" 512 | \n",
" plateau | \n",
" 0.05 | \n",
" 0.5613 | \n",
" 0.5587 | \n",
" 0.0026 | \n",
"
\n",
" \n",
" 47 | \n",
" 48 | \n",
" adam | \n",
" 0.00010 | \n",
" 16 | \n",
" 5 | \n",
" 256 | \n",
" 512 | \n",
" plateau | \n",
" 0.10 | \n",
" 0.5613 | \n",
" 0.5587 | \n",
" 0.0026 | \n",
"
\n",
" \n",
" 48 | \n",
" 39 | \n",
" adam | \n",
" 0.00010 | \n",
" 16 | \n",
" 4 | \n",
" 256 | \n",
" 512 | \n",
" plateau | \n",
" 0.05 | \n",
" 0.5612 | \n",
" 0.5586 | \n",
" 0.0026 | \n",
"
\n",
" \n",
" 49 | \n",
" 40 | \n",
" adam | \n",
" 0.00010 | \n",
" 16 | \n",
" 4 | \n",
" 256 | \n",
" 512 | \n",
" plateau | \n",
" 0.10 | \n",
" 0.5612 | \n",
" 0.5586 | \n",
" 0.0026 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" step optimizer lr num_transformer_heads tr_layer_number hidden_dim out_features scheduler_type warmup_ratio dev test \\\n",
"0 28 adam 0.00010 16 3 256 256 plateau 0.10 0.5750 0.5758 \n",
"1 27 adam 0.00010 16 3 256 256 plateau 0.05 0.5750 0.5758 \n",
"2 25 adam 0.00010 16 3 256 256 huggingface_cosine_with_restarts 0.05 0.5721 0.5716 \n",
"3 56 adam 0.00001 8 3 256 512 plateau 0.10 0.5717 0.5705 \n",
"4 55 adam 0.00001 8 3 256 512 plateau 0.05 0.5717 0.5705 \n",
"5 17 adam 0.00010 8 5 256 256 huggingface_cosine_with_restarts 0.05 0.5683 0.5690 \n",
"6 26 adam 0.00010 16 3 256 256 huggingface_cosine_with_restarts 0.10 0.5758 0.5687 \n",
"7 37 adam 0.00010 16 4 256 512 huggingface_cosine_with_restarts 0.05 0.5674 0.5676 \n",
"8 44 adam 0.00010 16 5 256 256 plateau 0.10 0.5681 0.5671 \n",
"9 43 adam 0.00010 16 5 256 256 plateau 0.05 0.5681 0.5671 \n",
"10 31 adam 0.00010 16 3 256 512 plateau 0.05 0.5624 0.5659 \n",
"11 32 adam 0.00010 16 3 256 512 plateau 0.10 0.5624 0.5659 \n",
"12 2 adam 0.00010 8 3 256 256 huggingface_cosine_with_restarts 0.10 0.5661 0.5656 \n",
"13 80 adam 0.00001 16 3 256 512 plateau 0.10 0.5689 0.5650 \n",
"14 79 adam 0.00001 16 3 256 512 plateau 0.05 0.5689 0.5650 \n",
"15 53 adam 0.00001 8 3 256 512 huggingface_cosine_with_restarts 0.05 0.5665 0.5649 \n",
"16 77 adam 0.00001 16 3 256 512 huggingface_cosine_with_restarts 0.05 0.5690 0.5649 \n",
"17 1 adam 0.00010 8 3 256 256 huggingface_cosine_with_restarts 0.05 0.5639 0.5648 \n",
"18 78 adam 0.00001 16 3 256 512 huggingface_cosine_with_restarts 0.10 0.5656 0.5645 \n",
"19 62 adam 0.00001 8 4 256 512 huggingface_cosine_with_restarts 0.10 0.5649 0.5644 \n",
"20 46 adam 0.00010 16 5 256 512 huggingface_cosine_with_restarts 0.10 0.5596 0.5638 \n",
"21 14 adam 0.00010 8 4 256 512 huggingface_cosine_with_restarts 0.10 0.5722 0.5631 \n",
"22 95 adam 0.00001 16 5 256 512 plateau 0.05 0.5654 0.5631 \n",
"23 96 adam 0.00001 16 5 256 512 plateau 0.10 0.5654 0.5631 \n",
"24 50 adam 0.00001 8 3 256 256 huggingface_cosine_with_restarts 0.10 0.5702 0.5629 \n",
"25 73 adam 0.00001 16 3 256 256 huggingface_cosine_with_restarts 0.05 0.5674 0.5628 \n",
"26 6 adam 0.00010 8 3 256 512 huggingface_cosine_with_restarts 0.10 0.5708 0.5626 \n",
"27 61 adam 0.00001 8 4 256 512 huggingface_cosine_with_restarts 0.05 0.5649 0.5624 \n",
"28 74 adam 0.00001 16 3 256 256 huggingface_cosine_with_restarts 0.10 0.5676 0.5623 \n",
"29 91 adam 0.00001 16 5 256 256 plateau 0.05 0.5586 0.5622 \n",
"30 92 adam 0.00001 16 5 256 256 plateau 0.10 0.5586 0.5622 \n",
"31 8 adam 0.00010 8 3 256 512 plateau 0.10 0.5669 0.5616 \n",
"32 38 adam 0.00010 16 4 256 512 huggingface_cosine_with_restarts 0.10 0.5640 0.5616 \n",
"33 7 adam 0.00010 8 3 256 512 plateau 0.05 0.5669 0.5616 \n",
"34 54 adam 0.00001 8 3 256 512 huggingface_cosine_with_restarts 0.10 0.5646 0.5615 \n",
"35 93 adam 0.00001 16 5 256 512 huggingface_cosine_with_restarts 0.05 0.5673 0.5612 \n",
"36 4 adam 0.00010 8 3 256 256 plateau 0.10 0.5665 0.5610 \n",
"37 3 adam 0.00010 8 3 256 256 plateau 0.05 0.5665 0.5610 \n",
"38 49 adam 0.00001 8 3 256 256 huggingface_cosine_with_restarts 0.05 0.5706 0.5609 \n",
"39 75 adam 0.00001 16 3 256 256 plateau 0.05 0.5656 0.5606 \n",
"40 76 adam 0.00001 16 3 256 256 plateau 0.10 0.5656 0.5606 \n",
"41 42 adam 0.00010 16 5 256 256 huggingface_cosine_with_restarts 0.10 0.5666 0.5603 \n",
"42 89 adam 0.00001 16 5 256 256 huggingface_cosine_with_restarts 0.05 0.5616 0.5600 \n",
"43 10 adam 0.00010 8 4 256 256 huggingface_cosine_with_restarts 0.10 0.5651 0.5597 \n",
"44 70 adam 0.00001 8 5 256 512 huggingface_cosine_with_restarts 0.10 0.5691 0.5594 \n",
"45 34 adam 0.00010 16 4 256 256 huggingface_cosine_with_restarts 0.10 0.5650 0.5589 \n",
"46 47 adam 0.00010 16 5 256 512 plateau 0.05 0.5613 0.5587 \n",
"47 48 adam 0.00010 16 5 256 512 plateau 0.10 0.5613 0.5587 \n",
"48 39 adam 0.00010 16 4 256 512 plateau 0.05 0.5612 0.5586 \n",
"49 40 adam 0.00010 16 4 256 512 plateau 0.10 0.5612 0.5586 \n",
"\n",
" gap \n",
"0 -0.0008 \n",
"1 -0.0008 \n",
"2 0.0005 \n",
"3 0.0012 \n",
"4 0.0012 \n",
"5 -0.0006 \n",
"6 0.0071 \n",
"7 -0.0002 \n",
"8 0.0010 \n",
"9 0.0010 \n",
"10 -0.0035 \n",
"11 -0.0035 \n",
"12 0.0005 \n",
"13 0.0040 \n",
"14 0.0040 \n",
"15 0.0016 \n",
"16 0.0041 \n",
"17 -0.0009 \n",
"18 0.0011 \n",
"19 0.0005 \n",
"20 -0.0042 \n",
"21 0.0091 \n",
"22 0.0023 \n",
"23 0.0023 \n",
"24 0.0073 \n",
"25 0.0046 \n",
"26 0.0082 \n",
"27 0.0025 \n",
"28 0.0053 \n",
"29 -0.0036 \n",
"30 -0.0036 \n",
"31 0.0053 \n",
"32 0.0024 \n",
"33 0.0053 \n",
"34 0.0031 \n",
"35 0.0061 \n",
"36 0.0054 \n",
"37 0.0054 \n",
"38 0.0097 \n",
"39 0.0050 \n",
"40 0.0050 \n",
"41 0.0064 \n",
"42 0.0017 \n",
"43 0.0053 \n",
"44 0.0097 \n",
"45 0.0061 \n",
"46 0.0026 \n",
"47 0.0026 \n",
"48 0.0026 \n",
"49 0.0026 "
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"df = parse_smart_log(\"C:/Users/Alexandr/Desktop/sampling/last/BiForm_wtb.txt\",50)\n",
"\n",
"from IPython.display import display\n",
"pd.set_option(\"display.max_columns\", None)\n",
"pd.set_option(\"display.width\", 160)\n",
"\n",
"display(df.head(50))"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "0d46eb44-6c34-4cc9-8322-6eb476eb827c",
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" step | \n",
" lr | \n",
" num_transformer_heads | \n",
" smoothing_probability | \n",
" model_name | \n",
" dev | \n",
" test | \n",
" gap | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 45 | \n",
" 0.00001 | \n",
" 8 | \n",
" 0.0 | \n",
" BiFormer | \n",
" 0.5768 | \n",
" 0.5797 | \n",
" -0.0029 | \n",
"
\n",
" \n",
" 1 | \n",
" 49 | \n",
" 0.00001 | \n",
" 8 | \n",
" 0.2 | \n",
" BiFormer | \n",
" 0.5753 | \n",
" 0.5779 | \n",
" -0.0026 | \n",
"
\n",
" \n",
" 2 | \n",
" 47 | \n",
" 0.00001 | \n",
" 8 | \n",
" 0.1 | \n",
" BiFormer | \n",
" 0.5745 | \n",
" 0.5773 | \n",
" -0.0028 | \n",
"
\n",
" \n",
" 3 | \n",
" 67 | \n",
" 0.00001 | \n",
" 16 | \n",
" 0.0 | \n",
" BiFormer | \n",
" 0.5742 | \n",
" 0.5771 | \n",
" -0.0028 | \n",
"
\n",
" \n",
" 4 | \n",
" 25 | \n",
" 0.00010 | \n",
" 16 | \n",
" 0.1 | \n",
" BiFormer | \n",
" 0.5779 | \n",
" 0.5766 | \n",
" 0.0013 | \n",
"
\n",
" \n",
" 5 | \n",
" 27 | \n",
" 0.00010 | \n",
" 16 | \n",
" 0.2 | \n",
" BiFormer | \n",
" 0.5695 | \n",
" 0.5752 | \n",
" -0.0057 | \n",
"
\n",
" \n",
" 6 | \n",
" 71 | \n",
" 0.00001 | \n",
" 16 | \n",
" 0.2 | \n",
" BiFormer | \n",
" 0.5704 | \n",
" 0.5745 | \n",
" -0.0041 | \n",
"
\n",
" \n",
" 7 | \n",
" 69 | \n",
" 0.00001 | \n",
" 16 | \n",
" 0.1 | \n",
" BiFormer | \n",
" 0.5713 | \n",
" 0.5741 | \n",
" -0.0029 | \n",
"
\n",
" \n",
" 8 | \n",
" 51 | \n",
" 0.00001 | \n",
" 8 | \n",
" 0.3 | \n",
" BiFormer | \n",
" 0.5751 | \n",
" 0.5735 | \n",
" 0.0016 | \n",
"
\n",
" \n",
" 9 | \n",
" 77 | \n",
" 0.00001 | \n",
" 16 | \n",
" 0.5 | \n",
" BiFormer | \n",
" 0.5646 | \n",
" 0.5722 | \n",
" -0.0076 | \n",
"
\n",
" \n",
" 10 | \n",
" 55 | \n",
" 0.00001 | \n",
" 8 | \n",
" 0.5 | \n",
" BiFormer | \n",
" 0.5708 | \n",
" 0.5719 | \n",
" -0.0011 | \n",
"
\n",
" \n",
" 11 | \n",
" 29 | \n",
" 0.00010 | \n",
" 16 | \n",
" 0.3 | \n",
" BiFormer | \n",
" 0.5696 | \n",
" 0.5710 | \n",
" -0.0014 | \n",
"
\n",
" \n",
" 12 | \n",
" 73 | \n",
" 0.00001 | \n",
" 16 | \n",
" 0.3 | \n",
" BiFormer | \n",
" 0.5708 | \n",
" 0.5708 | \n",
" 0.0000 | \n",
"
\n",
" \n",
" 13 | \n",
" 11 | \n",
" 0.00010 | \n",
" 8 | \n",
" 0.5 | \n",
" BiFormer | \n",
" 0.5675 | \n",
" 0.5706 | \n",
" -0.0031 | \n",
"
\n",
" \n",
" 14 | \n",
" 31 | \n",
" 0.00010 | \n",
" 16 | \n",
" 0.4 | \n",
" BiFormer | \n",
" 0.5720 | \n",
" 0.5705 | \n",
" 0.0016 | \n",
"
\n",
" \n",
" 15 | \n",
" 75 | \n",
" 0.00001 | \n",
" 16 | \n",
" 0.4 | \n",
" BiFormer | \n",
" 0.5674 | \n",
" 0.5702 | \n",
" -0.0028 | \n",
"
\n",
" \n",
" 16 | \n",
" 3 | \n",
" 0.00010 | \n",
" 8 | \n",
" 0.1 | \n",
" BiFormer | \n",
" 0.5741 | \n",
" 0.5698 | \n",
" 0.0042 | \n",
"
\n",
" \n",
" 17 | \n",
" 1 | \n",
" 0.00010 | \n",
" 8 | \n",
" 0.0 | \n",
" BiFormer | \n",
" 0.5743 | \n",
" 0.5698 | \n",
" 0.0045 | \n",
"
\n",
" \n",
" 18 | \n",
" 23 | \n",
" 0.00010 | \n",
" 16 | \n",
" 0.0 | \n",
" BiFormer | \n",
" 0.5779 | \n",
" 0.5694 | \n",
" 0.0085 | \n",
"
\n",
" \n",
" 19 | \n",
" 53 | \n",
" 0.00001 | \n",
" 8 | \n",
" 0.4 | \n",
" BiFormer | \n",
" 0.5743 | \n",
" 0.5691 | \n",
" 0.0052 | \n",
"
\n",
" \n",
" 20 | \n",
" 50 | \n",
" 0.00001 | \n",
" 8 | \n",
" 0.2 | \n",
" BiFormerWithProb | \n",
" 0.5722 | \n",
" 0.5683 | \n",
" 0.0039 | \n",
"
\n",
" \n",
" 21 | \n",
" 72 | \n",
" 0.00001 | \n",
" 16 | \n",
" 0.2 | \n",
" BiFormerWithProb | \n",
" 0.5733 | \n",
" 0.5682 | \n",
" 0.0051 | \n",
"
\n",
" \n",
" 22 | \n",
" 52 | \n",
" 0.00001 | \n",
" 8 | \n",
" 0.3 | \n",
" BiFormerWithProb | \n",
" 0.5673 | \n",
" 0.5677 | \n",
" -0.0004 | \n",
"
\n",
" \n",
" 23 | \n",
" 57 | \n",
" 0.00001 | \n",
" 8 | \n",
" 0.6 | \n",
" BiFormer | \n",
" 0.5674 | \n",
" 0.5677 | \n",
" -0.0003 | \n",
"
\n",
" \n",
" 24 | \n",
" 26 | \n",
" 0.00010 | \n",
" 16 | \n",
" 0.1 | \n",
" BiFormerWithProb | \n",
" 0.5729 | \n",
" 0.5673 | \n",
" 0.0056 | \n",
"
\n",
" \n",
" 25 | \n",
" 74 | \n",
" 0.00001 | \n",
" 16 | \n",
" 0.3 | \n",
" BiFormerWithProb | \n",
" 0.5696 | \n",
" 0.5671 | \n",
" 0.0025 | \n",
"
\n",
" \n",
" 26 | \n",
" 24 | \n",
" 0.00010 | \n",
" 16 | \n",
" 0.0 | \n",
" BiFormerWithProb | \n",
" 0.5681 | \n",
" 0.5671 | \n",
" 0.0010 | \n",
"
\n",
" \n",
" 27 | \n",
" 79 | \n",
" 0.00001 | \n",
" 16 | \n",
" 0.6 | \n",
" BiFormer | \n",
" 0.5623 | \n",
" 0.5663 | \n",
" -0.0040 | \n",
"
\n",
" \n",
" 28 | \n",
" 5 | \n",
" 0.00010 | \n",
" 8 | \n",
" 0.2 | \n",
" BiFormer | \n",
" 0.5687 | \n",
" 0.5659 | \n",
" 0.0028 | \n",
"
\n",
" \n",
" 29 | \n",
" 81 | \n",
" 0.00001 | \n",
" 16 | \n",
" 0.7 | \n",
" BiFormer | \n",
" 0.5596 | \n",
" 0.5655 | \n",
" -0.0059 | \n",
"
\n",
" \n",
" 30 | \n",
" 28 | \n",
" 0.00010 | \n",
" 16 | \n",
" 0.2 | \n",
" BiFormerWithProb | \n",
" 0.5649 | \n",
" 0.5655 | \n",
" -0.0006 | \n",
"
\n",
" \n",
" 31 | \n",
" 70 | \n",
" 0.00001 | \n",
" 16 | \n",
" 0.1 | \n",
" BiFormerWithProb | \n",
" 0.5684 | \n",
" 0.5638 | \n",
" 0.0046 | \n",
"
\n",
" \n",
" 32 | \n",
" 59 | \n",
" 0.00001 | \n",
" 8 | \n",
" 0.7 | \n",
" BiFormer | \n",
" 0.5682 | \n",
" 0.5636 | \n",
" 0.0046 | \n",
"
\n",
" \n",
" 33 | \n",
" 13 | \n",
" 0.00010 | \n",
" 8 | \n",
" 0.6 | \n",
" BiFormer | \n",
" 0.5593 | \n",
" 0.5634 | \n",
" -0.0042 | \n",
"
\n",
" \n",
" 34 | \n",
" 4 | \n",
" 0.00010 | \n",
" 8 | \n",
" 0.1 | \n",
" BiFormerWithProb | \n",
" 0.5645 | \n",
" 0.5630 | \n",
" 0.0014 | \n",
"
\n",
" \n",
" 35 | \n",
" 9 | \n",
" 0.00010 | \n",
" 8 | \n",
" 0.4 | \n",
" BiFormer | \n",
" 0.5684 | \n",
" 0.5629 | \n",
" 0.0055 | \n",
"
\n",
" \n",
" 36 | \n",
" 17 | \n",
" 0.00010 | \n",
" 8 | \n",
" 0.8 | \n",
" BiFormer | \n",
" 0.5623 | \n",
" 0.5622 | \n",
" 0.0001 | \n",
"
\n",
" \n",
" 37 | \n",
" 68 | \n",
" 0.00001 | \n",
" 16 | \n",
" 0.0 | \n",
" BiFormerWithProb | \n",
" 0.5586 | \n",
" 0.5622 | \n",
" -0.0036 | \n",
"
\n",
" \n",
" 38 | \n",
" 15 | \n",
" 0.00010 | \n",
" 8 | \n",
" 0.7 | \n",
" BiFormer | \n",
" 0.5625 | \n",
" 0.5616 | \n",
" 0.0009 | \n",
"
\n",
" \n",
" 39 | \n",
" 7 | \n",
" 0.00010 | \n",
" 8 | \n",
" 0.3 | \n",
" BiFormer | \n",
" 0.5689 | \n",
" 0.5608 | \n",
" 0.0081 | \n",
"
\n",
" \n",
" 40 | \n",
" 37 | \n",
" 0.00010 | \n",
" 16 | \n",
" 0.7 | \n",
" BiFormer | \n",
" 0.5590 | \n",
" 0.5607 | \n",
" -0.0017 | \n",
"
\n",
" \n",
" 41 | \n",
" 10 | \n",
" 0.00010 | \n",
" 8 | \n",
" 0.4 | \n",
" BiFormerWithProb | \n",
" 0.5636 | \n",
" 0.5607 | \n",
" 0.0029 | \n",
"
\n",
" \n",
" 42 | \n",
" 33 | \n",
" 0.00010 | \n",
" 16 | \n",
" 0.5 | \n",
" BiFormer | \n",
" 0.5627 | \n",
" 0.5605 | \n",
" 0.0022 | \n",
"
\n",
" \n",
" 43 | \n",
" 61 | \n",
" 0.00001 | \n",
" 8 | \n",
" 0.8 | \n",
" BiFormer | \n",
" 0.5610 | \n",
" 0.5600 | \n",
" 0.0010 | \n",
"
\n",
" \n",
" 44 | \n",
" 76 | \n",
" 0.00001 | \n",
" 16 | \n",
" 0.4 | \n",
" BiFormerWithProb | \n",
" 0.5652 | \n",
" 0.5599 | \n",
" 0.0053 | \n",
"
\n",
" \n",
" 45 | \n",
" 19 | \n",
" 0.00010 | \n",
" 8 | \n",
" 0.9 | \n",
" BiFormer | \n",
" 0.5615 | \n",
" 0.5592 | \n",
" 0.0024 | \n",
"
\n",
" \n",
" 46 | \n",
" 63 | \n",
" 0.00001 | \n",
" 8 | \n",
" 0.9 | \n",
" BiFormer | \n",
" 0.5573 | \n",
" 0.5590 | \n",
" -0.0017 | \n",
"
\n",
" \n",
" 47 | \n",
" 83 | \n",
" 0.00001 | \n",
" 16 | \n",
" 0.8 | \n",
" BiFormer | \n",
" 0.5566 | \n",
" 0.5585 | \n",
" -0.0019 | \n",
"
\n",
" \n",
" 48 | \n",
" 8 | \n",
" 0.00010 | \n",
" 8 | \n",
" 0.3 | \n",
" BiFormerWithProb | \n",
" 0.5626 | \n",
" 0.5585 | \n",
" 0.0041 | \n",
"
\n",
" \n",
" 49 | \n",
" 6 | \n",
" 0.00010 | \n",
" 8 | \n",
" 0.2 | \n",
" BiFormerWithProb | \n",
" 0.5612 | \n",
" 0.5575 | \n",
" 0.0037 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" step lr num_transformer_heads smoothing_probability model_name dev test gap\n",
"0 45 0.00001 8 0.0 BiFormer 0.5768 0.5797 -0.0029\n",
"1 49 0.00001 8 0.2 BiFormer 0.5753 0.5779 -0.0026\n",
"2 47 0.00001 8 0.1 BiFormer 0.5745 0.5773 -0.0028\n",
"3 67 0.00001 16 0.0 BiFormer 0.5742 0.5771 -0.0028\n",
"4 25 0.00010 16 0.1 BiFormer 0.5779 0.5766 0.0013\n",
"5 27 0.00010 16 0.2 BiFormer 0.5695 0.5752 -0.0057\n",
"6 71 0.00001 16 0.2 BiFormer 0.5704 0.5745 -0.0041\n",
"7 69 0.00001 16 0.1 BiFormer 0.5713 0.5741 -0.0029\n",
"8 51 0.00001 8 0.3 BiFormer 0.5751 0.5735 0.0016\n",
"9 77 0.00001 16 0.5 BiFormer 0.5646 0.5722 -0.0076\n",
"10 55 0.00001 8 0.5 BiFormer 0.5708 0.5719 -0.0011\n",
"11 29 0.00010 16 0.3 BiFormer 0.5696 0.5710 -0.0014\n",
"12 73 0.00001 16 0.3 BiFormer 0.5708 0.5708 0.0000\n",
"13 11 0.00010 8 0.5 BiFormer 0.5675 0.5706 -0.0031\n",
"14 31 0.00010 16 0.4 BiFormer 0.5720 0.5705 0.0016\n",
"15 75 0.00001 16 0.4 BiFormer 0.5674 0.5702 -0.0028\n",
"16 3 0.00010 8 0.1 BiFormer 0.5741 0.5698 0.0042\n",
"17 1 0.00010 8 0.0 BiFormer 0.5743 0.5698 0.0045\n",
"18 23 0.00010 16 0.0 BiFormer 0.5779 0.5694 0.0085\n",
"19 53 0.00001 8 0.4 BiFormer 0.5743 0.5691 0.0052\n",
"20 50 0.00001 8 0.2 BiFormerWithProb 0.5722 0.5683 0.0039\n",
"21 72 0.00001 16 0.2 BiFormerWithProb 0.5733 0.5682 0.0051\n",
"22 52 0.00001 8 0.3 BiFormerWithProb 0.5673 0.5677 -0.0004\n",
"23 57 0.00001 8 0.6 BiFormer 0.5674 0.5677 -0.0003\n",
"24 26 0.00010 16 0.1 BiFormerWithProb 0.5729 0.5673 0.0056\n",
"25 74 0.00001 16 0.3 BiFormerWithProb 0.5696 0.5671 0.0025\n",
"26 24 0.00010 16 0.0 BiFormerWithProb 0.5681 0.5671 0.0010\n",
"27 79 0.00001 16 0.6 BiFormer 0.5623 0.5663 -0.0040\n",
"28 5 0.00010 8 0.2 BiFormer 0.5687 0.5659 0.0028\n",
"29 81 0.00001 16 0.7 BiFormer 0.5596 0.5655 -0.0059\n",
"30 28 0.00010 16 0.2 BiFormerWithProb 0.5649 0.5655 -0.0006\n",
"31 70 0.00001 16 0.1 BiFormerWithProb 0.5684 0.5638 0.0046\n",
"32 59 0.00001 8 0.7 BiFormer 0.5682 0.5636 0.0046\n",
"33 13 0.00010 8 0.6 BiFormer 0.5593 0.5634 -0.0042\n",
"34 4 0.00010 8 0.1 BiFormerWithProb 0.5645 0.5630 0.0014\n",
"35 9 0.00010 8 0.4 BiFormer 0.5684 0.5629 0.0055\n",
"36 17 0.00010 8 0.8 BiFormer 0.5623 0.5622 0.0001\n",
"37 68 0.00001 16 0.0 BiFormerWithProb 0.5586 0.5622 -0.0036\n",
"38 15 0.00010 8 0.7 BiFormer 0.5625 0.5616 0.0009\n",
"39 7 0.00010 8 0.3 BiFormer 0.5689 0.5608 0.0081\n",
"40 37 0.00010 16 0.7 BiFormer 0.5590 0.5607 -0.0017\n",
"41 10 0.00010 8 0.4 BiFormerWithProb 0.5636 0.5607 0.0029\n",
"42 33 0.00010 16 0.5 BiFormer 0.5627 0.5605 0.0022\n",
"43 61 0.00001 8 0.8 BiFormer 0.5610 0.5600 0.0010\n",
"44 76 0.00001 16 0.4 BiFormerWithProb 0.5652 0.5599 0.0053\n",
"45 19 0.00010 8 0.9 BiFormer 0.5615 0.5592 0.0024\n",
"46 63 0.00001 8 0.9 BiFormer 0.5573 0.5590 -0.0017\n",
"47 83 0.00001 16 0.8 BiFormer 0.5566 0.5585 -0.0019\n",
"48 8 0.00010 8 0.3 BiFormerWithProb 0.5626 0.5585 0.0041\n",
"49 6 0.00010 8 0.2 BiFormerWithProb 0.5612 0.5575 0.0037"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"df = parse_smart_log(\"C:/Users/Alexandr/Desktop/sampling/smoothing.txt\",50)\n",
"\n",
"from IPython.display import display\n",
"pd.set_option(\"display.max_columns\", None)\n",
"pd.set_option(\"display.width\", 160)\n",
"\n",
"display(df.head(50))"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "7b4e6f8c-4428-4c5c-a8ac-b1780372ee98",
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" step | \n",
" lr | \n",
" hidden_dim | \n",
" out_features | \n",
" mamba_d_state | \n",
" mamba_ker_size | \n",
" mamba_layer_number | \n",
" scheduler_type | \n",
" warmup_ratio | \n",
" model_name | \n",
" dev | \n",
" test | \n",
" gap | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 76 | \n",
" 0.00010 | \n",
" 256 | \n",
" 512 | \n",
" 8 | \n",
" 4 | \n",
" 4 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" BiMambaWithProb | \n",
" 0.5742 | \n",
" 0.5765 | \n",
" -0.0024 | \n",
"
\n",
" \n",
" 1 | \n",
" 386 | \n",
" 0.00001 | \n",
" 512 | \n",
" 256 | \n",
" 8 | \n",
" 4 | \n",
" 3 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" BiMambaWithProb | \n",
" 0.5774 | \n",
" 0.5709 | \n",
" 0.0065 | \n",
"
\n",
" \n",
" 2 | \n",
" 260 | \n",
" 0.00001 | \n",
" 256 | \n",
" 256 | \n",
" 8 | \n",
" 4 | \n",
" 3 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" BiMambaWithProb | \n",
" 0.5653 | \n",
" 0.5666 | \n",
" -0.0012 | \n",
"
\n",
" \n",
" 3 | \n",
" 237 | \n",
" 0.00010 | \n",
" 512 | \n",
" 512 | \n",
" 16 | \n",
" 4 | \n",
" 4 | \n",
" plateau | \n",
" 0.05 | \n",
" BiMamba | \n",
" 0.5612 | \n",
" 0.5660 | \n",
" -0.0048 | \n",
"
\n",
" \n",
" 4 | \n",
" 239 | \n",
" 0.00010 | \n",
" 512 | \n",
" 512 | \n",
" 16 | \n",
" 4 | \n",
" 4 | \n",
" plateau | \n",
" 0.10 | \n",
" BiMamba | \n",
" 0.5612 | \n",
" 0.5660 | \n",
" -0.0048 | \n",
"
\n",
" \n",
" 5 | \n",
" 131 | \n",
" 0.00010 | \n",
" 512 | \n",
" 256 | \n",
" 8 | \n",
" 4 | \n",
" 3 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" BiMamba | \n",
" 0.5622 | \n",
" 0.5649 | \n",
" -0.0026 | \n",
"
\n",
" \n",
" 6 | \n",
" 120 | \n",
" 0.00010 | \n",
" 256 | \n",
" 512 | \n",
" 16 | \n",
" 5 | \n",
" 3 | \n",
" plateau | \n",
" 0.10 | \n",
" BiMambaWithProb | \n",
" 0.5703 | \n",
" 0.5643 | \n",
" 0.0060 | \n",
"
\n",
" \n",
" 7 | \n",
" 118 | \n",
" 0.00010 | \n",
" 256 | \n",
" 512 | \n",
" 16 | \n",
" 5 | \n",
" 3 | \n",
" plateau | \n",
" 0.05 | \n",
" BiMambaWithProb | \n",
" 0.5703 | \n",
" 0.5643 | \n",
" 0.0060 | \n",
"
\n",
" \n",
" 8 | \n",
" 4 | \n",
" 0.00010 | \n",
" 256 | \n",
" 256 | \n",
" 8 | \n",
" 4 | \n",
" 3 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" BiMambaWithProb | \n",
" 0.5676 | \n",
" 0.5643 | \n",
" 0.0033 | \n",
"
\n",
" \n",
" 9 | \n",
" 110 | \n",
" 0.00010 | \n",
" 256 | \n",
" 512 | \n",
" 16 | \n",
" 4 | \n",
" 4 | \n",
" plateau | \n",
" 0.05 | \n",
" BiMambaWithProb | \n",
" 0.5609 | \n",
" 0.5636 | \n",
" -0.0027 | \n",
"
\n",
" \n",
" 10 | \n",
" 112 | \n",
" 0.00010 | \n",
" 256 | \n",
" 512 | \n",
" 16 | \n",
" 4 | \n",
" 4 | \n",
" plateau | \n",
" 0.10 | \n",
" BiMambaWithProb | \n",
" 0.5609 | \n",
" 0.5636 | \n",
" -0.0027 | \n",
"
\n",
" \n",
" 11 | \n",
" 194 | \n",
" 0.00010 | \n",
" 512 | \n",
" 512 | \n",
" 8 | \n",
" 4 | \n",
" 3 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" BiMambaWithProb | \n",
" 0.5641 | \n",
" 0.5632 | \n",
" 0.0008 | \n",
"
\n",
" \n",
" 12 | \n",
" 388 | \n",
" 0.00001 | \n",
" 512 | \n",
" 256 | \n",
" 8 | \n",
" 4 | \n",
" 3 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" BiMambaWithProb | \n",
" 0.5701 | \n",
" 0.5630 | \n",
" 0.0071 | \n",
"
\n",
" \n",
" 13 | \n",
" 150 | \n",
" 0.00010 | \n",
" 512 | \n",
" 256 | \n",
" 8 | \n",
" 5 | \n",
" 3 | \n",
" plateau | \n",
" 0.05 | \n",
" BiMambaWithProb | \n",
" 0.5624 | \n",
" 0.5629 | \n",
" -0.0006 | \n",
"
\n",
" \n",
" 14 | \n",
" 152 | \n",
" 0.00010 | \n",
" 512 | \n",
" 256 | \n",
" 8 | \n",
" 5 | \n",
" 3 | \n",
" plateau | \n",
" 0.10 | \n",
" BiMambaWithProb | \n",
" 0.5624 | \n",
" 0.5629 | \n",
" -0.0006 | \n",
"
\n",
" \n",
" 15 | \n",
" 338 | \n",
" 0.00001 | \n",
" 256 | \n",
" 512 | \n",
" 8 | \n",
" 5 | \n",
" 3 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" BiMambaWithProb | \n",
" 0.5620 | \n",
" 0.5626 | \n",
" -0.0006 | \n",
"
\n",
" \n",
" 16 | \n",
" 340 | \n",
" 0.00001 | \n",
" 256 | \n",
" 512 | \n",
" 8 | \n",
" 5 | \n",
" 3 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" BiMambaWithProb | \n",
" 0.5645 | \n",
" 0.5626 | \n",
" 0.0019 | \n",
"
\n",
" \n",
" 17 | \n",
" 402 | \n",
" 0.00001 | \n",
" 512 | \n",
" 256 | \n",
" 8 | \n",
" 5 | \n",
" 3 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" BiMambaWithProb | \n",
" 0.5610 | \n",
" 0.5622 | \n",
" -0.0011 | \n",
"
\n",
" \n",
" 18 | \n",
" 146 | \n",
" 0.00010 | \n",
" 512 | \n",
" 256 | \n",
" 8 | \n",
" 5 | \n",
" 3 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" BiMambaWithProb | \n",
" 0.5647 | \n",
" 0.5613 | \n",
" 0.0034 | \n",
"
\n",
" \n",
" 19 | \n",
" 258 | \n",
" 0.00001 | \n",
" 256 | \n",
" 256 | \n",
" 8 | \n",
" 4 | \n",
" 3 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" BiMambaWithProb | \n",
" 0.5634 | \n",
" 0.5612 | \n",
" 0.0022 | \n",
"
\n",
" \n",
" 20 | \n",
" 216 | \n",
" 0.00010 | \n",
" 512 | \n",
" 512 | \n",
" 8 | \n",
" 5 | \n",
" 3 | \n",
" plateau | \n",
" 0.10 | \n",
" BiMambaWithProb | \n",
" 0.5643 | \n",
" 0.5608 | \n",
" 0.0035 | \n",
"
\n",
" \n",
" 21 | \n",
" 214 | \n",
" 0.00010 | \n",
" 512 | \n",
" 512 | \n",
" 8 | \n",
" 5 | \n",
" 3 | \n",
" plateau | \n",
" 0.05 | \n",
" BiMambaWithProb | \n",
" 0.5643 | \n",
" 0.5608 | \n",
" 0.0035 | \n",
"
\n",
" \n",
" 22 | \n",
" 452 | \n",
" 0.00001 | \n",
" 512 | \n",
" 512 | \n",
" 8 | \n",
" 4 | \n",
" 3 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" BiMambaWithProb | \n",
" 0.5630 | \n",
" 0.5608 | \n",
" 0.0022 | \n",
"
\n",
" \n",
" 23 | \n",
" 404 | \n",
" 0.00001 | \n",
" 512 | \n",
" 256 | \n",
" 8 | \n",
" 5 | \n",
" 3 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" BiMambaWithProb | \n",
" 0.5614 | \n",
" 0.5607 | \n",
" 0.0007 | \n",
"
\n",
" \n",
" 24 | \n",
" 106 | \n",
" 0.00010 | \n",
" 256 | \n",
" 512 | \n",
" 16 | \n",
" 4 | \n",
" 4 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" BiMambaWithProb | \n",
" 0.5646 | \n",
" 0.5603 | \n",
" 0.0043 | \n",
"
\n",
" \n",
" 25 | \n",
" 450 | \n",
" 0.00001 | \n",
" 512 | \n",
" 512 | \n",
" 8 | \n",
" 4 | \n",
" 3 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" BiMambaWithProb | \n",
" 0.5668 | \n",
" 0.5602 | \n",
" 0.0066 | \n",
"
\n",
" \n",
" 26 | \n",
" 199 | \n",
" 0.00010 | \n",
" 512 | \n",
" 512 | \n",
" 8 | \n",
" 4 | \n",
" 3 | \n",
" plateau | \n",
" 0.10 | \n",
" BiMamba | \n",
" 0.5591 | \n",
" 0.5596 | \n",
" -0.0004 | \n",
"
\n",
" \n",
" 27 | \n",
" 226 | \n",
" 0.00010 | \n",
" 512 | \n",
" 512 | \n",
" 16 | \n",
" 4 | \n",
" 3 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" BiMambaWithProb | \n",
" 0.5654 | \n",
" 0.5596 | \n",
" 0.0058 | \n",
"
\n",
" \n",
" 28 | \n",
" 197 | \n",
" 0.00010 | \n",
" 512 | \n",
" 512 | \n",
" 8 | \n",
" 4 | \n",
" 3 | \n",
" plateau | \n",
" 0.05 | \n",
" BiMamba | \n",
" 0.5591 | \n",
" 0.5596 | \n",
" -0.0004 | \n",
"
\n",
" \n",
" 29 | \n",
" 25 | \n",
" 0.00010 | \n",
" 256 | \n",
" 256 | \n",
" 8 | \n",
" 5 | \n",
" 4 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" BiMamba | \n",
" 0.5569 | \n",
" 0.5591 | \n",
" -0.0022 | \n",
"
\n",
" \n",
" 30 | \n",
" 196 | \n",
" 0.00010 | \n",
" 512 | \n",
" 512 | \n",
" 8 | \n",
" 4 | \n",
" 3 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" BiMambaWithProb | \n",
" 0.5611 | \n",
" 0.5590 | \n",
" 0.0022 | \n",
"
\n",
" \n",
" 31 | \n",
" 212 | \n",
" 0.00010 | \n",
" 512 | \n",
" 512 | \n",
" 8 | \n",
" 5 | \n",
" 3 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" BiMambaWithProb | \n",
" 0.5700 | \n",
" 0.5590 | \n",
" 0.0109 | \n",
"
\n",
" \n",
" 32 | \n",
" 385 | \n",
" 0.00001 | \n",
" 512 | \n",
" 256 | \n",
" 8 | \n",
" 4 | \n",
" 3 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" BiMamba | \n",
" 0.5596 | \n",
" 0.5586 | \n",
" 0.0010 | \n",
"
\n",
" \n",
" 33 | \n",
" 46 | \n",
" 0.00010 | \n",
" 256 | \n",
" 256 | \n",
" 16 | \n",
" 4 | \n",
" 4 | \n",
" plateau | \n",
" 0.05 | \n",
" BiMambaWithProb | \n",
" 0.5638 | \n",
" 0.5581 | \n",
" 0.0057 | \n",
"
\n",
" \n",
" 34 | \n",
" 48 | \n",
" 0.00010 | \n",
" 256 | \n",
" 256 | \n",
" 16 | \n",
" 4 | \n",
" 4 | \n",
" plateau | \n",
" 0.10 | \n",
" BiMambaWithProb | \n",
" 0.5638 | \n",
" 0.5581 | \n",
" 0.0057 | \n",
"
\n",
" \n",
" 35 | \n",
" 242 | \n",
" 0.00010 | \n",
" 512 | \n",
" 512 | \n",
" 16 | \n",
" 5 | \n",
" 3 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" BiMambaWithProb | \n",
" 0.5611 | \n",
" 0.5577 | \n",
" 0.0034 | \n",
"
\n",
" \n",
" 36 | \n",
" 166 | \n",
" 0.00010 | \n",
" 512 | \n",
" 256 | \n",
" 16 | \n",
" 4 | \n",
" 3 | \n",
" plateau | \n",
" 0.05 | \n",
" BiMambaWithProb | \n",
" 0.5596 | \n",
" 0.5577 | \n",
" 0.0020 | \n",
"
\n",
" \n",
" 37 | \n",
" 168 | \n",
" 0.00010 | \n",
" 512 | \n",
" 256 | \n",
" 16 | \n",
" 4 | \n",
" 3 | \n",
" plateau | \n",
" 0.10 | \n",
" BiMambaWithProb | \n",
" 0.5596 | \n",
" 0.5577 | \n",
" 0.0020 | \n",
"
\n",
" \n",
" 38 | \n",
" 162 | \n",
" 0.00010 | \n",
" 512 | \n",
" 256 | \n",
" 16 | \n",
" 4 | \n",
" 3 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" BiMambaWithProb | \n",
" 0.5621 | \n",
" 0.5574 | \n",
" 0.0047 | \n",
"
\n",
" \n",
" 39 | \n",
" 266 | \n",
" 0.00001 | \n",
" 256 | \n",
" 256 | \n",
" 8 | \n",
" 4 | \n",
" 4 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" BiMambaWithProb | \n",
" 0.5543 | \n",
" 0.5573 | \n",
" -0.0030 | \n",
"
\n",
" \n",
" 40 | \n",
" 124 | \n",
" 0.00010 | \n",
" 256 | \n",
" 512 | \n",
" 16 | \n",
" 5 | \n",
" 4 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" BiMambaWithProb | \n",
" 0.5551 | \n",
" 0.5573 | \n",
" -0.0022 | \n",
"
\n",
" \n",
" 41 | \n",
" 53 | \n",
" 0.00010 | \n",
" 256 | \n",
" 256 | \n",
" 16 | \n",
" 5 | \n",
" 3 | \n",
" plateau | \n",
" 0.05 | \n",
" BiMamba | \n",
" 0.5577 | \n",
" 0.5572 | \n",
" 0.0005 | \n",
"
\n",
" \n",
" 42 | \n",
" 55 | \n",
" 0.00010 | \n",
" 256 | \n",
" 256 | \n",
" 16 | \n",
" 5 | \n",
" 3 | \n",
" plateau | \n",
" 0.10 | \n",
" BiMamba | \n",
" 0.5577 | \n",
" 0.5572 | \n",
" 0.0005 | \n",
"
\n",
" \n",
" 43 | \n",
" 116 | \n",
" 0.00010 | \n",
" 256 | \n",
" 512 | \n",
" 16 | \n",
" 5 | \n",
" 3 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" BiMambaWithProb | \n",
" 0.5582 | \n",
" 0.5567 | \n",
" 0.0015 | \n",
"
\n",
" \n",
" 44 | \n",
" 332 | \n",
" 0.00001 | \n",
" 256 | \n",
" 512 | \n",
" 8 | \n",
" 4 | \n",
" 4 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" BiMambaWithProb | \n",
" 0.5554 | \n",
" 0.5567 | \n",
" -0.0013 | \n",
"
\n",
" \n",
" 45 | \n",
" 2 | \n",
" 0.00010 | \n",
" 256 | \n",
" 256 | \n",
" 8 | \n",
" 4 | \n",
" 3 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" BiMambaWithProb | \n",
" 0.5528 | \n",
" 0.5566 | \n",
" -0.0037 | \n",
"
\n",
" \n",
" 46 | \n",
" 130 | \n",
" 0.00010 | \n",
" 512 | \n",
" 256 | \n",
" 8 | \n",
" 4 | \n",
" 3 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" BiMambaWithProb | \n",
" 0.5636 | \n",
" 0.5565 | \n",
" 0.0071 | \n",
"
\n",
" \n",
" 47 | \n",
" 393 | \n",
" 0.00001 | \n",
" 512 | \n",
" 256 | \n",
" 8 | \n",
" 4 | \n",
" 4 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.05 | \n",
" BiMamba | \n",
" 0.5592 | \n",
" 0.5561 | \n",
" 0.0032 | \n",
"
\n",
" \n",
" 48 | \n",
" 132 | \n",
" 0.00010 | \n",
" 512 | \n",
" 256 | \n",
" 8 | \n",
" 4 | \n",
" 3 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" BiMambaWithProb | \n",
" 0.5630 | \n",
" 0.5559 | \n",
" 0.0071 | \n",
"
\n",
" \n",
" 49 | \n",
" 315 | \n",
" 0.00001 | \n",
" 256 | \n",
" 256 | \n",
" 16 | \n",
" 5 | \n",
" 4 | \n",
" huggingface_cosine_with_restarts | \n",
" 0.10 | \n",
" BiMamba | \n",
" 0.5563 | \n",
" 0.5558 | \n",
" 0.0005 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" step lr hidden_dim out_features mamba_d_state mamba_ker_size mamba_layer_number scheduler_type warmup_ratio \\\n",
"0 76 0.00010 256 512 8 4 4 huggingface_cosine_with_restarts 0.10 \n",
"1 386 0.00001 512 256 8 4 3 huggingface_cosine_with_restarts 0.05 \n",
"2 260 0.00001 256 256 8 4 3 huggingface_cosine_with_restarts 0.10 \n",
"3 237 0.00010 512 512 16 4 4 plateau 0.05 \n",
"4 239 0.00010 512 512 16 4 4 plateau 0.10 \n",
"5 131 0.00010 512 256 8 4 3 huggingface_cosine_with_restarts 0.10 \n",
"6 120 0.00010 256 512 16 5 3 plateau 0.10 \n",
"7 118 0.00010 256 512 16 5 3 plateau 0.05 \n",
"8 4 0.00010 256 256 8 4 3 huggingface_cosine_with_restarts 0.10 \n",
"9 110 0.00010 256 512 16 4 4 plateau 0.05 \n",
"10 112 0.00010 256 512 16 4 4 plateau 0.10 \n",
"11 194 0.00010 512 512 8 4 3 huggingface_cosine_with_restarts 0.05 \n",
"12 388 0.00001 512 256 8 4 3 huggingface_cosine_with_restarts 0.10 \n",
"13 150 0.00010 512 256 8 5 3 plateau 0.05 \n",
"14 152 0.00010 512 256 8 5 3 plateau 0.10 \n",
"15 338 0.00001 256 512 8 5 3 huggingface_cosine_with_restarts 0.05 \n",
"16 340 0.00001 256 512 8 5 3 huggingface_cosine_with_restarts 0.10 \n",
"17 402 0.00001 512 256 8 5 3 huggingface_cosine_with_restarts 0.05 \n",
"18 146 0.00010 512 256 8 5 3 huggingface_cosine_with_restarts 0.05 \n",
"19 258 0.00001 256 256 8 4 3 huggingface_cosine_with_restarts 0.05 \n",
"20 216 0.00010 512 512 8 5 3 plateau 0.10 \n",
"21 214 0.00010 512 512 8 5 3 plateau 0.05 \n",
"22 452 0.00001 512 512 8 4 3 huggingface_cosine_with_restarts 0.10 \n",
"23 404 0.00001 512 256 8 5 3 huggingface_cosine_with_restarts 0.10 \n",
"24 106 0.00010 256 512 16 4 4 huggingface_cosine_with_restarts 0.05 \n",
"25 450 0.00001 512 512 8 4 3 huggingface_cosine_with_restarts 0.05 \n",
"26 199 0.00010 512 512 8 4 3 plateau 0.10 \n",
"27 226 0.00010 512 512 16 4 3 huggingface_cosine_with_restarts 0.05 \n",
"28 197 0.00010 512 512 8 4 3 plateau 0.05 \n",
"29 25 0.00010 256 256 8 5 4 huggingface_cosine_with_restarts 0.05 \n",
"30 196 0.00010 512 512 8 4 3 huggingface_cosine_with_restarts 0.10 \n",
"31 212 0.00010 512 512 8 5 3 huggingface_cosine_with_restarts 0.10 \n",
"32 385 0.00001 512 256 8 4 3 huggingface_cosine_with_restarts 0.05 \n",
"33 46 0.00010 256 256 16 4 4 plateau 0.05 \n",
"34 48 0.00010 256 256 16 4 4 plateau 0.10 \n",
"35 242 0.00010 512 512 16 5 3 huggingface_cosine_with_restarts 0.05 \n",
"36 166 0.00010 512 256 16 4 3 plateau 0.05 \n",
"37 168 0.00010 512 256 16 4 3 plateau 0.10 \n",
"38 162 0.00010 512 256 16 4 3 huggingface_cosine_with_restarts 0.05 \n",
"39 266 0.00001 256 256 8 4 4 huggingface_cosine_with_restarts 0.05 \n",
"40 124 0.00010 256 512 16 5 4 huggingface_cosine_with_restarts 0.10 \n",
"41 53 0.00010 256 256 16 5 3 plateau 0.05 \n",
"42 55 0.00010 256 256 16 5 3 plateau 0.10 \n",
"43 116 0.00010 256 512 16 5 3 huggingface_cosine_with_restarts 0.10 \n",
"44 332 0.00001 256 512 8 4 4 huggingface_cosine_with_restarts 0.10 \n",
"45 2 0.00010 256 256 8 4 3 huggingface_cosine_with_restarts 0.05 \n",
"46 130 0.00010 512 256 8 4 3 huggingface_cosine_with_restarts 0.05 \n",
"47 393 0.00001 512 256 8 4 4 huggingface_cosine_with_restarts 0.05 \n",
"48 132 0.00010 512 256 8 4 3 huggingface_cosine_with_restarts 0.10 \n",
"49 315 0.00001 256 256 16 5 4 huggingface_cosine_with_restarts 0.10 \n",
"\n",
" model_name dev test gap \n",
"0 BiMambaWithProb 0.5742 0.5765 -0.0024 \n",
"1 BiMambaWithProb 0.5774 0.5709 0.0065 \n",
"2 BiMambaWithProb 0.5653 0.5666 -0.0012 \n",
"3 BiMamba 0.5612 0.5660 -0.0048 \n",
"4 BiMamba 0.5612 0.5660 -0.0048 \n",
"5 BiMamba 0.5622 0.5649 -0.0026 \n",
"6 BiMambaWithProb 0.5703 0.5643 0.0060 \n",
"7 BiMambaWithProb 0.5703 0.5643 0.0060 \n",
"8 BiMambaWithProb 0.5676 0.5643 0.0033 \n",
"9 BiMambaWithProb 0.5609 0.5636 -0.0027 \n",
"10 BiMambaWithProb 0.5609 0.5636 -0.0027 \n",
"11 BiMambaWithProb 0.5641 0.5632 0.0008 \n",
"12 BiMambaWithProb 0.5701 0.5630 0.0071 \n",
"13 BiMambaWithProb 0.5624 0.5629 -0.0006 \n",
"14 BiMambaWithProb 0.5624 0.5629 -0.0006 \n",
"15 BiMambaWithProb 0.5620 0.5626 -0.0006 \n",
"16 BiMambaWithProb 0.5645 0.5626 0.0019 \n",
"17 BiMambaWithProb 0.5610 0.5622 -0.0011 \n",
"18 BiMambaWithProb 0.5647 0.5613 0.0034 \n",
"19 BiMambaWithProb 0.5634 0.5612 0.0022 \n",
"20 BiMambaWithProb 0.5643 0.5608 0.0035 \n",
"21 BiMambaWithProb 0.5643 0.5608 0.0035 \n",
"22 BiMambaWithProb 0.5630 0.5608 0.0022 \n",
"23 BiMambaWithProb 0.5614 0.5607 0.0007 \n",
"24 BiMambaWithProb 0.5646 0.5603 0.0043 \n",
"25 BiMambaWithProb 0.5668 0.5602 0.0066 \n",
"26 BiMamba 0.5591 0.5596 -0.0004 \n",
"27 BiMambaWithProb 0.5654 0.5596 0.0058 \n",
"28 BiMamba 0.5591 0.5596 -0.0004 \n",
"29 BiMamba 0.5569 0.5591 -0.0022 \n",
"30 BiMambaWithProb 0.5611 0.5590 0.0022 \n",
"31 BiMambaWithProb 0.5700 0.5590 0.0109 \n",
"32 BiMamba 0.5596 0.5586 0.0010 \n",
"33 BiMambaWithProb 0.5638 0.5581 0.0057 \n",
"34 BiMambaWithProb 0.5638 0.5581 0.0057 \n",
"35 BiMambaWithProb 0.5611 0.5577 0.0034 \n",
"36 BiMambaWithProb 0.5596 0.5577 0.0020 \n",
"37 BiMambaWithProb 0.5596 0.5577 0.0020 \n",
"38 BiMambaWithProb 0.5621 0.5574 0.0047 \n",
"39 BiMambaWithProb 0.5543 0.5573 -0.0030 \n",
"40 BiMambaWithProb 0.5551 0.5573 -0.0022 \n",
"41 BiMamba 0.5577 0.5572 0.0005 \n",
"42 BiMamba 0.5577 0.5572 0.0005 \n",
"43 BiMambaWithProb 0.5582 0.5567 0.0015 \n",
"44 BiMambaWithProb 0.5554 0.5567 -0.0013 \n",
"45 BiMambaWithProb 0.5528 0.5566 -0.0037 \n",
"46 BiMambaWithProb 0.5636 0.5565 0.0071 \n",
"47 BiMamba 0.5592 0.5561 0.0032 \n",
"48 BiMambaWithProb 0.5630 0.5559 0.0071 \n",
"49 BiMamba 0.5563 0.5558 0.0005 "
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"df = parse_smart_log(\"C:/Users/Alexandr/Desktop/sampling/last/mambas.txt\",50)\n",
"\n",
"from IPython.display import display\n",
"pd.set_option(\"display.max_columns\", None)\n",
"pd.set_option(\"display.width\", 160)\n",
"\n",
"display(df.head(50))"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "45434e0f-2af9-491a-bfc9-54cc3a91fca9",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" step | \n",
" lr | \n",
" num_transformer_heads | \n",
" tr_layer_number | \n",
" hidden_dim_gated | \n",
" dev | \n",
" test | \n",
" gap | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 13 | \n",
" 0.00010 | \n",
" 16 | \n",
" 4 | \n",
" 128 | \n",
" 0.5418 | \n",
" 0.5439 | \n",
" -0.0022 | \n",
"
\n",
" \n",
" 1 | \n",
" 11 | \n",
" 0.00010 | \n",
" 16 | \n",
" 3 | \n",
" 256 | \n",
" 0.5419 | \n",
" 0.5430 | \n",
" -0.0010 | \n",
"
\n",
" \n",
" 2 | \n",
" 16 | \n",
" 0.00010 | \n",
" 16 | \n",
" 5 | \n",
" 128 | \n",
" 0.5432 | \n",
" 0.5426 | \n",
" 0.0005 | \n",
"
\n",
" \n",
" 3 | \n",
" 12 | \n",
" 0.00010 | \n",
" 16 | \n",
" 3 | \n",
" 512 | \n",
" 0.5521 | \n",
" 0.5407 | \n",
" 0.0114 | \n",
"
\n",
" \n",
" 4 | \n",
" 7 | \n",
" 0.00010 | \n",
" 8 | \n",
" 5 | \n",
" 128 | \n",
" 0.5445 | \n",
" 0.5395 | \n",
" 0.0050 | \n",
"
\n",
" \n",
" 5 | \n",
" 1 | \n",
" 0.00010 | \n",
" 8 | \n",
" 3 | \n",
" 128 | \n",
" 0.5370 | \n",
" 0.5394 | \n",
" -0.0025 | \n",
"
\n",
" \n",
" 6 | \n",
" 4 | \n",
" 0.00010 | \n",
" 8 | \n",
" 4 | \n",
" 128 | \n",
" 0.5426 | \n",
" 0.5368 | \n",
" 0.0058 | \n",
"
\n",
" \n",
" 7 | \n",
" 5 | \n",
" 0.00010 | \n",
" 8 | \n",
" 4 | \n",
" 256 | \n",
" 0.5437 | \n",
" 0.5368 | \n",
" 0.0070 | \n",
"
\n",
" \n",
" 8 | \n",
" 2 | \n",
" 0.00010 | \n",
" 8 | \n",
" 3 | \n",
" 256 | \n",
" 0.5346 | \n",
" 0.5366 | \n",
" -0.0020 | \n",
"
\n",
" \n",
" 9 | \n",
" 10 | \n",
" 0.00010 | \n",
" 16 | \n",
" 3 | \n",
" 128 | \n",
" 0.5322 | \n",
" 0.5347 | \n",
" -0.0025 | \n",
"
\n",
" \n",
" 10 | \n",
" 9 | \n",
" 0.00010 | \n",
" 8 | \n",
" 5 | \n",
" 512 | \n",
" 0.5438 | \n",
" 0.5343 | \n",
" 0.0094 | \n",
"
\n",
" \n",
" 11 | \n",
" 8 | \n",
" 0.00010 | \n",
" 8 | \n",
" 5 | \n",
" 256 | \n",
" 0.5419 | \n",
" 0.5343 | \n",
" 0.0076 | \n",
"
\n",
" \n",
" 12 | \n",
" 15 | \n",
" 0.00010 | \n",
" 16 | \n",
" 4 | \n",
" 512 | \n",
" 0.5346 | \n",
" 0.5319 | \n",
" 0.0027 | \n",
"
\n",
" \n",
" 13 | \n",
" 6 | \n",
" 0.00010 | \n",
" 8 | \n",
" 4 | \n",
" 512 | \n",
" 0.5338 | \n",
" 0.5301 | \n",
" 0.0037 | \n",
"
\n",
" \n",
" 14 | \n",
" 18 | \n",
" 0.00010 | \n",
" 16 | \n",
" 5 | \n",
" 512 | \n",
" 0.5287 | \n",
" 0.5285 | \n",
" 0.0002 | \n",
"
\n",
" \n",
" 15 | \n",
" 3 | \n",
" 0.00010 | \n",
" 8 | \n",
" 3 | \n",
" 512 | \n",
" 0.5354 | \n",
" 0.5278 | \n",
" 0.0076 | \n",
"
\n",
" \n",
" 16 | \n",
" 14 | \n",
" 0.00010 | \n",
" 16 | \n",
" 4 | \n",
" 256 | \n",
" 0.5335 | \n",
" 0.5272 | \n",
" 0.0063 | \n",
"
\n",
" \n",
" 17 | \n",
" 17 | \n",
" 0.00010 | \n",
" 16 | \n",
" 5 | \n",
" 256 | \n",
" 0.5216 | \n",
" 0.5137 | \n",
" 0.0080 | \n",
"
\n",
" \n",
" 18 | \n",
" 25 | \n",
" 0.00001 | \n",
" 8 | \n",
" 5 | \n",
" 128 | \n",
" 0.5141 | \n",
" 0.5038 | \n",
" 0.0104 | \n",
"
\n",
" \n",
" 19 | \n",
" 29 | \n",
" 0.00001 | \n",
" 16 | \n",
" 3 | \n",
" 256 | \n",
" 0.4945 | \n",
" 0.4995 | \n",
" -0.0049 | \n",
"
\n",
" \n",
" 20 | \n",
" 28 | \n",
" 0.00001 | \n",
" 16 | \n",
" 3 | \n",
" 128 | \n",
" 0.5015 | \n",
" 0.4964 | \n",
" 0.0051 | \n",
"
\n",
" \n",
" 21 | \n",
" 22 | \n",
" 0.00001 | \n",
" 8 | \n",
" 4 | \n",
" 128 | \n",
" 0.4993 | \n",
" 0.4952 | \n",
" 0.0042 | \n",
"
\n",
" \n",
" 22 | \n",
" 21 | \n",
" 0.00001 | \n",
" 8 | \n",
" 3 | \n",
" 512 | \n",
" 0.4902 | \n",
" 0.4831 | \n",
" 0.0071 | \n",
"
\n",
" \n",
" 23 | \n",
" 34 | \n",
" 0.00001 | \n",
" 16 | \n",
" 5 | \n",
" 128 | \n",
" 0.4821 | \n",
" 0.4795 | \n",
" 0.0026 | \n",
"
\n",
" \n",
" 24 | \n",
" 19 | \n",
" 0.00001 | \n",
" 8 | \n",
" 3 | \n",
" 128 | \n",
" 0.4894 | \n",
" 0.4793 | \n",
" 0.0101 | \n",
"
\n",
" \n",
" 25 | \n",
" 20 | \n",
" 0.00001 | \n",
" 8 | \n",
" 3 | \n",
" 256 | \n",
" 0.4756 | \n",
" 0.4779 | \n",
" -0.0023 | \n",
"
\n",
" \n",
" 26 | \n",
" 24 | \n",
" 0.00001 | \n",
" 8 | \n",
" 4 | \n",
" 512 | \n",
" 0.4767 | \n",
" 0.4777 | \n",
" -0.0010 | \n",
"
\n",
" \n",
" 27 | \n",
" 30 | \n",
" 0.00001 | \n",
" 16 | \n",
" 3 | \n",
" 512 | \n",
" 0.4862 | \n",
" 0.4768 | \n",
" 0.0094 | \n",
"
\n",
" \n",
" 28 | \n",
" 26 | \n",
" 0.00001 | \n",
" 8 | \n",
" 5 | \n",
" 256 | \n",
" 0.4815 | \n",
" 0.4745 | \n",
" 0.0069 | \n",
"
\n",
" \n",
" 29 | \n",
" 23 | \n",
" 0.00001 | \n",
" 8 | \n",
" 4 | \n",
" 256 | \n",
" 0.4741 | \n",
" 0.4739 | \n",
" 0.0003 | \n",
"
\n",
" \n",
" 30 | \n",
" 31 | \n",
" 0.00001 | \n",
" 16 | \n",
" 4 | \n",
" 128 | \n",
" 0.4715 | \n",
" 0.4705 | \n",
" 0.0011 | \n",
"
\n",
" \n",
" 31 | \n",
" 36 | \n",
" 0.00001 | \n",
" 16 | \n",
" 5 | \n",
" 512 | \n",
" 0.4669 | \n",
" 0.4627 | \n",
" 0.0042 | \n",
"
\n",
" \n",
" 32 | \n",
" 32 | \n",
" 0.00001 | \n",
" 16 | \n",
" 4 | \n",
" 256 | \n",
" 0.4610 | \n",
" 0.4606 | \n",
" 0.0003 | \n",
"
\n",
" \n",
" 33 | \n",
" 33 | \n",
" 0.00001 | \n",
" 16 | \n",
" 4 | \n",
" 512 | \n",
" 0.4530 | \n",
" 0.4529 | \n",
" 0.0001 | \n",
"
\n",
" \n",
" 34 | \n",
" 27 | \n",
" 0.00001 | \n",
" 8 | \n",
" 5 | \n",
" 512 | \n",
" 0.4569 | \n",
" 0.4477 | \n",
" 0.0092 | \n",
"
\n",
" \n",
" 35 | \n",
" 35 | \n",
" 0.00001 | \n",
" 16 | \n",
" 5 | \n",
" 256 | \n",
" 0.4448 | \n",
" 0.4422 | \n",
" 0.0025 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" step lr num_transformer_heads tr_layer_number hidden_dim_gated dev test gap\n",
"0 13 0.00010 16 4 128 0.5418 0.5439 -0.0022\n",
"1 11 0.00010 16 3 256 0.5419 0.5430 -0.0010\n",
"2 16 0.00010 16 5 128 0.5432 0.5426 0.0005\n",
"3 12 0.00010 16 3 512 0.5521 0.5407 0.0114\n",
"4 7 0.00010 8 5 128 0.5445 0.5395 0.0050\n",
"5 1 0.00010 8 3 128 0.5370 0.5394 -0.0025\n",
"6 4 0.00010 8 4 128 0.5426 0.5368 0.0058\n",
"7 5 0.00010 8 4 256 0.5437 0.5368 0.0070\n",
"8 2 0.00010 8 3 256 0.5346 0.5366 -0.0020\n",
"9 10 0.00010 16 3 128 0.5322 0.5347 -0.0025\n",
"10 9 0.00010 8 5 512 0.5438 0.5343 0.0094\n",
"11 8 0.00010 8 5 256 0.5419 0.5343 0.0076\n",
"12 15 0.00010 16 4 512 0.5346 0.5319 0.0027\n",
"13 6 0.00010 8 4 512 0.5338 0.5301 0.0037\n",
"14 18 0.00010 16 5 512 0.5287 0.5285 0.0002\n",
"15 3 0.00010 8 3 512 0.5354 0.5278 0.0076\n",
"16 14 0.00010 16 4 256 0.5335 0.5272 0.0063\n",
"17 17 0.00010 16 5 256 0.5216 0.5137 0.0080\n",
"18 25 0.00001 8 5 128 0.5141 0.5038 0.0104\n",
"19 29 0.00001 16 3 256 0.4945 0.4995 -0.0049\n",
"20 28 0.00001 16 3 128 0.5015 0.4964 0.0051\n",
"21 22 0.00001 8 4 128 0.4993 0.4952 0.0042\n",
"22 21 0.00001 8 3 512 0.4902 0.4831 0.0071\n",
"23 34 0.00001 16 5 128 0.4821 0.4795 0.0026\n",
"24 19 0.00001 8 3 128 0.4894 0.4793 0.0101\n",
"25 20 0.00001 8 3 256 0.4756 0.4779 -0.0023\n",
"26 24 0.00001 8 4 512 0.4767 0.4777 -0.0010\n",
"27 30 0.00001 16 3 512 0.4862 0.4768 0.0094\n",
"28 26 0.00001 8 5 256 0.4815 0.4745 0.0069\n",
"29 23 0.00001 8 4 256 0.4741 0.4739 0.0003\n",
"30 31 0.00001 16 4 128 0.4715 0.4705 0.0011\n",
"31 36 0.00001 16 5 512 0.4669 0.4627 0.0042\n",
"32 32 0.00001 16 4 256 0.4610 0.4606 0.0003\n",
"33 33 0.00001 16 4 512 0.4530 0.4529 0.0001\n",
"34 27 0.00001 8 5 512 0.4569 0.4477 0.0092\n",
"35 35 0.00001 16 5 256 0.4448 0.4422 0.0025"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"df = parse_smart_log(\"C:/Users/Alexandr/Desktop/sampling/last/bigated.txt\",50)\n",
"\n",
"from IPython.display import display\n",
"pd.set_option(\"display.max_columns\", None)\n",
"pd.set_option(\"display.width\", 160)\n",
"\n",
"display(df.head(50))"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "041a82f6-a5db-4033-8e89-f4ecc24479f1",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" step | \n",
" lr | \n",
" num_transformer_heads | \n",
" tr_layer_number | \n",
" num_graph_heads | \n",
" dev | \n",
" test | \n",
" gap | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 35 | \n",
" 0.00001 | \n",
" 8 | \n",
" 5 | \n",
" 8 | \n",
" 0.5798 | \n",
" 0.5774 | \n",
" 0.0023 | \n",
"
\n",
" \n",
" 1 | \n",
" 11 | \n",
" 0.00010 | \n",
" 8 | \n",
" 5 | \n",
" 8 | \n",
" 0.5744 | \n",
" 0.5701 | \n",
" 0.0042 | \n",
"
\n",
" \n",
" 2 | \n",
" 45 | \n",
" 0.00001 | \n",
" 16 | \n",
" 5 | \n",
" 2 | \n",
" 0.5693 | \n",
" 0.5699 | \n",
" -0.0005 | \n",
"
\n",
" \n",
" 3 | \n",
" 25 | \n",
" 0.00001 | \n",
" 8 | \n",
" 3 | \n",
" 2 | \n",
" 0.5726 | \n",
" 0.5695 | \n",
" 0.0031 | \n",
"
\n",
" \n",
" 4 | \n",
" 26 | \n",
" 0.00001 | \n",
" 8 | \n",
" 3 | \n",
" 4 | \n",
" 0.5737 | \n",
" 0.5688 | \n",
" 0.0049 | \n",
"
\n",
" \n",
" 5 | \n",
" 47 | \n",
" 0.00001 | \n",
" 16 | \n",
" 5 | \n",
" 8 | \n",
" 0.5772 | \n",
" 0.5688 | \n",
" 0.0084 | \n",
"
\n",
" \n",
" 6 | \n",
" 23 | \n",
" 0.00010 | \n",
" 16 | \n",
" 5 | \n",
" 8 | \n",
" 0.5687 | \n",
" 0.5675 | \n",
" 0.0012 | \n",
"
\n",
" \n",
" 7 | \n",
" 9 | \n",
" 0.00010 | \n",
" 8 | \n",
" 5 | \n",
" 2 | \n",
" 0.5688 | \n",
" 0.5669 | \n",
" 0.0020 | \n",
"
\n",
" \n",
" 8 | \n",
" 17 | \n",
" 0.00010 | \n",
" 16 | \n",
" 4 | \n",
" 2 | \n",
" 0.5685 | \n",
" 0.5665 | \n",
" 0.0020 | \n",
"
\n",
" \n",
" 9 | \n",
" 31 | \n",
" 0.00001 | \n",
" 8 | \n",
" 4 | \n",
" 8 | \n",
" 0.5677 | \n",
" 0.5653 | \n",
" 0.0024 | \n",
"
\n",
" \n",
" 10 | \n",
" 43 | \n",
" 0.00001 | \n",
" 16 | \n",
" 4 | \n",
" 8 | \n",
" 0.5654 | \n",
" 0.5646 | \n",
" 0.0008 | \n",
"
\n",
" \n",
" 11 | \n",
" 14 | \n",
" 0.00010 | \n",
" 16 | \n",
" 3 | \n",
" 4 | \n",
" 0.5621 | \n",
" 0.5644 | \n",
" -0.0022 | \n",
"
\n",
" \n",
" 12 | \n",
" 42 | \n",
" 0.00001 | \n",
" 16 | \n",
" 4 | \n",
" 4 | \n",
" 0.5730 | \n",
" 0.5640 | \n",
" 0.0090 | \n",
"
\n",
" \n",
" 13 | \n",
" 28 | \n",
" 0.00001 | \n",
" 8 | \n",
" 3 | \n",
" 16 | \n",
" 0.5657 | \n",
" 0.5638 | \n",
" 0.0019 | \n",
"
\n",
" \n",
" 14 | \n",
" 5 | \n",
" 0.00010 | \n",
" 8 | \n",
" 4 | \n",
" 2 | \n",
" 0.5722 | \n",
" 0.5635 | \n",
" 0.0086 | \n",
"
\n",
" \n",
" 15 | \n",
" 30 | \n",
" 0.00001 | \n",
" 8 | \n",
" 4 | \n",
" 4 | \n",
" 0.5729 | \n",
" 0.5634 | \n",
" 0.0095 | \n",
"
\n",
" \n",
" 16 | \n",
" 33 | \n",
" 0.00001 | \n",
" 8 | \n",
" 5 | \n",
" 2 | \n",
" 0.5625 | \n",
" 0.5622 | \n",
" 0.0004 | \n",
"
\n",
" \n",
" 17 | \n",
" 37 | \n",
" 0.00001 | \n",
" 16 | \n",
" 3 | \n",
" 2 | \n",
" 0.5692 | \n",
" 0.5622 | \n",
" 0.0070 | \n",
"
\n",
" \n",
" 18 | \n",
" 46 | \n",
" 0.00001 | \n",
" 16 | \n",
" 5 | \n",
" 4 | \n",
" 0.5730 | \n",
" 0.5615 | \n",
" 0.0114 | \n",
"
\n",
" \n",
" 19 | \n",
" 38 | \n",
" 0.00001 | \n",
" 16 | \n",
" 3 | \n",
" 4 | \n",
" 0.5660 | \n",
" 0.5611 | \n",
" 0.0049 | \n",
"
\n",
" \n",
" 20 | \n",
" 8 | \n",
" 0.00010 | \n",
" 8 | \n",
" 4 | \n",
" 16 | \n",
" 0.5647 | \n",
" 0.5609 | \n",
" 0.0038 | \n",
"
\n",
" \n",
" 21 | \n",
" 3 | \n",
" 0.00010 | \n",
" 8 | \n",
" 3 | \n",
" 8 | \n",
" 0.5685 | \n",
" 0.5606 | \n",
" 0.0079 | \n",
"
\n",
" \n",
" 22 | \n",
" 40 | \n",
" 0.00001 | \n",
" 16 | \n",
" 3 | \n",
" 16 | \n",
" 0.5664 | \n",
" 0.5605 | \n",
" 0.0059 | \n",
"
\n",
" \n",
" 23 | \n",
" 44 | \n",
" 0.00001 | \n",
" 16 | \n",
" 4 | \n",
" 16 | \n",
" 0.5669 | \n",
" 0.5603 | \n",
" 0.0066 | \n",
"
\n",
" \n",
" 24 | \n",
" 15 | \n",
" 0.00010 | \n",
" 16 | \n",
" 3 | \n",
" 8 | \n",
" 0.5635 | \n",
" 0.5596 | \n",
" 0.0040 | \n",
"
\n",
" \n",
" 25 | \n",
" 10 | \n",
" 0.00010 | \n",
" 8 | \n",
" 5 | \n",
" 4 | \n",
" 0.5706 | \n",
" 0.5596 | \n",
" 0.0110 | \n",
"
\n",
" \n",
" 26 | \n",
" 29 | \n",
" 0.00001 | \n",
" 8 | \n",
" 4 | \n",
" 2 | \n",
" 0.5661 | \n",
" 0.5591 | \n",
" 0.0069 | \n",
"
\n",
" \n",
" 27 | \n",
" 6 | \n",
" 0.00010 | \n",
" 8 | \n",
" 4 | \n",
" 4 | \n",
" 0.5625 | \n",
" 0.5589 | \n",
" 0.0036 | \n",
"
\n",
" \n",
" 28 | \n",
" 20 | \n",
" 0.00010 | \n",
" 16 | \n",
" 4 | \n",
" 16 | \n",
" 0.5642 | \n",
" 0.5586 | \n",
" 0.0056 | \n",
"
\n",
" \n",
" 29 | \n",
" 13 | \n",
" 0.00010 | \n",
" 16 | \n",
" 3 | \n",
" 2 | \n",
" 0.5631 | \n",
" 0.5584 | \n",
" 0.0047 | \n",
"
\n",
" \n",
" 30 | \n",
" 21 | \n",
" 0.00010 | \n",
" 16 | \n",
" 5 | \n",
" 2 | \n",
" 0.5610 | \n",
" 0.5581 | \n",
" 0.0028 | \n",
"
\n",
" \n",
" 31 | \n",
" 1 | \n",
" 0.00010 | \n",
" 8 | \n",
" 3 | \n",
" 2 | \n",
" 0.5585 | \n",
" 0.5580 | \n",
" 0.0005 | \n",
"
\n",
" \n",
" 32 | \n",
" 32 | \n",
" 0.00001 | \n",
" 8 | \n",
" 4 | \n",
" 16 | \n",
" 0.5624 | \n",
" 0.5579 | \n",
" 0.0045 | \n",
"
\n",
" \n",
" 33 | \n",
" 7 | \n",
" 0.00010 | \n",
" 8 | \n",
" 4 | \n",
" 8 | \n",
" 0.5631 | \n",
" 0.5577 | \n",
" 0.0054 | \n",
"
\n",
" \n",
" 34 | \n",
" 41 | \n",
" 0.00001 | \n",
" 16 | \n",
" 4 | \n",
" 2 | \n",
" 0.5661 | \n",
" 0.5575 | \n",
" 0.0086 | \n",
"
\n",
" \n",
" 35 | \n",
" 16 | \n",
" 0.00010 | \n",
" 16 | \n",
" 3 | \n",
" 16 | \n",
" 0.5681 | \n",
" 0.5575 | \n",
" 0.0106 | \n",
"
\n",
" \n",
" 36 | \n",
" 27 | \n",
" 0.00001 | \n",
" 8 | \n",
" 3 | \n",
" 8 | \n",
" 0.5553 | \n",
" 0.5563 | \n",
" -0.0010 | \n",
"
\n",
" \n",
" 37 | \n",
" 22 | \n",
" 0.00010 | \n",
" 16 | \n",
" 5 | \n",
" 4 | \n",
" 0.5629 | \n",
" 0.5547 | \n",
" 0.0082 | \n",
"
\n",
" \n",
" 38 | \n",
" 24 | \n",
" 0.00010 | \n",
" 16 | \n",
" 5 | \n",
" 16 | \n",
" 0.5571 | \n",
" 0.5545 | \n",
" 0.0027 | \n",
"
\n",
" \n",
" 39 | \n",
" 19 | \n",
" 0.00010 | \n",
" 16 | \n",
" 4 | \n",
" 8 | \n",
" 0.5590 | \n",
" 0.5537 | \n",
" 0.0053 | \n",
"
\n",
" \n",
" 40 | \n",
" 18 | \n",
" 0.00010 | \n",
" 16 | \n",
" 4 | \n",
" 4 | \n",
" 0.5645 | \n",
" 0.5537 | \n",
" 0.0108 | \n",
"
\n",
" \n",
" 41 | \n",
" 39 | \n",
" 0.00001 | \n",
" 16 | \n",
" 3 | \n",
" 8 | \n",
" 0.5542 | \n",
" 0.5535 | \n",
" 0.0006 | \n",
"
\n",
" \n",
" 42 | \n",
" 12 | \n",
" 0.00010 | \n",
" 8 | \n",
" 5 | \n",
" 16 | \n",
" 0.5597 | \n",
" 0.5534 | \n",
" 0.0063 | \n",
"
\n",
" \n",
" 43 | \n",
" 48 | \n",
" 0.00001 | \n",
" 16 | \n",
" 5 | \n",
" 16 | \n",
" 0.5542 | \n",
" 0.5533 | \n",
" 0.0009 | \n",
"
\n",
" \n",
" 44 | \n",
" 36 | \n",
" 0.00001 | \n",
" 8 | \n",
" 5 | \n",
" 16 | \n",
" 0.5593 | \n",
" 0.5530 | \n",
" 0.0063 | \n",
"
\n",
" \n",
" 45 | \n",
" 2 | \n",
" 0.00010 | \n",
" 8 | \n",
" 3 | \n",
" 4 | \n",
" 0.5640 | \n",
" 0.5526 | \n",
" 0.0114 | \n",
"
\n",
" \n",
" 46 | \n",
" 4 | \n",
" 0.00010 | \n",
" 8 | \n",
" 3 | \n",
" 16 | \n",
" 0.5556 | \n",
" 0.5508 | \n",
" 0.0048 | \n",
"
\n",
" \n",
" 47 | \n",
" 34 | \n",
" 0.00001 | \n",
" 8 | \n",
" 5 | \n",
" 4 | \n",
" 0.5653 | \n",
" 0.5508 | \n",
" 0.0145 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" step lr num_transformer_heads tr_layer_number num_graph_heads dev test gap\n",
"0 35 0.00001 8 5 8 0.5798 0.5774 0.0023\n",
"1 11 0.00010 8 5 8 0.5744 0.5701 0.0042\n",
"2 45 0.00001 16 5 2 0.5693 0.5699 -0.0005\n",
"3 25 0.00001 8 3 2 0.5726 0.5695 0.0031\n",
"4 26 0.00001 8 3 4 0.5737 0.5688 0.0049\n",
"5 47 0.00001 16 5 8 0.5772 0.5688 0.0084\n",
"6 23 0.00010 16 5 8 0.5687 0.5675 0.0012\n",
"7 9 0.00010 8 5 2 0.5688 0.5669 0.0020\n",
"8 17 0.00010 16 4 2 0.5685 0.5665 0.0020\n",
"9 31 0.00001 8 4 8 0.5677 0.5653 0.0024\n",
"10 43 0.00001 16 4 8 0.5654 0.5646 0.0008\n",
"11 14 0.00010 16 3 4 0.5621 0.5644 -0.0022\n",
"12 42 0.00001 16 4 4 0.5730 0.5640 0.0090\n",
"13 28 0.00001 8 3 16 0.5657 0.5638 0.0019\n",
"14 5 0.00010 8 4 2 0.5722 0.5635 0.0086\n",
"15 30 0.00001 8 4 4 0.5729 0.5634 0.0095\n",
"16 33 0.00001 8 5 2 0.5625 0.5622 0.0004\n",
"17 37 0.00001 16 3 2 0.5692 0.5622 0.0070\n",
"18 46 0.00001 16 5 4 0.5730 0.5615 0.0114\n",
"19 38 0.00001 16 3 4 0.5660 0.5611 0.0049\n",
"20 8 0.00010 8 4 16 0.5647 0.5609 0.0038\n",
"21 3 0.00010 8 3 8 0.5685 0.5606 0.0079\n",
"22 40 0.00001 16 3 16 0.5664 0.5605 0.0059\n",
"23 44 0.00001 16 4 16 0.5669 0.5603 0.0066\n",
"24 15 0.00010 16 3 8 0.5635 0.5596 0.0040\n",
"25 10 0.00010 8 5 4 0.5706 0.5596 0.0110\n",
"26 29 0.00001 8 4 2 0.5661 0.5591 0.0069\n",
"27 6 0.00010 8 4 4 0.5625 0.5589 0.0036\n",
"28 20 0.00010 16 4 16 0.5642 0.5586 0.0056\n",
"29 13 0.00010 16 3 2 0.5631 0.5584 0.0047\n",
"30 21 0.00010 16 5 2 0.5610 0.5581 0.0028\n",
"31 1 0.00010 8 3 2 0.5585 0.5580 0.0005\n",
"32 32 0.00001 8 4 16 0.5624 0.5579 0.0045\n",
"33 7 0.00010 8 4 8 0.5631 0.5577 0.0054\n",
"34 41 0.00001 16 4 2 0.5661 0.5575 0.0086\n",
"35 16 0.00010 16 3 16 0.5681 0.5575 0.0106\n",
"36 27 0.00001 8 3 8 0.5553 0.5563 -0.0010\n",
"37 22 0.00010 16 5 4 0.5629 0.5547 0.0082\n",
"38 24 0.00010 16 5 16 0.5571 0.5545 0.0027\n",
"39 19 0.00010 16 4 8 0.5590 0.5537 0.0053\n",
"40 18 0.00010 16 4 4 0.5645 0.5537 0.0108\n",
"41 39 0.00001 16 3 8 0.5542 0.5535 0.0006\n",
"42 12 0.00010 8 5 16 0.5597 0.5534 0.0063\n",
"43 48 0.00001 16 5 16 0.5542 0.5533 0.0009\n",
"44 36 0.00001 8 5 16 0.5593 0.5530 0.0063\n",
"45 2 0.00010 8 3 4 0.5640 0.5526 0.0114\n",
"46 4 0.00010 8 3 16 0.5556 0.5508 0.0048\n",
"47 34 0.00001 8 5 4 0.5653 0.5508 0.0145"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"df = parse_smart_log(\"C:/Users/Alexandr/Desktop/sampling/last/bigraph.txt\",50)\n",
"\n",
"from IPython.display import display\n",
"pd.set_option(\"display.max_columns\", None)\n",
"pd.set_option(\"display.width\", 160)\n",
"\n",
"display(df.head(50))"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "42ebb696-9e6d-4ba1-acd4-b8802ecf0fac",
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" step | \n",
" lr | \n",
" num_transformer_heads | \n",
" tr_layer_number | \n",
" num_graph_heads | \n",
" hidden_dim_gated | \n",
" dev | \n",
" test | \n",
" gap | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 50 | \n",
" 0.00001 | \n",
" 8 | \n",
" 5 | \n",
" 8 | \n",
" 256 | \n",
" 0.5805 | \n",
" 0.5756 | \n",
" 0.0048 | \n",
"
\n",
" \n",
" 1 | \n",
" 24 | \n",
" 0.00010 | \n",
" 16 | \n",
" 4 | \n",
" 8 | \n",
" 512 | \n",
" 0.5723 | \n",
" 0.5700 | \n",
" 0.0024 | \n",
"
\n",
" \n",
" 2 | \n",
" 67 | \n",
" 0.00001 | \n",
" 16 | \n",
" 5 | \n",
" 8 | \n",
" 128 | \n",
" 0.5667 | \n",
" 0.5687 | \n",
" -0.0019 | \n",
"
\n",
" \n",
" 3 | \n",
" 12 | \n",
" 0.00010 | \n",
" 8 | \n",
" 5 | \n",
" 4 | \n",
" 512 | \n",
" 0.5645 | \n",
" 0.5685 | \n",
" -0.0040 | \n",
"
\n",
" \n",
" 4 | \n",
" 32 | \n",
" 0.00010 | \n",
" 16 | \n",
" 5 | \n",
" 8 | \n",
" 256 | \n",
" 0.5658 | \n",
" 0.5678 | \n",
" -0.0020 | \n",
"
\n",
" \n",
" 5 | \n",
" 64 | \n",
" 0.00001 | \n",
" 16 | \n",
" 5 | \n",
" 4 | \n",
" 128 | \n",
" 0.5662 | \n",
" 0.5676 | \n",
" -0.0014 | \n",
"
\n",
" \n",
" 6 | \n",
" 68 | \n",
" 0.00001 | \n",
" 16 | \n",
" 5 | \n",
" 8 | \n",
" 256 | \n",
" 0.5779 | \n",
" 0.5669 | \n",
" 0.0110 | \n",
"
\n",
" \n",
" 7 | \n",
" 55 | \n",
" 0.00001 | \n",
" 16 | \n",
" 4 | \n",
" 4 | \n",
" 128 | \n",
" 0.5706 | \n",
" 0.5669 | \n",
" 0.0037 | \n",
"
\n",
" \n",
" 8 | \n",
" 49 | \n",
" 0.00001 | \n",
" 8 | \n",
" 5 | \n",
" 8 | \n",
" 128 | \n",
" 0.5642 | \n",
" 0.5659 | \n",
" -0.0018 | \n",
"
\n",
" \n",
" 9 | \n",
" 31 | \n",
" 0.00010 | \n",
" 16 | \n",
" 5 | \n",
" 8 | \n",
" 128 | \n",
" 0.5750 | \n",
" 0.5654 | \n",
" 0.0095 | \n",
"
\n",
" \n",
" 10 | \n",
" 23 | \n",
" 0.00010 | \n",
" 16 | \n",
" 4 | \n",
" 8 | \n",
" 256 | \n",
" 0.5716 | \n",
" 0.5651 | \n",
" 0.0065 | \n",
"
\n",
" \n",
" 11 | \n",
" 22 | \n",
" 0.00010 | \n",
" 16 | \n",
" 4 | \n",
" 8 | \n",
" 128 | \n",
" 0.5743 | \n",
" 0.5650 | \n",
" 0.0093 | \n",
"
\n",
" \n",
" 12 | \n",
" 53 | \n",
" 0.00001 | \n",
" 8 | \n",
" 5 | \n",
" 16 | \n",
" 256 | \n",
" 0.5732 | \n",
" 0.5633 | \n",
" 0.0099 | \n",
"
\n",
" \n",
" 13 | \n",
" 26 | \n",
" 0.00010 | \n",
" 16 | \n",
" 4 | \n",
" 16 | \n",
" 256 | \n",
" 0.5636 | \n",
" 0.5630 | \n",
" 0.0006 | \n",
"
\n",
" \n",
" 14 | \n",
" 6 | \n",
" 0.00010 | \n",
" 8 | \n",
" 4 | \n",
" 8 | \n",
" 512 | \n",
" 0.5624 | \n",
" 0.5626 | \n",
" -0.0002 | \n",
"
\n",
" \n",
" 15 | \n",
" 10 | \n",
" 0.00010 | \n",
" 8 | \n",
" 5 | \n",
" 4 | \n",
" 128 | \n",
" 0.5655 | \n",
" 0.5625 | \n",
" 0.0029 | \n",
"
\n",
" \n",
" 16 | \n",
" 70 | \n",
" 0.00001 | \n",
" 16 | \n",
" 5 | \n",
" 16 | \n",
" 128 | \n",
" 0.5618 | \n",
" 0.5616 | \n",
" 0.0002 | \n",
"
\n",
" \n",
" 17 | \n",
" 36 | \n",
" 0.00010 | \n",
" 16 | \n",
" 5 | \n",
" 16 | \n",
" 512 | \n",
" 0.5753 | \n",
" 0.5610 | \n",
" 0.0143 | \n",
"
\n",
" \n",
" 18 | \n",
" 27 | \n",
" 0.00010 | \n",
" 16 | \n",
" 4 | \n",
" 16 | \n",
" 512 | \n",
" 0.5587 | \n",
" 0.5609 | \n",
" -0.0022 | \n",
"
\n",
" \n",
" 19 | \n",
" 46 | \n",
" 0.00001 | \n",
" 8 | \n",
" 5 | \n",
" 4 | \n",
" 128 | \n",
" 0.5660 | \n",
" 0.5608 | \n",
" 0.0052 | \n",
"
\n",
" \n",
" 20 | \n",
" 19 | \n",
" 0.00010 | \n",
" 16 | \n",
" 4 | \n",
" 4 | \n",
" 128 | \n",
" 0.5616 | \n",
" 0.5604 | \n",
" 0.0012 | \n",
"
\n",
" \n",
" 21 | \n",
" 44 | \n",
" 0.00001 | \n",
" 8 | \n",
" 4 | \n",
" 16 | \n",
" 256 | \n",
" 0.5686 | \n",
" 0.5600 | \n",
" 0.0086 | \n",
"
\n",
" \n",
" 22 | \n",
" 35 | \n",
" 0.00010 | \n",
" 16 | \n",
" 5 | \n",
" 16 | \n",
" 256 | \n",
" 0.5643 | \n",
" 0.5598 | \n",
" 0.0045 | \n",
"
\n",
" \n",
" 23 | \n",
" 13 | \n",
" 0.00010 | \n",
" 8 | \n",
" 5 | \n",
" 8 | \n",
" 128 | \n",
" 0.5621 | \n",
" 0.5596 | \n",
" 0.0025 | \n",
"
\n",
" \n",
" 24 | \n",
" 71 | \n",
" 0.00001 | \n",
" 16 | \n",
" 5 | \n",
" 16 | \n",
" 256 | \n",
" 0.5667 | \n",
" 0.5589 | \n",
" 0.0078 | \n",
"
\n",
" \n",
" 25 | \n",
" 69 | \n",
" 0.00001 | \n",
" 16 | \n",
" 5 | \n",
" 8 | \n",
" 512 | \n",
" 0.5638 | \n",
" 0.5586 | \n",
" 0.0052 | \n",
"
\n",
" \n",
" 26 | \n",
" 7 | \n",
" 0.00010 | \n",
" 8 | \n",
" 4 | \n",
" 16 | \n",
" 128 | \n",
" 0.5626 | \n",
" 0.5579 | \n",
" 0.0047 | \n",
"
\n",
" \n",
" 27 | \n",
" 30 | \n",
" 0.00010 | \n",
" 16 | \n",
" 5 | \n",
" 4 | \n",
" 512 | \n",
" 0.5619 | \n",
" 0.5578 | \n",
" 0.0041 | \n",
"
\n",
" \n",
" 28 | \n",
" 51 | \n",
" 0.00001 | \n",
" 8 | \n",
" 5 | \n",
" 8 | \n",
" 512 | \n",
" 0.5627 | \n",
" 0.5577 | \n",
" 0.0051 | \n",
"
\n",
" \n",
" 29 | \n",
" 52 | \n",
" 0.00001 | \n",
" 8 | \n",
" 5 | \n",
" 16 | \n",
" 128 | \n",
" 0.5581 | \n",
" 0.5574 | \n",
" 0.0007 | \n",
"
\n",
" \n",
" 30 | \n",
" 37 | \n",
" 0.00001 | \n",
" 8 | \n",
" 4 | \n",
" 4 | \n",
" 128 | \n",
" 0.5608 | \n",
" 0.5572 | \n",
" 0.0036 | \n",
"
\n",
" \n",
" 31 | \n",
" 4 | \n",
" 0.00010 | \n",
" 8 | \n",
" 4 | \n",
" 8 | \n",
" 128 | \n",
" 0.5659 | \n",
" 0.5571 | \n",
" 0.0088 | \n",
"
\n",
" \n",
" 32 | \n",
" 14 | \n",
" 0.00010 | \n",
" 8 | \n",
" 5 | \n",
" 8 | \n",
" 256 | \n",
" 0.5634 | \n",
" 0.5571 | \n",
" 0.0064 | \n",
"
\n",
" \n",
" 33 | \n",
" 38 | \n",
" 0.00001 | \n",
" 8 | \n",
" 4 | \n",
" 4 | \n",
" 256 | \n",
" 0.5611 | \n",
" 0.5569 | \n",
" 0.0043 | \n",
"
\n",
" \n",
" 34 | \n",
" 57 | \n",
" 0.00001 | \n",
" 16 | \n",
" 4 | \n",
" 4 | \n",
" 512 | \n",
" 0.5626 | \n",
" 0.5569 | \n",
" 0.0058 | \n",
"
\n",
" \n",
" 35 | \n",
" 18 | \n",
" 0.00010 | \n",
" 8 | \n",
" 5 | \n",
" 16 | \n",
" 512 | \n",
" 0.5555 | \n",
" 0.5568 | \n",
" -0.0013 | \n",
"
\n",
" \n",
" 36 | \n",
" 62 | \n",
" 0.00001 | \n",
" 16 | \n",
" 4 | \n",
" 16 | \n",
" 256 | \n",
" 0.5602 | \n",
" 0.5565 | \n",
" 0.0038 | \n",
"
\n",
" \n",
" 37 | \n",
" 16 | \n",
" 0.00010 | \n",
" 8 | \n",
" 5 | \n",
" 16 | \n",
" 128 | \n",
" 0.5632 | \n",
" 0.5564 | \n",
" 0.0069 | \n",
"
\n",
" \n",
" 38 | \n",
" 59 | \n",
" 0.00001 | \n",
" 16 | \n",
" 4 | \n",
" 8 | \n",
" 256 | \n",
" 0.5695 | \n",
" 0.5563 | \n",
" 0.0133 | \n",
"
\n",
" \n",
" 39 | \n",
" 47 | \n",
" 0.00001 | \n",
" 8 | \n",
" 5 | \n",
" 4 | \n",
" 256 | \n",
" 0.5643 | \n",
" 0.5557 | \n",
" 0.0086 | \n",
"
\n",
" \n",
" 40 | \n",
" 2 | \n",
" 0.00010 | \n",
" 8 | \n",
" 4 | \n",
" 4 | \n",
" 256 | \n",
" 0.5589 | \n",
" 0.5555 | \n",
" 0.0034 | \n",
"
\n",
" \n",
" 41 | \n",
" 28 | \n",
" 0.00010 | \n",
" 16 | \n",
" 5 | \n",
" 4 | \n",
" 128 | \n",
" 0.5639 | \n",
" 0.5555 | \n",
" 0.0085 | \n",
"
\n",
" \n",
" 42 | \n",
" 43 | \n",
" 0.00001 | \n",
" 8 | \n",
" 4 | \n",
" 16 | \n",
" 128 | \n",
" 0.5659 | \n",
" 0.5554 | \n",
" 0.0105 | \n",
"
\n",
" \n",
" 43 | \n",
" 58 | \n",
" 0.00001 | \n",
" 16 | \n",
" 4 | \n",
" 8 | \n",
" 128 | \n",
" 0.5630 | \n",
" 0.5554 | \n",
" 0.0077 | \n",
"
\n",
" \n",
" 44 | \n",
" 1 | \n",
" 0.00010 | \n",
" 8 | \n",
" 4 | \n",
" 4 | \n",
" 128 | \n",
" 0.5633 | \n",
" 0.5553 | \n",
" 0.0080 | \n",
"
\n",
" \n",
" 45 | \n",
" 65 | \n",
" 0.00001 | \n",
" 16 | \n",
" 5 | \n",
" 4 | \n",
" 256 | \n",
" 0.5584 | \n",
" 0.5552 | \n",
" 0.0032 | \n",
"
\n",
" \n",
" 46 | \n",
" 72 | \n",
" 0.00001 | \n",
" 16 | \n",
" 5 | \n",
" 16 | \n",
" 512 | \n",
" 0.5592 | \n",
" 0.5551 | \n",
" 0.0041 | \n",
"
\n",
" \n",
" 47 | \n",
" 33 | \n",
" 0.00010 | \n",
" 16 | \n",
" 5 | \n",
" 8 | \n",
" 512 | \n",
" 0.5617 | \n",
" 0.5549 | \n",
" 0.0067 | \n",
"
\n",
" \n",
" 48 | \n",
" 42 | \n",
" 0.00001 | \n",
" 8 | \n",
" 4 | \n",
" 8 | \n",
" 512 | \n",
" 0.5615 | \n",
" 0.5546 | \n",
" 0.0069 | \n",
"
\n",
" \n",
" 49 | \n",
" 45 | \n",
" 0.00001 | \n",
" 8 | \n",
" 4 | \n",
" 16 | \n",
" 512 | \n",
" 0.5638 | \n",
" 0.5546 | \n",
" 0.0092 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" step lr num_transformer_heads tr_layer_number num_graph_heads hidden_dim_gated dev test gap\n",
"0 50 0.00001 8 5 8 256 0.5805 0.5756 0.0048\n",
"1 24 0.00010 16 4 8 512 0.5723 0.5700 0.0024\n",
"2 67 0.00001 16 5 8 128 0.5667 0.5687 -0.0019\n",
"3 12 0.00010 8 5 4 512 0.5645 0.5685 -0.0040\n",
"4 32 0.00010 16 5 8 256 0.5658 0.5678 -0.0020\n",
"5 64 0.00001 16 5 4 128 0.5662 0.5676 -0.0014\n",
"6 68 0.00001 16 5 8 256 0.5779 0.5669 0.0110\n",
"7 55 0.00001 16 4 4 128 0.5706 0.5669 0.0037\n",
"8 49 0.00001 8 5 8 128 0.5642 0.5659 -0.0018\n",
"9 31 0.00010 16 5 8 128 0.5750 0.5654 0.0095\n",
"10 23 0.00010 16 4 8 256 0.5716 0.5651 0.0065\n",
"11 22 0.00010 16 4 8 128 0.5743 0.5650 0.0093\n",
"12 53 0.00001 8 5 16 256 0.5732 0.5633 0.0099\n",
"13 26 0.00010 16 4 16 256 0.5636 0.5630 0.0006\n",
"14 6 0.00010 8 4 8 512 0.5624 0.5626 -0.0002\n",
"15 10 0.00010 8 5 4 128 0.5655 0.5625 0.0029\n",
"16 70 0.00001 16 5 16 128 0.5618 0.5616 0.0002\n",
"17 36 0.00010 16 5 16 512 0.5753 0.5610 0.0143\n",
"18 27 0.00010 16 4 16 512 0.5587 0.5609 -0.0022\n",
"19 46 0.00001 8 5 4 128 0.5660 0.5608 0.0052\n",
"20 19 0.00010 16 4 4 128 0.5616 0.5604 0.0012\n",
"21 44 0.00001 8 4 16 256 0.5686 0.5600 0.0086\n",
"22 35 0.00010 16 5 16 256 0.5643 0.5598 0.0045\n",
"23 13 0.00010 8 5 8 128 0.5621 0.5596 0.0025\n",
"24 71 0.00001 16 5 16 256 0.5667 0.5589 0.0078\n",
"25 69 0.00001 16 5 8 512 0.5638 0.5586 0.0052\n",
"26 7 0.00010 8 4 16 128 0.5626 0.5579 0.0047\n",
"27 30 0.00010 16 5 4 512 0.5619 0.5578 0.0041\n",
"28 51 0.00001 8 5 8 512 0.5627 0.5577 0.0051\n",
"29 52 0.00001 8 5 16 128 0.5581 0.5574 0.0007\n",
"30 37 0.00001 8 4 4 128 0.5608 0.5572 0.0036\n",
"31 4 0.00010 8 4 8 128 0.5659 0.5571 0.0088\n",
"32 14 0.00010 8 5 8 256 0.5634 0.5571 0.0064\n",
"33 38 0.00001 8 4 4 256 0.5611 0.5569 0.0043\n",
"34 57 0.00001 16 4 4 512 0.5626 0.5569 0.0058\n",
"35 18 0.00010 8 5 16 512 0.5555 0.5568 -0.0013\n",
"36 62 0.00001 16 4 16 256 0.5602 0.5565 0.0038\n",
"37 16 0.00010 8 5 16 128 0.5632 0.5564 0.0069\n",
"38 59 0.00001 16 4 8 256 0.5695 0.5563 0.0133\n",
"39 47 0.00001 8 5 4 256 0.5643 0.5557 0.0086\n",
"40 2 0.00010 8 4 4 256 0.5589 0.5555 0.0034\n",
"41 28 0.00010 16 5 4 128 0.5639 0.5555 0.0085\n",
"42 43 0.00001 8 4 16 128 0.5659 0.5554 0.0105\n",
"43 58 0.00001 16 4 8 128 0.5630 0.5554 0.0077\n",
"44 1 0.00010 8 4 4 128 0.5633 0.5553 0.0080\n",
"45 65 0.00001 16 5 4 256 0.5584 0.5552 0.0032\n",
"46 72 0.00001 16 5 16 512 0.5592 0.5551 0.0041\n",
"47 33 0.00010 16 5 8 512 0.5617 0.5549 0.0067\n",
"48 42 0.00001 8 4 8 512 0.5615 0.5546 0.0069\n",
"49 45 0.00001 8 4 16 512 0.5638 0.5546 0.0092"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"df = parse_smart_log(\"C:/Users/Alexandr/Desktop/sampling/last/bigatedgraph.txt\",50)\n",
"\n",
"from IPython.display import display\n",
"pd.set_option(\"display.max_columns\", None)\n",
"pd.set_option(\"display.width\", 160)\n",
"\n",
"display(df.head(50))"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "dfae755b-6a58-4c26-8c83-4884038b321e",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" step | \n",
" lr | \n",
" smoothing_probability | \n",
" dev | \n",
" test | \n",
" gap | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 12 | \n",
" 0.00001 | \n",
" 0.0 | \n",
" 0.5768 | \n",
" 0.5797 | \n",
" -0.0029 | \n",
"
\n",
" \n",
" 1 | \n",
" 14 | \n",
" 0.00001 | \n",
" 0.2 | \n",
" 0.5753 | \n",
" 0.5779 | \n",
" -0.0026 | \n",
"
\n",
" \n",
" 2 | \n",
" 13 | \n",
" 0.00001 | \n",
" 0.1 | \n",
" 0.5745 | \n",
" 0.5773 | \n",
" -0.0028 | \n",
"
\n",
" \n",
" 3 | \n",
" 15 | \n",
" 0.00001 | \n",
" 0.3 | \n",
" 0.5751 | \n",
" 0.5735 | \n",
" 0.0016 | \n",
"
\n",
" \n",
" 4 | \n",
" 17 | \n",
" 0.00001 | \n",
" 0.5 | \n",
" 0.5708 | \n",
" 0.5719 | \n",
" -0.0011 | \n",
"
\n",
" \n",
" 5 | \n",
" 6 | \n",
" 0.00010 | \n",
" 0.5 | \n",
" 0.5675 | \n",
" 0.5706 | \n",
" -0.0031 | \n",
"
\n",
" \n",
" 6 | \n",
" 1 | \n",
" 0.00010 | \n",
" 0.0 | \n",
" 0.5743 | \n",
" 0.5698 | \n",
" 0.0045 | \n",
"
\n",
" \n",
" 7 | \n",
" 2 | \n",
" 0.00010 | \n",
" 0.1 | \n",
" 0.5741 | \n",
" 0.5698 | \n",
" 0.0042 | \n",
"
\n",
" \n",
" 8 | \n",
" 16 | \n",
" 0.00001 | \n",
" 0.4 | \n",
" 0.5743 | \n",
" 0.5691 | \n",
" 0.0052 | \n",
"
\n",
" \n",
" 9 | \n",
" 18 | \n",
" 0.00001 | \n",
" 0.6 | \n",
" 0.5674 | \n",
" 0.5677 | \n",
" -0.0003 | \n",
"
\n",
" \n",
" 10 | \n",
" 3 | \n",
" 0.00010 | \n",
" 0.2 | \n",
" 0.5687 | \n",
" 0.5659 | \n",
" 0.0028 | \n",
"
\n",
" \n",
" 11 | \n",
" 19 | \n",
" 0.00001 | \n",
" 0.7 | \n",
" 0.5682 | \n",
" 0.5636 | \n",
" 0.0046 | \n",
"
\n",
" \n",
" 12 | \n",
" 7 | \n",
" 0.00010 | \n",
" 0.6 | \n",
" 0.5593 | \n",
" 0.5634 | \n",
" -0.0042 | \n",
"
\n",
" \n",
" 13 | \n",
" 5 | \n",
" 0.00010 | \n",
" 0.4 | \n",
" 0.5684 | \n",
" 0.5629 | \n",
" 0.0055 | \n",
"
\n",
" \n",
" 14 | \n",
" 9 | \n",
" 0.00010 | \n",
" 0.8 | \n",
" 0.5623 | \n",
" 0.5622 | \n",
" 0.0001 | \n",
"
\n",
" \n",
" 15 | \n",
" 8 | \n",
" 0.00010 | \n",
" 0.7 | \n",
" 0.5625 | \n",
" 0.5616 | \n",
" 0.0009 | \n",
"
\n",
" \n",
" 16 | \n",
" 4 | \n",
" 0.00010 | \n",
" 0.3 | \n",
" 0.5689 | \n",
" 0.5608 | \n",
" 0.0081 | \n",
"
\n",
" \n",
" 17 | \n",
" 20 | \n",
" 0.00001 | \n",
" 0.8 | \n",
" 0.5610 | \n",
" 0.5600 | \n",
" 0.0010 | \n",
"
\n",
" \n",
" 18 | \n",
" 10 | \n",
" 0.00010 | \n",
" 0.9 | \n",
" 0.5615 | \n",
" 0.5592 | \n",
" 0.0024 | \n",
"
\n",
" \n",
" 19 | \n",
" 21 | \n",
" 0.00001 | \n",
" 0.9 | \n",
" 0.5573 | \n",
" 0.5590 | \n",
" -0.0017 | \n",
"
\n",
" \n",
" 20 | \n",
" 22 | \n",
" 0.00001 | \n",
" 1.0 | \n",
" 0.5558 | \n",
" 0.5574 | \n",
" -0.0016 | \n",
"
\n",
" \n",
" 21 | \n",
" 11 | \n",
" 0.00010 | \n",
" 1.0 | \n",
" 0.5534 | \n",
" 0.5564 | \n",
" -0.0029 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" step lr smoothing_probability dev test gap\n",
"0 12 0.00001 0.0 0.5768 0.5797 -0.0029\n",
"1 14 0.00001 0.2 0.5753 0.5779 -0.0026\n",
"2 13 0.00001 0.1 0.5745 0.5773 -0.0028\n",
"3 15 0.00001 0.3 0.5751 0.5735 0.0016\n",
"4 17 0.00001 0.5 0.5708 0.5719 -0.0011\n",
"5 6 0.00010 0.5 0.5675 0.5706 -0.0031\n",
"6 1 0.00010 0.0 0.5743 0.5698 0.0045\n",
"7 2 0.00010 0.1 0.5741 0.5698 0.0042\n",
"8 16 0.00001 0.4 0.5743 0.5691 0.0052\n",
"9 18 0.00001 0.6 0.5674 0.5677 -0.0003\n",
"10 3 0.00010 0.2 0.5687 0.5659 0.0028\n",
"11 19 0.00001 0.7 0.5682 0.5636 0.0046\n",
"12 7 0.00010 0.6 0.5593 0.5634 -0.0042\n",
"13 5 0.00010 0.4 0.5684 0.5629 0.0055\n",
"14 9 0.00010 0.8 0.5623 0.5622 0.0001\n",
"15 8 0.00010 0.7 0.5625 0.5616 0.0009\n",
"16 4 0.00010 0.3 0.5689 0.5608 0.0081\n",
"17 20 0.00001 0.8 0.5610 0.5600 0.0010\n",
"18 10 0.00010 0.9 0.5615 0.5592 0.0024\n",
"19 21 0.00001 0.9 0.5573 0.5590 -0.0017\n",
"20 22 0.00001 1.0 0.5558 0.5574 -0.0016\n",
"21 11 0.00010 1.0 0.5534 0.5564 -0.0029"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"df = parse_smart_log(\"C:/Users/Alexandr/Desktop/sampling/last/smothing/phi.txt\",50)\n",
"\n",
"from IPython.display import display\n",
"pd.set_option(\"display.max_columns\", None)\n",
"pd.set_option(\"display.width\", 160)\n",
"\n",
"display(df.head(50))"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "f4db3748-6be0-4ee6-abd1-c0e734efc8c8",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" step | \n",
" lr | \n",
" smoothing_probability | \n",
" dev | \n",
" test | \n",
" gap | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 14 | \n",
" 0.00001 | \n",
" 0.2 | \n",
" 0.5802 | \n",
" 0.5808 | \n",
" -0.0007 | \n",
"
\n",
" \n",
" 1 | \n",
" 12 | \n",
" 0.00001 | \n",
" 0.0 | \n",
" 0.5768 | \n",
" 0.5797 | \n",
" -0.0029 | \n",
"
\n",
" \n",
" 2 | \n",
" 15 | \n",
" 0.00001 | \n",
" 0.3 | \n",
" 0.5789 | \n",
" 0.5782 | \n",
" 0.0007 | \n",
"
\n",
" \n",
" 3 | \n",
" 16 | \n",
" 0.00001 | \n",
" 0.4 | \n",
" 0.5759 | \n",
" 0.5768 | \n",
" -0.0008 | \n",
"
\n",
" \n",
" 4 | \n",
" 13 | \n",
" 0.00001 | \n",
" 0.1 | \n",
" 0.5797 | \n",
" 0.5762 | \n",
" 0.0035 | \n",
"
\n",
" \n",
" 5 | \n",
" 17 | \n",
" 0.00001 | \n",
" 0.5 | \n",
" 0.5745 | \n",
" 0.5760 | \n",
" -0.0015 | \n",
"
\n",
" \n",
" 6 | \n",
" 18 | \n",
" 0.00001 | \n",
" 0.6 | \n",
" 0.5677 | \n",
" 0.5710 | \n",
" -0.0033 | \n",
"
\n",
" \n",
" 7 | \n",
" 1 | \n",
" 0.00010 | \n",
" 0.0 | \n",
" 0.5743 | \n",
" 0.5698 | \n",
" 0.0045 | \n",
"
\n",
" \n",
" 8 | \n",
" 2 | \n",
" 0.00010 | \n",
" 0.1 | \n",
" 0.5690 | \n",
" 0.5696 | \n",
" -0.0006 | \n",
"
\n",
" \n",
" 9 | \n",
" 19 | \n",
" 0.00001 | \n",
" 0.7 | \n",
" 0.5644 | \n",
" 0.5694 | \n",
" -0.0050 | \n",
"
\n",
" \n",
" 10 | \n",
" 20 | \n",
" 0.00001 | \n",
" 0.8 | \n",
" 0.5646 | \n",
" 0.5690 | \n",
" -0.0044 | \n",
"
\n",
" \n",
" 11 | \n",
" 21 | \n",
" 0.00001 | \n",
" 0.9 | \n",
" 0.5651 | \n",
" 0.5683 | \n",
" -0.0032 | \n",
"
\n",
" \n",
" 12 | \n",
" 4 | \n",
" 0.00010 | \n",
" 0.3 | \n",
" 0.5718 | \n",
" 0.5662 | \n",
" 0.0056 | \n",
"
\n",
" \n",
" 13 | \n",
" 3 | \n",
" 0.00010 | \n",
" 0.2 | \n",
" 0.5656 | \n",
" 0.5647 | \n",
" 0.0009 | \n",
"
\n",
" \n",
" 14 | \n",
" 22 | \n",
" 0.00001 | \n",
" 1.0 | \n",
" 0.5559 | \n",
" 0.5610 | \n",
" -0.0051 | \n",
"
\n",
" \n",
" 15 | \n",
" 5 | \n",
" 0.00010 | \n",
" 0.4 | \n",
" 0.5650 | \n",
" 0.5608 | \n",
" 0.0042 | \n",
"
\n",
" \n",
" 16 | \n",
" 8 | \n",
" 0.00010 | \n",
" 0.7 | \n",
" 0.5660 | \n",
" 0.5595 | \n",
" 0.0065 | \n",
"
\n",
" \n",
" 17 | \n",
" 9 | \n",
" 0.00010 | \n",
" 0.8 | \n",
" 0.5636 | \n",
" 0.5589 | \n",
" 0.0047 | \n",
"
\n",
" \n",
" 18 | \n",
" 7 | \n",
" 0.00010 | \n",
" 0.6 | \n",
" 0.5565 | \n",
" 0.5587 | \n",
" -0.0022 | \n",
"
\n",
" \n",
" 19 | \n",
" 6 | \n",
" 0.00010 | \n",
" 0.5 | \n",
" 0.5596 | \n",
" 0.5585 | \n",
" 0.0011 | \n",
"
\n",
" \n",
" 20 | \n",
" 10 | \n",
" 0.00010 | \n",
" 0.9 | \n",
" 0.5581 | \n",
" 0.5561 | \n",
" 0.0020 | \n",
"
\n",
" \n",
" 21 | \n",
" 11 | \n",
" 0.00010 | \n",
" 1.0 | \n",
" 0.5427 | \n",
" 0.5463 | \n",
" -0.0036 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" step lr smoothing_probability dev test gap\n",
"0 14 0.00001 0.2 0.5802 0.5808 -0.0007\n",
"1 12 0.00001 0.0 0.5768 0.5797 -0.0029\n",
"2 15 0.00001 0.3 0.5789 0.5782 0.0007\n",
"3 16 0.00001 0.4 0.5759 0.5768 -0.0008\n",
"4 13 0.00001 0.1 0.5797 0.5762 0.0035\n",
"5 17 0.00001 0.5 0.5745 0.5760 -0.0015\n",
"6 18 0.00001 0.6 0.5677 0.5710 -0.0033\n",
"7 1 0.00010 0.0 0.5743 0.5698 0.0045\n",
"8 2 0.00010 0.1 0.5690 0.5696 -0.0006\n",
"9 19 0.00001 0.7 0.5644 0.5694 -0.0050\n",
"10 20 0.00001 0.8 0.5646 0.5690 -0.0044\n",
"11 21 0.00001 0.9 0.5651 0.5683 -0.0032\n",
"12 4 0.00010 0.3 0.5718 0.5662 0.0056\n",
"13 3 0.00010 0.2 0.5656 0.5647 0.0009\n",
"14 22 0.00001 1.0 0.5559 0.5610 -0.0051\n",
"15 5 0.00010 0.4 0.5650 0.5608 0.0042\n",
"16 8 0.00010 0.7 0.5660 0.5595 0.0065\n",
"17 9 0.00010 0.8 0.5636 0.5589 0.0047\n",
"18 7 0.00010 0.6 0.5565 0.5587 -0.0022\n",
"19 6 0.00010 0.5 0.5596 0.5585 0.0011\n",
"20 10 0.00010 0.9 0.5581 0.5561 0.0020\n",
"21 11 0.00010 1.0 0.5427 0.5463 -0.0036"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"df = parse_smart_log(\"C:/Users/Alexandr/Desktop/sampling/last/smothing/qwen.txt\",50)\n",
"\n",
"from IPython.display import display\n",
"pd.set_option(\"display.max_columns\", None)\n",
"pd.set_option(\"display.width\", 160)\n",
"\n",
"display(df.head(50))"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "952affd8-2fea-481c-b8cc-3f53418b5472",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" step | \n",
" lr | \n",
" num_transformer_heads | \n",
" tr_layer_number | \n",
" num_graph_heads | \n",
" dev | \n",
" test | \n",
" gap | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 3 | \n",
" 0.00010 | \n",
" 16 | \n",
" 5 | \n",
" 4 | \n",
" 0.5632 | \n",
" 0.5630 | \n",
" 0.0002 | \n",
"
\n",
" \n",
" 1 | \n",
" 1 | \n",
" 0.00010 | \n",
" 8 | \n",
" 5 | \n",
" 4 | \n",
" 0.5654 | \n",
" 0.5616 | \n",
" 0.0039 | \n",
"
\n",
" \n",
" 2 | \n",
" 5 | \n",
" 0.00001 | \n",
" 8 | \n",
" 5 | \n",
" 4 | \n",
" 0.5656 | \n",
" 0.5588 | \n",
" 0.0068 | \n",
"
\n",
" \n",
" 3 | \n",
" 6 | \n",
" 0.00001 | \n",
" 8 | \n",
" 5 | \n",
" 8 | \n",
" 0.5688 | \n",
" 0.5580 | \n",
" 0.0108 | \n",
"
\n",
" \n",
" 4 | \n",
" 7 | \n",
" 0.00001 | \n",
" 16 | \n",
" 5 | \n",
" 4 | \n",
" 0.5631 | \n",
" 0.5574 | \n",
" 0.0057 | \n",
"
\n",
" \n",
" 5 | \n",
" 2 | \n",
" 0.00010 | \n",
" 8 | \n",
" 5 | \n",
" 8 | \n",
" 0.5619 | \n",
" 0.5564 | \n",
" 0.0054 | \n",
"
\n",
" \n",
" 6 | \n",
" 8 | \n",
" 0.00001 | \n",
" 16 | \n",
" 5 | \n",
" 8 | \n",
" 0.5653 | \n",
" 0.5559 | \n",
" 0.0094 | \n",
"
\n",
" \n",
" 7 | \n",
" 4 | \n",
" 0.00010 | \n",
" 16 | \n",
" 5 | \n",
" 8 | \n",
" 0.5631 | \n",
" 0.5539 | \n",
" 0.0092 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" step lr num_transformer_heads tr_layer_number num_graph_heads dev test gap\n",
"0 3 0.00010 16 5 4 0.5632 0.5630 0.0002\n",
"1 1 0.00010 8 5 4 0.5654 0.5616 0.0039\n",
"2 5 0.00001 8 5 4 0.5656 0.5588 0.0068\n",
"3 6 0.00001 8 5 8 0.5688 0.5580 0.0108\n",
"4 7 0.00001 16 5 4 0.5631 0.5574 0.0057\n",
"5 2 0.00010 8 5 8 0.5619 0.5564 0.0054\n",
"6 8 0.00001 16 5 8 0.5653 0.5559 0.0094\n",
"7 4 0.00010 16 5 8 0.5631 0.5539 0.0092"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"df = parse_smart_log(\"C:/Users/Alexandr/Desktop/sampling/last/BiGraphFormerWithProb.txt\",50)\n",
"\n",
"from IPython.display import display\n",
"pd.set_option(\"display.max_columns\", None)\n",
"pd.set_option(\"display.width\", 160)\n",
"\n",
"display(df.head(50))"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "c210111f-e1e1-4276-8e47-249bda79b189",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" step | \n",
" lr | \n",
" num_transformer_heads | \n",
" tr_layer_number | \n",
" num_graph_heads | \n",
" hidden_dim_gated | \n",
" dev | \n",
" test | \n",
" gap | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 3 | \n",
" 0.00010 | \n",
" 16 | \n",
" 5 | \n",
" 8 | \n",
" 256 | \n",
" 0.5670 | \n",
" 0.5656 | \n",
" 0.0015 | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
" 0.00010 | \n",
" 8 | \n",
" 5 | \n",
" 8 | \n",
" 512 | \n",
" 0.5706 | \n",
" 0.5616 | \n",
" 0.0091 | \n",
"
\n",
" \n",
" 2 | \n",
" 4 | \n",
" 0.00010 | \n",
" 16 | \n",
" 5 | \n",
" 8 | \n",
" 512 | \n",
" 0.5560 | \n",
" 0.5573 | \n",
" -0.0013 | \n",
"
\n",
" \n",
" 3 | \n",
" 8 | \n",
" 0.00001 | \n",
" 16 | \n",
" 5 | \n",
" 8 | \n",
" 512 | \n",
" 0.5569 | \n",
" 0.5529 | \n",
" 0.0040 | \n",
"
\n",
" \n",
" 4 | \n",
" 1 | \n",
" 0.00010 | \n",
" 8 | \n",
" 5 | \n",
" 8 | \n",
" 256 | \n",
" 0.5572 | \n",
" 0.5514 | \n",
" 0.0059 | \n",
"
\n",
" \n",
" 5 | \n",
" 6 | \n",
" 0.00001 | \n",
" 8 | \n",
" 5 | \n",
" 8 | \n",
" 512 | \n",
" 0.5520 | \n",
" 0.5474 | \n",
" 0.0046 | \n",
"
\n",
" \n",
" 6 | \n",
" 5 | \n",
" 0.00001 | \n",
" 8 | \n",
" 5 | \n",
" 8 | \n",
" 256 | \n",
" 0.5550 | \n",
" 0.5472 | \n",
" 0.0078 | \n",
"
\n",
" \n",
" 7 | \n",
" 7 | \n",
" 0.00001 | \n",
" 16 | \n",
" 5 | \n",
" 8 | \n",
" 256 | \n",
" 0.5488 | \n",
" 0.5418 | \n",
" 0.0071 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" step lr num_transformer_heads tr_layer_number num_graph_heads hidden_dim_gated dev test gap\n",
"0 3 0.00010 16 5 8 256 0.5670 0.5656 0.0015\n",
"1 2 0.00010 8 5 8 512 0.5706 0.5616 0.0091\n",
"2 4 0.00010 16 5 8 512 0.5560 0.5573 -0.0013\n",
"3 8 0.00001 16 5 8 512 0.5569 0.5529 0.0040\n",
"4 1 0.00010 8 5 8 256 0.5572 0.5514 0.0059\n",
"5 6 0.00001 8 5 8 512 0.5520 0.5474 0.0046\n",
"6 5 0.00001 8 5 8 256 0.5550 0.5472 0.0078\n",
"7 7 0.00001 16 5 8 256 0.5488 0.5418 0.0071"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"df = parse_smart_log(\"C:/Users/Alexandr/Desktop/sampling/last/BiGatedGraphFormerWithProb.txt\",50)\n",
"\n",
"from IPython.display import display\n",
"pd.set_option(\"display.max_columns\", None)\n",
"pd.set_option(\"display.width\", 160)\n",
"\n",
"display(df.head(50))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d2e06ea3-d6ba-48d4-a013-15b1145d9a4a",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "7f9a4eae-c094-40c8-a0ee-ecf7c884d523",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "e8d39259-0192-4df1-8eef-531dee7f45b8",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "40afd60f-78b2-4b26-b5e2-314dddfa6c3f",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "9c90849c-69d8-484e-b6a4-96a8e6447bc7",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "d3325846-ffbc-444f-8549-2fbe2f5d6fde",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "03aaa192-92c8-4938-bc18-8eac086e4648",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 21,
"id": "d32b1b7f-bcb6-45bd-930c-f93b4b40b3f8",
"metadata": {},
"outputs": [
{
"ename": "TypeError",
"evalue": "PretrainedAudioEmbeddingExtractor.__init__() got an unexpected keyword argument 'model_name'",
"output_type": "error",
"traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mTypeError\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[21]\u001b[39m\u001b[32m, line 32\u001b[39m\n\u001b[32m 29\u001b[39m DEVICE = \u001b[33m\"\u001b[39m\u001b[33mcuda\u001b[39m\u001b[33m\"\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m torch.cuda.is_available() \u001b[38;5;28;01melse\u001b[39;00m \u001b[33m\"\u001b[39m\u001b[33mcpu\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 30\u001b[39m SAMPLE_RATE = \u001b[32m16000\u001b[39m\n\u001b[32m---> \u001b[39m\u001b[32m32\u001b[39m audio_feat = \u001b[43mPretrainedAudioEmbeddingExtractor\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 33\u001b[39m \u001b[43m \u001b[49m\u001b[43mmodel_name\u001b[49m\u001b[43m=\u001b[49m\u001b[43mAUDIO_MODEL\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 34\u001b[39m \u001b[43m \u001b[49m\u001b[43mcheckpoint\u001b[49m\u001b[43m=\u001b[49m\u001b[43mAUDIO_CKPT\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 35\u001b[39m \u001b[43m \u001b[49m\u001b[43mdevice\u001b[49m\u001b[43m=\u001b[49m\u001b[43mDEVICE\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 36\u001b[39m \u001b[43m)\u001b[49m\n\u001b[32m 38\u001b[39m text_feat = PretrainedTextEmbeddingExtractor(\n\u001b[32m 39\u001b[39m model_name=TEXT_MODEL,\n\u001b[32m 40\u001b[39m checkpoint=TEXT_CKPT,\n\u001b[32m 41\u001b[39m device=DEVICE,\n\u001b[32m 42\u001b[39m )\n\u001b[32m 44\u001b[39m \u001b[38;5;66;03m# ---------- 4. Узнаём фактические размеры эмбеддингов ----------\u001b[39;00m\n",
"\u001b[31mTypeError\u001b[39m: PretrainedAudioEmbeddingExtractor.__init__() got an unexpected keyword argument 'model_name'"
]
}
],
"source": [
"# ======================================================================\n",
"# Проверка синтетического корпуса MELD-S на «битые» эмбеддинги\n",
"# ======================================================================\n",
"\n",
"# ---------- 1. Импорты и базовые настройки ----------\n",
"import os, logging, traceback\n",
"import torch, torchaudio\n",
"import pandas as pd\n",
"from tqdm.auto import tqdm\n",
"\n",
"# --- если проект находится в другом каталоге, добавьте его в sys.path ---\n",
"# import sys; sys.path.append(r\"C:\\Prgrm\\ESWA_2025\")\n",
"\n",
"from data_loading.feature_extractor import (\n",
" PretrainedAudioEmbeddingExtractor,\n",
" PretrainedTextEmbeddingExtractor,\n",
")\n",
"\n",
"# ---------- 2. Пути из вашего config.toml ----------\n",
"synthetic_path = r\"E:/MELD_S\"\n",
"synth_csv_path = os.path.join(synthetic_path, \"meld_s_train_labels.csv\")\n",
"synth_wav_dir = os.path.join(synthetic_path, \"wavs\")\n",
"\n",
"# ---------- 3. Создаём экстракторы ровно как в основном проекте ----------\n",
"AUDIO_MODEL = \"audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim\"\n",
"AUDIO_CKPT = \"best_audio_model_2.pt\" # путь относительно запуска\n",
"TEXT_MODEL = \"jinaai/jina-embeddings-v3\"\n",
"TEXT_CKPT = \"best_text_model.pth\"\n",
"DEVICE = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
"SAMPLE_RATE = 16000\n",
"\n",
"audio_feat = PretrainedAudioEmbeddingExtractor(\n",
" model_name=AUDIO_MODEL,\n",
" checkpoint=AUDIO_CKPT,\n",
" device=DEVICE,\n",
")\n",
"\n",
"text_feat = PretrainedTextEmbeddingExtractor(\n",
" model_name=TEXT_MODEL,\n",
" checkpoint=TEXT_CKPT,\n",
" device=DEVICE,\n",
")\n",
"\n",
"# ---------- 4. Узнаём фактические размеры эмбеддингов ----------\n",
"with torch.no_grad():\n",
" dummy_wav = torch.zeros(1, SAMPLE_RATE) # секунда тишины\n",
" _, a_emb = audio_feat.extract(dummy_wav[0], SAMPLE_RATE)\n",
" AUDIO_DIM = a_emb[0].shape[-1]\n",
"\n",
" _, t_emb = text_feat.extract(\"hello world\")\n",
" TEXT_DIM = t_emb[0].shape[-1]\n",
"\n",
"# сколько логитов выдаёт каждый классификатор\n",
"NUM_EMOTIONS = 7 # [\"anger\", \"disgust\", ...] — как в config\n",
"PRED_DIM = NUM_EMOTIONS\n",
"\n",
"EXPECTED_ALL = AUDIO_DIM + TEXT_DIM + 2 * PRED_DIM\n",
"print(f\"AUDIO_DIM = {AUDIO_DIM}, TEXT_DIM = {TEXT_DIM}, \"\n",
" f\"TOTAL EXPECTED = {EXPECTED_ALL}\")\n",
"\n",
"# ---------- 5. Читаем CSV синтетики ----------\n",
"df = pd.read_csv(synth_csv_path)\n",
"print(f\"Всего строк в CSV: {len(df)}\")\n",
"\n",
"bad_rows, good_cnt = [], 0\n",
"\n",
"# ---------- 6. Проходим по записям ----------\n",
"for i, row in tqdm(df.iterrows(), total=len(df)):\n",
" video_name = row[\"video_name\"]\n",
" wav_path = os.path.join(synth_wav_dir, f\"{video_name}.wav\")\n",
" txt = row.get(\"text\", \"\")\n",
"\n",
" reason = None\n",
" try:\n",
" # 6.1 Проверяем, существует ли wav-файл\n",
" if not os.path.exists(wav_path):\n",
" reason = \"file_missing\"\n",
"\n",
" # 6.2 Получаем аудио-эмбеддинг\n",
" if reason is None:\n",
" wf, sr = torchaudio.load(wav_path)\n",
" if sr != SAMPLE_RATE:\n",
" wf = torchaudio.transforms.Resample(sr, SAMPLE_RATE)(wf)\n",
" a_pred, a_emb = audio_feat.extract(wf[0], SAMPLE_RATE)\n",
" a_emb = a_emb[0]\n",
" if a_emb.shape[-1] != AUDIO_DIM:\n",
" reason = f\"audio_dim_{a_emb.shape[-1]}\"\n",
"\n",
" # 6.3 Получаем текст-эмбеддинг\n",
" if reason is None:\n",
" t_pred, t_emb = text_feat.extract(txt)\n",
" t_emb = t_emb[0]\n",
" if t_emb.shape[-1] != TEXT_DIM:\n",
" reason = f\"text_dim_{t_emb.shape[-1]}\"\n",
"\n",
" # 6.4 Проверяем полную конкатенацию\n",
" if reason is None:\n",
" full_vec = torch.cat([a_emb, t_emb, a_pred[0], t_pred[0]], dim=-1)\n",
" if full_vec.shape[-1] != EXPECTED_ALL:\n",
" reason = f\"concat_dim_{full_vec.shape[-1]}\"\n",
"\n",
" except Exception as e:\n",
" reason = \"exception_\" + e.__class__.__name__\n",
" logging.error(f\"{video_name}: {traceback.format_exc(limit=2)}\")\n",
"\n",
" # 6.5 Сохраняем результат\n",
" if reason:\n",
" bad_rows.append({\n",
" \"idx\": i,\n",
" \"video_name\": video_name,\n",
" \"reason\": reason,\n",
" \"wav_path\": wav_path,\n",
" \"text_len\": len(txt),\n",
" })\n",
" else:\n",
" good_cnt += 1\n",
"\n",
"# ---------- 7. Итоги ----------\n",
"print(f\"\\n✅ GOOD : {good_cnt}\")\n",
"print(f\"❌ BAD : {len(bad_rows)}\")\n",
"\n",
"bad_df = pd.DataFrame(bad_rows)\n",
"display(bad_df)\n",
"\n",
"# ---------- 8. (Необязательно) сохраняем список плохих файлов ----------\n",
"out_csv = os.path.join(synthetic_path, \"bad_synth_meld.csv\")\n",
"bad_df.to_csv(out_csv, index=False)\n",
"print(f\"\\nСписок «битых» примеров сохранён в: {out_csv}\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a232f93d-7f7c-41d3-9204-74445e43d071",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}