Spaces:

ppak10
/

AdditiveLLM-Notebooks

Sleeping

File size: 22,365 Bytes
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch\n",
    "\n",
    "from huggingface_hub import hf_hub_download\n",
    "from transformers import AutoTokenizer\n",
    "\n",
    "from model.distilbert import DistilBertClassificationModel\n",
    "from model.llama import LlamaClassificationModel"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "repo_id = \"ppak10/defect-classification-llama-baseline-25-epochs\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "LlamaConfig {\n",
      "  \"_attn_implementation_autoset\": true,\n",
      "  \"_name_or_path\": \"meta-llama/Llama-3.2-1B\",\n",
      "  \"architectures\": [\n",
      "    \"LlamaForCausalLM\"\n",
      "  ],\n",
      "  \"attention_bias\": false,\n",
      "  \"attention_dropout\": 0.0,\n",
      "  \"bos_token_id\": 128000,\n",
      "  \"eos_token_id\": 128001,\n",
      "  \"head_dim\": 64,\n",
      "  \"hidden_act\": \"silu\",\n",
      "  \"hidden_size\": 2048,\n",
      "  \"initializer_range\": 0.02,\n",
      "  \"intermediate_size\": 8192,\n",
      "  \"max_position_embeddings\": 131072,\n",
      "  \"mlp_bias\": false,\n",
      "  \"model_type\": \"llama\",\n",
      "  \"num_attention_heads\": 32,\n",
      "  \"num_hidden_layers\": 16,\n",
      "  \"num_key_value_heads\": 8,\n",
      "  \"pretraining_tp\": 1,\n",
      "  \"rms_norm_eps\": 1e-05,\n",
      "  \"rope_scaling\": {\n",
      "    \"factor\": 32.0,\n",
      "    \"high_freq_factor\": 4.0,\n",
      "    \"low_freq_factor\": 1.0,\n",
      "    \"original_max_position_embeddings\": 8192,\n",
      "    \"rope_type\": \"llama3\"\n",
      "  },\n",
      "  \"rope_theta\": 500000.0,\n",
      "  \"tie_word_embeddings\": true,\n",
      "  \"torch_dtype\": \"bfloat16\",\n",
      "  \"transformers_version\": \"4.47.0\",\n",
      "  \"use_cache\": true,\n",
      "  \"vocab_size\": 128256\n",
      "}\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_3716586/1335258174.py:14: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
      "  model.classifier.load_state_dict(torch.load(classification_head_path))\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "LlamaClassificationModel(\n",
       "  (base_model): LlamaModel(\n",
       "    (embed_tokens): Embedding(128256, 2048)\n",
       "    (layers): ModuleList(\n",
       "      (0-15): 16 x LlamaDecoderLayer(\n",
       "        (self_attn): LlamaSdpaAttention(\n",
       "          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)\n",
       "          (k_proj): Linear(in_features=2048, out_features=512, bias=False)\n",
       "          (v_proj): Linear(in_features=2048, out_features=512, bias=False)\n",
       "          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)\n",
       "          (rotary_emb): LlamaRotaryEmbedding()\n",
       "        )\n",
       "        (mlp): LlamaMLP(\n",
       "          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)\n",
       "          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)\n",
       "          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)\n",
       "          (act_fn): SiLU()\n",
       "        )\n",
       "        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)\n",
       "        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)\n",
       "      )\n",
       "    )\n",
       "    (norm): LlamaRMSNorm((2048,), eps=1e-05)\n",
       "    (rotary_emb): LlamaRotaryEmbedding()\n",
       "  )\n",
       "  (classifier): Linear(in_features=2048, out_features=4, bias=True)\n",
       ")"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Initialize the model\n",
    "# model = DistilBertClassificationModel(repo_id)\n",
    "model = LlamaClassificationModel()\n",
    "\n",
    "# Load the tokenizer\n",
    "tokenizer = AutoTokenizer.from_pretrained(repo_id)\n",
    "\n",
    "classification_head_path = hf_hub_download(\n",
    "    repo_id=repo_id,\n",
    "    repo_type=\"model\",\n",
    "    filename=\"classification_head.pt\"\n",
    ")\n",
    "\n",
    "model.classifier.load_state_dict(torch.load(classification_head_path))\n",
    "model.eval()  # Set the model to evaluation mode"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tensor([[1, 0, 0, 0]], dtype=torch.int32)\n"
     ]
    }
   ],
   "source": [
    "# text = \"What defects would occur with a beam size of 100 microns, a power of 500 W, a velocity of 100 mm/s and layer height of 10 microns and a hatch spacing of 10 microns for Ti-6Al-4V\"\n",
    "# text = \"SS316L[SEP]500 W[SEP]10.0 mm/s[SEP]500.0 microns[SEP]500.0 microns[SEP]100.0 microns\"\n",
    "text = \"SS316L[SEP]250.0 W[SEP]280.0 mm/s[SEP][SEP]950.0 microns[SEP]600.0 microns\"\n",
    "\n",
    "# Ensure the model is on the GPU\n",
    "# device = torch.device(\"cuda:0\" if torch.cuda.is_available() else \"cpu\")\n",
    "device = \"cpu\"\n",
    "model = model.to(device)\n",
    "\n",
    "# Tokenize input for the entire batch and move to GPU\n",
    "inputs = tokenizer(text, return_tensors=\"pt\", truncation=True, padding=\"max_length\", max_length=256)\n",
    "inputs = {key: value.to(device) for key, value in inputs.items()}\n",
    "\n",
    "# Perform inference\n",
    "outputs = model(**inputs)\n",
    "\n",
    "# Extract logits and apply sigmoid activation for multi-label classification\n",
    "logits = outputs[\"logits\"]\n",
    "probs = torch.sigmoid(logits)\n",
    "\n",
    "# Convert probabilities to one-hot encoded labels\n",
    "preds = (probs > 0.5).int()\n",
    "\n",
    "# None, keyhole, lack of fusion, balling\n",
    "print(preds)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/mnt/am/GitHub/LLM-Enabled-Process-Map/venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n"
     ]
    }
   ],
   "source": [
    "import torch.nn as nn\n",
    "from transformers import  PreTrainedModel\n",
    "\n",
    "class PretrainedLlamaClassificationModel(PreTrainedModel):\n",
    "    def __init__(self, config):\n",
    "        super().__init__(config)\n",
    "        self.base_model = AutoModel.from_pretrained(config.model_path, config=config)\n",
    "        self.classifier = nn.Linear(config.hidden_size, config.num_labels)\n",
    "        self.config = config\n",
    "\n",
    "    def forward(self, input_ids, attention_mask, labels=None):\n",
    "        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)\n",
    "        summed_representation = outputs.last_hidden_state.sum(dim=1)\n",
    "        logits = self.classifier(summed_representation)\n",
    "        loss = None\n",
    "        if labels is not None:\n",
    "            loss_fn = nn.BCEWithLogitsLoss()\n",
    "            loss = loss_fn(logits, labels.float())\n",
    "        return {\"loss\": loss, \"logits\": logits}\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/mnt/am/GitHub/LLM-Enabled-Process-Map/venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n",
      "Some weights of LlamaModel were not initialized from the model checkpoint at ppak10/defect-classification-llama-baseline-25-epochs and are newly initialized: ['embed_tokens.weight', 'layers.0.input_layernorm.weight', 'layers.0.mlp.down_proj.weight', 'layers.0.mlp.gate_proj.weight', 'layers.0.mlp.up_proj.weight', 'layers.0.post_attention_layernorm.weight', 'layers.0.self_attn.k_proj.weight', 'layers.0.self_attn.o_proj.weight', 'layers.0.self_attn.q_proj.weight', 'layers.0.self_attn.v_proj.weight', 'layers.1.input_layernorm.weight', 'layers.1.mlp.down_proj.weight', 'layers.1.mlp.gate_proj.weight', 'layers.1.mlp.up_proj.weight', 'layers.1.post_attention_layernorm.weight', 'layers.1.self_attn.k_proj.weight', 'layers.1.self_attn.o_proj.weight', 'layers.1.self_attn.q_proj.weight', 'layers.1.self_attn.v_proj.weight', 'layers.10.input_layernorm.weight', 'layers.10.mlp.down_proj.weight', 'layers.10.mlp.gate_proj.weight', 'layers.10.mlp.up_proj.weight', 'layers.10.post_attention_layernorm.weight', 'layers.10.self_attn.k_proj.weight', 'layers.10.self_attn.o_proj.weight', 'layers.10.self_attn.q_proj.weight', 'layers.10.self_attn.v_proj.weight', 'layers.11.input_layernorm.weight', 'layers.11.mlp.down_proj.weight', 'layers.11.mlp.gate_proj.weight', 'layers.11.mlp.up_proj.weight', 'layers.11.post_attention_layernorm.weight', 'layers.11.self_attn.k_proj.weight', 'layers.11.self_attn.o_proj.weight', 'layers.11.self_attn.q_proj.weight', 'layers.11.self_attn.v_proj.weight', 'layers.12.input_layernorm.weight', 'layers.12.mlp.down_proj.weight', 'layers.12.mlp.gate_proj.weight', 'layers.12.mlp.up_proj.weight', 'layers.12.post_attention_layernorm.weight', 'layers.12.self_attn.k_proj.weight', 'layers.12.self_attn.o_proj.weight', 'layers.12.self_attn.q_proj.weight', 'layers.12.self_attn.v_proj.weight', 'layers.13.input_layernorm.weight', 'layers.13.mlp.down_proj.weight', 'layers.13.mlp.gate_proj.weight', 'layers.13.mlp.up_proj.weight', 'layers.13.post_attention_layernorm.weight', 'layers.13.self_attn.k_proj.weight', 'layers.13.self_attn.o_proj.weight', 'layers.13.self_attn.q_proj.weight', 'layers.13.self_attn.v_proj.weight', 'layers.14.input_layernorm.weight', 'layers.14.mlp.down_proj.weight', 'layers.14.mlp.gate_proj.weight', 'layers.14.mlp.up_proj.weight', 'layers.14.post_attention_layernorm.weight', 'layers.14.self_attn.k_proj.weight', 'layers.14.self_attn.o_proj.weight', 'layers.14.self_attn.q_proj.weight', 'layers.14.self_attn.v_proj.weight', 'layers.15.input_layernorm.weight', 'layers.15.mlp.down_proj.weight', 'layers.15.mlp.gate_proj.weight', 'layers.15.mlp.up_proj.weight', 'layers.15.post_attention_layernorm.weight', 'layers.15.self_attn.k_proj.weight', 'layers.15.self_attn.o_proj.weight', 'layers.15.self_attn.q_proj.weight', 'layers.15.self_attn.v_proj.weight', 'layers.16.input_layernorm.weight', 'layers.16.mlp.down_proj.weight', 'layers.16.mlp.gate_proj.weight', 'layers.16.mlp.up_proj.weight', 'layers.16.post_attention_layernorm.weight', 'layers.16.self_attn.k_proj.weight', 'layers.16.self_attn.o_proj.weight', 'layers.16.self_attn.q_proj.weight', 'layers.16.self_attn.v_proj.weight', 'layers.17.input_layernorm.weight', 'layers.17.mlp.down_proj.weight', 'layers.17.mlp.gate_proj.weight', 'layers.17.mlp.up_proj.weight', 'layers.17.post_attention_layernorm.weight', 'layers.17.self_attn.k_proj.weight', 'layers.17.self_attn.o_proj.weight', 'layers.17.self_attn.q_proj.weight', 'layers.17.self_attn.v_proj.weight', 'layers.18.input_layernorm.weight', 'layers.18.mlp.down_proj.weight', 'layers.18.mlp.gate_proj.weight', 'layers.18.mlp.up_proj.weight', 'layers.18.post_attention_layernorm.weight', 'layers.18.self_attn.k_proj.weight', 'layers.18.self_attn.o_proj.weight', 'layers.18.self_attn.q_proj.weight', 'layers.18.self_attn.v_proj.weight', 'layers.19.input_layernorm.weight', 'layers.19.mlp.down_proj.weight', 'layers.19.mlp.gate_proj.weight', 'layers.19.mlp.up_proj.weight', 'layers.19.post_attention_layernorm.weight', 'layers.19.self_attn.k_proj.weight', 'layers.19.self_attn.o_proj.weight', 'layers.19.self_attn.q_proj.weight', 'layers.19.self_attn.v_proj.weight', 'layers.2.input_layernorm.weight', 'layers.2.mlp.down_proj.weight', 'layers.2.mlp.gate_proj.weight', 'layers.2.mlp.up_proj.weight', 'layers.2.post_attention_layernorm.weight', 'layers.2.self_attn.k_proj.weight', 'layers.2.self_attn.o_proj.weight', 'layers.2.self_attn.q_proj.weight', 'layers.2.self_attn.v_proj.weight', 'layers.20.input_layernorm.weight', 'layers.20.mlp.down_proj.weight', 'layers.20.mlp.gate_proj.weight', 'layers.20.mlp.up_proj.weight', 'layers.20.post_attention_layernorm.weight', 'layers.20.self_attn.k_proj.weight', 'layers.20.self_attn.o_proj.weight', 'layers.20.self_attn.q_proj.weight', 'layers.20.self_attn.v_proj.weight', 'layers.21.input_layernorm.weight', 'layers.21.mlp.down_proj.weight', 'layers.21.mlp.gate_proj.weight', 'layers.21.mlp.up_proj.weight', 'layers.21.post_attention_layernorm.weight', 'layers.21.self_attn.k_proj.weight', 'layers.21.self_attn.o_proj.weight', 'layers.21.self_attn.q_proj.weight', 'layers.21.self_attn.v_proj.weight', 'layers.22.input_layernorm.weight', 'layers.22.mlp.down_proj.weight', 'layers.22.mlp.gate_proj.weight', 'layers.22.mlp.up_proj.weight', 'layers.22.post_attention_layernorm.weight', 'layers.22.self_attn.k_proj.weight', 'layers.22.self_attn.o_proj.weight', 'layers.22.self_attn.q_proj.weight', 'layers.22.self_attn.v_proj.weight', 'layers.23.input_layernorm.weight', 'layers.23.mlp.down_proj.weight', 'layers.23.mlp.gate_proj.weight', 'layers.23.mlp.up_proj.weight', 'layers.23.post_attention_layernorm.weight', 'layers.23.self_attn.k_proj.weight', 'layers.23.self_attn.o_proj.weight', 'layers.23.self_attn.q_proj.weight', 'layers.23.self_attn.v_proj.weight', 'layers.24.input_layernorm.weight', 'layers.24.mlp.down_proj.weight', 'layers.24.mlp.gate_proj.weight', 'layers.24.mlp.up_proj.weight', 'layers.24.post_attention_layernorm.weight', 'layers.24.self_attn.k_proj.weight', 'layers.24.self_attn.o_proj.weight', 'layers.24.self_attn.q_proj.weight', 'layers.24.self_attn.v_proj.weight', 'layers.25.input_layernorm.weight', 'layers.25.mlp.down_proj.weight', 'layers.25.mlp.gate_proj.weight', 'layers.25.mlp.up_proj.weight', 'layers.25.post_attention_layernorm.weight', 'layers.25.self_attn.k_proj.weight', 'layers.25.self_attn.o_proj.weight', 'layers.25.self_attn.q_proj.weight', 'layers.25.self_attn.v_proj.weight', 'layers.26.input_layernorm.weight', 'layers.26.mlp.down_proj.weight', 'layers.26.mlp.gate_proj.weight', 'layers.26.mlp.up_proj.weight', 'layers.26.post_attention_layernorm.weight', 'layers.26.self_attn.k_proj.weight', 'layers.26.self_attn.o_proj.weight', 'layers.26.self_attn.q_proj.weight', 'layers.26.self_attn.v_proj.weight', 'layers.27.input_layernorm.weight', 'layers.27.mlp.down_proj.weight', 'layers.27.mlp.gate_proj.weight', 'layers.27.mlp.up_proj.weight', 'layers.27.post_attention_layernorm.weight', 'layers.27.self_attn.k_proj.weight', 'layers.27.self_attn.o_proj.weight', 'layers.27.self_attn.q_proj.weight', 'layers.27.self_attn.v_proj.weight', 'layers.28.input_layernorm.weight', 'layers.28.mlp.down_proj.weight', 'layers.28.mlp.gate_proj.weight', 'layers.28.mlp.up_proj.weight', 'layers.28.post_attention_layernorm.weight', 'layers.28.self_attn.k_proj.weight', 'layers.28.self_attn.o_proj.weight', 'layers.28.self_attn.q_proj.weight', 'layers.28.self_attn.v_proj.weight', 'layers.29.input_layernorm.weight', 'layers.29.mlp.down_proj.weight', 'layers.29.mlp.gate_proj.weight', 'layers.29.mlp.up_proj.weight', 'layers.29.post_attention_layernorm.weight', 'layers.29.self_attn.k_proj.weight', 'layers.29.self_attn.o_proj.weight', 'layers.29.self_attn.q_proj.weight', 'layers.29.self_attn.v_proj.weight', 'layers.3.input_layernorm.weight', 'layers.3.mlp.down_proj.weight', 'layers.3.mlp.gate_proj.weight', 'layers.3.mlp.up_proj.weight', 'layers.3.post_attention_layernorm.weight', 'layers.3.self_attn.k_proj.weight', 'layers.3.self_attn.o_proj.weight', 'layers.3.self_attn.q_proj.weight', 'layers.3.self_attn.v_proj.weight', 'layers.30.input_layernorm.weight', 'layers.30.mlp.down_proj.weight', 'layers.30.mlp.gate_proj.weight', 'layers.30.mlp.up_proj.weight', 'layers.30.post_attention_layernorm.weight', 'layers.30.self_attn.k_proj.weight', 'layers.30.self_attn.o_proj.weight', 'layers.30.self_attn.q_proj.weight', 'layers.30.self_attn.v_proj.weight', 'layers.31.input_layernorm.weight', 'layers.31.mlp.down_proj.weight', 'layers.31.mlp.gate_proj.weight', 'layers.31.mlp.up_proj.weight', 'layers.31.post_attention_layernorm.weight', 'layers.31.self_attn.k_proj.weight', 'layers.31.self_attn.o_proj.weight', 'layers.31.self_attn.q_proj.weight', 'layers.31.self_attn.v_proj.weight', 'layers.4.input_layernorm.weight', 'layers.4.mlp.down_proj.weight', 'layers.4.mlp.gate_proj.weight', 'layers.4.mlp.up_proj.weight', 'layers.4.post_attention_layernorm.weight', 'layers.4.self_attn.k_proj.weight', 'layers.4.self_attn.o_proj.weight', 'layers.4.self_attn.q_proj.weight', 'layers.4.self_attn.v_proj.weight', 'layers.5.input_layernorm.weight', 'layers.5.mlp.down_proj.weight', 'layers.5.mlp.gate_proj.weight', 'layers.5.mlp.up_proj.weight', 'layers.5.post_attention_layernorm.weight', 'layers.5.self_attn.k_proj.weight', 'layers.5.self_attn.o_proj.weight', 'layers.5.self_attn.q_proj.weight', 'layers.5.self_attn.v_proj.weight', 'layers.6.input_layernorm.weight', 'layers.6.mlp.down_proj.weight', 'layers.6.mlp.gate_proj.weight', 'layers.6.mlp.up_proj.weight', 'layers.6.post_attention_layernorm.weight', 'layers.6.self_attn.k_proj.weight', 'layers.6.self_attn.o_proj.weight', 'layers.6.self_attn.q_proj.weight', 'layers.6.self_attn.v_proj.weight', 'layers.7.input_layernorm.weight', 'layers.7.mlp.down_proj.weight', 'layers.7.mlp.gate_proj.weight', 'layers.7.mlp.up_proj.weight', 'layers.7.post_attention_layernorm.weight', 'layers.7.self_attn.k_proj.weight', 'layers.7.self_attn.o_proj.weight', 'layers.7.self_attn.q_proj.weight', 'layers.7.self_attn.v_proj.weight', 'layers.8.input_layernorm.weight', 'layers.8.mlp.down_proj.weight', 'layers.8.mlp.gate_proj.weight', 'layers.8.mlp.up_proj.weight', 'layers.8.post_attention_layernorm.weight', 'layers.8.self_attn.k_proj.weight', 'layers.8.self_attn.o_proj.weight', 'layers.8.self_attn.q_proj.weight', 'layers.8.self_attn.v_proj.weight', 'layers.9.input_layernorm.weight', 'layers.9.mlp.down_proj.weight', 'layers.9.mlp.gate_proj.weight', 'layers.9.mlp.up_proj.weight', 'layers.9.post_attention_layernorm.weight', 'layers.9.self_attn.k_proj.weight', 'layers.9.self_attn.o_proj.weight', 'layers.9.self_attn.q_proj.weight', 'layers.9.self_attn.v_proj.weight', 'norm.weight']\n",
      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
     ]
    }
   ],
   "source": [
    "from transformers import AutoModel, pipeline\n",
    "\n",
    "repo_id = \"ppak10/defect-classification-llama-baseline-25-epochs\"\n",
    "model = AutoModel.from_pretrained(repo_id)\n",
    "# tokenizer = AutoTokenizer.from_pretrained(repo_id)\n",
    "\n",
    "# classification_pipeline = pipeline(\"text-classification\", model=model, tokenizer=tokenizer)\n",
    "# result = classification_pipeline(\"Test input text\")\n",
    "# print(result)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "LlamaModel(\n",
      "  (embed_tokens): Embedding(32000, 2048)\n",
      "  (layers): ModuleList(\n",
      "    (0-31): 32 x LlamaDecoderLayer(\n",
      "      (self_attn): LlamaSdpaAttention(\n",
      "        (q_proj): Linear(in_features=2048, out_features=2048, bias=False)\n",
      "        (k_proj): Linear(in_features=2048, out_features=2048, bias=False)\n",
      "        (v_proj): Linear(in_features=2048, out_features=2048, bias=False)\n",
      "        (o_proj): Linear(in_features=2048, out_features=2048, bias=False)\n",
      "        (rotary_emb): LlamaRotaryEmbedding()\n",
      "      )\n",
      "      (mlp): LlamaMLP(\n",
      "        (gate_proj): Linear(in_features=2048, out_features=11008, bias=False)\n",
      "        (up_proj): Linear(in_features=2048, out_features=11008, bias=False)\n",
      "        (down_proj): Linear(in_features=11008, out_features=2048, bias=False)\n",
      "        (act_fn): SiLU()\n",
      "      )\n",
      "      (input_layernorm): LlamaRMSNorm((2048,), eps=1e-06)\n",
      "      (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-06)\n",
      "    )\n",
      "  )\n",
      "  (norm): LlamaRMSNorm((2048,), eps=1e-06)\n",
      "  (rotary_emb): LlamaRotaryEmbedding()\n",
      ")\n"
     ]
    }
   ],
   "source": [
    "print(model)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}