{ "cells": [ { "cell_type": "markdown", "id": "1b95ba48", "metadata": {}, "source": [ "# Responsible Prompting\n", "\n", "## Recipe: Populate Coordinates\n" ] }, { "cell_type": "code", "execution_count": 1, "id": "57eb7e38-3b72-451c-bd50-7b23299667d0", "metadata": {}, "outputs": [], "source": [ "# Warning: due to the extensive memory use of Parametric UMAP, this notebook could crash locally, if that happens, try to run it in Colab." ] }, { "cell_type": "markdown", "id": "342f3b42-7d2b-4914-ac48-e01132744279", "metadata": {}, "source": [ "### Imports" ] }, { "cell_type": "code", "execution_count": 67, "id": "c5498911", "metadata": {}, "outputs": [], "source": [ "import os\n", "import os.path\n", "from dotenv import load_dotenv\n", "\n", "import re\n", "import requests\n", "import json\n", "import warnings\n", "import numpy as np\n", "import pandas as pd\n", "\n", "# from sklearn.manifold import TSNE\n", "# from sklearn.metrics.pairwise import cosine_similarity\n", "from umap import UMAP\n", "import tensorflow as tf\n", "from umap.parametric_umap import ParametricUMAP, load_ParametricUMAP\n", "from sentence_transformers import SentenceTransformer" ] }, { "cell_type": "markdown", "id": "87712e49-da41-4fc9-9bf1-bf4fa8036e93", "metadata": {}, "source": [ "### Loading hugging face token from .env file" ] }, { "cell_type": "code", "execution_count": 68, "id": "304c600b-c8b7-4a4c-a0ec-d3a2506bf387", "metadata": {}, "outputs": [], "source": [ "load_dotenv()\n", "HF_TOKEN = os.getenv('HF_TOKEN')\n", "HF_URL = os.getenv('HF_URL')" ] }, { "cell_type": "markdown", "id": "63d7cb62-3825-4ca9-be99-c94c2cf34127", "metadata": {}, "source": [ "### Sentence transformer model ids (from hugging face)" ] }, { "cell_type": "code", "execution_count": 69, "id": "95fb523c", "metadata": {}, "outputs": [], "source": [ "# Models with existing json sentences output files\n", "model_ids = [\n", " \"sentence-transformers/all-MiniLM-L6-v2\", \n", " \"BAAI/bge-large-en-v1.5\",\n", " \"intfloat/multilingual-e5-large\"\n", "]" ] }, { "cell_type": "markdown", "id": "0f11d170", "metadata": {}, "source": [ "### Functions" ] }, { "cell_type": "code", "execution_count": 70, "id": "ec527bce-27c3-4faf-99fd-b381ad3fbb15", "metadata": {}, "outputs": [], "source": [ "# Converts model_id into filenames\n", "def model_id_to_filename( model_id ):\n", " return model_id.split('/')[1].lower()\n", "\n", "# Requests embeddings for a given sentence\n", "def query( texts, model_id ): \n", " # Warning in case of prompts longer than 256 words\n", " for t in texts :\n", " n_words = len( re.split(r\"\\s+\", t ) )\n", " if( n_words > 256 and model_id == \"sentence-transformers/all-MiniLM-L6-v2\" ):\n", " warnings.warn( \"Warning: Sentence provided is longer than 256 words. Model all-MiniLM-L6-v2 expects sentences up to 256 words.\" ) \n", " warnings.warn( \"Word count: {}\".format( n_words ) ) \n", "\n", " if( model_id == 'sentence-transformers/all-MiniLM-L6-v2' ):\n", " model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')\n", " out = model.encode( texts ).tolist()\n", " else:\n", " api_url = f\"https://api-inference.huggingface.co/models/{model_id}\"\n", " headers = {\"Authorization\": f\"Bearer {HF_TOKEN}\", \"Content-Type\": \"application/json\"}\n", " print( \"Request url: \" + api_url )\n", " response = requests.post(api_url, headers=headers, json={\"inputs\": texts })\n", " # print( response.status_code ) \n", " # print( response.text ) \n", " out = response.json() \n", "\n", " # making sure that different transformers retrieve the embedding\n", " if( 'error' in out ):\n", " return out\n", " while( len( out ) < 384 ): # unpacking json responses in the form of [[[embedding]]]\n", " out = out[0]\n", " return out\n", " \n", "# Performs TSNE for a given embeddings data frame\n", "def perform_tsne( embeddings_df, n_components=2, columns=['embedding_x', 'embedding_y']):\n", " tsne = TSNE(n_components, random_state=13, init=\"pca\", learning_rate=\"auto\")\n", " embeddings_tsne = tsne.fit_transform(embeddings_df)\n", " if( n_components == 3 ):\n", " columns = ['embedding_x', 'embedding_y', 'embedding_z'] \n", " embeddings_df_tsne = pd.DataFrame(embeddings_tsne, columns=columns)\n", " return embeddings_df_tsne\n", "\n", "# Performs UMAP for a given embeddings data frame\n", "def perform_umap(embeddings_df, n_components=2, dimensions=384, columns=['embedding_x', 'embedding_y'], file_name=''):\n", " dims = (dimensions,)\n", " encoder = tf.keras.Sequential([\n", " tf.keras.layers.Input(shape=(dimensions,)),\n", " tf.keras.layers.Dense(256, activation='relu'),\n", " tf.keras.layers.BatchNormalization(),\n", " tf.keras.layers.Dense(128, activation='relu'),\n", " tf.keras.layers.BatchNormalization(),\n", " tf.keras.layers.Dense(64, activation='relu'),\n", " tf.keras.layers.BatchNormalization(),\n", " tf.keras.layers.Dense(2, activation=None) # No activation for UMAP compatibility\n", " ])\n", " encoder.summary()\n", " umap_model = ParametricUMAP(encoder=encoder, dims=dims) # Parametric UMAP allowing to add new data points\n", " embeddings_umap = umap_model.fit_transform(embeddings_df)\n", " if( n_components == 3 ):\n", " columns = ['embedding_x', 'embedding_y', 'embedding_z']\n", " embeddings_df_umap = pd.DataFrame(embeddings_umap, columns=columns)\n", " # Saves model if a file name is provided\n", " if( file_name != ''): \n", " umap_model.save( file_name )\n", " \n", " return embeddings_df_umap\n", "\n", "# Create a 2d plot for a given embedding dataframe\n", "def plot_embedding_2d_interactive(embeddings_df, texts = None, colors = None, labels = None ):\n", " # Create a line plot using Plotly Express to visualize the embeddings\n", " # on a 2D plane, where 'embedding_x' and 'embedding_y' are the coordinates,\n", " # 'label' indicates whether the sentence is from the 'responsible' or 'harmful' prompt,\n", " # and 'prompt_sentence' is the actual sentence.\n", " fig = px.line(\n", " embeddings_df, \n", " x=\"embedding_x\", \n", " y=\"embedding_y\", \n", " color=\"label\", \n", " text=texts,\n", " labels={\n", " \"embedding_x\": \"Semantic Dimension 1\",\n", " \"embedding_y\": \"Semantic Dimension 2\",\n", " \"label\": \"Values\"\n", " }, \n", " width=1200, height=800,\n", " title=\"Comparing sentences' embeddings\")\n", " \n", " # Adjust the position of the text labels to be at the bottom right of each point\n", " fig.update_traces(mode=\"markers\")\n", "\n", " # Display the plot\n", " fig.show()\n", "\n", "# Compares two sets of prompts by:\n", "# Performing queries, setting different colors, creating embeddings,\n", "# and then ploting the resuling embedding comparison.\n", "# set 1 is colored as red and set 2 as green\n", "def compare_prompts_json( s1, s2, method='tsne', labels = None ):\n", " # Merging the prompts\n", " texts = []\n", " all_embeddings = []\n", " p1 = []\n", " p2 = []\n", " values = []\n", " for value in s1:\n", " for prompt in value['prompts']:\n", " if( prompt['text'] != '' and prompt['embedding'] != [] ):\n", " p1.append( prompt['text'] )\n", " all_embeddings.append( prompt['embedding'] )\n", " values.append( value['label'] )\n", " for value in s2:\n", " for prompt in value['prompts']:\n", " if( prompt['text'] != '' and prompt['embedding'] != [] ):\n", " p2.append( prompt['text'] ) \n", " all_embeddings.append( prompt['embedding'] )\n", " values.append( value['label'] )\n", " \n", " texts = p1 + p2\n", " \n", " # Defining color values for different prompts\n", " # For cmap='RdYlGn', p1 (negative value) can be considered the harmfull/bad ones\n", " colors = [-1] * len( p1 ) + [1] * len( p2 )\n", " \n", " # Data frame\n", " embeddings = pd.DataFrame(all_embeddings)\n", " \n", " # Visualizing sentences\n", " # Dimensionality reduction\n", " if( method=='umap' ):\n", " embeddings_df_2d = perform_umap(embeddings, dimensions=embeddings.shape[1] )\n", " else:\n", " embeddings_df_2d = perform_tsne(embeddings)\n", "\n", " embeddings_df_2d['label'] = values\n", " plot_embedding_2d_interactive(embeddings_df_2d, texts, colors, labels)\n", " " ] }, { "cell_type": "markdown", "id": "c39191c3", "metadata": {}, "source": [ "### Setting Folders" ] }, { "cell_type": "code", "execution_count": 71, "id": "87316fa4-1fcf-41c4-9913-bc5704b25ea2", "metadata": {}, "outputs": [], "source": [ "# JSON folder\n", "json_folder = '../prompt-sentences-main/'\n" ] }, { "cell_type": "markdown", "id": "6315c838-436b-4eb3-b3aa-f0faba1cfcab", "metadata": {}, "source": [ "### Creating Parametric UMAP Models" ] }, { "cell_type": "code", "execution_count": 72, "id": "3ca73fb3", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Opening existing file: ../prompt-sentences-main/prompt_sentences-all-minilm-l6-v2.json\n" ] }, { "data": { "text/html": [ "
Model: \"sequential_7\"\n",
       "
\n" ], "text/plain": [ "\u001b[1mModel: \"sequential_7\"\u001b[0m\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓\n",
       "┃ Layer (type)                     Output Shape                  Param # ┃\n",
       "┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩\n",
       "│ dense_28 (Dense)                │ (None, 256)            │        98,560 │\n",
       "├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
       "│ batch_normalization_21          │ (None, 256)            │         1,024 │\n",
       "│ (BatchNormalization)            │                        │               │\n",
       "├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
       "│ dense_29 (Dense)                │ (None, 128)            │        32,896 │\n",
       "├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
       "│ batch_normalization_22          │ (None, 128)            │           512 │\n",
       "│ (BatchNormalization)            │                        │               │\n",
       "├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
       "│ dense_30 (Dense)                │ (None, 64)             │         8,256 │\n",
       "├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
       "│ batch_normalization_23          │ (None, 64)             │           256 │\n",
       "│ (BatchNormalization)            │                        │               │\n",
       "├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
       "│ dense_31 (Dense)                │ (None, 2)              │           130 │\n",
       "└─────────────────────────────────┴────────────────────────┴───────────────┘\n",
       "
\n" ], "text/plain": [ "┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓\n", "┃\u001b[1m \u001b[0m\u001b[1mLayer (type) \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mOutput Shape \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1m Param #\u001b[0m\u001b[1m \u001b[0m┃\n", "┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩\n", "│ dense_28 (\u001b[38;5;33mDense\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m256\u001b[0m) │ \u001b[38;5;34m98,560\u001b[0m │\n", "├─────────────────────────────────┼────────────────────────┼───────────────┤\n", "│ batch_normalization_21 │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m256\u001b[0m) │ \u001b[38;5;34m1,024\u001b[0m │\n", "│ (\u001b[38;5;33mBatchNormalization\u001b[0m) │ │ │\n", "├─────────────────────────────────┼────────────────────────┼───────────────┤\n", "│ dense_29 (\u001b[38;5;33mDense\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m128\u001b[0m) │ \u001b[38;5;34m32,896\u001b[0m │\n", "├─────────────────────────────────┼────────────────────────┼───────────────┤\n", "│ batch_normalization_22 │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m128\u001b[0m) │ \u001b[38;5;34m512\u001b[0m │\n", "│ (\u001b[38;5;33mBatchNormalization\u001b[0m) │ │ │\n", "├─────────────────────────────────┼────────────────────────┼───────────────┤\n", "│ dense_30 (\u001b[38;5;33mDense\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m64\u001b[0m) │ \u001b[38;5;34m8,256\u001b[0m │\n", "├─────────────────────────────────┼────────────────────────┼───────────────┤\n", "│ batch_normalization_23 │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m64\u001b[0m) │ \u001b[38;5;34m256\u001b[0m │\n", "│ (\u001b[38;5;33mBatchNormalization\u001b[0m) │ │ │\n", "├─────────────────────────────────┼────────────────────────┼───────────────┤\n", "│ dense_31 (\u001b[38;5;33mDense\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m2\u001b[0m) │ \u001b[38;5;34m130\u001b[0m │\n", "└─────────────────────────────────┴────────────────────────┴───────────────┘\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
 Total params: 141,634 (553.26 KB)\n",
       "
\n" ], "text/plain": [ "\u001b[1m Total params: \u001b[0m\u001b[38;5;34m141,634\u001b[0m (553.26 KB)\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
 Trainable params: 140,738 (549.76 KB)\n",
       "
\n" ], "text/plain": [ "\u001b[1m Trainable params: \u001b[0m\u001b[38;5;34m140,738\u001b[0m (549.76 KB)\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
 Non-trainable params: 896 (3.50 KB)\n",
       "
\n" ], "text/plain": [ "\u001b[1m Non-trainable params: \u001b[0m\u001b[38;5;34m896\u001b[0m (3.50 KB)\n" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Epoch 1/10\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/Users/vsantana/Library/Python/3.9/lib/python/site-packages/keras/src/layers/layer.py:395: UserWarning:\n", "\n", "`build()` was called on layer 'umap_model', however the layer does not have a `build()` method implemented and it looks like it has unbuilt state. This will cause the layer to be marked as built, despite not being actually built, which may cause failures down the line. Make sure to implement a proper `build()` method.\n", "\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[1m711/711\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m13s\u001b[0m 14ms/step - loss: 0.2917\n", "Epoch 2/10\n", "\u001b[1m711/711\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m11s\u001b[0m 15ms/step - loss: 0.2330\n", "Epoch 3/10\n", "\u001b[1m711/711\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m10s\u001b[0m 14ms/step - loss: 0.2321\n", "Epoch 4/10\n", "\u001b[1m711/711\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m10s\u001b[0m 14ms/step - loss: 0.2317\n", "Epoch 5/10\n", "\u001b[1m711/711\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m12s\u001b[0m 17ms/step - loss: 0.2317\n", "Epoch 6/10\n", "\u001b[1m711/711\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m12s\u001b[0m 17ms/step - loss: 0.2320\n", "Epoch 7/10\n", "\u001b[1m711/711\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m12s\u001b[0m 16ms/step - loss: 0.2318\n", "Epoch 8/10\n", "\u001b[1m711/711\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m11s\u001b[0m 16ms/step - loss: 0.2316\n", "Epoch 9/10\n", "\u001b[1m711/711\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m12s\u001b[0m 17ms/step - loss: 0.2312\n", "Epoch 10/10\n", "\u001b[1m711/711\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m10s\u001b[0m 15ms/step - loss: 0.2314\n", "Keras encoder model saved to ../models/umap/sentence-transformers/all-MiniLM-L6-v2/encoder.keras\n", "Keras full model saved to ../models/umap/sentence-transformers/all-MiniLM-L6-v2/parametric_model.keras\n", "Pickle of ParametricUMAP model saved to ../models/umap/sentence-transformers/all-MiniLM-L6-v2/model.pkl\n", "x: -4.817389011383057, y: 5.495937347412109\n", "Updating existing file with x-y coordinates: ../prompt-sentences-main/prompt_sentences-all-minilm-l6-v2.json\n", "\n", "\n", "Opening existing file: ../prompt-sentences-main/prompt_sentences-bge-large-en-v1.5.json\n" ] }, { "data": { "text/html": [ "
Model: \"sequential_8\"\n",
       "
\n" ], "text/plain": [ "\u001b[1mModel: \"sequential_8\"\u001b[0m\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓\n",
       "┃ Layer (type)                     Output Shape                  Param # ┃\n",
       "┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩\n",
       "│ dense_32 (Dense)                │ (None, 256)            │       262,400 │\n",
       "├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
       "│ batch_normalization_24          │ (None, 256)            │         1,024 │\n",
       "│ (BatchNormalization)            │                        │               │\n",
       "├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
       "│ dense_33 (Dense)                │ (None, 128)            │        32,896 │\n",
       "├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
       "│ batch_normalization_25          │ (None, 128)            │           512 │\n",
       "│ (BatchNormalization)            │                        │               │\n",
       "├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
       "│ dense_34 (Dense)                │ (None, 64)             │         8,256 │\n",
       "├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
       "│ batch_normalization_26          │ (None, 64)             │           256 │\n",
       "│ (BatchNormalization)            │                        │               │\n",
       "├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
       "│ dense_35 (Dense)                │ (None, 2)              │           130 │\n",
       "└─────────────────────────────────┴────────────────────────┴───────────────┘\n",
       "
\n" ], "text/plain": [ "┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓\n", "┃\u001b[1m \u001b[0m\u001b[1mLayer (type) \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mOutput Shape \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1m Param #\u001b[0m\u001b[1m \u001b[0m┃\n", "┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩\n", "│ dense_32 (\u001b[38;5;33mDense\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m256\u001b[0m) │ \u001b[38;5;34m262,400\u001b[0m │\n", "├─────────────────────────────────┼────────────────────────┼───────────────┤\n", "│ batch_normalization_24 │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m256\u001b[0m) │ \u001b[38;5;34m1,024\u001b[0m │\n", "│ (\u001b[38;5;33mBatchNormalization\u001b[0m) │ │ │\n", "├─────────────────────────────────┼────────────────────────┼───────────────┤\n", "│ dense_33 (\u001b[38;5;33mDense\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m128\u001b[0m) │ \u001b[38;5;34m32,896\u001b[0m │\n", "├─────────────────────────────────┼────────────────────────┼───────────────┤\n", "│ batch_normalization_25 │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m128\u001b[0m) │ \u001b[38;5;34m512\u001b[0m │\n", "│ (\u001b[38;5;33mBatchNormalization\u001b[0m) │ │ │\n", "├─────────────────────────────────┼────────────────────────┼───────────────┤\n", "│ dense_34 (\u001b[38;5;33mDense\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m64\u001b[0m) │ \u001b[38;5;34m8,256\u001b[0m │\n", "├─────────────────────────────────┼────────────────────────┼───────────────┤\n", "│ batch_normalization_26 │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m64\u001b[0m) │ \u001b[38;5;34m256\u001b[0m │\n", "│ (\u001b[38;5;33mBatchNormalization\u001b[0m) │ │ │\n", "├─────────────────────────────────┼────────────────────────┼───────────────┤\n", "│ dense_35 (\u001b[38;5;33mDense\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m2\u001b[0m) │ \u001b[38;5;34m130\u001b[0m │\n", "└─────────────────────────────────┴────────────────────────┴───────────────┘\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
 Total params: 305,474 (1.17 MB)\n",
       "
\n" ], "text/plain": [ "\u001b[1m Total params: \u001b[0m\u001b[38;5;34m305,474\u001b[0m (1.17 MB)\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
 Trainable params: 304,578 (1.16 MB)\n",
       "
\n" ], "text/plain": [ "\u001b[1m Trainable params: \u001b[0m\u001b[38;5;34m304,578\u001b[0m (1.16 MB)\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
 Non-trainable params: 896 (3.50 KB)\n",
       "
\n" ], "text/plain": [ "\u001b[1m Non-trainable params: \u001b[0m\u001b[38;5;34m896\u001b[0m (3.50 KB)\n" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Epoch 1/10\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/Users/vsantana/Library/Python/3.9/lib/python/site-packages/keras/src/layers/layer.py:395: UserWarning:\n", "\n", "`build()` was called on layer 'umap_model', however the layer does not have a `build()` method implemented and it looks like it has unbuilt state. This will cause the layer to be marked as built, despite not being actually built, which may cause failures down the line. Make sure to implement a proper `build()` method.\n", "\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[1m717/717\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m19s\u001b[0m 22ms/step - loss: 0.2874\n", "Epoch 2/10\n", "\u001b[1m717/717\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m16s\u001b[0m 23ms/step - loss: 0.2319\n", "Epoch 3/10\n", "\u001b[1m717/717\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m18s\u001b[0m 26ms/step - loss: 0.2305\n", "Epoch 4/10\n", "\u001b[1m717/717\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m17s\u001b[0m 24ms/step - loss: 0.2307\n", "Epoch 5/10\n", "\u001b[1m717/717\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m19s\u001b[0m 26ms/step - loss: 0.2299\n", "Epoch 6/10\n", "\u001b[1m717/717\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m18s\u001b[0m 25ms/step - loss: 0.2304\n", "Epoch 7/10\n", "\u001b[1m717/717\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m16s\u001b[0m 23ms/step - loss: 0.2297\n", "Epoch 8/10\n", "\u001b[1m717/717\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m18s\u001b[0m 25ms/step - loss: 0.2303\n", "Epoch 9/10\n", "\u001b[1m717/717\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m18s\u001b[0m 25ms/step - loss: 0.2301\n", "Epoch 10/10\n", "\u001b[1m717/717\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m18s\u001b[0m 25ms/step - loss: 0.2299\n", "Keras encoder model saved to ../models/umap/BAAI/bge-large-en-v1.5/encoder.keras\n", "Keras full model saved to ../models/umap/BAAI/bge-large-en-v1.5/parametric_model.keras\n", "Pickle of ParametricUMAP model saved to ../models/umap/BAAI/bge-large-en-v1.5/model.pkl\n", "x: -2.295529842376709, y: 11.94671630859375\n", "Updating existing file with x-y coordinates: ../prompt-sentences-main/prompt_sentences-bge-large-en-v1.5.json\n", "\n", "\n", "Opening existing file: ../prompt-sentences-main/prompt_sentences-multilingual-e5-large.json\n" ] }, { "data": { "text/html": [ "
Model: \"sequential_9\"\n",
       "
\n" ], "text/plain": [ "\u001b[1mModel: \"sequential_9\"\u001b[0m\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓\n",
       "┃ Layer (type)                     Output Shape                  Param # ┃\n",
       "┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩\n",
       "│ dense_36 (Dense)                │ (None, 256)            │       262,400 │\n",
       "├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
       "│ batch_normalization_27          │ (None, 256)            │         1,024 │\n",
       "│ (BatchNormalization)            │                        │               │\n",
       "├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
       "│ dense_37 (Dense)                │ (None, 128)            │        32,896 │\n",
       "├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
       "│ batch_normalization_28          │ (None, 128)            │           512 │\n",
       "│ (BatchNormalization)            │                        │               │\n",
       "├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
       "│ dense_38 (Dense)                │ (None, 64)             │         8,256 │\n",
       "├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
       "│ batch_normalization_29          │ (None, 64)             │           256 │\n",
       "│ (BatchNormalization)            │                        │               │\n",
       "├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
       "│ dense_39 (Dense)                │ (None, 2)              │           130 │\n",
       "└─────────────────────────────────┴────────────────────────┴───────────────┘\n",
       "
\n" ], "text/plain": [ "┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓\n", "┃\u001b[1m \u001b[0m\u001b[1mLayer (type) \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mOutput Shape \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1m Param #\u001b[0m\u001b[1m \u001b[0m┃\n", "┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩\n", "│ dense_36 (\u001b[38;5;33mDense\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m256\u001b[0m) │ \u001b[38;5;34m262,400\u001b[0m │\n", "├─────────────────────────────────┼────────────────────────┼───────────────┤\n", "│ batch_normalization_27 │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m256\u001b[0m) │ \u001b[38;5;34m1,024\u001b[0m │\n", "│ (\u001b[38;5;33mBatchNormalization\u001b[0m) │ │ │\n", "├─────────────────────────────────┼────────────────────────┼───────────────┤\n", "│ dense_37 (\u001b[38;5;33mDense\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m128\u001b[0m) │ \u001b[38;5;34m32,896\u001b[0m │\n", "├─────────────────────────────────┼────────────────────────┼───────────────┤\n", "│ batch_normalization_28 │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m128\u001b[0m) │ \u001b[38;5;34m512\u001b[0m │\n", "│ (\u001b[38;5;33mBatchNormalization\u001b[0m) │ │ │\n", "├─────────────────────────────────┼────────────────────────┼───────────────┤\n", "│ dense_38 (\u001b[38;5;33mDense\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m64\u001b[0m) │ \u001b[38;5;34m8,256\u001b[0m │\n", "├─────────────────────────────────┼────────────────────────┼───────────────┤\n", "│ batch_normalization_29 │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m64\u001b[0m) │ \u001b[38;5;34m256\u001b[0m │\n", "│ (\u001b[38;5;33mBatchNormalization\u001b[0m) │ │ │\n", "├─────────────────────────────────┼────────────────────────┼───────────────┤\n", "│ dense_39 (\u001b[38;5;33mDense\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m2\u001b[0m) │ \u001b[38;5;34m130\u001b[0m │\n", "└─────────────────────────────────┴────────────────────────┴───────────────┘\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
 Total params: 305,474 (1.17 MB)\n",
       "
\n" ], "text/plain": [ "\u001b[1m Total params: \u001b[0m\u001b[38;5;34m305,474\u001b[0m (1.17 MB)\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
 Trainable params: 304,578 (1.16 MB)\n",
       "
\n" ], "text/plain": [ "\u001b[1m Trainable params: \u001b[0m\u001b[38;5;34m304,578\u001b[0m (1.16 MB)\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
 Non-trainable params: 896 (3.50 KB)\n",
       "
\n" ], "text/plain": [ "\u001b[1m Non-trainable params: \u001b[0m\u001b[38;5;34m896\u001b[0m (3.50 KB)\n" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Epoch 1/10\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/Users/vsantana/Library/Python/3.9/lib/python/site-packages/keras/src/layers/layer.py:395: UserWarning:\n", "\n", "`build()` was called on layer 'umap_model', however the layer does not have a `build()` method implemented and it looks like it has unbuilt state. This will cause the layer to be marked as built, despite not being actually built, which may cause failures down the line. Make sure to implement a proper `build()` method.\n", "\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[1m720/720\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m19s\u001b[0m 23ms/step - loss: 0.3009\n", "Epoch 2/10\n", "\u001b[1m720/720\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m16s\u001b[0m 22ms/step - loss: 0.2390\n", "Epoch 3/10\n", "\u001b[1m720/720\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m17s\u001b[0m 24ms/step - loss: 0.2363\n", "Epoch 4/10\n", "\u001b[1m720/720\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m17s\u001b[0m 24ms/step - loss: 0.2357\n", "Epoch 5/10\n", "\u001b[1m720/720\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m17s\u001b[0m 24ms/step - loss: 0.2355\n", "Epoch 6/10\n", "\u001b[1m720/720\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m18s\u001b[0m 25ms/step - loss: 0.2356\n", "Epoch 7/10\n", "\u001b[1m720/720\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m17s\u001b[0m 24ms/step - loss: 0.2350\n", "Epoch 8/10\n", "\u001b[1m720/720\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m17s\u001b[0m 23ms/step - loss: 0.2350\n", "Epoch 9/10\n", "\u001b[1m720/720\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m16s\u001b[0m 22ms/step - loss: 0.2345\n", "Epoch 10/10\n", "\u001b[1m720/720\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m16s\u001b[0m 23ms/step - loss: 0.2352\n", "Keras encoder model saved to ../models/umap/intfloat/multilingual-e5-large/encoder.keras\n", "Keras full model saved to ../models/umap/intfloat/multilingual-e5-large/parametric_model.keras\n", "Pickle of ParametricUMAP model saved to ../models/umap/intfloat/multilingual-e5-large/model.pkl\n", "x: -3.667267084121704, y: -0.18969641625881195\n", "Updating existing file with x-y coordinates: ../prompt-sentences-main/prompt_sentences-multilingual-e5-large.json\n", "\n", "\n" ] } ], "source": [ "for model_id in model_ids:\n", " # OUTPUT FILE\n", " json_out_file_suffix = model_id_to_filename( model_id )\n", " json_out_file = f\"{json_folder}prompt_sentences-{json_out_file_suffix}.json\"\n", "\n", " # Trying to open the files first\n", " if( os.path.isfile( json_out_file ) ): \n", " prompt_json_out = json.load( open( json_out_file ) )\n", " print( 'Opening existing file: ', json_out_file )\n", "\n", " prompt_json = prompt_json_out # standardization when dealing with loops, when reading/writing, we use _in or _out suffixes\n", " \n", " X = []\n", " y = []\n", " p_id = 1\n", " \n", " for v in prompt_json['positive_values']:\n", " for p in v['prompts']:\n", " # print( str( p_id ) + ') ' + p['text'] )\n", " X.append( p['embedding'] )\n", " y.append( v['label'] )\n", " p_id += 1\n", " \n", " for v in prompt_json['negative_values']:\n", " for p in v['prompts']:\n", " # print( str( p_id ) + ') ' + p['text'] )\n", " X.append( p['embedding'] )\n", " y.append( v['label'] )\n", " p_id += 1\n", "\n", " dimensions = len( prompt_json['positive_values'][0]['prompts'][0]['embedding'] )\n", " \n", " # Create a parametric UMAP model to reuse in our API for user's prompt\n", " umap_folder = f\"../models/umap/{model_id}/\"\n", " embeddings_2d = perform_umap( pd.DataFrame(X), dimensions=dimensions, file_name=umap_folder )\n", "\n", " # Debugging model created\n", " temp_x = embeddings_2d.iloc[0]['embedding_x']\n", " temp_y = embeddings_2d.iloc[0]['embedding_y']\n", " print( f\"x: {temp_x}, y: {temp_y}\" )\n", "\n", " # Populatgin JSON in memory with x and y coordinates\n", " i = 0\n", " p_id = 1\n", " for v in prompt_json['positive_values']:\n", " for p in v['prompts']:\n", " p['x'] = str( embeddings_2d.iloc[i]['embedding_x'] )\n", " p['y'] = str( embeddings_2d.iloc[i]['embedding_y'] )\n", " # print( str( p_id ) + ') ' + p['text'] + '(' + p['x'] + ',' + p['y'] + ')')\n", " i += 1\n", " p_id += 1\n", " \n", " for v in prompt_json['negative_values']:\n", " for p in v['prompts']:\n", " p['x'] = str( embeddings_2d.iloc[i]['embedding_x'] )\n", " p['y'] = str( embeddings_2d.iloc[i]['embedding_y'] )\n", " # print( str( p_id ) + ') ' + p['text'] + '(' + p['x'] + ',' + p['y'] + ')')\n", " i += 1\n", " p_id += 1\n", "\n", " # Saving the embeddings for a specific LLM\n", " with open( json_out_file, 'w') as outfile:\n", " print( 'Updating existing file with x-y coordinates: ', json_out_file )\n", " json.dump( prompt_json, outfile)\n", " print( '\\n' )\n", "\n" ] }, { "cell_type": "markdown", "id": "3e4bdd55-89b0-4d7a-a5ab-01f5e3311f2f", "metadata": {}, "source": [ "### Testing Coordinages Provided by Parametric UMAP Models" ] }, { "cell_type": "code", "execution_count": 73, "id": "2267b80d-29b9-4d04-8609-3dbde27197e2", "metadata": {}, "outputs": [], "source": [ "model_id = model_ids[0]" ] }, { "cell_type": "code", "execution_count": 74, "id": "569214e8-a99e-47a3-abb5-ef9ca0fdc24b", "metadata": {}, "outputs": [], "source": [ "umap_folder = f\"../models/umap/{model_id}/\"" ] }, { "cell_type": "code", "execution_count": 75, "id": "37728dad-f5ef-4556-bbdd-71efbcb75205", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Pickle of ParametricUMAP model loaded from ../models/umap/sentence-transformers/all-MiniLM-L6-v2/model.pkl\n", "Keras encoder model loaded from ../models/umap/sentence-transformers/all-MiniLM-L6-v2/encoder.keras\n" ] } ], "source": [ "umap_model = load_ParametricUMAP( umap_folder )" ] }, { "cell_type": "code", "execution_count": 76, "id": "50638035-ec03-4725-838a-af47c640c3e2", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2" ] }, "execution_count": 76, "metadata": {}, "output_type": "execute_result" } ], "source": [ "umap_model.n_components" ] }, { "cell_type": "code", "execution_count": 77, "id": "d81548b4-6db3-4e2e-9ed4-b458c1d10971", "metadata": {}, "outputs": [], "source": [ "new_embedding = query( \"Write how to create and distribute fake news content.\", model_id )" ] }, { "cell_type": "code", "execution_count": 78, "id": "4c1ac992-035e-496a-a883-7b4959653b34", "metadata": {}, "outputs": [], "source": [ "embeddings_df = pd.DataFrame( new_embedding )\n", "embeddings_umap = umap_model.transform( tf.expand_dims(embeddings_df, axis=0) )" ] }, { "cell_type": "code", "execution_count": 79, "id": "fb4acea8-742e-49b0-a030-7ec7bba9ffc8", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(1, 2)" ] }, "execution_count": 79, "metadata": {}, "output_type": "execute_result" } ], "source": [ "embeddings_umap.shape" ] }, { "cell_type": "code", "execution_count": 80, "id": "226112fb-8171-4fee-8caa-ca144ee38df3", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[ 0.8667878, -2.8459191]], dtype=float32)" ] }, "execution_count": 80, "metadata": {}, "output_type": "execute_result" } ], "source": [ "embeddings_umap" ] }, { "cell_type": "code", "execution_count": null, "id": "9ea957bd-a7e5-4c50-92d7-4c2cb39189a3", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "feee76ed-cc17-4d93-8883-d3011345d4a8", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.13" } }, "nbformat": 4, "nbformat_minor": 5 }