{ "cells": [ { "cell_type": "markdown", "id": "1b95ba48", "metadata": { "id": "1b95ba48" }, "source": [ "# Responsible Prompting\n", "\n", "## Recipe: Populate embeddings\n" ] }, { "cell_type": "markdown", "id": "342f3b42-7d2b-4914-ac48-e01132744279", "metadata": { "id": "342f3b42-7d2b-4914-ac48-e01132744279" }, "source": [ "### Imports" ] }, { "cell_type": "code", "execution_count": 29, "id": "c5498911", "metadata": { "id": "c5498911" }, "outputs": [], "source": [ "import os\n", "import os.path\n", "\n", "import re\n", "import requests\n", "import json\n", "import warnings\n", "import math\n", "# import numpy as np\n", "import pandas as pd\n", "from sentence_transformers import SentenceTransformer" ] }, { "cell_type": "markdown", "id": "dc9210e4-0537-459f-be12-7381da11d338", "metadata": { "id": "dc9210e4-0537-459f-be12-7381da11d338" }, "source": [ "### Loading hugging face token from .env file" ] }, { "cell_type": "code", "execution_count": 30, "id": "45b95c55", "metadata": { "id": "45b95c55" }, "outputs": [], "source": [ "if os.getenv(\"COLAB_RELEASE_TAG\"):\n", " COLAB = True\n", " from google.colab import userdata\n", " HF_TOKEN = userdata.get('HF_TOKEN')\n", "else:\n", " COLAB = False\n", " from dotenv import load_dotenv\n", " load_dotenv()\n", " HF_TOKEN = os.getenv('HF_TOKEN')" ] }, { "cell_type": "code", "execution_count": 31, "id": "b87a3c65-0e08-4fa9-aa8f-2f9a2f6c3499", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "b87a3c65-0e08-4fa9-aa8f-2f9a2f6c3499", "outputId": "6c751172-8e0e-4172-a4bf-2a36dfd69115" }, "outputs": [ { "data": { "text/plain": [ "False" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "COLAB" ] }, { "cell_type": "markdown", "id": "63d7cb62-3825-4ca9-be99-c94c2cf34127", "metadata": { "id": "63d7cb62-3825-4ca9-be99-c94c2cf34127" }, "source": [ "### Sentence transformer model ids (from hugging face)" ] }, { "cell_type": "code", "execution_count": 32, "id": "95fb523c", "metadata": { "id": "95fb523c" }, "outputs": [], "source": [ "# These codes will be used in the hugging face request headers.\n", "# If you want to add more models, this is the place\n", "model_ids = [\n", " \"sentence-transformers/all-MiniLM-L6-v2\",\n", " \"BAAI/bge-large-en-v1.5\",\n", " \"intfloat/multilingual-e5-large\"\n", "]" ] }, { "cell_type": "markdown", "id": "0f11d170", "metadata": { "id": "0f11d170" }, "source": [ "### Functions" ] }, { "cell_type": "code", "execution_count": 33, "id": "cd09f66b", "metadata": { "id": "cd09f66b" }, "outputs": [], "source": [ "# Converts model_id into filenames\n", "def model_id_to_filename( model_id ):\n", " return model_id.split('/')[1].lower()\n", "\n", "# Requests embeddings for a given sentence\n", "def query( texts, model_id ):\n", " # Warning in case of prompts longer than 256 words\n", " for t in texts :\n", " n_words = len( re.split(r\"\\s+\", t ) )\n", " if( n_words > 256 and model_id == \"sentence-transformers/all-MiniLM-L6-v2\" ):\n", " warnings.warn( \"Warning: Sentence provided is longer than 256 words. Model all-MiniLM-L6-v2 expects sentences up to 256 words.\" )\n", " warnings.warn( \"Word count: {}\".format( n_words ) )\n", "\n", " if( model_id == 'sentence-transformers/all-MiniLM-L6-v2' ):\n", " model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')\n", " out = model.encode( texts ).tolist()\n", " else:\n", " api_url = f\"https://api-inference.huggingface.co/models/{model_id}\"\n", " headers = {\"Authorization\": f\"Bearer {HF_TOKEN}\", \"Content-Type\": \"application/json\"}\n", " print( \"Request url: \" + api_url )\n", " response = requests.post(api_url, headers=headers, json={\"inputs\": texts })\n", " # print( response.status_code ) \n", " # print( response.text )\n", " out = response.json()\n", " \n", " # making sure that different transformers retrieve the embedding\n", " if( 'error' in out ):\n", " return out\n", " while( len( out ) < 384 ): # unpacking json responses in the form of [[[embedding]]]\n", " out = out[0]\n", " return out\n", "\n", "# Returns the centroid for a given value\n", "def get_centroid( v, dimension = 384, k = 10 ):\n", " centroid = [0] * dimension\n", " count = 0\n", " for p in v['prompts']:\n", " i = 0\n", " while i < len( p['embedding'] ):\n", " centroid[i] += p['embedding'][i]\n", " i += 1\n", " count += 1\n", " i = 0\n", " while i < len( centroid ):\n", " centroid[i] /= count\n", " i += 1\n", "\n", " return centroid" ] }, { "cell_type": "markdown", "id": "c39191c3", "metadata": { "id": "c39191c3" }, "source": [ "### Populating JSON files" ] }, { "cell_type": "code", "execution_count": 34, "id": "87316fa4-1fcf-41c4-9913-bc5704b25ea2", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "87316fa4-1fcf-41c4-9913-bc5704b25ea2", "outputId": "2240cbbf-94e8-4450-976f-27ab8e5c68d8" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Opening existing file locally: ../prompt-sentences-main/prompt_sentences.json\n", "Opening existing file locally: ../prompt-sentences-main/prompt_sentences-all-minilm-l6-v2.json\n", "Dimensions from hugging face API response: 384\n", "Dimensions from json file: 384\n", "Old prompts: 2217\n", "New prompts: 0\n", "Errors: 0\n", "Successes: 0\n", "Updating centroids.\n", "Saving into file: ../prompt-sentences-main/prompt_sentences-all-minilm-l6-v2.json\n", "\n", "\n", "Opening existing file locally: ../prompt-sentences-main/prompt_sentences-bge-large-en-v1.5.json\n", "Request url: https://api-inference.huggingface.co/models/BAAI/bge-large-en-v1.5\n", "Dimensions from hugging face API response: 1024\n", "Dimensions from json file: 1024\n", "Request url: https://api-inference.huggingface.co/models/BAAI/bge-large-en-v1.5\n", "Request url: https://api-inference.huggingface.co/models/BAAI/bge-large-en-v1.5\n", "Request url: https://api-inference.huggingface.co/models/BAAI/bge-large-en-v1.5\n", "Request url: https://api-inference.huggingface.co/models/BAAI/bge-large-en-v1.5\n", "Request url: https://api-inference.huggingface.co/models/BAAI/bge-large-en-v1.5\n", "Request url: https://api-inference.huggingface.co/models/BAAI/bge-large-en-v1.5\n", "Request url: https://api-inference.huggingface.co/models/BAAI/bge-large-en-v1.5\n", "Request url: https://api-inference.huggingface.co/models/BAAI/bge-large-en-v1.5\n", "Request url: https://api-inference.huggingface.co/models/BAAI/bge-large-en-v1.5\n", "Request url: https://api-inference.huggingface.co/models/BAAI/bge-large-en-v1.5\n", "Request url: https://api-inference.huggingface.co/models/BAAI/bge-large-en-v1.5\n", "Request url: https://api-inference.huggingface.co/models/BAAI/bge-large-en-v1.5\n", "Request url: https://api-inference.huggingface.co/models/BAAI/bge-large-en-v1.5\n", "Request url: https://api-inference.huggingface.co/models/BAAI/bge-large-en-v1.5\n", "Request url: https://api-inference.huggingface.co/models/BAAI/bge-large-en-v1.5\n", "Request url: https://api-inference.huggingface.co/models/BAAI/bge-large-en-v1.5\n", "Request url: https://api-inference.huggingface.co/models/BAAI/bge-large-en-v1.5\n", "Request url: https://api-inference.huggingface.co/models/BAAI/bge-large-en-v1.5\n", "Request url: https://api-inference.huggingface.co/models/BAAI/bge-large-en-v1.5\n", "Request url: https://api-inference.huggingface.co/models/BAAI/bge-large-en-v1.5\n", "Request url: https://api-inference.huggingface.co/models/BAAI/bge-large-en-v1.5\n", "Request url: https://api-inference.huggingface.co/models/BAAI/bge-large-en-v1.5\n", "Request url: https://api-inference.huggingface.co/models/BAAI/bge-large-en-v1.5\n", "Old prompts: 2194\n", "New prompts: 23\n", "Errors: 0\n", "Successes: 23\n", "Updating centroids.\n", "Saving into file: ../prompt-sentences-main/prompt_sentences-bge-large-en-v1.5.json\n", "\n", "\n", "Opening existing file locally: ../prompt-sentences-main/prompt_sentences-multilingual-e5-large.json\n", "Request url: https://api-inference.huggingface.co/models/intfloat/multilingual-e5-large\n", "Dimensions from hugging face API response: 1024\n", "Dimensions from json file: 1024\n", "Request url: https://api-inference.huggingface.co/models/intfloat/multilingual-e5-large\n", "Request url: https://api-inference.huggingface.co/models/intfloat/multilingual-e5-large\n", "Request url: https://api-inference.huggingface.co/models/intfloat/multilingual-e5-large\n", "Request url: https://api-inference.huggingface.co/models/intfloat/multilingual-e5-large\n", "Request url: https://api-inference.huggingface.co/models/intfloat/multilingual-e5-large\n", "Request url: https://api-inference.huggingface.co/models/intfloat/multilingual-e5-large\n", "Request url: https://api-inference.huggingface.co/models/intfloat/multilingual-e5-large\n", "Request url: https://api-inference.huggingface.co/models/intfloat/multilingual-e5-large\n", "Request url: https://api-inference.huggingface.co/models/intfloat/multilingual-e5-large\n", "Request url: https://api-inference.huggingface.co/models/intfloat/multilingual-e5-large\n", "Request url: https://api-inference.huggingface.co/models/intfloat/multilingual-e5-large\n", "Request url: https://api-inference.huggingface.co/models/intfloat/multilingual-e5-large\n", "Request url: https://api-inference.huggingface.co/models/intfloat/multilingual-e5-large\n", "Request url: https://api-inference.huggingface.co/models/intfloat/multilingual-e5-large\n", "Request url: https://api-inference.huggingface.co/models/intfloat/multilingual-e5-large\n", "Request url: https://api-inference.huggingface.co/models/intfloat/multilingual-e5-large\n", "Request url: https://api-inference.huggingface.co/models/intfloat/multilingual-e5-large\n", "Request url: https://api-inference.huggingface.co/models/intfloat/multilingual-e5-large\n", "Request url: https://api-inference.huggingface.co/models/intfloat/multilingual-e5-large\n", "Request url: https://api-inference.huggingface.co/models/intfloat/multilingual-e5-large\n", "Request url: https://api-inference.huggingface.co/models/intfloat/multilingual-e5-large\n", "Request url: https://api-inference.huggingface.co/models/intfloat/multilingual-e5-large\n", "Request url: https://api-inference.huggingface.co/models/intfloat/multilingual-e5-large\n", "Old prompts: 2194\n", "New prompts: 23\n", "Errors: 0\n", "Successes: 23\n", "Updating centroids.\n", "Saving into file: ../prompt-sentences-main/prompt_sentences-multilingual-e5-large.json\n", "\n", "\n" ] } ], "source": [ "# JSON folder\n", "if( COLAB ):\n", " json_folder = 'https://raw.githubusercontent.com/IBM/responsible-prompting-api/refs/heads/main/prompt-sentences-main/'\n", "else:\n", " json_folder = '../prompt-sentences-main/'\n", "\n", "# INPUT FILE\n", "# Default file with empty embeddings\n", "json_in_file = json_folder + 'prompt_sentences.json'\n", "\n", "# Trying to open the files first\n", "if( COLAB ):\n", " prompt_json_in = requests.get( json_in_file ).json()\n", " print( 'Opening file from GitHub repo: ', json_in_file )\n", "else:\n", " if( os.path.isfile( json_in_file ) ):\n", " prompt_json_in = json.load( open( json_in_file ) )\n", " print( 'Opening existing file locally: ', json_in_file )\n", "\n", "for model_id in model_ids:\n", " # OUTPUT FILE\n", " json_out_file_suffix = model_id_to_filename( model_id )\n", " json_out_file = f\"{json_folder}prompt_sentences-{json_out_file_suffix}.json\"\n", "\n", " # Trying to open the files first\n", " if( COLAB ):\n", " prompt_json_out = requests.get( json_out_file ).json()\n", " print( 'Opening file from GitHub repo: ', json_out_file )\n", " else:\n", " if( os.path.isfile( json_out_file ) ):\n", " prompt_json_out = json.load( open( json_out_file ) )\n", " print( 'Opening existing file locally: ', json_out_file )\n", " else:\n", " # Creating an empty file for new transformer\n", " print( 'Starting a file from scratch for model: ', model_id )\n", "\n", " # API request test\n", " api_response_dimensions = len( query( ['testing API endpoint'], model_id ) )\n", " print( f\"Dimensions from hugging face API response: {api_response_dimensions}\" )\n", " json_file_dimensions = len( prompt_json_out['positive_values'][0]['prompts'][0]['embedding'] )\n", " print( f\"Dimensions from json file: {json_file_dimensions}\" )\n", " if( api_response_dimensions != json_file_dimensions ):\n", " warnings.warn( f\"Dimensions are different: API={api_response_dimensions} while JSON sentences file={json_file_dimensions}\" )\n", "\n", " ############################\n", " # Generate a new output file using the hashmap as auxiliary table hosting old and new/changed embeddings\n", " ############################\n", "\n", " # Using the output json with the prompts and embeddings\n", " # prompt_json_out\n", "\n", " # Create a hashmap with a key value containing a hash for the prompt and the already populated embedding\n", " prompts_embeddings = {}\n", " new_prompts = 0\n", " old_prompts = 0\n", " errors = 0\n", " successes = 0\n", "\n", " for v in prompt_json_out['positive_values']:\n", " for p in v['prompts']:\n", " if( p['embedding'] != [] ):\n", " prompts_embeddings[ p['text'] ] = p['embedding']\n", "\n", " for v in prompt_json_out['negative_values']:\n", " for p in v['prompts']:\n", " if( p['embedding'] != [] ):\n", " prompts_embeddings[ p['text'] ] = p['embedding']\n", "\n", " # Loading all prompts from prompt_json_in, potentially with new/changed sentences\n", "\n", " # Iterate over the two lists, looking only for new/changed prompts that require the API request for embeddings\n", " for v in prompt_json_in['positive_values']:\n", " for p in v['prompts']:\n", " if( p['text'] in prompts_embeddings ):\n", " # Prompt found, no need to request embeddings\n", " p['embedding'] = prompts_embeddings[ p['text'] ]\n", " old_prompts += 1\n", " else:\n", " # Requesting embedding for new/changed prompt\n", " embedding = query( p['text'], model_id )\n", " if( 'error' in embedding ):\n", " errors += 1\n", " else:\n", " # Add the new/changed prompt to the hashmap\n", " prompts_embeddings[ p['text'] ] = embedding\n", "\n", " # Using the new hash\n", " p['embedding'] = prompts_embeddings[ p['text'] ]\n", " successes += 1\n", " new_prompts += 1\n", "\n", " for v in prompt_json_in['negative_values']:\n", " for p in v['prompts']:\n", " if( p['text'] in prompts_embeddings ):\n", " # Prompt found, no need to request embeddings\n", " p['embedding'] = prompts_embeddings[ p['text'] ]\n", " old_prompts += 1\n", " else:\n", " # Requesting embedding for new/changed prompt\n", " embedding = query( p['text'], model_id )\n", " if( 'error' in embedding ):\n", " errors += 1\n", " else:\n", " # Add the new/changed prompt to the hashmap\n", " prompts_embeddings[ p['text'] ] = embedding\n", "\n", " # Using the new hash\n", " p['embedding'] = prompts_embeddings[ p['text'] ]\n", " successes += 1\n", " new_prompts += 1\n", "\n", " print( 'Old prompts: ', old_prompts )\n", " print( 'New prompts: ', new_prompts )\n", " print( 'Errors: ', errors )\n", " print( 'Successes: ', successes )\n", "\n", " # After all the embeddings are populated (with no errors), compute the centroids for each value\n", " if( errors == 0 ):\n", " print( 'Updating centroids.' )\n", " for v in prompt_json_in['positive_values']:\n", " v['centroid'] = get_centroid( v, json_file_dimensions, 10 )\n", " for v in prompt_json_in['negative_values']:\n", " v['centroid'] = get_centroid( v, json_file_dimensions, 10 )\n", "\n", " # Saving the embeddings for a specific LLM\n", " if( COLAB ):\n", " json_out_file = f\"prompt_sentences-{json_out_file_suffix}.json\"\n", "\n", " with open( json_out_file, 'w') as outfile:\n", " print( 'Saving into file: ', json_out_file )\n", " json.dump( prompt_json_in, outfile)\n", " print( '\\n' )" ] }, { "cell_type": "code", "execution_count": null, "id": "2a257009-4021-4956-a3ee-5d39931ecd6b", "metadata": { "id": "2a257009-4021-4956-a3ee-5d39931ecd6b" }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "0fe5a03b-5ebf-4361-a183-4a19261e4ec2", "metadata": { "id": "0fe5a03b-5ebf-4361-a183-4a19261e4ec2" }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "63dd3311-67fe-490a-9998-65422697dab2", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "colab": { "provenance": [] }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.6" } }, "nbformat": 4, "nbformat_minor": 5 }