{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "1b95ba48",
   "metadata": {
    "id": "1b95ba48"
   },
   "source": [
    "# Responsible Prompting\n",
    "\n",
    "## Recipe: Populate embeddings\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "342f3b42-7d2b-4914-ac48-e01132744279",
   "metadata": {
    "id": "342f3b42-7d2b-4914-ac48-e01132744279"
   },
   "source": [
    "### Imports"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "c5498911",
   "metadata": {
    "id": "c5498911"
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import os.path\n",
    "\n",
    "import re\n",
    "import requests\n",
    "import json\n",
    "import warnings\n",
    "import math\n",
    "# import numpy as np\n",
    "import pandas as pd\n",
    "from sentence_transformers import SentenceTransformer"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "dc9210e4-0537-459f-be12-7381da11d338",
   "metadata": {
    "id": "dc9210e4-0537-459f-be12-7381da11d338"
   },
   "source": [
    "### Loading hugging face token from .env file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "45b95c55",
   "metadata": {
    "id": "45b95c55"
   },
   "outputs": [],
   "source": [
    "if os.getenv(\"COLAB_RELEASE_TAG\"):\n",
    "    COLAB = True\n",
    "    from google.colab import userdata\n",
    "    HF_TOKEN = userdata.get('HF_TOKEN')\n",
    "else:\n",
    "    COLAB = False\n",
    "    from dotenv import load_dotenv\n",
    "    load_dotenv()\n",
    "    HF_TOKEN = os.getenv('HF_TOKEN')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "b87a3c65-0e08-4fa9-aa8f-2f9a2f6c3499",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "b87a3c65-0e08-4fa9-aa8f-2f9a2f6c3499",
    "outputId": "6c751172-8e0e-4172-a4bf-2a36dfd69115"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "False"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "COLAB"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "63d7cb62-3825-4ca9-be99-c94c2cf34127",
   "metadata": {
    "id": "63d7cb62-3825-4ca9-be99-c94c2cf34127"
   },
   "source": [
    "### Sentence transformer model ids (from hugging face)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "95fb523c",
   "metadata": {
    "id": "95fb523c"
   },
   "outputs": [],
   "source": [
    "# These codes will be used in the hugging face request headers.\n",
    "# If you want to add more models, this is the place\n",
    "model_ids = [\n",
    "    \"sentence-transformers/all-MiniLM-L6-v2\",\n",
    "    \"BAAI/bge-large-en-v1.5\",\n",
    "    \"intfloat/multilingual-e5-large\"\n",
    "]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0f11d170",
   "metadata": {
    "id": "0f11d170"
   },
   "source": [
    "### Functions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "cd09f66b",
   "metadata": {
    "id": "cd09f66b"
   },
   "outputs": [],
   "source": [
    "# Converts model_id into filenames\n",
    "def model_id_to_filename( model_id ):\n",
    "    return model_id.split('/')[1].lower()\n",
    "\n",
    "# Requests embeddings for a given sentence\n",
    "def query( texts, model_id ):\n",
    "    # Warning in case of prompts longer than 256 words\n",
    "    for t in texts :\n",
    "        n_words = len( re.split(r\"\\s+\", t ) )\n",
    "        if( n_words > 256 and model_id == \"sentence-transformers/all-MiniLM-L6-v2\" ):\n",
    "            warnings.warn( \"Warning: Sentence provided is longer than 256 words. Model all-MiniLM-L6-v2 expects sentences up to 256 words.\" )\n",
    "            warnings.warn( \"Word count: {}\".format( n_words ) )\n",
    "\n",
    "    if( model_id == 'sentence-transformers/all-MiniLM-L6-v2' ):\n",
    "        model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')\n",
    "        out = model.encode( texts ).tolist()\n",
    "    else:\n",
    "        api_url = f\"https://api-inference.huggingface.co/models/{model_id}\"\n",
    "        headers = {\"Authorization\": f\"Bearer {HF_TOKEN}\", \"Content-Type\": \"application/json\"}\n",
    "        print( \"Request url: \" + api_url )\n",
    "        response = requests.post(api_url, headers=headers, json={\"inputs\": texts })\n",
    "        # print( response.status_code ) \n",
    "        # print( response.text )\n",
    "        out = response.json()\n",
    "        \n",
    "    # making sure that different transformers retrieve the embedding\n",
    "    if( 'error' in out ):\n",
    "        return out\n",
    "    while( len( out ) < 384 ): # unpacking json responses in the form of [[[embedding]]]\n",
    "        out = out[0]\n",
    "    return out\n",
    "\n",
    "# Returns the centroid for a given value\n",
    "def get_centroid( v, dimension = 384, k = 10 ):\n",
    "    centroid = [0] * dimension\n",
    "    count = 0\n",
    "    for p in v['prompts']:\n",
    "        i = 0\n",
    "        while i < len( p['embedding'] ):\n",
    "            centroid[i] += p['embedding'][i]\n",
    "            i += 1\n",
    "        count += 1\n",
    "    i = 0\n",
    "    while i < len( centroid ):\n",
    "        centroid[i] /= count\n",
    "        i += 1\n",
    "\n",
    "    return centroid"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c39191c3",
   "metadata": {
    "id": "c39191c3"
   },
   "source": [
    "### Populating JSON files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "87316fa4-1fcf-41c4-9913-bc5704b25ea2",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "87316fa4-1fcf-41c4-9913-bc5704b25ea2",
    "outputId": "2240cbbf-94e8-4450-976f-27ab8e5c68d8"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Opening existing file locally:  ../prompt-sentences-main/prompt_sentences.json\n",
      "Opening existing file locally:  ../prompt-sentences-main/prompt_sentences-all-minilm-l6-v2.json\n",
      "Dimensions from hugging face API response: 384\n",
      "Dimensions from json file: 384\n",
      "Old prompts:  2217\n",
      "New prompts:  0\n",
      "Errors:  0\n",
      "Successes:  0\n",
      "Updating centroids.\n",
      "Saving into file:  ../prompt-sentences-main/prompt_sentences-all-minilm-l6-v2.json\n",
      "\n",
      "\n",
      "Opening existing file locally:  ../prompt-sentences-main/prompt_sentences-bge-large-en-v1.5.json\n",
      "Request url: https://api-inference.huggingface.co/models/BAAI/bge-large-en-v1.5\n",
      "Dimensions from hugging face API response: 1024\n",
      "Dimensions from json file: 1024\n",
      "Request url: https://api-inference.huggingface.co/models/BAAI/bge-large-en-v1.5\n",
      "Request url: https://api-inference.huggingface.co/models/BAAI/bge-large-en-v1.5\n",
      "Request url: https://api-inference.huggingface.co/models/BAAI/bge-large-en-v1.5\n",
      "Request url: https://api-inference.huggingface.co/models/BAAI/bge-large-en-v1.5\n",
      "Request url: https://api-inference.huggingface.co/models/BAAI/bge-large-en-v1.5\n",
      "Request url: https://api-inference.huggingface.co/models/BAAI/bge-large-en-v1.5\n",
      "Request url: https://api-inference.huggingface.co/models/BAAI/bge-large-en-v1.5\n",
      "Request url: https://api-inference.huggingface.co/models/BAAI/bge-large-en-v1.5\n",
      "Request url: https://api-inference.huggingface.co/models/BAAI/bge-large-en-v1.5\n",
      "Request url: https://api-inference.huggingface.co/models/BAAI/bge-large-en-v1.5\n",
      "Request url: https://api-inference.huggingface.co/models/BAAI/bge-large-en-v1.5\n",
      "Request url: https://api-inference.huggingface.co/models/BAAI/bge-large-en-v1.5\n",
      "Request url: https://api-inference.huggingface.co/models/BAAI/bge-large-en-v1.5\n",
      "Request url: https://api-inference.huggingface.co/models/BAAI/bge-large-en-v1.5\n",
      "Request url: https://api-inference.huggingface.co/models/BAAI/bge-large-en-v1.5\n",
      "Request url: https://api-inference.huggingface.co/models/BAAI/bge-large-en-v1.5\n",
      "Request url: https://api-inference.huggingface.co/models/BAAI/bge-large-en-v1.5\n",
      "Request url: https://api-inference.huggingface.co/models/BAAI/bge-large-en-v1.5\n",
      "Request url: https://api-inference.huggingface.co/models/BAAI/bge-large-en-v1.5\n",
      "Request url: https://api-inference.huggingface.co/models/BAAI/bge-large-en-v1.5\n",
      "Request url: https://api-inference.huggingface.co/models/BAAI/bge-large-en-v1.5\n",
      "Request url: https://api-inference.huggingface.co/models/BAAI/bge-large-en-v1.5\n",
      "Request url: https://api-inference.huggingface.co/models/BAAI/bge-large-en-v1.5\n",
      "Old prompts:  2194\n",
      "New prompts:  23\n",
      "Errors:  0\n",
      "Successes:  23\n",
      "Updating centroids.\n",
      "Saving into file:  ../prompt-sentences-main/prompt_sentences-bge-large-en-v1.5.json\n",
      "\n",
      "\n",
      "Opening existing file locally:  ../prompt-sentences-main/prompt_sentences-multilingual-e5-large.json\n",
      "Request url: https://api-inference.huggingface.co/models/intfloat/multilingual-e5-large\n",
      "Dimensions from hugging face API response: 1024\n",
      "Dimensions from json file: 1024\n",
      "Request url: https://api-inference.huggingface.co/models/intfloat/multilingual-e5-large\n",
      "Request url: https://api-inference.huggingface.co/models/intfloat/multilingual-e5-large\n",
      "Request url: https://api-inference.huggingface.co/models/intfloat/multilingual-e5-large\n",
      "Request url: https://api-inference.huggingface.co/models/intfloat/multilingual-e5-large\n",
      "Request url: https://api-inference.huggingface.co/models/intfloat/multilingual-e5-large\n",
      "Request url: https://api-inference.huggingface.co/models/intfloat/multilingual-e5-large\n",
      "Request url: https://api-inference.huggingface.co/models/intfloat/multilingual-e5-large\n",
      "Request url: https://api-inference.huggingface.co/models/intfloat/multilingual-e5-large\n",
      "Request url: https://api-inference.huggingface.co/models/intfloat/multilingual-e5-large\n",
      "Request url: https://api-inference.huggingface.co/models/intfloat/multilingual-e5-large\n",
      "Request url: https://api-inference.huggingface.co/models/intfloat/multilingual-e5-large\n",
      "Request url: https://api-inference.huggingface.co/models/intfloat/multilingual-e5-large\n",
      "Request url: https://api-inference.huggingface.co/models/intfloat/multilingual-e5-large\n",
      "Request url: https://api-inference.huggingface.co/models/intfloat/multilingual-e5-large\n",
      "Request url: https://api-inference.huggingface.co/models/intfloat/multilingual-e5-large\n",
      "Request url: https://api-inference.huggingface.co/models/intfloat/multilingual-e5-large\n",
      "Request url: https://api-inference.huggingface.co/models/intfloat/multilingual-e5-large\n",
      "Request url: https://api-inference.huggingface.co/models/intfloat/multilingual-e5-large\n",
      "Request url: https://api-inference.huggingface.co/models/intfloat/multilingual-e5-large\n",
      "Request url: https://api-inference.huggingface.co/models/intfloat/multilingual-e5-large\n",
      "Request url: https://api-inference.huggingface.co/models/intfloat/multilingual-e5-large\n",
      "Request url: https://api-inference.huggingface.co/models/intfloat/multilingual-e5-large\n",
      "Request url: https://api-inference.huggingface.co/models/intfloat/multilingual-e5-large\n",
      "Old prompts:  2194\n",
      "New prompts:  23\n",
      "Errors:  0\n",
      "Successes:  23\n",
      "Updating centroids.\n",
      "Saving into file:  ../prompt-sentences-main/prompt_sentences-multilingual-e5-large.json\n",
      "\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# JSON folder\n",
    "if( COLAB ):\n",
    "    json_folder = 'https://raw.githubusercontent.com/IBM/responsible-prompting-api/refs/heads/main/prompt-sentences-main/'\n",
    "else:\n",
    "    json_folder = '../prompt-sentences-main/'\n",
    "\n",
    "# INPUT FILE\n",
    "# Default file with empty embeddings\n",
    "json_in_file = json_folder + 'prompt_sentences.json'\n",
    "\n",
    "# Trying to open the files first\n",
    "if( COLAB ):\n",
    "    prompt_json_in = requests.get( json_in_file ).json()\n",
    "    print( 'Opening file from GitHub repo: ', json_in_file )\n",
    "else:\n",
    "    if( os.path.isfile( json_in_file ) ):\n",
    "        prompt_json_in = json.load( open( json_in_file ) )\n",
    "        print( 'Opening existing file locally: ', json_in_file )\n",
    "\n",
    "for model_id in model_ids:\n",
    "    # OUTPUT FILE\n",
    "    json_out_file_suffix = model_id_to_filename( model_id )\n",
    "    json_out_file = f\"{json_folder}prompt_sentences-{json_out_file_suffix}.json\"\n",
    "\n",
    "    # Trying to open the files first\n",
    "    if( COLAB ):\n",
    "        prompt_json_out = requests.get( json_out_file ).json()\n",
    "        print( 'Opening file from GitHub repo: ', json_out_file )\n",
    "    else:\n",
    "        if( os.path.isfile( json_out_file ) ):\n",
    "            prompt_json_out = json.load( open( json_out_file ) )\n",
    "            print( 'Opening existing file locally: ', json_out_file )\n",
    "        else:\n",
    "            # Creating an empty file for new transformer\n",
    "            print( 'Starting a file from scratch for model: ', model_id )\n",
    "\n",
    "    # API request test\n",
    "    api_response_dimensions = len( query( ['testing API endpoint'], model_id ) )\n",
    "    print( f\"Dimensions from hugging face API response: {api_response_dimensions}\" )\n",
    "    json_file_dimensions = len( prompt_json_out['positive_values'][0]['prompts'][0]['embedding'] )\n",
    "    print( f\"Dimensions from json file: {json_file_dimensions}\" )\n",
    "    if( api_response_dimensions != json_file_dimensions ):\n",
    "        warnings.warn( f\"Dimensions are different: API={api_response_dimensions} while JSON sentences file={json_file_dimensions}\" )\n",
    "\n",
    "    ############################\n",
    "    # Generate a new output file using the hashmap as auxiliary table hosting old and new/changed embeddings\n",
    "    ############################\n",
    "\n",
    "    # Using the output json with the prompts and embeddings\n",
    "    # prompt_json_out\n",
    "\n",
    "    # Create a hashmap with a key value containing a hash for the prompt and the already populated embedding\n",
    "    prompts_embeddings = {}\n",
    "    new_prompts = 0\n",
    "    old_prompts = 0\n",
    "    errors = 0\n",
    "    successes = 0\n",
    "\n",
    "    for v in prompt_json_out['positive_values']:\n",
    "        for p in v['prompts']:\n",
    "            if( p['embedding'] != [] ):\n",
    "                prompts_embeddings[ p['text'] ] = p['embedding']\n",
    "\n",
    "    for v in prompt_json_out['negative_values']:\n",
    "        for p in v['prompts']:\n",
    "            if( p['embedding'] != [] ):\n",
    "                prompts_embeddings[ p['text'] ] = p['embedding']\n",
    "\n",
    "    # Loading all prompts from prompt_json_in, potentially with new/changed sentences\n",
    "\n",
    "    # Iterate over the two lists, looking only for new/changed prompts that require the API request for embeddings\n",
    "    for v in prompt_json_in['positive_values']:\n",
    "        for p in v['prompts']:\n",
    "            if( p['text'] in prompts_embeddings ):\n",
    "                # Prompt found, no need to request embeddings\n",
    "                p['embedding'] = prompts_embeddings[ p['text'] ]\n",
    "                old_prompts += 1\n",
    "            else:\n",
    "                # Requesting embedding for new/changed prompt\n",
    "                embedding = query( p['text'], model_id )\n",
    "                if( 'error' in embedding ):\n",
    "                    errors += 1\n",
    "                else:\n",
    "                    # Add the new/changed prompt to the hashmap\n",
    "                    prompts_embeddings[ p['text'] ] = embedding\n",
    "\n",
    "                    # Using the new hash\n",
    "                    p['embedding'] = prompts_embeddings[ p['text'] ]\n",
    "                    successes += 1\n",
    "                new_prompts += 1\n",
    "\n",
    "    for v in prompt_json_in['negative_values']:\n",
    "        for p in v['prompts']:\n",
    "            if( p['text'] in prompts_embeddings ):\n",
    "                # Prompt found, no need to request embeddings\n",
    "                p['embedding'] = prompts_embeddings[ p['text'] ]\n",
    "                old_prompts += 1\n",
    "            else:\n",
    "                # Requesting embedding for new/changed prompt\n",
    "                embedding = query( p['text'], model_id )\n",
    "                if( 'error' in embedding ):\n",
    "                    errors += 1\n",
    "                else:\n",
    "                    # Add the new/changed prompt to the hashmap\n",
    "                    prompts_embeddings[ p['text'] ] = embedding\n",
    "\n",
    "                    # Using the new hash\n",
    "                    p['embedding'] = prompts_embeddings[ p['text'] ]\n",
    "                    successes += 1\n",
    "                new_prompts += 1\n",
    "\n",
    "    print( 'Old prompts: ', old_prompts )\n",
    "    print( 'New prompts: ', new_prompts )\n",
    "    print( 'Errors: ', errors )\n",
    "    print( 'Successes: ', successes )\n",
    "\n",
    "    # After all the embeddings are populated (with no errors), compute the centroids for each value\n",
    "    if( errors == 0 ):\n",
    "        print( 'Updating centroids.' )\n",
    "        for v in prompt_json_in['positive_values']:\n",
    "            v['centroid'] = get_centroid( v, json_file_dimensions, 10 )\n",
    "        for v in prompt_json_in['negative_values']:\n",
    "            v['centroid'] = get_centroid( v, json_file_dimensions, 10 )\n",
    "\n",
    "    # Saving the embeddings for a specific LLM\n",
    "    if( COLAB ):\n",
    "      json_out_file = f\"prompt_sentences-{json_out_file_suffix}.json\"\n",
    "\n",
    "    with open( json_out_file, 'w') as outfile:\n",
    "        print( 'Saving into file: ', json_out_file )\n",
    "        json.dump( prompt_json_in, outfile)\n",
    "        print( '\\n' )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2a257009-4021-4956-a3ee-5d39931ecd6b",
   "metadata": {
    "id": "2a257009-4021-4956-a3ee-5d39931ecd6b"
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0fe5a03b-5ebf-4361-a183-4a19261e4ec2",
   "metadata": {
    "id": "0fe5a03b-5ebf-4361-a183-4a19261e4ec2"
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "63dd3311-67fe-490a-9998-65422697dab2",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "colab": {
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}