File size: 2,285 Bytes
8a0c27f
b652e4e
8a0c27f
 
 
 
b652e4e
8a0c27f
b652e4e
 
 
8a0c27f
 
b652e4e
8a0c27f
 
b652e4e
 
 
 
8a0c27f
 
 
 
 
 
 
b652e4e
 
 
 
 
 
 
8a0c27f
 
 
 
 
b652e4e
 
 
8a0c27f
b652e4e
 
 
 
 
 
 
 
 
 
8a0c27f
b652e4e
8a0c27f
b652e4e
 
 
8a0c27f
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
from typing import Sequence

import numpy as np
from sentence_transformers import SentenceTransformer


class Vectorizer:
    """
    The Vectorizers role is to transform textual prompts into numerical vectors that can be
    compared in a high-dimensional space. This transformation allows the system to quantify the
    similarity between different prompts effectively.
    """

    def __init__(self, model_name: str = "all-MiniLM-L6-v2") -> None:
        """
        Initialize the vectorizer with a pre-trained embedding model.

        Args:
            model_name (str): The pre-trained embedding model to use for transforming prompts.
                This can be any model that provides a method to convert texts into numerical vectors.
        """

        self.model = SentenceTransformer(model_name)

    def transform(self, prompts: Sequence[str]) -> np.ndarray:
        """
        Transform texts into numerical vectors using the specified model.

        Args:
            prompts (Sequence[str]): The sequence of raw corpus prompts to be transformed.

        Returns:
            np.ndarray: A numpy array containing the vectorized prompts. Each row corresponds to the
                        vector representation of a prompt.
        """

        return self.model.encode(list(prompts))

    @staticmethod
    def cosine_similarity(
        query_vector: np.ndarray, corpus_vectors: np.ndarray
    ) -> np.ndarray:
        """
        Calculate cosine similarity between a query vector and a set of corpus vectors.

        Args:
            query_vector (np.ndarray): A numpy array representing the vector of the query prompt.
            corpus_vectors (np.ndarray): A numpy array representing the vectors of the corpus prompts.
                                         Each row corresponds to the vector representation of a corpus prompt.

        Returns:
            np.ndarray: A numpy array containing the cosine similarity scores between the query vector and each
                of the corpus vectors.
        """

        query_norm = query_vector / np.linalg.norm(query_vector)
        corpus_norms = corpus_vectors / np.linalg.norm(
            corpus_vectors, axis=1, keepdims=True
        )

        return np.dot(corpus_norms, query_norm.T).flatten()