Spaces:
Sleeping
Sleeping
File size: 2,285 Bytes
8a0c27f b652e4e 8a0c27f b652e4e 8a0c27f b652e4e 8a0c27f b652e4e 8a0c27f b652e4e 8a0c27f b652e4e 8a0c27f b652e4e 8a0c27f b652e4e 8a0c27f b652e4e 8a0c27f b652e4e 8a0c27f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 |
from typing import Sequence
import numpy as np
from sentence_transformers import SentenceTransformer
class Vectorizer:
"""
The Vectorizers role is to transform textual prompts into numerical vectors that can be
compared in a high-dimensional space. This transformation allows the system to quantify the
similarity between different prompts effectively.
"""
def __init__(self, model_name: str = "all-MiniLM-L6-v2") -> None:
"""
Initialize the vectorizer with a pre-trained embedding model.
Args:
model_name (str): The pre-trained embedding model to use for transforming prompts.
This can be any model that provides a method to convert texts into numerical vectors.
"""
self.model = SentenceTransformer(model_name)
def transform(self, prompts: Sequence[str]) -> np.ndarray:
"""
Transform texts into numerical vectors using the specified model.
Args:
prompts (Sequence[str]): The sequence of raw corpus prompts to be transformed.
Returns:
np.ndarray: A numpy array containing the vectorized prompts. Each row corresponds to the
vector representation of a prompt.
"""
return self.model.encode(list(prompts))
@staticmethod
def cosine_similarity(
query_vector: np.ndarray, corpus_vectors: np.ndarray
) -> np.ndarray:
"""
Calculate cosine similarity between a query vector and a set of corpus vectors.
Args:
query_vector (np.ndarray): A numpy array representing the vector of the query prompt.
corpus_vectors (np.ndarray): A numpy array representing the vectors of the corpus prompts.
Each row corresponds to the vector representation of a corpus prompt.
Returns:
np.ndarray: A numpy array containing the cosine similarity scores between the query vector and each
of the corpus vectors.
"""
query_norm = query_vector / np.linalg.norm(query_vector)
corpus_norms = corpus_vectors / np.linalg.norm(
corpus_vectors, axis=1, keepdims=True
)
return np.dot(corpus_norms, query_norm.T).flatten()
|