Spaces:
Runtime error
Runtime error
from typing import List, Sequence, Tuple | |
import numpy as np | |
from vectorizer import Vectorizer | |
def cosine_similarity( | |
query_vector: np.ndarray, | |
corpus_vectors: np.ndarray | |
)-> np.ndarray: | |
"""Calculate cosine similarity between prompt vectors. | |
Args: | |
query_vector: Vectorized prompt query of shape (1, D). | |
corpus_vectors: Vectorized prompt corpus of shape (N, D). | |
Returns: The vector of shape (N,) with values in range [-1, 1] where 1 | |
is max similarity i.e., two vectors are the same. | |
""" | |
dot_product = np.dot( corpus_vectors, query_vector) | |
magnitude_A = np.linalg.norm(corpus_vectors, axis=1) | |
magnitude_B = np.linalg.norm(query_vector) | |
cosine_sim = dot_product / (magnitude_A * magnitude_B) | |
return np.around(cosine_sim, 4) | |
# return np.format_float_positional(cosine_sim, precision = 4) | |
class PromptSearchEngine: | |
def __init__(self, prompts: Sequence[str], model) -> None: | |
"""Initialize search engine by vectorizing prompt corpus. | |
Vectorized prompt corpus should be used to find the top n most | |
similar prompts w.r.t. user’s input prompt. | |
Args: | |
prompts: The sequence of raw prompts from the dataset. | |
""" | |
self.prompts = prompts | |
self.vectorizer = Vectorizer(model) | |
self.corpus_embeddings = self.vectorizer.transform(prompts) | |
def most_similar( | |
self, | |
query: str, | |
n: int = 5 | |
) -> List[Tuple[float, str]]: | |
"""Return top n most similar prompts from corpus. | |
Input query prompt should be vectorized with chosen Vectorizer. | |
After | |
that, use the cosine_similarity function to get the top n most | |
similar | |
prompts from the corpus. | |
Args: | |
query: The raw query prompt input from the user. | |
n: The number of similar prompts returned from the corpus. | |
Returns: | |
The list of top n most similar prompts from the corpus along | |
with similarity scores. Note that returned prompts are | |
verbatim. | |
""" | |
most_similar_prompts = [] | |
prompt_embedding = self.vectorizer.transform([query]).flatten() | |
corpus_embeddings = self.corpus_embeddings | |
result = cosine_similarity(prompt_embedding, corpus_embeddings) | |
for i in range(len(self.prompts)): | |
most_similar_prompts.append((result[i], self.prompts[i])) | |
prompt_score_sorted = sorted(most_similar_prompts, key=lambda x: x[0], reverse=True) | |
return prompt_score_sorted[0:n] | |
def display_prompts(self, prompts): | |
"""Display the list of prompts with their similarity scores.""" | |
if prompts: | |
for i, (score, prompt) in enumerate(prompts, 1): | |
print(f"{i}. {prompt} (Similarity: {score:.4f})") | |
else: | |
print("No prompts found.") | |
def stringify_prompts(self, prompts): | |
"""Save the list of prompts with their similarity scores.""" | |
strings = [] | |
if prompts: | |
for i, (score, prompt) in enumerate(prompts, 1): | |
strings.append(f"{i}. {prompt} (Similarity: {score:.4f})") | |
return strings | |
else: | |
return [] | |