Spaces:
Runtime error
Runtime error
| import logging | |
| from dataclasses import dataclass, field | |
| import numpy as np | |
| import openai | |
| import pandas as pd | |
| from omegaconf import OmegaConf | |
| from openai.embeddings_utils import cosine_similarity, get_embedding | |
| from buster.docparser import EMBEDDING_MODEL | |
| logger = logging.getLogger(__name__) | |
| logging.basicConfig(level=logging.INFO) | |
| def load_documents(path: str) -> pd.DataFrame: | |
| logger.info(f"loading embeddings from {path}...") | |
| df = pd.read_csv(path) | |
| df["embedding"] = df.embedding.apply(eval).apply(np.array) | |
| logger.info(f"embeddings loaded.") | |
| return df | |
| class Chatbot: | |
| def __init__(self, cfg: OmegaConf): | |
| # TODO: right now, the cfg is being passed as an omegaconf, is this what we want? | |
| self.cfg = cfg | |
| self._init_documents() | |
| self._init_unk_embedding() | |
| def _init_documents(self): | |
| self.documents = load_documents(self.cfg.documents_csv) | |
| def _init_unk_embedding(self): | |
| logger.info("Generating UNK token...") | |
| unknown_prompt = self.cfg.unknown_prompt | |
| engine = self.cfg.embedding_model | |
| self.unk_embedding = get_embedding( | |
| unknown_prompt, | |
| engine=engine, | |
| ) | |
| def rank_documents( | |
| self, | |
| documents: pd.DataFrame, | |
| query: str, | |
| ) -> pd.DataFrame: | |
| """ | |
| Compare the question to the series of documents and return the best matching documents. | |
| """ | |
| top_k = self.cfg.top_k | |
| thresh = self.cfg.thresh | |
| engine = self.cfg.embedding_model # EMBEDDING_MODEL | |
| query_embedding = get_embedding( | |
| query, | |
| engine=engine, | |
| ) | |
| documents["similarity"] = documents.embedding.apply(lambda x: cosine_similarity(x, query_embedding)) | |
| # sort the matched_documents by score | |
| matched_documents = documents.sort_values("similarity", ascending=False) | |
| # limit search to top_k matched_documents. | |
| top_k = len(matched_documents) if top_k == -1 else top_k | |
| matched_documents = matched_documents.head(top_k) | |
| # log matched_documents to the console | |
| logger.info(f"matched documents before thresh: {matched_documents}") | |
| # filter out matched_documents using a threshold | |
| if thresh: | |
| matched_documents = matched_documents[matched_documents.similarity > thresh] | |
| logger.info(f"matched documents after thresh: {matched_documents}") | |
| return matched_documents | |
| def prepare_prompt(self, question: str, candidates: pd.DataFrame) -> str: | |
| """ | |
| Prepare the prompt with prompt engineering. | |
| """ | |
| max_chars = self.cfg.max_chars | |
| text_before_prompt = self.cfg.text_before_prompt | |
| documents_list = candidates.text.to_list() | |
| documents_str = " ".join(documents_list) | |
| if len(documents_str) > max_chars: | |
| logger.info("truncating documents to fit...") | |
| documents_str = documents_str[0:max_chars] | |
| return documents_str + text_before_prompt + question | |
| def generate_response(self, prompt: str, matched_documents: pd.DataFrame) -> str: | |
| """ | |
| Generate a response based on the retrieved documents. | |
| """ | |
| if len(matched_documents) == 0: | |
| # No matching documents were retrieved, return | |
| response_text = "I did not find any relevant documentation related to your question." | |
| return response_text | |
| logger.info(f"querying GPT...") | |
| # Call the API to generate a response | |
| try: | |
| completion_kwargs = self.cfg.completion_kwargs | |
| completion_kwargs["prompt"] = prompt | |
| response = openai.Completion.create(**completion_kwargs) | |
| # Get the response text | |
| response_text = response["choices"][0]["text"] | |
| logger.info(f"GPT Response:\n{response_text}") | |
| return response_text | |
| except Exception as e: | |
| # log the error and return a generic response instead. | |
| import traceback | |
| logging.error(traceback.format_exc()) | |
| response_text = "Oops, something went wrong. Try again later!" | |
| return response_text | |
| def add_sources(self, response: str, matched_documents: pd.DataFrame): | |
| """ | |
| Add sources fromt the matched documents to the response. | |
| """ | |
| sep = self.cfg.separator # \n | |
| format = self.cfg.link_format | |
| urls = matched_documents.url.to_list() | |
| names = matched_documents.name.to_list() | |
| similarities = matched_documents.similarity.to_list() | |
| response += f"{sep}{sep}Here are the sources I used to answer your question:\n" | |
| for url, name, similarity in zip(urls, names, similarities): | |
| if format == "markdown": | |
| response += f"{sep}[{name}]({url}){sep}" | |
| elif format == "slack": | |
| response += f"• <{url}|{name}>, score: {similarity:2.3f}{sep}" | |
| else: | |
| raise ValueError(f"{format} is not a valid URL format.") | |
| return response | |
| def format_response(self, response: str, matched_documents: pd.DataFrame) -> str: | |
| """ | |
| Format the response by adding the sources if necessary, and a disclaimer prompt. | |
| """ | |
| sep = self.cfg.separator | |
| text_after_response = self.cfg.text_after_response | |
| if len(matched_documents) > 0: | |
| # we have matched documents, now we check to see if the answer is meaningful | |
| response_embedding = get_embedding( | |
| response, | |
| engine=EMBEDDING_MODEL, | |
| ) | |
| score = cosine_similarity(response_embedding, self.unk_embedding) | |
| logger.info(f"UNK score: {score}") | |
| if score < 0.9: | |
| # Liekly that the answer is meaningful, add the top sources | |
| response = self.add_sources(response, matched_documents=matched_documents) | |
| response += f"{sep}{sep}{sep}{text_after_response}{sep}" | |
| return response | |
| def process_input(self, question: str) -> str: | |
| """ | |
| Main function to process the input question and generate a formatted output. | |
| """ | |
| logger.info(f"User Question:\n{question}") | |
| matched_documents = self.rank_documents(documents=self.documents, query=question) | |
| prompt = self.prepare_prompt(question, matched_documents) | |
| response = self.generate_response(prompt, matched_documents) | |
| formatted_output = self.format_response(response, matched_documents) | |
| return formatted_output | |
| class ChatbotConfig: | |
| """Configuration object for a chatbot. | |
| documents_csv: Path to the csv file containing the documents and their embeddings. | |
| embedding_model: OpenAI model to use to get embeddings. | |
| top_k: Max number of documents to retrieve, ordered by cosine similarity | |
| thresh: threshold for cosine similarity to be considered | |
| max_chars: maximum number of characters the retrieved documents can be. Will truncate otherwise. | |
| completion_kwargs: kwargs for the OpenAI.Completion() method | |
| separator: the separator to use, can be either "\n" or <p> depending on rendering. | |
| link_format: the type of format to render links with, e.g. slack or markdown | |
| unknown_prompt: Prompt to use to generate the "I don't know" embedding to compare to. | |
| text_before_prompt: Text to prompt GPT with before the user prompt, but after the documentation. | |
| text_after_response: Generic response to add the the chatbot's reply. | |
| """ | |
| documents_csv: str = "buster/data/document_embeddings.csv" | |
| embedding_model: str = "text-embedding-ada-002" | |
| top_k: int = 3 | |
| thresh: float = 0.7 | |
| max_chars: int = 3000 | |
| completion_kwargs: dict = field( | |
| default_factory=lambda: { | |
| "engine": "text-davinci-003", | |
| "max_tokens": 200, | |
| "temperature": None, | |
| "top_p": None, | |
| "frequency_penalty": 1, | |
| "presence_penalty": 1, | |
| } | |
| ) | |
| separator: str = "\n" | |
| link_format: str = "slack" | |
| unknown_prompt: str = "I Don't know how to answer your question." | |
| text_before_prompt: str = "I'm a chatbot, bleep bloop." | |
| text_after_response: str = "Answer the following question:\n" | |