Spaces:

jerpint
/

buster-dev

Runtime error

App Files Files Community

marondeau commited on Feb 24, 2023

Commit

60b4723

unverified ·

2 Parent(s): 17cfdef 5a55b5b

Merge pull request #51 from marondeau/formatter

Browse files

Files changed (6) hide show

buster/chatbot.py +40 -61
buster/formatter/__init__.py +6 -0
buster/formatter/base.py +60 -0
buster/formatter/html.py +41 -0
buster/formatter/markdown.py +28 -0
buster/formatter/slack.py +28 -0

buster/chatbot.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import logging
 import os
 from dataclasses import dataclass, field
 import numpy as np
 import openai
@@ -9,6 +10,11 @@ import promptlayer
 from openai.embeddings_utils import cosine_similarity, get_embedding
 from buster.docparser import read_documents
 logger = logging.getLogger(__name__)
 logging.basicConfig(level=logging.INFO)
@@ -149,53 +155,49 @@ class Chatbot:
         documents_str: str = self.prepare_documents(matched_documents, max_words=self.cfg.max_words)
         return text_before_documents + documents_str + text_before_prompt + question
-    def get_gpt_response(self, **completion_kwargs):
         # Call the API to generate a response
         logger.info(f"querying GPT...")
         try:
-            return openai.Completion.create(**completion_kwargs)
         except Exception as e:
             # log the error and return a generic response instead.
             logger.exception("Error connecting to OpenAI API. See traceback:")
-            response = {"choices": [{"text": "We're having trouble connecting to OpenAI right now... Try again soon!"}]}
-            return response
-    def generate_response(self, prompt: str, matched_documents: pd.DataFrame, unknown_prompt: str) -> str:
         """
         Generate a response based on the retrieved documents.
         """
         if len(matched_documents) == 0:
             # No matching documents were retrieved, return
-            return unknown_prompt
         logger.info(f"Prompt:  {prompt}")
         response = self.get_gpt_response(prompt=prompt, **self.cfg.completion_kwargs)
-        response_str = response["choices"][0]["text"]
-        logger.info(f"GPT Response:\n{response_str}")
-        return response_str
-    def add_sources(self, response: str, matched_documents: pd.DataFrame, sep: str, format: str):
-        """
-        Add sources fromt the matched documents to the response.
-        """
-        urls = matched_documents.url.to_list()
-        titles = matched_documents.title.to_list()
-        similarities = matched_documents.similarity.to_list()
-        response += f"{sep}{sep}📝 Here are the sources I used to answer your question:{sep}{sep}"
-        for url, title, similarity in zip(urls, titles, similarities):
-            if format == "markdown":
-                response += f"[🔗 {title}]({url}), relevance: {similarity:2.3f}{sep}"
-            elif format == "html":
-                response += f"<a href='{url}'>🔗 {title}</a>{sep}"
-            elif format == "slack":
-                response += f"<{url}|🔗 {title}>, relevance: {similarity:2.3f}{sep}"
             else:
-                raise ValueError(f"{format} is not a valid URL format.")
-        return response
     def check_response_relevance(
         self, response: str, engine: str, unk_embedding: np.array, unk_threshold: float
@@ -217,36 +219,16 @@ class Chatbot:
         # Likely that the answer is meaningful, add the top sources
         return score < unk_threshold
-    def format_response(self, response: str, matched_documents: pd.DataFrame, text_after_response: str) -> str:
-        """
-        Format the response by adding the sources if necessary, and a disclaimer prompt.
-        """
-        sep = self.cfg.separator
-        is_relevant = self.check_response_relevance(
-            response=response,
-            engine=self.cfg.embedding_model,
-            unk_embedding=self.unk_embedding,
-            unk_threshold=self.cfg.unknown_threshold,
-        )
-        if is_relevant:
-            # Passes our relevance detection mechanism that the answer is meaningful, add the top sources
-            response = self.add_sources(
-                response=response,
-                matched_documents=matched_documents,
-                sep=self.cfg.separator,
-                format=self.cfg.link_format,
-            )
-        response += f"{sep}{sep}{sep}{text_after_response}{sep}"
-        return response
-    def process_input(self, question: str) -> str:
         """
         Main function to process the input question and generate a formatted output.
         """
         logger.info(f"User Question:\n{question}")
         matched_documents = self.rank_documents(
@@ -262,9 +244,6 @@ class Chatbot:
             text_before_prompt=self.cfg.text_before_prompt,
             text_before_documents=self.cfg.text_before_documents,
         )
-        response = self.generate_response(prompt, matched_documents, self.cfg.unknown_prompt)
-        formatted_output = self.format_response(
-            response, matched_documents, text_after_response=self.cfg.text_after_response
-        )
-        return formatted_output

 import logging
 import os
 from dataclasses import dataclass, field
+from typing import Iterable
 import numpy as np
 import openai
 from openai.embeddings_utils import cosine_similarity, get_embedding
 from buster.docparser import read_documents
+from buster.formatter import Formatter, HTMLFormatter, MarkdownFormatter, SlackFormatter
+from buster.formatter.base import Response, Source
+FORMATTERS = {"text": Formatter, "slack": SlackFormatter, "html": HTMLFormatter, "markdown": MarkdownFormatter}
 logger = logging.getLogger(__name__)
 logging.basicConfig(level=logging.INFO)
         documents_str: str = self.prepare_documents(matched_documents, max_words=self.cfg.max_words)
         return text_before_documents + documents_str + text_before_prompt + question
+    def get_gpt_response(self, **completion_kwargs) -> Response:
         # Call the API to generate a response
         logger.info(f"querying GPT...")
         try:
+            response = openai.Completion.create(**completion_kwargs)
         except Exception as e:
             # log the error and return a generic response instead.
             logger.exception("Error connecting to OpenAI API. See traceback:")
+            return Response("", True, "We're having trouble connecting to OpenAI right now... Try again soon!")
+        text = response["choices"][0]["text"]
+        return Response(text)
+    def generate_response(
+        self, prompt: str, matched_documents: pd.DataFrame, unknown_prompt: str
+    ) -> tuple[Response, Iterable[Source]]:
         """
         Generate a response based on the retrieved documents.
         """
         if len(matched_documents) == 0:
             # No matching documents were retrieved, return
+            sources = tuple()
+            return Response(unknown_prompt), sources
         logger.info(f"Prompt:  {prompt}")
         response = self.get_gpt_response(prompt=prompt, **self.cfg.completion_kwargs)
+        if response:
+            logger.info(f"GPT Response:\n{response.text}")
+            relevant = self.check_response_relevance(
+                response=response.text,
+                engine=self.cfg.embedding_model,
+                unk_embedding=self.unk_embedding,
+                unk_threshold=self.cfg.unknown_threshold,
+            )
+            if relevant:
+                sources = (
+                    Source(dct["name"], dct["url"], dct["similarity"])
+                    for dct in matched_documents.to_dict(orient="records")
+                )
             else:
+                sources = tuple()
+        return response, sources
     def check_response_relevance(
         self, response: str, engine: str, unk_embedding: np.array, unk_threshold: float
         # Likely that the answer is meaningful, add the top sources
         return score < unk_threshold
+    def process_input(self, question: str, formatter: Formatter = None) -> str:
         """
         Main function to process the input question and generate a formatted output.
         """
+        if formatter is None and self.cfg.link_format not in FORMATTERS:
+            raise ValueError(f"Unknown link format {self.cfg.link_format}")
+        elif formatter is None:
+            formatter = FORMATTERS[self.cfg.link_format]()
         logger.info(f"User Question:\n{question}")
         matched_documents = self.rank_documents(
             text_before_prompt=self.cfg.text_before_prompt,
             text_before_documents=self.cfg.text_before_documents,
         )
+        response, sources = self.generate_response(prompt, matched_documents, self.cfg.unknown_prompt)
+        return formatter(response, sources)

buster/formatter/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from .base import Formatter
+from .html import HTMLFormatter
+from .markdown import MarkdownFormatter
+from .slack import SlackFormatter
+__all__ = [Formatter, HTMLFormatter, MarkdownFormatter, SlackFormatter]

buster/formatter/base.py ADDED Viewed

	@@ -0,0 +1,60 @@

+from dataclasses import dataclass
+from typing import Iterable, NamedTuple
+# Should be from the `documents` module.
+class Source(NamedTuple):
+    name: str
+    url: str
+    question_similarity: float
+    # TODO Add answer similarity.
+    # answer_similarity: float
+# Should be from the `nlp` module.
+@dataclass(slots=True)
+class Response:
+    text: str
+    error: bool = False
+    error_msg: str | None = None
+@dataclass
+class Formatter:
+    source_template: str = "{source.name} (relevance: {source.question_similarity:2.3f})"
+    error_msg_template: str = "Something went wrong: {response.error_msg}"
+    error_fallback_template: str = "Something went very wrong."
+    sourced_answer_template: str = "{response.text}\n\nSources:\n{sources}\n\nBut what do I know, I'm a chatbot."
+    unsourced_answer_template: str = "{response.text}\n\nBut what do I know, I'm a chatbot."
+    def source_item(self, source: Source) -> str:
+        """Format a single source item."""
+        return self.source_template.format(source=source)
+    def sources_list(self, sources: Iterable[Source]) -> str | None:
+        """Format sources into a list."""
+        items = [self.source_item(source) for source in sources]
+        if not items:
+            return None  # No list needed.
+        return "\n".join(f"{ind}. {item}" for ind, item in enumerate(items, 1))
+    def error(self, response: Response) -> str:
+        """Format an error message."""
+        if response.error_msg:
+            return self.error_msg_template.format(response=response)
+        return self.error_fallback_template.format(response=response)
+    def answer(self, response: Response, sources: Iterable[Source]) -> str:
+        """Format an answer and its sources."""
+        sources_list = self.sources_list(sources)
+        if not sources_list:
+            return self.sourced_answer_template.format(response=response, sources=sources_list)
+        return self.unsourced_answer_template.format(response=response)
+    def __call__(self, response: Response, sources: Iterable[Source]) -> str:
+        """Format an answer and its sources, or an error message."""
+        if response.error:
+            return self.error(response)
+        return self.answer(response, sources)

buster/formatter/html.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import html
+from dataclasses import dataclass
+from typing import Iterable
+from buster.formatter.base import Formatter, Response, Source
+@dataclass
+class HTMLFormatter(Formatter):
+    """Format the answer in HTML."""
+    source_template: str = """<li><a href='{source.url}'>🔗 {source.name}</a></li>"""
+    error_msg_template: str = """<div class="error">Something went wrong:\n<p>{response.error_msg}</p></div>"""
+    error_fallback_template: str = """<div class="error">Something went very wrong.</div>"""
+    sourced_answer_template: str = (
+        """<div class="answer"><p>{response.text}</p></div>\n"""
+        """<div class="sources>📝 Here are the sources I used to answer your question:\n"""
+        """<ol>\n{sources}</ol></div>\n"""
+        """<div class="footer">I'm a chatbot, bleep bloop.</div>"""
+    )
+    unsourced_answer_template: str = (
+        """<div class="answer">{response.text}</div>\n<div class="footer">I'm a chatbot, bleep bloop.</div>"""
+    )
+    def sources_list(self, sources: Iterable[Source]) -> str | None:
+        """Format sources into a list."""
+        items = [self.source_item(source) for source in sources]
+        if not items:
+            return None  # No list needed.
+        return "\n".join(items)
+    def __call__(self, response: Response, sources: Iterable[Source]) -> str:
+        # Escape any html in the text.
+        response = Response(
+            html.escape(response.text) if response.text else response.text,
+            response.error,
+            html.escape(response.error_msg) if response.error_msg else response.error_msg,
+        )
+        sources = (Source(html.escape(source.name), source.url, source.question_similarity) for source in sources)
+        return super().__call__(response, sources)

buster/formatter/markdown.py ADDED Viewed

	@@ -0,0 +1,28 @@

+from dataclasses import dataclass
+from typing import Iterable
+from buster.formatter.base import Formatter, Source
+@dataclass
+class MarkdownFormatter(Formatter):
+    """Format the answer in markdown."""
+    source_template: str = """[🔗 {source.name}]({source.url}), relevance: {source.question_similarity:2.3f}"""
+    error_msg_template: str = """Something went wrong:\n{response.error_msg}"""
+    error_fallback_template: str = """Something went very wrong."""
+    sourced_answer_template: str = (
+        """{response.text}\n\n"""
+        """📝 Here are the sources I used to answer your question:\n"""
+        """{sources}\n\n"""
+        """I'm a chatbot, bleep bloop."""
+    )
+    unsourced_answer_template: str = """{response.text}\n\nI'm a chatbot, bleep bloop."""
+    def sources_list(self, sources: Iterable[Source]) -> str | None:
+        """Format sources into a list."""
+        items = [self.source_item(source) for source in sources]
+        if not items:
+            return None  # No list needed.
+        return "\n".join(f"{ind}. {item}" for ind, item in enumerate(items, 1))

buster/formatter/slack.py ADDED Viewed

	@@ -0,0 +1,28 @@

+from dataclasses import dataclass
+from typing import Iterable
+from buster.formatter.base import Formatter, Source
+@dataclass
+class SlackFormatter(Formatter):
+    """Format the answer for Slack."""
+    source_template: str = """<{source.url}|🔗 {source.name}>, relevance: {source.question_similarity:2.3f}"""
+    error_msg_template: str = """Something went wrong:\n{response.error_msg}"""
+    error_fallback_template: str = """Something went very wrong."""
+    sourced_answer_template: str = (
+        """{response.text}\n\n"""
+        """📝 Here are the sources I used to answer your question:\n"""
+        """{sources}\n\n"""
+        """I'm a chatbot, bleep bloop."""
+    )
+    unsourced_answer_template: str = """{response.text}\n\nI'm a chatbot, bleep bloop."""
+    def sources_list(self, sources: Iterable[Source]) -> str | None:
+        """Format sources into a list."""
+        items = [self.source_item(source) for source in sources]
+        if not items:
+            return None  # No list needed.
+        return "\n".join(items)