In [1]:
import os

os.environ["OPENAI_API_KEY"] = ""
s

In [2]:
import os
from dotenv import load_dotenv
from literalai import LiteralClient

virtual_app = dict(
    llm=dict(modelname="AI Tutor - GPT3 - FAISS"),
    template="information about the template I used in my app",
    debug="all of these fields are completely optional",
)
from trulens_eval import Select
from trulens_eval.tru_virtual import VirtualApp

virtual_app = VirtualApp(virtual_app)  # can start with the prior dictionary
virtual_app[Select.RecordCalls.llm.maxtokens] = 1024

from trulens_eval import Select

retriever = Select.RecordCalls.retriever
synthesizer = Select.RecordCalls.synthesizer

virtual_app[retriever] = "retriever"
virtual_app[synthesizer] = "synthesizer"


load_dotenv()

literal_client = LiteralClient()

threads = literal_client.api.get_threads(first=1).data
print(threads)

rag_steps = []
for thread in threads:
    rag_steps.extend([step for step in thread.steps if step.name == "RAG"])

save_log_dict = {"step_id": [], "user_input": [], "output": [], "context": []}
for step in rag_steps:
    save_log_dict["step_id"].append(step.id)
    save_log_dict["user_input"].append(step.input["question"])
    save_log_dict["output"].append(step.output["answer"])
    save_log_dict["context"].append(step.output["source_documents"])

print(save_log_dict)

records = []
for i in range(len(save_log_dict["step_id"])):
    step_id = save_log_dict["step_id"][i]
    user_input = save_log_dict["user_input"][i]
    output = save_log_dict["output"][i]
    context = save_log_dict["context"][i]

    print(f"Step ID: {step_id}")
    print(f"User Input: {user_input}")
    print(f"Output: {output}")
    print(f"Context: {context}")

    from trulens_eval.tru_virtual import VirtualRecord

    # The selector for a presumed context retrieval component's call to
    # `get_context`. The names are arbitrary but may be useful for readability on
    # your end.
    context_call = retriever.get_context
    generation = synthesizer.generate

    rec = VirtualRecord(
        main_input=f"{user_input}",
        main_output=f"{output}",
        calls={
            context_call: dict(args=[f"{user_input}"], rets=[f"{output}"]),
            generation: dict(
                args=[f"{context}"],
                rets=[f"{output}"],
            ),
        },
    )

    records.append(rec)


[{
    "createdAt": "2024-06-19T09:44:35.578Z",
    "id": "7a2c91be-4f47-4f60-8ded-537dd31386df",
    "metadata": {},
    "name": "TEST",
    "participant": {},
    "steps": [
        {
            "attachments": [],
            "endTime": "2024-06-19T09:44:48.221",
            "error": null,
            "generation": null,
            "id": "1ab30fe6-99cb-4327-bd7d-e3128b2c1a62",
            "input": {
                "question": "What is inductive bias in CNNs?"
            },
            "metadata": {},
            "name": "RAG",
            "output": {
                "answer": "Inductive bias in CNNs refers to the inherent preference for prioritizing information from neighboring nodes when updating each node in the network. This bias helps the network effectively aggregate information from nearby nodes, leading to more efficient and accurate node embeddings. You can find more information about this topic in the lecture slides on Graph Neural Networks and Convolutional Networks.",


In [3]:
from trulens_eval.feedback.provider import OpenAI
from trulens_eval.feedback.feedback import Feedback

# Initialize provider class
provider = OpenAI()

# Select context to be used in feedback. We select the return values of the
# virtual `get_context` call in the virtual `retriever` component. Names are
# arbitrary except for `rets`.
context = context_call.rets[:]

# Question/statement relevance between question and each context chunk.
f_context_relevance = (
    Feedback(provider.context_relevance_with_cot_reasons)
    .on_input()
    .on(context)
)

# Define a groundedness feedback function
f_groundedness = (
    Feedback(provider.groundedness_measure_with_cot_reasons, name = "Groundedness")
    .on(context.collect())
    .on_output()
)

# Question/answer relevance between overall question and answer.
f_qa_relevance = (
    Feedback(provider.relevance_with_cot_reasons, name = "Answer Relevance")
    .on_input_output()
)

‚úÖ In context_relevance_with_cot_reasons, input question will be set to __record__.main_input or `Select.RecordInput` .
‚úÖ In context_relevance_with_cot_reasons, input context will be set to __record__.app.retriever.get_context.rets[:] .
‚úÖ In Groundedness, input source will be set to __record__.app.retriever.get_context.rets[:].collect() .
‚úÖ In Groundedness, input statement will be set to __record__.main_output or `Select.RecordOutput` .
‚úÖ In Answer Relevance, input prompt will be set to __record__.main_input or `Select.RecordInput` .
‚úÖ In Answer Relevance, input response will be set to __record__.main_output or `Select.RecordOutput` .


In [4]:
from trulens_eval.tru_virtual import TruVirtual

virtual_recorder = TruVirtual(
    app_id="AI Tutor - GPT3 - FAISS",
    app=virtual_app,
    feedbacks=[f_context_relevance, f_groundedness, f_qa_relevance],
    feedback_mode = "deferred" # optional
)

ü¶ë Tru initialized with db url sqlite:///default.sqlite .
üõë Secret keys may be written to the database. See the `database_redact_keys` option of `Tru` to prevent this.


In [5]:
for record in records:
    virtual_recorder.add_record(record)

In [6]:
from trulens_eval import Tru
tru = Tru()

tru.run_dashboard(force=True)

Force stopping dashboard ...
Starting dashboard ...
Config file already exists. Skipping writing process.
Credentials file already exists. Skipping writing process.


Accordion(children=(VBox(children=(VBox(children=(Label(value='STDOUT'), Output())), VBox(children=(Label(valu‚Ä¶

Dashboard started at http://10.0.0.226:8501 .


<Popen: returncode: None args: ['streamlit', 'run', '--server.headless=True'...>

In [7]:
tru.start_evaluator()

# tru.stop_evaluator() # stop if needed

Will keep max of 32 feedback(s) running.
Tasks are spread among max of 128 thread(s).
Will rerun running feedbacks after a minute.
Will rerun failed feedbacks after 5 minutes.


<Thread(Thread-8 (runloop), started daemon 13681946624)>

Feedback Status:  80%|########  | 84/105 [00:00<?, ?feedbacks/s, DONE=84, NONE=21]

Done Runs: 0runs [00:00, ?runs/s]

Waiting for Runs: 0runs [00:00, ?runs/s]