File size: 5,846 Bytes
04abf37
 
af85e91
 
04abf37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
af85e91
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
04abf37
af85e91
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
04abf37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
af85e91
04abf37
af85e91
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
04abf37
 
 
 
af85e91
04abf37
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
import os
import getpass
import sys
import importlib.util
from pathlib import Path
from operator import itemgetter
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

import chainlit as cl
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain_openai.chat_models import ChatOpenAI
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams

# Import utility functions from the notebook
def import_notebook_functions(notebook_path):
    """Import functions from a Jupyter notebook"""
    import nbformat
    from importlib.util import spec_from_loader, module_from_spec
    from IPython.core.interactiveshell import InteractiveShell
    
    # Create a module
    module_name = Path(notebook_path).stem
    spec = spec_from_loader(module_name, loader=None)
    module = module_from_spec(spec)
    sys.modules[module_name] = module
    
    # Read the notebook
    with open(notebook_path) as f:
        nb = nbformat.read(f, as_version=4)
    
    # Execute code cells
    shell = InteractiveShell.instance()
    for cell in nb.cells:
        if cell.cell_type == 'code':
            # Skip example code
            if 'if __name__ == "__main__":' in cell.source:
                continue
                
            code = shell.input_transformer_manager.transform_cell(cell.source)
            exec(code, module.__dict__)
    
    return module

# Try to import utility functions if available
try:
    utils = import_notebook_functions('utils_data_loading.ipynb')
    
    # Load vector store using the utility function
    vector_store = utils.load_vector_store(
        storage_path=os.environ.get("VECTOR_STORAGE_PATH", "./db/vectorstore_v3"),
        collection_name=os.environ.get("QDRANT_COLLECTION", "thedataguy_documents"),
        embedding_model=os.environ.get("EMBEDDING_MODEL", "Snowflake/snowflake-arctic-embed-l")
    )
    
    print("Successfully loaded vector store using utility functions")
    
except Exception as e:
    print(f"Could not load utility functions: {e}")
    print("Falling back to direct initialization")
    
    # Get vector storage path from .env file with fallback
    storage_path = Path(os.environ.get("VECTOR_STORAGE_PATH", "./db/vectorstore_v3"))
    
    # Load embedding model from environment variable with fallback
    embedding_model = os.environ.get("EMBEDDING_MODEL", "Snowflake/snowflake-arctic-embed-l")
    huggingface_embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
    
    # Set up Qdrant vectorstore from existing collection
    collection_name = os.environ.get("QDRANT_COLLECTION", "thedataguy_documents")
    
    vector_store = QdrantVectorStore.from_existing_collection(
        path=storage_path,
        collection_name=collection_name,
        embedding=huggingface_embeddings,
    )


# Create a retriever
retriever = vector_store.as_retriever()

# Set up ChatOpenAI with environment variables
llm_model = os.environ.get("LLM_MODEL", "gpt-4o-mini")
temperature = float(os.environ.get("TEMPERATURE", "0"))
llm = ChatOpenAI(model=llm_model, temperature=temperature)

# Create RAG prompt template
rag_prompt_template = """\
You are a helpful assistant that answers questions based on the context provided. 
Generate a concise answer to the question in markdown format and include a list of relevant links to the context.
Use links from context to help user to navigate to to find more information.
You have access to the following information:

Context:
{context}

Question:
{question}

If context is unrelated to question, say "I don't know".
"""

rag_prompt = ChatPromptTemplate.from_template(rag_prompt_template)

# Create chain
retrieval_augmented_qa_chain = (
    {"context": itemgetter("question") | retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": rag_prompt | llm, "context": itemgetter("context")}
)


  
@cl.on_chat_start
async def setup_chain():
    # Check if API key is already set
    api_key = os.environ.get("OPENAI_API_KEY")
    if not api_key:
        # In a real app, you'd want to handle this more gracefully
        api_key = await cl.AskUserMessage(
            content="Please enter your OpenAI API Key:",
            timeout=60,
            raise_on_timeout=True
        ).send()
        os.environ["OPENAI_API_KEY"] = api_key.content

    # Set a loading message
    msg = cl.Message(content="Let's talk about [TheDataGuy](https://thedataguy.pro)'s blog posts, how can I help you?", author="System")
    await msg.send()
    
    # Store the chain in user session
    cl.user_session.set("chain", retrieval_augmented_qa_chain)

    

@cl.on_message
async def on_message(message: cl.Message):
    # Get chain from user session
    chain = cl.user_session.get("chain")
    
    print(message.content)
    # Call the chain with the user message
    response = chain.invoke({"question": message.content})
    
    # Get the sources to display them
    sources = []
    for doc in response["context"]:
        if "url" in doc.metadata:
            # Get title from post_title metadata if available, otherwise derive from URL
            title = doc.metadata.get("post_title", "")
            if not title:
                title = doc.metadata["url"].split("/")[-2].replace("-", " ").title()
                
            sources.append(
                cl.Source(
                    url=doc.metadata["url"],
                    title=title
                )
            )

    # Send the response with sources
    await cl.Message(
        content=response["response"].content,
        sources=sources
    ).send()