Spaces:

krstakis
/

prompt_search_engine

Sleeping

App Files Files Community

krstakis commited on Jul 22, 2024

Commit

b652e4e

1 Parent(s): f526df9

fixing everything.....

Browse files

Files changed (13) hide show

Dockerfile +76 -47
api/__pycache__/__init__.cpython-39.pyc +0 -0
api/__pycache__/service_manager.cpython-39.pyc +0 -0
api/__pycache__/web_server.cpython-39.pyc +0 -0
api/service_manager.py +4 -5
api/web_server.py +22 -4
core/initialization.py +12 -6
core/search_engine.py +23 -8
core/vectorizer.py +34 -7
requirements.txt +3 -1
ui/__init__.py +1 -0
ui/requirements.txt +2 -0
ui/streamlit_app.py +27 -0

Dockerfile CHANGED Viewed

@@ -1,63 +1,92 @@
-# First stage: Build and load the dataset
-FROM python:3.9 as core
-# Set the working directory
-WORKDIR /app
-# Copy and install core requirements
 COPY ./core/requirements.txt ./core/requirements.txt
-RUN pip install -r ./core/requirements.txt
-# Copy the core files
-COPY ./core ./core
-# Set the PYTHONPATH to include the /app directory
-ENV PYTHONPATH="/app"
-# Set the HF_HOME to a writable directory
-ENV HF_HOME="/app/cache"
-# Create the cache directory and set correct permissions
-RUN mkdir -p /app/cache && chmod -R 777 /app/cache
-# Run the initialization script to load and serialize the dataset
-RUN python ./core/initialization.py
-# Second stage: Set up the API
 FROM python:3.9
-# Set the working directory
-WORKDIR /app
-# Copy and install core and API requirements
-COPY ./core/requirements.txt ./core/requirements.txt
-COPY ./api/requirements.txt ./api/requirements.txt
-RUN pip install -r ./core/requirements.txt
-RUN pip install -r ./api/requirements.txt
-# Copy the API files
-COPY ./api ./api
-# Copy the core files to the second stage to ensure search_engine is available
 COPY ./core ./core
-# Copy the serialized engine from the first stage to the API directory
-COPY --from=core /app/core/engine.pickle /app/api/engine.pickle
-# Set the PYTHONPATH to include the /app directory
-ENV PYTHONPATH="/app"
-# Set the HF_HOME to a writable directory
-ENV HF_HOME="/app/cache"
-# Create the cache directory and set correct permissions
-RUN mkdir -p /app/cache && chmod -R 777 /app/cache
-# Expose the API port
 EXPOSE 7860
-# Run the service manager
 ENTRYPOINT ["python", "api/service_manager.py"]
 # FROM python:3.9 as core
 #
 # COPY ./core/requirements.txt ./requirements.txt

+FROM python:3.9 AS install
+RUN apt-get update
+RUN apt-get install -y --no-install-recommends build-essential gcc
+COPY ./api/requirements.txt ./api/requirements.txt
+RUN pip install --user -r ./api/requirements.txt
 COPY ./core/requirements.txt ./core/requirements.txt
+RUN pip install --user -r ./core/requirements.txt
+##################################################################
+FROM python:3.9 AS setup
+COPY --from=install /root/.local /root/.local
+COPY ./core .
+RUN python ./initialization.py
+##################################################################
 FROM python:3.9
+COPY --from=install /root/.local /root/.local
 COPY ./core ./core
+COPY ./api ./api
+COPY --from=setup /engine.pickle /engine.pickle
 EXPOSE 7860
 ENTRYPOINT ["python", "api/service_manager.py"]
+# # First stage: Build and load the dataset
+# FROM python:3.9 as core
+#
+# # Set the working directory
+# WORKDIR /app
+#
+# # Copy and install core requirements
+# COPY ./core/requirements.txt ./core/requirements.txt
+# RUN pip install -r ./core/requirements.txt
+#
+# # Copy the core files
+# COPY ./core ./core
+#
+# # Set the PYTHONPATH to include the /app directory
+# ENV PYTHONPATH="/app"
+#
+# # Set the HF_HOME to a writable directory
+# ENV HF_HOME="/app/cache"
+#
+# # Create the cache directory and set correct permissions
+# RUN mkdir -p /app/cache && chmod -R 777 /app/cache
+#
+# # Run the initialization script to load and serialize the dataset
+# RUN python ./core/initialization.py
+#
+# # Second stage: Set up the API
+# FROM python:3.9
+#
+# # Set the working directory
+# WORKDIR /app
+#
+# # Copy and install core and API requirements
+# COPY ./core/requirements.txt ./core/requirements.txt
+# COPY ./api/requirements.txt ./api/requirements.txt
+# RUN pip install -r ./core/requirements.txt
+# RUN pip install -r ./api/requirements.txt
+#
+# # Copy the API files
+# COPY ./api ./api
+#
+# # Copy the core files to the second stage to ensure search_engine is available
+# COPY ./core ./core
+#
+# # Copy the serialized engine from the first stage to the API directory
+# COPY --from=core /app/core/engine.pickle /app/api/engine.pickle
+#
+# # Set the PYTHONPATH to include the /app directory
+# ENV PYTHONPATH="/app"
+#
+# # Set the HF_HOME to a writable directory
+# ENV HF_HOME="/app/cache"
+#
+# # Create the cache directory and set correct permissions
+# RUN mkdir -p /app/cache && chmod -R 777 /app/cache
+#
+# # Expose the API port
+# EXPOSE 7860
+#
+# # Run the service manager
+# ENTRYPOINT ["python", "api/service_manager.py"]
 # FROM python:3.9 as core
 #
 # COPY ./core/requirements.txt ./requirements.txt

api/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (175 Bytes). View file

api/__pycache__/service_manager.cpython-39.pyc ADDED Viewed

Binary file (414 Bytes). View file

api/__pycache__/web_server.cpython-39.pyc ADDED Viewed

Binary file (1.33 kB). View file

api/service_manager.py CHANGED Viewed

@@ -1,12 +1,11 @@
-from api.web_server import app
 import uvicorn
 def run():
     """
-    TODO
     """
-    uvicorn.run(app, host="0.0.0.0", port=8000)

 import uvicorn
+from .web_server import app
 def run():
     """
+    Start the FastAPI web server using Uvicorn.
     """
+    uvicorn.run(app, host="0.0.0.0", port=7860)

api/web_server.py CHANGED Viewed

@@ -1,7 +1,8 @@
 import dill
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
-from core.search_engine import PromptSearchEngine
 class Query(BaseModel):
@@ -11,7 +12,7 @@ class Query(BaseModel):
 app = FastAPI()
-with open('api/engine.pickle', 'rb') as file:
     serialized_engine = file.read()
 prompt_search_engine = dill.loads(serialized_engine)
@@ -20,7 +21,22 @@ prompt_search_engine = dill.loads(serialized_engine)
 @app.post("/search/")
 async def search(query: Query):
     """
-    TODO
     """
     try:
@@ -31,7 +47,9 @@ async def search(query: Query):
             raise ValueError("Prompt must be a string")
         results = prompt_search_engine.most_similar(query.prompt, query.n)
-        formatted_results = [{"score": float(score), "description": desc} for score, desc in results]
         return formatted_results

 import dill
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
+# from ..core.search_engine import PromptSearchEngine
 class Query(BaseModel):
 app = FastAPI()
+with open("./engine.pickle", "rb") as file:
     serialized_engine = file.read()
 prompt_search_engine = dill.loads(serialized_engine)
 @app.post("/search/")
 async def search(query: Query):
     """
+    Find the most similar prompts to a given query prompt using the pre-trained PromptSearchEngine.
+    This endpoint accepts a query prompt and returns a specified number of the most similar prompts
+    from the corpus. It performs the following steps:
+    1. Validates the input types.
+    2. Uses the pre-loaded PromptSearchEngine to find the most similar prompts.
+    3. Formats the results into a list of dictionaries containing the similarity score and prompt text.
+    Args:
+        query (Query): The query model containing the prompt text and the number of similar prompts to return.
+    Returns:
+        List[Dict[str, Union[float, str]]]: A list of dictionaries where each dictionary contains the similarity score and the corresponding prompt.
+    Raises:
+        HTTPException: If an error occurs during the processing of the query, an HTTP 500 error is raised with the error details.
     """
     try:
             raise ValueError("Prompt must be a string")
         results = prompt_search_engine.most_similar(query.prompt, query.n)
+        formatted_results = [
+            {"score": float(score), "description": desc} for score, desc in results
+        ]
         return formatted_results

core/initialization.py CHANGED Viewed

@@ -1,11 +1,19 @@
 import dill
-from core.search_engine import PromptSearchEngine
-from core.data.dataset import PromptDataset
 def run():
     """
-    TODO
     """
     prompt_dataset = PromptDataset("Gustavosta/Stable-Diffusion-Prompts")
@@ -15,7 +23,5 @@ def run():
     serialized_engine = dill.dumps(engine)
-    with open("core/engine.pickle", "wb") as file:
         file.write(serialized_engine)
-run()

 import dill
+from .data.dataset import PromptDataset
+from .search_engine import PromptSearchEngine
 def run():
     """
+    Initialize the PromptSearchEngine with prompts from the specified dataset,
+    serialize the engine, and save it to a file.
+    This function performs the following steps:
+    1. Loads a dataset of prompts using the PromptDataset class.
+    2. Initializes the PromptSearchEngine with the loaded prompts.
+    3. Serializes the PromptSearchEngine instance using dill.
+    4. Saves the serialized engine to a file named 'engine.pickle'.
     """
     prompt_dataset = PromptDataset("Gustavosta/Stable-Diffusion-Prompts")
     serialized_engine = dill.dumps(engine)
+    with open("engine.pickle", "wb") as file:
         file.write(serialized_engine)

core/search_engine.py CHANGED Viewed

@@ -1,35 +1,50 @@
 from typing import List, Sequence, Tuple
-import numpy as np
 import faiss
 from .vectorizer import Vectorizer
-class PromptSearchEngine(object):
     """
-    TODO
     """
     def __init__(self, prompts: Sequence[str]) -> None:
         """
-        TODO
         """
         self.vectorizer = Vectorizer()
         self.corpus_vectors = self.vectorizer.transform(prompts)
         self.corpus = prompts
-        self.corpus_vectors = self.corpus_vectors / np.linalg.norm(self.corpus_vectors, axis=1, keepdims=True)
         d = self.corpus_vectors.shape[1]
         self.index = faiss.IndexFlatIP(d)
-        self.index.add(self.corpus_vectors.astype('float32'))
     def most_similar(self, query: str, n: int = 5) -> List[Tuple[float, str]]:
         """
-        TODO
         """
-        query_vector = self.vectorizer.transform([query]).astype('float32')
         query_vector = query_vector / np.linalg.norm(query_vector)
         distances, indices = self.index.search(query_vector, n)

 from typing import List, Sequence, Tuple
 import faiss
+import numpy as np
 from .vectorizer import Vectorizer
+class PromptSearchEngine:
     """
+    The PromptSearchEngine is responsible for finding the most similar prompts to a given query
+    by leveraging vectorized representations of the prompts and a similarity search index.
     """
     def __init__(self, prompts: Sequence[str]) -> None:
         """
+        Initialize the PromptSearchEngine with a list of prompts.
+        Args:
+            prompts (Sequence[str]): The sequence of raw corpus prompts to be indexed for similarity search.
         """
         self.vectorizer = Vectorizer()
         self.corpus_vectors = self.vectorizer.transform(prompts)
         self.corpus = prompts
+        self.corpus_vectors = self.corpus_vectors / np.linalg.norm(
+            self.corpus_vectors, axis=1, keepdims=True
+        )
         d = self.corpus_vectors.shape[1]
         self.index = faiss.IndexFlatIP(d)
+        self.index.add(self.corpus_vectors.astype("float32"))
     def most_similar(self, query: str, n: int = 5) -> List[Tuple[float, str]]:
         """
+        Find the most similar prompts to a given query.
+        Args:
+            query (str): The query prompt to search for similar prompts.
+            n (int, optional): The number of similar prompts to retrieve. Defaults to 5.
+        Returns:
+            List[Tuple[float, str]]: A list of tuples containing the similarity score and the corresponding prompt.
         """
+        query_vector = self.vectorizer.transform([query]).astype("float32")
         query_vector = query_vector / np.linalg.norm(query_vector)
         distances, indices = self.index.search(query_vector, n)

core/vectorizer.py CHANGED Viewed

@@ -1,16 +1,23 @@
 from typing import Sequence
 import numpy as np
 from sentence_transformers import SentenceTransformer
-class Vectorizer(object):
     """
-    TODO
     """
-    def __init__(self, model_name: str = 'all-MiniLM-L6-v2') -> None:
         """
         Initialize the vectorizer with a pre-trained embedding model.
         """
         self.model = SentenceTransformer(model_name)
@@ -18,17 +25,37 @@ class Vectorizer(object):
     def transform(self, prompts: Sequence[str]) -> np.ndarray:
         """
         Transform texts into numerical vectors using the specified model.
         """
         return self.model.encode(list(prompts))
     @staticmethod
-    def cosine_similarity(query_vector: np.ndarray, corpus_vectors: np.ndarray) -> np.ndarray:
         """
-        Calculate cosine similarity between prompt vectors.
         """
         query_norm = query_vector / np.linalg.norm(query_vector)
-        corpus_norms = corpus_vectors / np.linalg.norm(corpus_vectors, axis=1, keepdims=True)
         return np.dot(corpus_norms, query_norm.T).flatten()

 from typing import Sequence
 import numpy as np
 from sentence_transformers import SentenceTransformer
+class Vectorizer:
     """
+    The Vectorizers role is to transform textual prompts into numerical vectors that can be
+    compared in a high-dimensional space. This transformation allows the system to quantify the
+    similarity between different prompts effectively.
     """
+    def __init__(self, model_name: str = "all-MiniLM-L6-v2") -> None:
         """
         Initialize the vectorizer with a pre-trained embedding model.
+        Args:
+            model_name (str): The pre-trained embedding model to use for transforming prompts.
+                This can be any model that provides a method to convert texts into numerical vectors.
         """
         self.model = SentenceTransformer(model_name)
     def transform(self, prompts: Sequence[str]) -> np.ndarray:
         """
         Transform texts into numerical vectors using the specified model.
+        Args:
+            prompts (Sequence[str]): The sequence of raw corpus prompts to be transformed.
+        Returns:
+            np.ndarray: A numpy array containing the vectorized prompts. Each row corresponds to the
+                        vector representation of a prompt.
         """
         return self.model.encode(list(prompts))
     @staticmethod
+    def cosine_similarity(
+        query_vector: np.ndarray, corpus_vectors: np.ndarray
+    ) -> np.ndarray:
         """
+        Calculate cosine similarity between a query vector and a set of corpus vectors.
+        Args:
+            query_vector (np.ndarray): A numpy array representing the vector of the query prompt.
+            corpus_vectors (np.ndarray): A numpy array representing the vectors of the corpus prompts.
+                                         Each row corresponds to the vector representation of a corpus prompt.
+        Returns:
+            np.ndarray: A numpy array containing the cosine similarity scores between the query vector and each
+                of the corpus vectors.
         """
         query_norm = query_vector / np.linalg.norm(query_vector)
+        corpus_norms = corpus_vectors / np.linalg.norm(
+            corpus_vectors, axis=1, keepdims=True
+        )
         return np.dot(corpus_norms, query_norm.T).flatten()

requirements.txt CHANGED Viewed

@@ -1,8 +1,10 @@
 datasets==2.20.0
 dill==0.3.8
 faiss_cpu==1.8.0.post1
 fastapi==0.111.1
 pydantic==2.8.2
 sentence_transformers==3.0.1
 uvicorn==0.30.3
-numpy==1.23.5

+Requests==2.32.3
 datasets==2.20.0
 dill==0.3.8
 faiss_cpu==1.8.0.post1
 fastapi==0.111.1
+numpy==1.23.5
 pydantic==2.8.2
 sentence_transformers==3.0.1
+streamlit==1.36.0
 uvicorn==0.30.3

ui/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # This file is intentionally left empty

ui/requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ Requests==2.32.3
2	+ streamlit==1.36.0

ui/streamlit_app.py ADDED Viewed

	@@ -0,0 +1,27 @@

+"""
+This Streamlit application interfaces with a FastAPI backend to search for prompts based on user input.
+The user can enter a prompt and specify the number of similar results they want to retrieve.
+The application then sends a request to the FastAPI endpoint and displays the results.
+"""
+import requests
+import streamlit as st
+API_URL = "http://localhost:8000/search/"
+st.title("Prompt Search Engine")
+prompt = st.text_input("Enter a prompt")
+n = st.slider("Number of results", 1, 20, 5)
+if st.button("Search"):
+    response = requests.post(API_URL, json={"prompt": prompt, "n": n})
+    if response.status_code == 200:
+        results = response.json()
+        for result in results:
+            score = result["score"]
+            result_prompt = result["description"]
+            st.write(f"Score: {score:.4f}, Prompt: {result_prompt}")
+    else:
+        st.error("Error: Could not retrieve results")