rohit commited on
Commit
91f974c
·
1 Parent(s): 9474afa

Deploy RAG Pipeline with FastAPI and ML capabilities

Browse files
Dockerfile ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ RUN useradd -m -u 1000 user
4
+ USER user
5
+ ENV PATH="/home/user/.local/bin:$PATH"
6
+
7
+ WORKDIR /app
8
+
9
+ # Copy requirements and install dependencies
10
+ COPY --chown=user requirements.txt .
11
+ RUN pip install --no-cache-dir -r requirements.txt
12
+
13
+ # Copy application code
14
+ COPY --chown=user app/ app/
15
+ COPY --chown=user start.sh .
16
+
17
+ # Make start script executable
18
+ RUN chmod +x start.sh
19
+
20
+ # Expose HF Space port
21
+ EXPOSE 7860
22
+
23
+ # Run the FastAPI application
24
+ CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]
app/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+
app/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (176 Bytes). View file
 
app/__pycache__/config.cpython-311.pyc ADDED
Binary file (5.34 kB). View file
 
app/__pycache__/main.cpython-311.pyc ADDED
Binary file (13.7 kB). View file
 
app/__pycache__/pipeline.cpython-311.pyc ADDED
Binary file (7.24 kB). View file
 
app/config.py ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, Optional, List
2
+ from dataclasses import dataclass
3
+ from haystack.dataclasses import ChatMessage
4
+
5
+ @dataclass
6
+ class DatasetConfig:
7
+ name: str
8
+ split: str = "train"
9
+ content_field: str = "content"
10
+ fields: Dict[str, str] = None # Dictionary of field mappings
11
+ prompt_template: Optional[str] = None
12
+
13
+ # Default configurations for different datasets
14
+ DATASET_CONFIGS = {
15
+ "awesome-chatgpt-prompts": DatasetConfig(
16
+ name="fka/awesome-chatgpt-prompts",
17
+ content_field="prompt",
18
+ fields={
19
+ "role": "act",
20
+ "prompt": "prompt"
21
+ },
22
+ prompt_template="""
23
+ Given the following context where each document represents a prompt for a specific role,
24
+ please answer the question while considering both the role and the prompt content.
25
+
26
+ Available Contexts:
27
+ {% for document in documents %}
28
+ {% if document.meta.role %}Role: {{ document.meta.role }}{% endif %}
29
+ Content: {{ document.content }}
30
+ ---
31
+ {% endfor %}
32
+
33
+ Question: {{question}}
34
+ Answer:
35
+ """
36
+ ),
37
+ "settings-dataset": DatasetConfig(
38
+ name="syntaxhacker/rag_pipeline",
39
+ content_field="context",
40
+ fields={
41
+ "question": "question",
42
+ "answer": "answer",
43
+ "context": "context"
44
+ },
45
+ prompt_template="""
46
+ Given the following context about software settings and configurations,
47
+ please answer the question accurately based on the provided information.
48
+
49
+ For each setting, provide a clear, step-by-step navigation path and include:
50
+ 1. The exact location (Origin Type > Tab > Section > Setting name)
51
+ 2. What the setting does
52
+ 3. Available options/values
53
+ 4. How to access and modify the setting
54
+ 5. Reference screenshots (if available)
55
+
56
+ Format your answer as:
57
+ "To [accomplish task], follow these steps:
58
+
59
+ Location: [Origin Type] > [Tab] > [Section] > [Setting name]
60
+ Purpose: [describe what the setting does]
61
+ Options: [list available values/options]
62
+ How to set: [describe interaction method: toggle/select/input]
63
+
64
+ Visual Guide:
65
+ [Include reference image links if available]
66
+
67
+ For more details, you can refer to the screenshots above showing the exact location and interface."
68
+
69
+ Available Contexts:
70
+ {% for document in documents %}
71
+ Setting Info: {{ document.content }}
72
+ Reference Answer: {{ document.meta.answer }}
73
+ ---
74
+ {% endfor %}
75
+
76
+ Question: {{question}}
77
+ Answer:
78
+ """
79
+ ),
80
+ "seven-wonders": DatasetConfig(
81
+ name="bilgeyucel/seven-wonders",
82
+ content_field="content",
83
+ fields={}, # No additional fields needed
84
+ prompt_template="""
85
+ Given the following information about the Seven Wonders, please answer the question.
86
+
87
+ Context:
88
+ {% for document in documents %}
89
+ {{ document.content }}
90
+ {% endfor %}
91
+
92
+ Question: {{question}}
93
+ Answer:
94
+ """
95
+ ),
96
+ "psychology-dataset": DatasetConfig(
97
+ name="jkhedri/psychology-dataset",
98
+ split="train",
99
+ content_field="question", # Assuming we want to use the question as the content
100
+ fields={
101
+ "response_j": "response_j", # Response from one model
102
+ "response_k": "response_k" # Response from another model
103
+ },
104
+ prompt_template="""
105
+ Given the following context where each document represents a psychological inquiry,
106
+ please answer the question based on the provided responses.
107
+
108
+ Available Contexts:
109
+ {% for document in documents %}
110
+ Question: {{ document.content }}
111
+ Response J: {{ document.meta.response_j }}
112
+ Response K: {{ document.meta.response_k }}
113
+ ---
114
+ {% endfor %}
115
+
116
+ Question: {{question}}
117
+ Answer:
118
+ """
119
+ ),
120
+ "developer-portfolio": DatasetConfig(
121
+ name="syntaxhacker/developer-portfolio-rag",
122
+ split="train",
123
+ content_field="answer",
124
+ fields={
125
+ "question": "question",
126
+ "answer": "answer",
127
+ "context": "context"
128
+ },
129
+ prompt_template="""
130
+ Given the following context about a software developer's skills, experience, and background,
131
+ please answer the question accurately based on the provided information.
132
+
133
+ For each query, provide detailed information about:
134
+ 1. Technical skills and programming languages
135
+ 2. Machine learning and AI experience
136
+ 3. Projects and professional experience
137
+ 4. Tools and frameworks used
138
+ 5. Personal interests and learning approach
139
+
140
+ Available Contexts:
141
+ {% for document in documents %}
142
+ Question: {{ document.meta.question }}
143
+ Answer: {{ document.content }}
144
+ Context: {{ document.meta.context }}
145
+ ---
146
+ {% endfor %}
147
+
148
+ Question: {{question}}
149
+ Answer:
150
+ """
151
+ ),
152
+ }
153
+
154
+ # Default configuration for embedding and LLM models
155
+ MODEL_CONFIG = {
156
+ "embedding_model": "sentence-transformers/all-MiniLM-L6-v2",
157
+ "llm_model": "gemini-2.0-flash-exp",
158
+ }
app/main.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException
2
+ from pydantic import BaseModel
3
+ import os
4
+ import logging
5
+ import sys
6
+ from .config import DATASET_CONFIGS
7
+ # Lazy imports to avoid blocking startup
8
+ # from .pipeline import RAGPipeline # Will import when needed
9
+ # import umap # Will import when needed for visualization
10
+ # import plotly.express as px # Will import when needed for visualization
11
+ # import plotly.graph_objects as go # Will import when needed for visualization
12
+ # from plotly.subplots import make_subplots # Will import when needed for visualization
13
+ # import numpy as np # Will import when needed for visualization
14
+ # from sklearn.preprocessing import normalize # Will import when needed for visualization
15
+ # import pandas as pd # Will import when needed for visualization
16
+ import json
17
+
18
+ # Configure logging
19
+ logging.basicConfig(
20
+ level=logging.INFO,
21
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
22
+ handlers=[
23
+ logging.StreamHandler(sys.stdout)
24
+ ]
25
+ )
26
+ logger = logging.getLogger(__name__)
27
+
28
+ app = FastAPI(title="RAG Pipeline API", description="Multi-dataset RAG API", version="1.0.0")
29
+
30
+ # Initialize pipelines for all datasets
31
+ pipelines = {}
32
+ google_api_key = os.getenv("GOOGLE_API_KEY")
33
+
34
+ logger.info(f"Starting RAG Pipeline API")
35
+ logger.info(f"Port from env: {os.getenv('PORT', 'Not set - will use 8000')}")
36
+ logger.info(f"Google API Key present: {'Yes' if google_api_key else 'No'}")
37
+ logger.info(f"Available datasets: {list(DATASET_CONFIGS.keys())}")
38
+
39
+ # Don't load datasets during startup - do it asynchronously after server starts
40
+ logger.info("RAG Pipeline API is ready to serve requests - datasets will load in background")
41
+
42
+ # Visualization function disabled to speed up startup
43
+ # def create_3d_visualization(pipeline):
44
+ # ... (commented out for faster startup)
45
+
46
+ class Question(BaseModel):
47
+ text: str
48
+ dataset: str = "developer-portfolio" # Default dataset
49
+
50
+ @app.post("/answer")
51
+ async def get_answer(question: Question):
52
+ try:
53
+ # Check if any pipelines are loaded
54
+ if not pipelines:
55
+ return {
56
+ "answer": "RAG Pipeline is running but datasets are still loading in the background. Please try again in a moment, or check /health for loading status.",
57
+ "dataset": question.dataset,
58
+ "status": "datasets_loading"
59
+ }
60
+
61
+ # Select the appropriate pipeline based on dataset
62
+ if question.dataset not in pipelines:
63
+ raise HTTPException(status_code=400, detail=f"Dataset '{question.dataset}' not available. Available datasets: {list(pipelines.keys())}")
64
+
65
+ selected_pipeline = pipelines[question.dataset]
66
+ answer = selected_pipeline.answer_question(question.text)
67
+ return {"answer": answer, "dataset": question.dataset}
68
+ except Exception as e:
69
+ raise HTTPException(status_code=500, detail=str(e))
70
+
71
+ @app.get("/datasets")
72
+ async def list_datasets():
73
+ """List all available datasets"""
74
+ return {"datasets": list(pipelines.keys())}
75
+
76
+ async def load_datasets_background():
77
+ """Load datasets in background after server starts"""
78
+ global pipelines
79
+ if google_api_key:
80
+ # Import RAGPipeline only when needed
81
+ from .pipeline import RAGPipeline
82
+ # Only load developer-portfolio to save memory
83
+ dataset_name = "developer-portfolio"
84
+ try:
85
+ logger.info(f"Loading dataset: {dataset_name}")
86
+ pipeline = RAGPipeline.from_preset(
87
+ google_api_key=google_api_key,
88
+ preset_name=dataset_name
89
+ )
90
+ pipelines[dataset_name] = pipeline
91
+ logger.info(f"Successfully loaded {dataset_name}")
92
+ except Exception as e:
93
+ logger.error(f"Failed to load {dataset_name}: {e}")
94
+ logger.info(f"Background loading complete - {len(pipelines)} datasets loaded")
95
+ else:
96
+ logger.warning("No Google API key provided - running in demo mode without datasets")
97
+
98
+ @app.on_event("startup")
99
+ async def startup_event():
100
+ logger.info("FastAPI application startup complete")
101
+ logger.info(f"Server should be running on port: {os.getenv('PORT', '8000')}")
102
+
103
+ # Start loading datasets in background (non-blocking)
104
+ import asyncio
105
+ asyncio.create_task(load_datasets_background())
106
+
107
+ @app.on_event("shutdown")
108
+ async def shutdown_event():
109
+ logger.info("FastAPI application shutting down")
110
+
111
+ @app.get("/")
112
+ async def root():
113
+ """Root endpoint"""
114
+ return {"status": "ok", "message": "RAG Pipeline API", "version": "1.0.0", "datasets": list(pipelines.keys())}
115
+
116
+ @app.get("/health")
117
+ async def health_check():
118
+ """Health check endpoint"""
119
+ logger.info("Health check called")
120
+ loading_status = "complete" if "developer-portfolio" in pipelines else "loading"
121
+ return {
122
+ "status": "healthy",
123
+ "datasets_loaded": len(pipelines),
124
+ "total_datasets": 1, # Only loading developer-portfolio
125
+ "loading_status": loading_status,
126
+ "port": os.getenv('PORT', '8000')
127
+ }
128
+
app/pipeline.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from haystack import Document, Pipeline
2
+ from haystack.document_stores.in_memory import InMemoryDocumentStore
3
+ from haystack.components.embedders import SentenceTransformersTextEmbedder, SentenceTransformersDocumentEmbedder
4
+ from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
5
+ from haystack.components.builders import ChatPromptBuilder
6
+ from haystack_integrations.components.generators.google_ai import GoogleAIGeminiChatGenerator
7
+ from datasets import load_dataset
8
+ from haystack.dataclasses import ChatMessage
9
+ from typing import Optional, List, Union, Dict
10
+ from .config import DatasetConfig, DATASET_CONFIGS, MODEL_CONFIG
11
+
12
+ class RAGPipeline:
13
+ def __init__(
14
+ self,
15
+ google_api_key: str,
16
+ dataset_config: Union[str, DatasetConfig],
17
+ documents: Optional[List[Union[str, Document]]] = None,
18
+ embedding_model: Optional[str] = None,
19
+ llm_model: Optional[str] = None
20
+ ):
21
+ """
22
+ Initialize the RAG Pipeline.
23
+
24
+ Args:
25
+ google_api_key: API key for Google AI services
26
+ dataset_config: Either a string key from DATASET_CONFIGS or a DatasetConfig object
27
+ documents: Optional list of documents to use instead of loading from a dataset
28
+ embedding_model: Optional override for embedding model
29
+ llm_model: Optional override for LLM model
30
+ """
31
+ # Load configuration
32
+ if isinstance(dataset_config, str):
33
+ if dataset_config not in DATASET_CONFIGS:
34
+ raise ValueError(f"Dataset config '{dataset_config}' not found. Available configs: {list(DATASET_CONFIGS.keys())}")
35
+ self.config = DATASET_CONFIGS[dataset_config]
36
+ else:
37
+ self.config = dataset_config
38
+
39
+ # Load documents either from provided list or dataset
40
+ if documents is not None:
41
+ self.documents = documents
42
+ else:
43
+ dataset = load_dataset(self.config.name, split=self.config.split)
44
+ # Create documents with metadata based on configuration
45
+ self.documents = []
46
+ for doc in dataset:
47
+ # Create metadata dictionary from configured fields
48
+ meta = {}
49
+ if self.config.fields:
50
+ for meta_key, dataset_field in self.config.fields.items():
51
+ if dataset_field in doc:
52
+ meta[meta_key] = doc[dataset_field]
53
+
54
+ # Create document with content and metadata
55
+ document = Document(
56
+ content=doc[self.config.content_field],
57
+ meta=meta
58
+ )
59
+ self.documents.append(document)
60
+
61
+ # print 10 documents
62
+ for doc in self.documents[:10]:
63
+ print(f"Content: {doc.content}")
64
+ print(f"Metadata: {doc.meta}")
65
+ print("-"*100)
66
+
67
+ # Initialize components
68
+ self.document_store = InMemoryDocumentStore()
69
+ self.doc_embedder = SentenceTransformersDocumentEmbedder(
70
+ model=embedding_model or MODEL_CONFIG["embedding_model"]
71
+ )
72
+ self.text_embedder = SentenceTransformersTextEmbedder(
73
+ model=embedding_model or MODEL_CONFIG["embedding_model"]
74
+ )
75
+ self.retriever = InMemoryEmbeddingRetriever(self.document_store)
76
+
77
+ # Warm up the document embedder
78
+ self.doc_embedder.warm_up()
79
+
80
+ # Initialize prompt template
81
+ template = [
82
+ ChatMessage.from_user(self.config.prompt_template)
83
+ ]
84
+ self.prompt_builder = ChatPromptBuilder(template=template)
85
+
86
+ # Initialize the generator
87
+ self.generator = GoogleAIGeminiChatGenerator(
88
+ model=llm_model or MODEL_CONFIG["llm_model"]
89
+ )
90
+
91
+ # Index documents
92
+ self._index_documents(self.documents)
93
+
94
+ # Build pipeline
95
+ self.pipeline = self._build_pipeline()
96
+
97
+ @classmethod
98
+ def from_preset(cls, google_api_key: str, preset_name: str):
99
+ """
100
+ Create a pipeline from a preset configuration.
101
+
102
+ Args:
103
+ google_api_key: API key for Google AI services
104
+ preset_name: Name of the preset configuration to use
105
+ """
106
+ return cls(google_api_key=google_api_key, dataset_config=preset_name)
107
+
108
+ def _index_documents(self, documents):
109
+ # Embed and index documents
110
+ docs_with_embeddings = self.doc_embedder.run(documents)
111
+ self.document_store.write_documents(docs_with_embeddings["documents"])
112
+
113
+ def _build_pipeline(self):
114
+ pipeline = Pipeline()
115
+ pipeline.add_component("text_embedder", self.text_embedder)
116
+ pipeline.add_component("retriever", self.retriever)
117
+ pipeline.add_component("prompt_builder", self.prompt_builder)
118
+ pipeline.add_component("llm", self.generator)
119
+
120
+ # Connect components
121
+ pipeline.connect("text_embedder.embedding", "retriever.query_embedding")
122
+ pipeline.connect("retriever", "prompt_builder")
123
+ pipeline.connect("prompt_builder.prompt", "llm.messages")
124
+
125
+ return pipeline
126
+
127
+ def answer_question(self, question: str) -> str:
128
+ """Run the RAG pipeline to answer a question"""
129
+ result = self.pipeline.run({
130
+ "text_embedder": {"text": question},
131
+ "prompt_builder": {"question": question}
132
+ })
133
+ return result["llm"]["replies"][0].text
developer-portfolio-rag ADDED
@@ -0,0 +1 @@
 
 
1
+ Subproject commit 761b11c411795efadbf92045b906c28125597810
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ haystack-ai==2.10.3
2
+ datasets==3.3.2
3
+ sentence-transformers==3.4.1
4
+ google-ai-haystack==5.1.0
5
+ fastapi==0.115.4
6
+ uvicorn==0.31.0
7
+ beautifulsoup4==4.12.0 # Stable HTML parsing
8
+ umap-learn==0.5.4
9
+ plotly==5.22.0
10
+ scikit-learn==1.4.1.post1
11
+ numpy>=1.20.0
12
+ pandas>=1.3.0
start.sh ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ set -e # Exit on any error
3
+
4
+ echo "=== RAG Pipeline Startup ==="
5
+ echo "PORT environment variable: ${PORT:-'not set'}"
6
+ echo "Using port: ${PORT:-8000}"
7
+ if [ -n "$GOOGLE_API_KEY" ]; then
8
+ echo "Google API Key present: Yes"
9
+ else
10
+ echo "Google API Key present: No"
11
+ fi
12
+ echo "Starting uvicorn server..."
13
+ echo "=== End Startup Info ==="
14
+
15
+ # Debug the uvicorn command
16
+ echo "Current directory: $(pwd)"
17
+ echo "Python path: $PYTHONPATH"
18
+ echo "Contents of current directory:"
19
+ ls -la
20
+ echo "Contents of app directory:"
21
+ ls -la app/
22
+ echo "Testing Python import:"
23
+ python -c "import app.main; print('Import successful')" || echo "Import failed"
24
+ echo "Starting uvicorn..."
25
+ uvicorn app.main:app --host 0.0.0.0 --port ${PORT:-8000} --log-level info