In [1]:
import os
import subprocess
import threading
import time

# Set environment variable we need to support dual-GPU on Cirrus
os.environ["NCCL_P2P_LEVEL"] = "NVL"
os.environ["VLLM_API_KEY"] = os.getenv("OPENAI_API_KEY") # set same key for simplicity

# https://huggingface.co/spaces/mteb/leaderboard 
def run_vllm_server():
    subprocess.run([
        "vllm", "serve", "Qwen/Qwen3-Embedding-4B",
        "--host", "0.0.0.0",
        "--port", "8000",
        "--tensor-parallel-size", "2",
        "--trust-remote-code",
        "--gpu-memory-utilization", "0.4",
        "--enforce-eager",
        "--served-model-name", "local",
        "--task", "embed" # Run in embed mode!  (default is 'generate')
    ])

# Start server in daemon thread
server_thread = threading.Thread(target=run_vllm_server, daemon=True)
server_thread.start()

## give server time to start up:
import time
# Pause execution for 100 seconds
time.sleep(200)

INFO 06-07 22:36:36 [__init__.py:243] Automatically detected platform cuda.
INFO 06-07 22:36:40 [__init__.py:31] Available plugins for group vllm.general_plugins:
INFO 06-07 22:36:40 [__init__.py:33] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver
INFO 06-07 22:36:40 [__init__.py:36] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load.
INFO 06-07 22:36:41 [api_server.py:1289] vLLM API server version 0.9.0.1
INFO 06-07 22:36:41 [cli_args.py:300] non-default args: {'host': '0.0.0.0', 'task': 'embed', 'trust_remote_code': True, 'enforce_eager': True, 'served_model_name': ['local'], 'tensor_parallel_size': 2, 'gpu_memory_utilization': 0.4}
INFO 06-07 22:36:51 [config.py:473] Found sentence-transformers modules configuration.
INFO 06-07 22:36:52 [config.py:493] Found pooling configuration.
INFO 06-07 22:36:52 [config.py:1875] Defaulting to use mp for distributed inference
INFO 06-07 22:36:52

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  50% Completed | 1/2 [00:02<00:02,  2.96s/it]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:09<00:00,  4.92s/it]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:09<00:00,  4.63s/it]



INFO 06-07 22:38:24 [default_loader.py:280] Loading weights took 9.43 seconds
[1;36m(VllmWorkerProcess pid=16602)[0;0m INFO 06-07 22:38:24 [default_loader.py:280] Loading weights took 9.14 seconds
INFO 06-07 22:38:24 [model_runner.py:1202] Model loading took 3.8162 GiB and 75.582441 seconds
[1;36m(VllmWorkerProcess pid=16602)[0;0m INFO 06-07 22:38:25 [model_runner.py:1202] Model loading took 3.8162 GiB and 75.610664 seconds
INFO 06-07 22:38:25 [api_server.py:1336] Starting vLLM API server on http://0.0.0.0:8000
INFO 06-07 22:38:25 [launcher.py:28] Available routes are:
INFO 06-07 22:38:25 [launcher.py:36] Route: /openapi.json, Methods: GET, HEAD
INFO 06-07 22:38:25 [launcher.py:36] Route: /docs, Methods: GET, HEAD
INFO 06-07 22:38:25 [launcher.py:36] Route: /docs/oauth2-redirect, Methods: GET, HEAD
INFO 06-07 22:38:25 [launcher.py:36] Route: /redoc, Methods: GET, HEAD
INFO 06-07 22:38:25 [launcher.py:36] Route: /health, Methods: GET
INFO 06-07 22:38:25 [launcher.py:36] Route: /load

INFO:     Started server process [16093]
INFO:     Waiting for application startup.
INFO:     Application startup complete.


In [None]:
## wait for output above to print routes, ending with: 
## INFO:     Application startup complete.


In [2]:
## NOTE!  You must wait until the log above finishes and not just the cell.
## Connect to the local model
from langchain_openai import OpenAIEmbeddings
embedding = OpenAIEmbeddings(
                 model = "local", ## served model name
                 api_key = os.getenv("OPENAI_API_KEY"),
                 base_url = "http://localhost:8000/v1",
)

## test that the model can do embeddings
from langchain_core.vectorstores import InMemoryVectorStore
vectorstore = InMemoryVectorStore.from_texts(["test text"], embedding=embedding)

INFO 06-07 22:39:51 [logger.py:42] Received request embd-9059fd2aadf84d8288c45ca3ecc8cd3c-0: prompt: ' product down', params: PoolingParams(dimensions=None, additional_metadata=None), prompt_token_ids: [1985, 1495], prompt_embeds shape: None, lora_request: None, prompt_adapter_request: None.
INFO 06-07 22:39:51 [engine.py:316] Added request embd-9059fd2aadf84d8288c45ca3ecc8cd3c-0.
INFO 06-07 22:39:53 [metrics.py:486] Avg prompt throughput: 0.2 tokens/s, Avg generation throughput: 0.1 tokens/s, Running: 0 reqs, Swapped: 0 reqs, Pending: 0 reqs, GPU KV cache usage: 0.0%, CPU KV cache usage: 0.0%.
INFO:     127.0.0.1:36040 - "POST /v1/embeddings HTTP/1.1" 200 OK
INFO 06-07 22:40:03 [metrics.py:486] Avg prompt throughput: 0.0 tokens/s, Avg generation throughput: 0.0 tokens/s, Running: 0 reqs, Swapped: 0 reqs, Pending: 0 reqs, GPU KV cache usage: 0.0%, CPU KV cache usage: 0.0%.


In [22]:
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams

client = QdrantClient(path = "hwc_qdrant.db")

client.create_collection(
    collection_name="demo_collection",
    vectors_config=VectorParams(size=2560, distance=Distance.COSINE),
)

vector_store = QdrantVectorStore(
    client=client,
    collection_name="demo_collection",
    embedding=embedding
)

INFO 06-07 23:16:55 [logger.py:42] Received request embd-60cc555da14743da90337ba88edfe7cf-0: prompt: ' Stainless">\r\n', params: PoolingParams(dimensions=None, additional_metadata=None), prompt_token_ids: [32490, 4424], prompt_embeds shape: None, lora_request: None, prompt_adapter_request: None.
INFO 06-07 23:16:55 [engine.py:316] Added request embd-60cc555da14743da90337ba88edfe7cf-0.
INFO:     127.0.0.1:53594 - "POST /v1/embeddings HTTP/1.1" 200 OK
INFO 06-07 23:17:05 [metrics.py:486] Avg prompt throughput: 0.1 tokens/s, Avg generation throughput: 0.1 tokens/s, Running: 0 reqs, Swapped: 0 reqs, Pending: 0 reqs, GPU KV cache usage: 0.0%, CPU KV cache usage: 0.0%.
INFO 06-07 23:17:15 [metrics.py:486] Avg prompt throughput: 0.0 tokens/s, Avg generation throughput: 0.0 tokens/s, Running: 0 reqs, Swapped: 0 reqs, Pending: 0 reqs, GPU KV cache usage: 0.0%, CPU KV cache usage: 0.0%.


In [28]:
from uuid import uuid4

from langchain_core.documents import Document

document_1 = Document(
    page_content="I had chocolate chip pancakes and scrambled eggs for breakfast this morning.",
    metadata={"source": "tweet"},
)

document_2 = Document(
    page_content="The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees Fahrenheit.",
    metadata={"source": "news"},
)

document_3 = Document(
    page_content="Building an exciting new project with LangChain - come check it out!",
    metadata={"source": "tweet"},
)

document_4 = Document(
    page_content="Robbers broke into the city bank and stole $1 million in cash.",
    metadata={"source": "news"},
)

document_5 = Document(
    page_content="Wow! That was an amazing movie. I can't wait to see it again.",
    metadata={"source": "tweet"},
)

document_6 = Document(
    page_content="Is the new iPhone worth the price? Read this review to find out.",
    metadata={"source": "website"},
)

document_7 = Document(
    page_content="The top 10 soccer players in the world right now.",
    metadata={"source": "website"},
)

document_8 = Document(
    page_content="LangGraph is the best framework for building stateful, agentic applications!",
    metadata={"source": "tweet"},
)

document_9 = Document(
    page_content="The stock market is down 500 points today due to fears of a recession.",
    metadata={"source": "news"},
)

document_10 = Document(
    page_content="I have a bad feeling I am going to get deleted :(",
    metadata={"source": "tweet"},
)

documents = [
    document_1,
    document_2,
    document_3,
    document_4,
    document_5,
    document_6,
    document_7,
    document_8,
    document_9,
    document_10,
]
uuids = [str(uuid4()) for _ in range(len(documents))]
vector_store.add_documents(documents=documents, ids=uuids)

INFO 06-07 23:19:58 [logger.py:42] Received request embd-d18cf8a824104f36a07c970cfa3e999f-0: prompt: 'Iys landscape SL defends and-fired Philadelphia for Utils wh whose.', params: PoolingParams(dimensions=None, additional_metadata=None), prompt_token_ids: [40, 1047, 18414, 16797, 80960, 323, 71578, 19335, 369, 17954, 420, 6693, 13], prompt_embeds shape: None, lora_request: None, prompt_adapter_request: None.
INFO 06-07 23:19:58 [logger.py:42] Received request embd-d18cf8a824104f36a07c970cfa3e999f-1: prompt: 'eg observ.price for guitar is=! andshional,iz aik of  deep(long\\Category.', params: PoolingParams(dimensions=None, additional_metadata=None), prompt_token_ids: [791, 9282, 18057, 369, 16986, 374, 74649, 323, 927, 3914, 11, 449, 264, 1579, 315, 220, 5538, 12628, 69823, 13], prompt_embeds shape: None, lora_request: None, prompt_adapter_request: None.
INFO 06-07 23:19:58 [logger.py:42] Received request embd-d18cf8a824104f36a07c970cfa3e999f-2: prompt: '.setPropertyata dram j.Iiz��shar

['878ac59d-e276-4d29-b715-6b20cd0f76f8',
 '5e5c960e-7c02-446d-9ad4-ca930ec3b57a',
 '8f478874-a730-4e19-b5f2-ef3caeb96a9d',
 'ed8dd908-a4da-4b20-b3d3-b2778846298d',
 'cc487e15-f7f3-46b6-a0a4-d0be9955cdaa',
 '5f380da0-9b78-4051-beea-6aa98421df33',
 'd5fa20d1-cc94-4613-ac64-b5ba3830589c',
 'cb0cbe18-5118-45f6-ae9c-1452b43cb92e',
 '15181e21-fa5d-45d0-b97b-3e206d671101',
 '1e769b7d-d327-465e-9b5b-3274b159ede3']

INFO 06-07 23:20:08 [metrics.py:486] Avg prompt throughput: 12.1 tokens/s, Avg generation throughput: 0.8 tokens/s, Running: 0 reqs, Swapped: 0 reqs, Pending: 0 reqs, GPU KV cache usage: 0.0%, CPU KV cache usage: 0.0%.
INFO 06-07 23:20:19 [metrics.py:486] Avg prompt throughput: 0.0 tokens/s, Avg generation throughput: 0.0 tokens/s, Running: 0 reqs, Swapped: 0 reqs, Pending: 0 reqs, GPU KV cache usage: 0.0%, CPU KV cache usage: 0.0%.


In [3]:
import os
import requests
import zipfile
import pathlib
from langchain_community.document_loaders import PyPDFLoader

def download_and_unzip(url, output_dir):
    response = requests.get(url)
    zip_file_path = os.path.basename(url)
    with open(zip_file_path, 'wb') as f:
        f.write(response.content)
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        zip_ref.extractall(output_dir)
    os.remove(zip_file_path)

def pdf_loader(path):
    all_documents = []
    docs_dir = pathlib.Path(path)
    for file in docs_dir.iterdir():
        loader = PyPDFLoader(file)
        documents = loader.load()
        all_documents.extend(documents)
    return all_documents


download_and_unzip("https://minio.carlboettiger.info/public-data/hwc.zip", 'hwc')
docs = pdf_loader('hwc/')

In [25]:
# Build a retrival agent
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_text_splitters import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

In [27]:
# slow part here, runs on remote GPU
vectorstore = vector_store.from_documents(documents=splits, embedding = embedding)
retriever = vectorstore.as_retriever()

ResponseHandlingException: [Errno 111] Connection refused

In [4]:
# Set up the Chat model from one of the NRP models
import os
api_key = os.getenv("OPENAI_API_KEY")

# see `curl -H "Authorization: Bearer $OPENAI_API_KEY" https://llm.nrp-nautilus.io/v1/models`
models = {"llama3": "llama3-sdsc", 
          "deepseek-small": "DeepSeek-R1-Distill-Qwen-32B",
          "deepseek": "deepseek-r1-qwen-qualcomm",
          "gemma3": "gemma3",
          "phi3": "phi3",
          "olmo": "olmo"
         }

from langchain_openai import ChatOpenAI
llm = ChatOpenAI(model = models["gemma3"], 
                 api_key = api_key, 
                 base_url = "https://llm.nrp-nautilus.io",  
                 temperature=0)

# Embedding model from NRP usually times out.
#embedding = OpenAIEmbeddings(model = "embed-mistral", api_key = api_key, base_url = "https://llm.nrp-nautilus.io")


In [7]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use five sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)


In [None]:
prompt = "I live in Tanzania and am having issues with lions breaking into my boma and preying on cattle. What interventions might work best for me?"
results = rag_chain.invoke({"input": prompt})
results

In [None]:
prompt = "What are the most cost-effective prevention methods for elephants raiding my crops?"

results = rag_chain.invoke({"input": prompt})
results

In [None]:
rag_chain.invoke({"input": 
                  "I have a small herd of goats and cattle and I am worried about jaguars preying on them. What preventative measures can I take?"
                 })

In [None]:
rag_chain.invoke({"input": "I am trying to prevent coyotes from eating the calves of my free-range cattle. What may work best?"})

In [None]:
rag_chain.invoke({"input": "We have major issues with deer raiding our large agricultural fields. Is there anything I can try to prevent this that won’t break the bank?"})

In [None]:
rag_chain.invoke({"input": "We live in a suburban area and bears sometimes come into our town to eat from our fruit trees and trash. What are the best ways for us to prevent this as a community? We don’t want to have to get rid of our fruit trees…"})

In [8]:
prompt = "What cattle husbandry strategies might be helpful to prevent conflict if we live in wolf country?"

rag_chain.invoke({"input": prompt})

INFO 06-07 22:45:49 [logger.py:42] Received request embd-651e9b5086e647569840aece45d970bc-0: prompt: 'yl descent.session anyInv os beAccording tocategory Ltd D public quick in StatefulWidget.\r\n?', params: PoolingParams(dimensions=None, additional_metadata=None), prompt_token_ids: [3923, 36952, 10177, 894, 15174, 2643, 387, 11190, 311, 5471, 12324, 422, 584, 3974, 304, 37642, 3224, 30], prompt_embeds shape: None, lora_request: None, prompt_adapter_request: None.
INFO 06-07 22:45:49 [engine.py:316] Added request embd-651e9b5086e647569840aece45d970bc-0.
INFO:     127.0.0.1:45182 - "POST /v1/embeddings HTTP/1.1" 200 OK
INFO 06-07 22:45:59 [metrics.py:486] Avg prompt throughput: 1.2 tokens/s, Avg generation throughput: 0.1 tokens/s, Running: 0 reqs, Swapped: 0 reqs, Pending: 0 reqs, GPU KV cache usage: 0.0%, CPU KV cache usage: 0.0%.


{'input': 'What cattle husbandry strategies might be helpful to prevent conflict if we live in wolf country?',
 'context': [Document(id='b2071fd8-ba63-4f55-aaed-5469eabef499', metadata={'producer': 'PDF Architect 3', 'creator': 'PDF Architect 3', 'creationdate': '2017-01-25T14:50:41+00:00', 'author': 'V. Pimenta', 'moddate': '2017-01-25T14:52:31+00:00', 'source': 'hwc/Pimenta et al. 2017.pdf', 'total_pages': 20, 'page': 5, 'page_label': '6'}, page_content='4. Discussion\nOur study examined a human-wildlife conﬂict involving wolf preda-\ntion on cattle in northern Portugal, showing that the problem may be\nworsening due to increased predation levels, though only a minority\nof cattle farms was heavily affected. We found that predation was par-\nticularly high on cattle farms using a free-ranging husbandry system,\nbut also that predation problems within this system were largely con-\ncentrated on the few herds that were left unconﬁned at night in winter.\nIn contrast, we found that farm

INFO 06-07 22:46:09 [metrics.py:486] Avg prompt throughput: 0.0 tokens/s, Avg generation throughput: 0.0 tokens/s, Running: 0 reqs, Swapped: 0 reqs, Pending: 0 reqs, GPU KV cache usage: 0.0%, CPU KV cache usage: 0.0%.


In [None]:
## DRAFT exploring other embedding databases

import os
from langchain_community.vectorstores import FAISS
from langchain_community.vectorstores import Chroma
from langchain_community.vectorstores import Qdrant
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams
import gc
import torch

# Option 1: FAISS (Facebook AI Similarity Search) - Most memory efficient
def create_faiss_vectorstore(splits, embedding, persist_directory="./faiss_db", batch_size=100):
    """
    Create FAISS vector store with batched processing to minimize GPU RAM usage
    """
    os.makedirs(persist_directory, exist_ok=True)
    
    # Process documents in batches to avoid GPU memory overflow
    vectorstore = None
    
    for i in range(0, len(splits), batch_size):
        batch = splits[i:i + batch_size]
        print(f"Processing batch {i//batch_size + 1}/{(len(splits) + batch_size - 1)//batch_size}")
        
        if vectorstore is None:
            # Create initial vectorstore with first batch
            vectorstore = FAISS.from_documents(
                documents=batch,
                embedding=embedding
            )
        else:
            # Add subsequent batches to existing vectorstore
            batch_vectorstore = FAISS.from_documents(
                documents=batch,
                embedding=embedding
            )
            vectorstore.merge_from(batch_vectorstore)
            
            # Clean up temporary vectorstore
            del batch_vectorstore
        
        # Force garbage collection and clear GPU cache if using CUDA
        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
    
    # Save to disk
    vectorstore.save_local(persist_directory)
    print(f"Vector store saved to {persist_directory}")
    
    return vectorstore

def load_faiss_vectorstore(embedding, persist_directory="./faiss_db"):
    """Load existing FAISS vector store from disk"""
    return FAISS.load_local(
        persist_directory,
        embedding,
        allow_dangerous_deserialization=True  # Only if you trust the source
    )

# Option 2: Chroma - Persistent SQLite-based storage
def create_chroma_vectorstore(splits, embedding, persist_directory="./chroma_db", batch_size=100):
    """
    Create Chroma vector store with batched processing
    """
    # Initialize Chroma with persistence
    vectorstore = Chroma(
        persist_directory=persist_directory,
        embedding_function=embedding
    )
    
    # Add documents in batches
    for i in range(0, len(splits), batch_size):
        batch = splits[i:i + batch_size]
        print(f"Processing batch {i//batch_size + 1}/{(len(splits) + batch_size - 1)//batch_size}")
        
        vectorstore.add_documents(batch)
        
        # Force garbage collection and clear GPU cache
        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
    
    # Persist to disk
    vectorstore.persist()
    print(f"Vector store persisted to {persist_directory}")
    
    return vectorstore

def load_chroma_vectorstore(embedding, persist_directory="./chroma_db"):
    """Load existing Chroma vector store from disk"""
    return Chroma(
        persist_directory=persist_directory,
        embedding_function=embedding
    )

# Option 3: Qdrant - High-performance vector database
def create_qdrant_vectorstore(splits, embedding, collection_name="documents", 
                            path="./qdrant_db", batch_size=100):
    """
    Create Qdrant vector store with local file-based storage
    """
    # Initialize local Qdrant client
    client = QdrantClient(path=path)
    
    # Get embedding dimension (embed a sample text)
    sample_embedding = embedding.embed_query("sample text")
    embedding_dim = len(sample_embedding)
    
    # Create collection if it doesn't exist
    try:
        client.create_collection(
            collection_name=collection_name,
            vectors_config=VectorParams(size=embedding_dim, distance=Distance.COSINE)
        )
    except Exception as e:
        print(f"Collection might already exist: {e}")
    
    # Create vectorstore
    vectorstore = Qdrant(
        client=client,
        collection_name=collection_name,
        embeddings=embedding
    )
    
    # Add documents in batches
    for i in range(0, len(splits), batch_size):
        batch = splits[i:i + batch_size]
        print(f"Processing batch {i//batch_size + 1}/{(len(splits) + batch_size - 1)//batch_size}")
        
        vectorstore.add_documents(batch)
        
        # Force garbage collection and clear GPU cache
        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
    
    print(f"Vector store created in {path}")
    return vectorstore

def load_qdrant_vectorstore(embedding, collection_name="documents", path="./qdrant_db"):
    """Load existing Qdrant vector store from disk"""
    client = QdrantClient(path=path)
    return Qdrant(
        client=client,
        collection_name=collection_name,
        embeddings=embedding
    )


In [None]:

# Usage examples:

# Replace your original code with one of these options:

# Option 1: FAISS (Recommended for most use cases)
vectorstore = create_faiss_vectorstore(
    splits=splits, 
    embedding=embedding, 
    persist_directory="./my_faiss_db",
    batch_size=50  # Adjust based on your GPU memory
)

# To load later:
# vectorstore = load_faiss_vectorstore(embedding, "./my_faiss_db")

# Option 2: Chroma (Good for development and moderate scale)
# vectorstore = create_chroma_vectorstore(
#     splits=splits,
#     embedding=embedding,
#     persist_directory="./my_chroma_db",
#     batch_size=50
# )

# Option 3: Qdrant (Best for production and very large scale)
# vectorstore = create_qdrant_vectorstore(
#     splits=splits,
#     embedding=embedding,
#     collection_name="my_documents",
#     path="./my_qdrant_db",
#     batch_size=50
# )

# Memory optimization settings
def optimize_gpu_memory():
    """Additional GPU memory optimization"""
    if torch.cuda.is_available():
        # Set memory fraction if needed
        torch.cuda.set_per_process_memory_fraction(0.8)  # Use 80% of GPU memory
        
        # Enable memory mapping for large tensors
        torch.backends.cuda.matmul.allow_tf32 = True
        torch.backends.cudnn.allow_tf32 = True

# Call before processing if you have GPU memory issues
# optimize_gpu_memory()