Spaces:
Running
Running
File size: 6,628 Bytes
31add3b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
import pytest
import os
from unittest.mock import patch, MagicMock
# Make sure 'utils' is discoverable, or adjust path.
# This might require __init__.py in 'utils' and 'tests' and correct pythonpath.
from utils.rag_utils import load_and_split_documents, get_embedding_model, EMBEDDING_MODEL_NAME
from langchain_community.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_community.embeddings import HuggingFaceEmbeddings
# --- Tests for load_and_split_documents ---
def test_load_and_split_documents_no_directory(tmp_path):
"""Test behavior when the persona data directory does not exist."""
persona_id = "non_existent_persona"
result_docs = load_and_split_documents(persona_id, data_path=str(tmp_path))
assert result_docs == []
def test_load_and_split_documents_no_txt_files(tmp_path):
"""Test behavior when directory exists but contains no .txt files."""
persona_id = "empty_persona"
persona_dir = tmp_path / persona_id
persona_dir.mkdir()
(persona_dir / "other_file.md").write_text("some markdown content")
result_docs = load_and_split_documents(persona_id, data_path=str(tmp_path))
assert result_docs == []
def test_load_and_split_documents_loads_and_splits_txt_files(tmp_path):
"""Test successful loading and splitting of .txt files."""
persona_id = "test_persona"
data_sources_path = tmp_path / "data_sources" # Simulate the data_sources structure
data_sources_path.mkdir()
persona_dir = data_sources_path / persona_id
persona_dir.mkdir()
# Create dummy .txt files
(persona_dir / "doc1.txt").write_text("This is the first document. It has some text.")
(persona_dir / "doc2.txt").write_text("Another document here with more words to ensure splitting might occur if long enough.")
# Mocking DirectoryLoader.load() to control what it returns,
# as testing the actual loader behavior deeply is out of scope for this unit test.
# We are more interested in the interaction with text_splitter.
# However, for this test, let's allow it to run to verify basic integration.
# For more complex scenarios or to avoid actual file loading, mocking loader.load() would be better.
# For simplicity, we'll assume RecursiveCharacterTextSplitter works as expected.
# We are mainly testing that documents are loaded and passed to the splitter.
split_docs = load_and_split_documents(persona_id, data_path=str(data_sources_path))
assert len(split_docs) > 0 # Expecting at least one chunk per document if short, or more if split
assert isinstance(split_docs[0], Document)
# Check if content from original docs is present (simplified check)
content_doc1_present = any("first document" in doc.page_content for doc in split_docs)
content_doc2_present = any("Another document" in doc.page_content for doc in split_docs)
assert content_doc1_present or content_doc2_present # At least one should be found if files are small
# A more robust test would mock text_splitter.split_documents and verify it's called with loaded docs.
def test_load_and_split_documents_uses_correct_loader_and_splitter_params():
"""Test that DirectoryLoader and RecursiveCharacterTextSplitter are called with expected parameters."""
persona_id = "params_test_persona"
data_path = "dummy_data_path"
dummy_persona_path = os.path.join(data_path, persona_id)
# Mock os.path.isdir to simulate directory existence
with patch('os.path.isdir', return_value=True):
# Mock DirectoryLoader
mock_doc_instance = Document(page_content="Test content from loader.")
mock_loader_instance = MagicMock()
mock_loader_instance.load.return_value = [mock_doc_instance] # Simulate loader returning one doc
with patch('utils.rag_utils.DirectoryLoader', return_value=mock_loader_instance) as mock_directory_loader:
# Mock RecursiveCharacterTextSplitter
mock_splitter_instance = MagicMock()
mock_splitter_instance.split_documents.return_value = [Document(page_content="Split chunk 1")] # Simulate splitter returning one chunk
with patch('utils.rag_utils.RecursiveCharacterTextSplitter', return_value=mock_splitter_instance) as mock_text_splitter:
load_and_split_documents(persona_id, data_path=data_path)
# Assert DirectoryLoader was called correctly
mock_directory_loader.assert_called_once_with(
dummy_persona_path,
glob="**/*.txt",
loader_cls=UnstructuredFileLoader,
show_progress=True,
use_multithreading=True,
silent_errors=True
)
mock_loader_instance.load.assert_called_once()
# Assert RecursiveCharacterTextSplitter was called correctly
mock_text_splitter.assert_called_once_with(
chunk_size=1000,
chunk_overlap=150,
length_function=len,
is_separator_regex=False
)
mock_splitter_instance.split_documents.assert_called_once_with([mock_doc_instance])
# --- Tests for get_embedding_model ---
def test_get_embedding_model_default():
"""Test that get_embedding_model returns a HuggingFaceEmbeddings instance with the default model."""
# Patching the HuggingFaceEmbeddings constructor to avoid actual model loading/download
with patch('utils.rag_utils.HuggingFaceEmbeddings') as mock_hf_embeddings:
mock_instance = MagicMock(spec=HuggingFaceEmbeddings)
mock_hf_embeddings.return_value = mock_instance
embedding_model = get_embedding_model()
mock_hf_embeddings.assert_called_once_with(model_name=EMBEDDING_MODEL_NAME)
assert embedding_model == mock_instance
def test_get_embedding_model_custom_name():
"""Test get_embedding_model with a custom model name."""
custom_model = "sentence-transformers/paraphrase-MiniLM-L3-v2"
with patch('utils.rag_utils.HuggingFaceEmbeddings') as mock_hf_embeddings:
mock_instance = MagicMock(spec=HuggingFaceEmbeddings)
mock_hf_embeddings.return_value = mock_instance
embedding_model = get_embedding_model(model_name=custom_model)
mock_hf_embeddings.assert_called_once_with(model_name=custom_model)
assert embedding_model == mock_instance |