import pytest import os from unittest.mock import patch, MagicMock # Make sure 'utils' is discoverable, or adjust path. # This might require __init__.py in 'utils' and 'tests' and correct pythonpath. from utils.rag_utils import load_and_split_documents, get_embedding_model, EMBEDDING_MODEL_NAME from langchain_community.document_loaders import UnstructuredFileLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_core.documents import Document from langchain_community.embeddings import HuggingFaceEmbeddings # --- Tests for load_and_split_documents --- def test_load_and_split_documents_no_directory(tmp_path): """Test behavior when the persona data directory does not exist.""" persona_id = "non_existent_persona" result_docs = load_and_split_documents(persona_id, data_path=str(tmp_path)) assert result_docs == [] def test_load_and_split_documents_no_txt_files(tmp_path): """Test behavior when directory exists but contains no .txt files.""" persona_id = "empty_persona" persona_dir = tmp_path / persona_id persona_dir.mkdir() (persona_dir / "other_file.md").write_text("some markdown content") result_docs = load_and_split_documents(persona_id, data_path=str(tmp_path)) assert result_docs == [] def test_load_and_split_documents_loads_and_splits_txt_files(tmp_path): """Test successful loading and splitting of .txt files.""" persona_id = "test_persona" data_sources_path = tmp_path / "data_sources" # Simulate the data_sources structure data_sources_path.mkdir() persona_dir = data_sources_path / persona_id persona_dir.mkdir() # Create dummy .txt files (persona_dir / "doc1.txt").write_text("This is the first document. It has some text.") (persona_dir / "doc2.txt").write_text("Another document here with more words to ensure splitting might occur if long enough.") # Mocking DirectoryLoader.load() to control what it returns, # as testing the actual loader behavior deeply is out of scope for this unit test. # We are more interested in the interaction with text_splitter. # However, for this test, let's allow it to run to verify basic integration. # For more complex scenarios or to avoid actual file loading, mocking loader.load() would be better. # For simplicity, we'll assume RecursiveCharacterTextSplitter works as expected. # We are mainly testing that documents are loaded and passed to the splitter. split_docs = load_and_split_documents(persona_id, data_path=str(data_sources_path)) assert len(split_docs) > 0 # Expecting at least one chunk per document if short, or more if split assert isinstance(split_docs[0], Document) # Check if content from original docs is present (simplified check) content_doc1_present = any("first document" in doc.page_content for doc in split_docs) content_doc2_present = any("Another document" in doc.page_content for doc in split_docs) assert content_doc1_present or content_doc2_present # At least one should be found if files are small # A more robust test would mock text_splitter.split_documents and verify it's called with loaded docs. def test_load_and_split_documents_uses_correct_loader_and_splitter_params(): """Test that DirectoryLoader and RecursiveCharacterTextSplitter are called with expected parameters.""" persona_id = "params_test_persona" data_path = "dummy_data_path" dummy_persona_path = os.path.join(data_path, persona_id) # Mock os.path.isdir to simulate directory existence with patch('os.path.isdir', return_value=True): # Mock DirectoryLoader mock_doc_instance = Document(page_content="Test content from loader.") mock_loader_instance = MagicMock() mock_loader_instance.load.return_value = [mock_doc_instance] # Simulate loader returning one doc with patch('utils.rag_utils.DirectoryLoader', return_value=mock_loader_instance) as mock_directory_loader: # Mock RecursiveCharacterTextSplitter mock_splitter_instance = MagicMock() mock_splitter_instance.split_documents.return_value = [Document(page_content="Split chunk 1")] # Simulate splitter returning one chunk with patch('utils.rag_utils.RecursiveCharacterTextSplitter', return_value=mock_splitter_instance) as mock_text_splitter: load_and_split_documents(persona_id, data_path=data_path) # Assert DirectoryLoader was called correctly mock_directory_loader.assert_called_once_with( dummy_persona_path, glob="**/*.txt", loader_cls=UnstructuredFileLoader, show_progress=True, use_multithreading=True, silent_errors=True ) mock_loader_instance.load.assert_called_once() # Assert RecursiveCharacterTextSplitter was called correctly mock_text_splitter.assert_called_once_with( chunk_size=1000, chunk_overlap=150, length_function=len, is_separator_regex=False ) mock_splitter_instance.split_documents.assert_called_once_with([mock_doc_instance]) # --- Tests for get_embedding_model --- def test_get_embedding_model_default(): """Test that get_embedding_model returns a HuggingFaceEmbeddings instance with the default model.""" # Patching the HuggingFaceEmbeddings constructor to avoid actual model loading/download with patch('utils.rag_utils.HuggingFaceEmbeddings') as mock_hf_embeddings: mock_instance = MagicMock(spec=HuggingFaceEmbeddings) mock_hf_embeddings.return_value = mock_instance embedding_model = get_embedding_model() mock_hf_embeddings.assert_called_once_with(model_name=EMBEDDING_MODEL_NAME) assert embedding_model == mock_instance def test_get_embedding_model_custom_name(): """Test get_embedding_model with a custom model name.""" custom_model = "sentence-transformers/paraphrase-MiniLM-L3-v2" with patch('utils.rag_utils.HuggingFaceEmbeddings') as mock_hf_embeddings: mock_instance = MagicMock(spec=HuggingFaceEmbeddings) mock_hf_embeddings.return_value = mock_instance embedding_model = get_embedding_model(model_name=custom_model) mock_hf_embeddings.assert_called_once_with(model_name=custom_model) assert embedding_model == mock_instance