File size: 6,628 Bytes
31add3b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import pytest
import os
from unittest.mock import patch, MagicMock

# Make sure 'utils' is discoverable, or adjust path.
# This might require __init__.py in 'utils' and 'tests' and correct pythonpath.
from utils.rag_utils import load_and_split_documents, get_embedding_model, EMBEDDING_MODEL_NAME
from langchain_community.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_community.embeddings import HuggingFaceEmbeddings

# --- Tests for load_and_split_documents ---

def test_load_and_split_documents_no_directory(tmp_path):
    """Test behavior when the persona data directory does not exist."""
    persona_id = "non_existent_persona"
    result_docs = load_and_split_documents(persona_id, data_path=str(tmp_path))
    assert result_docs == []

def test_load_and_split_documents_no_txt_files(tmp_path):
    """Test behavior when directory exists but contains no .txt files."""
    persona_id = "empty_persona"
    persona_dir = tmp_path / persona_id
    persona_dir.mkdir()
    (persona_dir / "other_file.md").write_text("some markdown content")
    
    result_docs = load_and_split_documents(persona_id, data_path=str(tmp_path))
    assert result_docs == []

def test_load_and_split_documents_loads_and_splits_txt_files(tmp_path):
    """Test successful loading and splitting of .txt files."""
    persona_id = "test_persona"
    data_sources_path = tmp_path / "data_sources" # Simulate the data_sources structure
    data_sources_path.mkdir()
    persona_dir = data_sources_path / persona_id
    persona_dir.mkdir()

    # Create dummy .txt files
    (persona_dir / "doc1.txt").write_text("This is the first document. It has some text.")
    (persona_dir / "doc2.txt").write_text("Another document here with more words to ensure splitting might occur if long enough.")
    
    # Mocking DirectoryLoader.load() to control what it returns,
    # as testing the actual loader behavior deeply is out of scope for this unit test.
    # We are more interested in the interaction with text_splitter.
    # However, for this test, let's allow it to run to verify basic integration.
    # For more complex scenarios or to avoid actual file loading, mocking loader.load() would be better.

    # For simplicity, we'll assume RecursiveCharacterTextSplitter works as expected.
    # We are mainly testing that documents are loaded and passed to the splitter.
    
    split_docs = load_and_split_documents(persona_id, data_path=str(data_sources_path))
    
    assert len(split_docs) > 0 # Expecting at least one chunk per document if short, or more if split
    assert isinstance(split_docs[0], Document)
    
    # Check if content from original docs is present (simplified check)
    content_doc1_present = any("first document" in doc.page_content for doc in split_docs)
    content_doc2_present = any("Another document" in doc.page_content for doc in split_docs)
    assert content_doc1_present or content_doc2_present # At least one should be found if files are small

    # A more robust test would mock text_splitter.split_documents and verify it's called with loaded docs.


def test_load_and_split_documents_uses_correct_loader_and_splitter_params():
    """Test that DirectoryLoader and RecursiveCharacterTextSplitter are called with expected parameters."""
    persona_id = "params_test_persona"
    data_path = "dummy_data_path"
    dummy_persona_path = os.path.join(data_path, persona_id)

    # Mock os.path.isdir to simulate directory existence
    with patch('os.path.isdir', return_value=True):
        # Mock DirectoryLoader
        mock_doc_instance = Document(page_content="Test content from loader.")
        mock_loader_instance = MagicMock()
        mock_loader_instance.load.return_value = [mock_doc_instance] # Simulate loader returning one doc
        
        with patch('utils.rag_utils.DirectoryLoader', return_value=mock_loader_instance) as mock_directory_loader:
            # Mock RecursiveCharacterTextSplitter
            mock_splitter_instance = MagicMock()
            mock_splitter_instance.split_documents.return_value = [Document(page_content="Split chunk 1")] # Simulate splitter returning one chunk
            
            with patch('utils.rag_utils.RecursiveCharacterTextSplitter', return_value=mock_splitter_instance) as mock_text_splitter:
                
                load_and_split_documents(persona_id, data_path=data_path)

                # Assert DirectoryLoader was called correctly
                mock_directory_loader.assert_called_once_with(
                    dummy_persona_path,
                    glob="**/*.txt",
                    loader_cls=UnstructuredFileLoader,
                    show_progress=True,
                    use_multithreading=True,
                    silent_errors=True
                )
                mock_loader_instance.load.assert_called_once()

                # Assert RecursiveCharacterTextSplitter was called correctly
                mock_text_splitter.assert_called_once_with(
                    chunk_size=1000,
                    chunk_overlap=150,
                    length_function=len,
                    is_separator_regex=False
                )
                mock_splitter_instance.split_documents.assert_called_once_with([mock_doc_instance])


# --- Tests for get_embedding_model ---

def test_get_embedding_model_default():
    """Test that get_embedding_model returns a HuggingFaceEmbeddings instance with the default model."""
    # Patching the HuggingFaceEmbeddings constructor to avoid actual model loading/download
    with patch('utils.rag_utils.HuggingFaceEmbeddings') as mock_hf_embeddings:
        mock_instance = MagicMock(spec=HuggingFaceEmbeddings)
        mock_hf_embeddings.return_value = mock_instance
        
        embedding_model = get_embedding_model()
        
        mock_hf_embeddings.assert_called_once_with(model_name=EMBEDDING_MODEL_NAME)
        assert embedding_model == mock_instance

def test_get_embedding_model_custom_name():
    """Test get_embedding_model with a custom model name."""
    custom_model = "sentence-transformers/paraphrase-MiniLM-L3-v2"
    with patch('utils.rag_utils.HuggingFaceEmbeddings') as mock_hf_embeddings:
        mock_instance = MagicMock(spec=HuggingFaceEmbeddings)
        mock_hf_embeddings.return_value = mock_instance

        embedding_model = get_embedding_model(model_name=custom_model)
        
        mock_hf_embeddings.assert_called_once_with(model_name=custom_model)
        assert embedding_model == mock_instance