Spaces:

suh4s
/

InsightFlowAI_test

Running

File size: 6,628 Bytes

31add3b

import pytest
import os
from unittest.mock import patch, MagicMock

# Make sure 'utils' is discoverable, or adjust path.
# This might require __init__.py in 'utils' and 'tests' and correct pythonpath.
from utils.rag_utils import load_and_split_documents, get_embedding_model, EMBEDDING_MODEL_NAME
from langchain_community.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_community.embeddings import HuggingFaceEmbeddings

# --- Tests for load_and_split_documents ---

def test_load_and_split_documents_no_directory(tmp_path):
    """Test behavior when the persona data directory does not exist."""
    persona_id = "non_existent_persona"
    result_docs = load_and_split_documents(persona_id, data_path=str(tmp_path))
    assert result_docs == []

def test_load_and_split_documents_no_txt_files(tmp_path):
    """Test behavior when directory exists but contains no .txt files."""
    persona_id = "empty_persona"
    persona_dir = tmp_path / persona_id
    persona_dir.mkdir()
    (persona_dir / "other_file.md").write_text("some markdown content")
    
    result_docs = load_and_split_documents(persona_id, data_path=str(tmp_path))
    assert result_docs == []

def test_load_and_split_documents_loads_and_splits_txt_files(tmp_path):
    """Test successful loading and splitting of .txt files."""
    persona_id = "test_persona"
    data_sources_path = tmp_path / "data_sources" # Simulate the data_sources structure
    data_sources_path.mkdir()
    persona_dir = data_sources_path / persona_id
    persona_dir.mkdir()

    # Create dummy .txt files
    (persona_dir / "doc1.txt").write_text("This is the first document. It has some text.")
    (persona_dir / "doc2.txt").write_text("Another document here with more words to ensure splitting might occur if long enough.")
    
    # Mocking DirectoryLoader.load() to control what it returns,
    # as testing the actual loader behavior deeply is out of scope for this unit test.
    # We are more interested in the interaction with text_splitter.
    # However, for this test, let's allow it to run to verify basic integration.
    # For more complex scenarios or to avoid actual file loading, mocking loader.load() would be better.

    # For simplicity, we'll assume RecursiveCharacterTextSplitter works as expected.
    # We are mainly testing that documents are loaded and passed to the splitter.
    
    split_docs = load_and_split_documents(persona_id, data_path=str(data_sources_path))
    
    assert len(split_docs) > 0 # Expecting at least one chunk per document if short, or more if split
    assert isinstance(split_docs[0], Document)
    
    # Check if content from original docs is present (simplified check)
    content_doc1_present = any("first document" in doc.page_content for doc in split_docs)
    content_doc2_present = any("Another document" in doc.page_content for doc in split_docs)
    assert content_doc1_present or content_doc2_present # At least one should be found if files are small

    # A more robust test would mock text_splitter.split_documents and verify it's called with loaded docs.


def test_load_and_split_documents_uses_correct_loader_and_splitter_params():
    """Test that DirectoryLoader and RecursiveCharacterTextSplitter are called with expected parameters."""
    persona_id = "params_test_persona"
    data_path = "dummy_data_path"
    dummy_persona_path = os.path.join(data_path, persona_id)

    # Mock os.path.isdir to simulate directory existence
    with patch('os.path.isdir', return_value=True):
        # Mock DirectoryLoader
        mock_doc_instance = Document(page_content="Test content from loader.")
        mock_loader_instance = MagicMock()
        mock_loader_instance.load.return_value = [mock_doc_instance] # Simulate loader returning one doc
        
        with patch('utils.rag_utils.DirectoryLoader', return_value=mock_loader_instance) as mock_directory_loader:
            # Mock RecursiveCharacterTextSplitter
            mock_splitter_instance = MagicMock()
            mock_splitter_instance.split_documents.return_value = [Document(page_content="Split chunk 1")] # Simulate splitter returning one chunk
            
            with patch('utils.rag_utils.RecursiveCharacterTextSplitter', return_value=mock_splitter_instance) as mock_text_splitter:
                
                load_and_split_documents(persona_id, data_path=data_path)

                # Assert DirectoryLoader was called correctly
                mock_directory_loader.assert_called_once_with(
                    dummy_persona_path,
                    glob="**/*.txt",
                    loader_cls=UnstructuredFileLoader,
                    show_progress=True,
                    use_multithreading=True,
                    silent_errors=True
                )
                mock_loader_instance.load.assert_called_once()

                # Assert RecursiveCharacterTextSplitter was called correctly
                mock_text_splitter.assert_called_once_with(
                    chunk_size=1000,
                    chunk_overlap=150,
                    length_function=len,
                    is_separator_regex=False
                )
                mock_splitter_instance.split_documents.assert_called_once_with([mock_doc_instance])


# --- Tests for get_embedding_model ---

def test_get_embedding_model_default():
    """Test that get_embedding_model returns a HuggingFaceEmbeddings instance with the default model."""
    # Patching the HuggingFaceEmbeddings constructor to avoid actual model loading/download
    with patch('utils.rag_utils.HuggingFaceEmbeddings') as mock_hf_embeddings:
        mock_instance = MagicMock(spec=HuggingFaceEmbeddings)
        mock_hf_embeddings.return_value = mock_instance
        
        embedding_model = get_embedding_model()
        
        mock_hf_embeddings.assert_called_once_with(model_name=EMBEDDING_MODEL_NAME)
        assert embedding_model == mock_instance

def test_get_embedding_model_custom_name():
    """Test get_embedding_model with a custom model name."""
    custom_model = "sentence-transformers/paraphrase-MiniLM-L3-v2"
    with patch('utils.rag_utils.HuggingFaceEmbeddings') as mock_hf_embeddings:
        mock_instance = MagicMock(spec=HuggingFaceEmbeddings)
        mock_hf_embeddings.return_value = mock_instance

        embedding_model = get_embedding_model(model_name=custom_model)
        
        mock_hf_embeddings.assert_called_once_with(model_name=custom_model)
        assert embedding_model == mock_instance