from unittest.mock import Mock, create_autospec import pytest import numpy as np from semantic_router.text import Conversation from semantic_router.schema import Message from semantic_router.splitters.consecutive_sim import ConsecutiveSimSplitter from semantic_router.splitters.cumulative_sim import CumulativeSimSplitter from semantic_router.encoders.base import BaseEncoder from semantic_router.encoders.cohere import CohereEncoder from semantic_router.splitters.base import BaseSplitter def test_consecutive_sim_splitter(): # Create a Mock object for the encoder mock_encoder = Mock() mock_encoder.return_value = np.array([[1, 0], [1, 0.1], [0, 1]]) cohere_encoder = CohereEncoder( name="", cohere_api_key="a", input_type="", ) # Instantiate the ConsecutiveSimSplitter with the mock encoder splitter = ConsecutiveSimSplitter(encoder=cohere_encoder, score_threshold=0.9) splitter.encoder = mock_encoder # Define some documents docs = ["doc1", "doc2", "doc3"] # Use the splitter to split the documents splits = splitter(docs) # Verify the splits assert len(splits) == 2, "Expected two splits based on the similarity threshold" assert splits[0].docs == [ "doc1", "doc2", ], "First split does not match expected documents" assert splits[1].docs == ["doc3"], "Second split does not match expected documents" def test_cumulative_sim_splitter(): # Mock the BaseEncoder mock_encoder = Mock() # Adjust the side_effect to simulate the encoder's behavior for cumulative document comparisons # This simplistic simulation assumes binary embeddings for demonstration purposes # Define a side_effect function for the mock encoder mock_encoder.side_effect = ( lambda x: [[0.5, 0]] if "doc1" in x or "doc1\ndoc2" in x or "doc2" in x else [[0, 0.5]] ) # Instantiate the CumulativeSimSplitter with the mock encoder cohere_encoder = CohereEncoder( name="", cohere_api_key="a", input_type="", ) splitter = CumulativeSimSplitter(encoder=cohere_encoder, score_threshold=0.9) splitter.encoder = mock_encoder # Define some documents docs = ["doc1", "doc2", "doc3", "doc4", "doc5"] # Use the splitter to split the documents splits = splitter(docs) # Verify the splits # The expected outcome needs to match the logic defined in your mock_encoder's side_effect assert len(splits) == 2, f"{len(splits)}" assert splits[0].docs == [ "doc1", "doc2", ], "First split does not match expected documents" assert splits[1].docs == [ "doc3", "doc4", "doc5", ], "Second split does not match expected documents" def test_split_by_topic_consecutive_similarity(): mock_encoder = Mock() mock_encoder.return_value = [[0.5, 0], [0, 0.5]] messages = [ Message(role="User", content="What is the latest news?"), Message(role="Bot", content="How is the weather today?"), ] conversation = Conversation(messages=messages) cohere_encoder = CohereEncoder( name="", cohere_api_key="a", input_type="", ) conversation.configure_splitter( encoder=cohere_encoder, threshold=0.5, split_method="consecutive_similarity" ) conversation.splitter.encoder = mock_encoder topics, new_topics = conversation.split_by_topic() assert len(new_topics) == 2 assert new_topics[0].docs == ["User: What is the latest news?"] assert new_topics[1].docs == ["Bot: How is the weather today?"] def test_split_by_topic_cumulative_similarity(): mock_encoder = Mock() mock_encoder.side_effect = ( lambda x: [[0.5, 0]] if "User: What is the latest news?" in x else [[0, 0.5]] ) messages = [ Message(role="User", content="What is the latest news?"), Message(role="Bot", content="How is the weather today?"), ] conversation = Conversation(messages=messages) cohere_encoder = CohereEncoder( name="", cohere_api_key="a", input_type="", ) conversation.configure_splitter( encoder=cohere_encoder, threshold=0.5, split_method="cumulative_similarity" ) conversation.splitter.encoder = mock_encoder topics, new_topics = conversation.split_by_topic() # Assertions may need to be adjusted based on the expected behavior of the cumulative similarity splitter assert len(new_topics) == 2 def test_split_by_topic_no_messages(): mock_encoder = create_autospec(BaseEncoder) conversation = Conversation() conversation.configure_splitter( encoder=mock_encoder, threshold=0.5, split_method="consecutive_similarity" ) topics, new_topics = conversation.split_by_topic() assert len(new_topics) == 0 assert len(topics) == 0 def test_split_by_topic_without_configuring_splitter(): conversation = Conversation(messages=[Message(role="User", content="Hello")]) with pytest.raises(ValueError): conversation.split_by_topic() def test_consecutive_similarity_splitter_single_doc(): mock_encoder = create_autospec(BaseEncoder) # Assuming any return value since it should not reach the point of using the encoder mock_encoder.return_value = np.array([[0.5, 0]]) splitter = ConsecutiveSimSplitter(encoder=mock_encoder, score_threshold=0.5) docs = ["doc1"] with pytest.raises(ValueError) as excinfo: splitter(docs) assert "at least two are required" in str(excinfo.value) def test_cumulative_similarity_splitter_single_doc(): mock_encoder = create_autospec(BaseEncoder) # Assuming any return value since it should not reach the point of using the encoder mock_encoder.return_value = np.array([[0.5, 0]]) splitter = CumulativeSimSplitter(encoder=mock_encoder, score_threshold=0.5) docs = ["doc1"] with pytest.raises(ValueError) as excinfo: splitter(docs) assert "at least two are required" in str(excinfo.value) @pytest.fixture def base_splitter_instance(): # Now MockEncoder includes default values for required fields mock_encoder = Mock(spec=BaseEncoder) mock_encoder.name = "mock_encoder" mock_encoder.score_threshold = 0.5 return BaseSplitter(name="test_splitter", encoder=mock_encoder, score_threshold=0.5) def test_base_splitter_call_not_implemented(base_splitter_instance): with pytest.raises(NotImplementedError): base_splitter_instance(["document"])