from unittest.mock import patch from src.embeddingbuddy.data.processor import DataProcessor from src.embeddingbuddy.models.field_mapper import FieldMapping class TestDataProcessorOpenSearch: def test_process_opensearch_data_success(self): processor = DataProcessor() # Mock raw OpenSearch documents raw_documents = [ { "vector": [0.1, 0.2, 0.3], "content": "Test document 1", "doc_id": "doc1", "type": "news", }, { "vector": [0.4, 0.5, 0.6], "content": "Test document 2", "doc_id": "doc2", "type": "blog", }, ] # Create field mapping field_mapping = FieldMapping( embedding_field="vector", text_field="content", id_field="doc_id", category_field="type", ) # Process the data processed_data = processor.process_opensearch_data(raw_documents, field_mapping) # Assertions assert processed_data.error is None assert len(processed_data.documents) == 2 assert processed_data.embeddings.shape == (2, 3) # Check first document doc1 = processed_data.documents[0] assert doc1.text == "Test document 1" assert doc1.embedding == [0.1, 0.2, 0.3] assert doc1.id == "doc1" assert doc1.category == "news" # Check second document doc2 = processed_data.documents[1] assert doc2.text == "Test document 2" assert doc2.embedding == [0.4, 0.5, 0.6] assert doc2.id == "doc2" assert doc2.category == "blog" def test_process_opensearch_data_with_tags(self): processor = DataProcessor() # Mock raw OpenSearch documents with tags raw_documents = [ { "vector": [0.1, 0.2, 0.3], "content": "Test document with tags", "keywords": ["tag1", "tag2"], } ] # Create field mapping field_mapping = FieldMapping( embedding_field="vector", text_field="content", tags_field="keywords" ) processed_data = processor.process_opensearch_data(raw_documents, field_mapping) assert processed_data.error is None assert len(processed_data.documents) == 1 doc = processed_data.documents[0] assert doc.tags == ["tag1", "tag2"] def test_process_opensearch_data_invalid_documents(self): processor = DataProcessor() # Mock raw documents with missing required fields raw_documents = [ { "vector": [0.1, 0.2, 0.3], # Missing text field } ] field_mapping = FieldMapping(embedding_field="vector", text_field="content") processed_data = processor.process_opensearch_data(raw_documents, field_mapping) # Should return error since no valid documents assert processed_data.error is not None assert "No valid documents" in processed_data.error assert len(processed_data.documents) == 0 def test_process_opensearch_data_partial_success(self): processor = DataProcessor() # Mix of valid and invalid documents raw_documents = [ { "vector": [0.1, 0.2, 0.3], "content": "Valid document", }, { "vector": [0.4, 0.5, 0.6], # Missing content field - should be skipped }, { "vector": [0.7, 0.8, 0.9], "content": "Another valid document", }, ] field_mapping = FieldMapping(embedding_field="vector", text_field="content") processed_data = processor.process_opensearch_data(raw_documents, field_mapping) # Should process valid documents only assert processed_data.error is None assert len(processed_data.documents) == 2 assert processed_data.documents[0].text == "Valid document" assert processed_data.documents[1].text == "Another valid document" @patch("src.embeddingbuddy.models.field_mapper.FieldMapper.transform_documents") def test_process_opensearch_data_transformation_error(self, mock_transform): processor = DataProcessor() # Mock transformation error mock_transform.side_effect = Exception("Transformation failed") raw_documents = [{"vector": [0.1], "content": "test"}] field_mapping = FieldMapping(embedding_field="vector", text_field="content") processed_data = processor.process_opensearch_data(raw_documents, field_mapping) assert processed_data.error is not None assert "Transformation failed" in processed_data.error assert len(processed_data.documents) == 0 def test_process_opensearch_data_empty_input(self): processor = DataProcessor() raw_documents = [] field_mapping = FieldMapping(embedding_field="vector", text_field="content") processed_data = processor.process_opensearch_data(raw_documents, field_mapping) assert processed_data.error is not None assert "No valid documents" in processed_data.error assert len(processed_data.documents) == 0