embedding-buddy/tests/test_data_processing.py

import pytest
import numpy as np
from src.embeddingbuddy.data.parser import NDJSONParser
from src.embeddingbuddy.data.processor import DataProcessor
from src.embeddingbuddy.models.schemas import Document


class TestNDJSONParser:

    def test_parse_text_basic(self):
        text_content = '{"id": "test1", "text": "Hello world", "embedding": [0.1, 0.2, 0.3]}'
        documents = NDJSONParser.parse_text(text_content)

        assert len(documents) == 1
        assert documents[0].id == "test1"
        assert documents[0].text == "Hello world"
        assert documents[0].embedding == [0.1, 0.2, 0.3]

    def test_parse_text_with_metadata(self):
        text_content = '{"id": "test1", "text": "Hello", "embedding": [0.1, 0.2], "category": "greeting", "tags": ["test"]}'
        documents = NDJSONParser.parse_text(text_content)

        assert documents[0].category == "greeting"
        assert documents[0].tags == ["test"]

    def test_parse_text_missing_id(self):
        text_content = '{"text": "Hello", "embedding": [0.1, 0.2]}'
        documents = NDJSONParser.parse_text(text_content)

        assert len(documents) == 1
        assert documents[0].id is not None  # Should be auto-generated


class TestDataProcessor:

    def test_extract_embeddings(self):
        documents = [
            Document(id="1", text="test1", embedding=[0.1, 0.2]),
            Document(id="2", text="test2", embedding=[0.3, 0.4])
        ]

        processor = DataProcessor()
        embeddings = processor._extract_embeddings(documents)

        assert embeddings.shape == (2, 2)
        assert np.allclose(embeddings[0], [0.1, 0.2])
        assert np.allclose(embeddings[1], [0.3, 0.4])

    def test_combine_data(self):
        from src.embeddingbuddy.models.schemas import ProcessedData

        doc_data = ProcessedData(
            documents=[Document(id="1", text="doc", embedding=[0.1, 0.2])],
            embeddings=np.array([[0.1, 0.2]])
        )

        prompt_data = ProcessedData(
            documents=[Document(id="p1", text="prompt", embedding=[0.3, 0.4])],
            embeddings=np.array([[0.3, 0.4]])
        )

        processor = DataProcessor()
        all_embeddings, documents, prompts = processor.combine_data(doc_data, prompt_data)

        assert all_embeddings.shape == (2, 2)
        assert len(documents) == 1
        assert len(prompts) == 1
        assert documents[0].id == "1"
        assert prompts[0].id == "p1"


if __name__ == "__main__":
    pytest.main([__file__])