reformat

2025-08-14 08:07:50 -07:00
parent 6a995635ac
commit 4867614474
5 changed files with 79 additions and 61 deletions
--- a/tests/test_bad_data.py
+++ b/tests/test_bad_data.py
@@ -1,4 +1,5 @@
 """Tests for handling bad/invalid data files."""
+
 import pytest
 import json
 import base64
@@ -16,16 +17,16 @@ class TestBadDataHandling:

    def _create_upload_contents(self, text_content: str) -> str:
        """Helper to create upload contents format."""
-        encoded = base64.b64encode(text_content.encode('utf-8')).decode('utf-8')
+        encoded = base64.b64encode(text_content.encode("utf-8")).decode("utf-8")
        return f"data:application/json;base64,{encoded}"

    def test_missing_embedding_field(self):
        """Test files missing required embedding field."""
        bad_content = '{"id": "doc_001", "text": "Sample text", "category": "test"}'
-        
+
        with pytest.raises(KeyError, match="embedding"):
            self.parser.parse_text(bad_content)
-        
+
        # Test processor error handling
        upload_contents = self._create_upload_contents(bad_content)
        result = self.processor.process_upload(upload_contents)
@@ -34,11 +35,13 @@ class TestBadDataHandling:

    def test_missing_text_field(self):
        """Test files missing required text field."""
-        bad_content = '{"id": "doc_001", "embedding": [0.1, 0.2, 0.3], "category": "test"}'
-        
+        bad_content = (
+            '{"id": "doc_001", "embedding": [0.1, 0.2, 0.3], "category": "test"}'
+        )
+
        with pytest.raises(KeyError, match="text"):
            self.parser.parse_text(bad_content)
-        
+
        # Test processor error handling
        upload_contents = self._create_upload_contents(bad_content)
        result = self.processor.process_upload(upload_contents)
@@ -49,10 +52,10 @@ class TestBadDataHandling:
        """Test files with malformed JSON syntax."""
        # Missing closing brace
        bad_content = '{"id": "doc_001", "embedding": [0.1, 0.2], "text": "test"'
-        
+
        with pytest.raises(json.JSONDecodeError):
            self.parser.parse_text(bad_content)
-        
+
        # Test processor error handling
        upload_contents = self._create_upload_contents(bad_content)
        result = self.processor.process_upload(upload_contents)
@@ -70,7 +73,7 @@ class TestBadDataHandling:
            # Null embedding
            '{"id": "doc_004", "embedding": null, "text": "test"}',
        ]
-        
+
        for bad_content in test_cases:
            upload_contents = self._create_upload_contents(bad_content)
            result = self.processor.process_upload(upload_contents)
@@ -78,25 +81,27 @@ class TestBadDataHandling:

    def test_inconsistent_embedding_dimensions(self):
        """Test files with embeddings of different dimensions."""
-        bad_content = '''{"id": "doc_001", "embedding": [0.1, 0.2, 0.3, 0.4], "text": "4D embedding"}
-{"id": "doc_002", "embedding": [0.1, 0.2, 0.3], "text": "3D embedding"}'''
-        
+        bad_content = """{"id": "doc_001", "embedding": [0.1, 0.2, 0.3, 0.4], "text": "4D embedding"}
+{"id": "doc_002", "embedding": [0.1, 0.2, 0.3], "text": "3D embedding"}"""
+
        upload_contents = self._create_upload_contents(bad_content)
        result = self.processor.process_upload(upload_contents)
-        
+
        # This might succeed parsing but fail in processing
        # The error depends on where dimension validation occurs
        if result.error is None:
            # If parsing succeeds, check that embeddings have inconsistent shapes
            assert len(result.documents) == 2
-            assert len(result.documents[0].embedding) != len(result.documents[1].embedding)
+            assert len(result.documents[0].embedding) != len(
+                result.documents[1].embedding
+            )

    def test_empty_lines_in_ndjson(self):
        """Test files with empty lines mixed in."""
-        content_with_empty_lines = '''{"id": "doc_001", "embedding": [0.1, 0.2], "text": "First line"}
+        content_with_empty_lines = """{"id": "doc_001", "embedding": [0.1, 0.2], "text": "First line"}
+
+{"id": "doc_002", "embedding": [0.3, 0.4], "text": "After empty line"}"""

-{"id": "doc_002", "embedding": [0.3, 0.4], "text": "After empty line"}'''
-        
        # This should work - empty lines should be skipped
        documents = self.parser.parse_text(content_with_empty_lines)
        assert len(documents) == 2
@@ -105,55 +110,62 @@ class TestBadDataHandling:

    def test_not_ndjson_format(self):
        """Test regular JSON array instead of NDJSON."""
-        json_array = '''[
+        json_array = """[
  {"id": "doc_001", "embedding": [0.1, 0.2], "text": "First"},
  {"id": "doc_002", "embedding": [0.3, 0.4], "text": "Second"}
-]'''
-        
+]"""
+
        with pytest.raises(json.JSONDecodeError):
            self.parser.parse_text(json_array)

    def test_binary_content_in_file(self):
        """Test files with binary content mixed in."""
        # Simulate binary content that can't be decoded
-        binary_content = b'\x00\x01\x02{"id": "doc_001", "embedding": [0.1], "text": "test"}'
-        
+        binary_content = (
+            b'\x00\x01\x02{"id": "doc_001", "embedding": [0.1], "text": "test"}'
+        )
+
        # This should result in an error when processing
-        encoded = base64.b64encode(binary_content).decode('utf-8')
+        encoded = base64.b64encode(binary_content).decode("utf-8")
        upload_contents = f"data:application/json;base64,{encoded}"
        result = self.processor.process_upload(upload_contents)
-        
+
        # Should either fail with UnicodeDecodeError or JSON parsing error
        assert result.error is not None

    def test_extremely_large_embeddings(self):
        """Test embeddings with very large dimensions."""
        large_embedding = [0.1] * 10000  # 10k dimensions
-        content = json.dumps({
-            "id": "doc_001", 
-            "embedding": large_embedding, 
-            "text": "Large embedding test"
-        })
-        
+        content = json.dumps(
+            {
+                "id": "doc_001",
+                "embedding": large_embedding,
+                "text": "Large embedding test",
+            }
+        )
+
        # This should work but might be slow
        upload_contents = self._create_upload_contents(content)
        result = self.processor.process_upload(upload_contents)
-        
+
        if result.error is None:
            assert len(result.documents) == 1
            assert len(result.documents[0].embedding) == 10000

    def test_special_characters_in_text(self):
        """Test handling of special characters and unicode."""
-        special_content = json.dumps({
-            "id": "doc_001",
-            "embedding": [0.1, 0.2],
-            "text": "Special chars: 🚀 ñoñó 中文 \n\t\""
-        }, ensure_ascii=False)
-        
+        special_content = json.dumps(
+            {
+                "id": "doc_001",
+                "embedding": [0.1, 0.2],
+                "text": 'Special chars: 🚀 ñoñó 中文 \n\t"',
+            },
+            ensure_ascii=False,
+        )
+
        upload_contents = self._create_upload_contents(special_content)
        result = self.processor.process_upload(upload_contents)
-        
+
        assert result.error is None
        assert len(result.documents) == 1
        assert "🚀" in result.documents[0].text
@@ -162,9 +174,9 @@ class TestBadDataHandling:
        """Test that processor returns proper error structure."""
        bad_content = '{"invalid": "json"'  # Missing closing brace
        upload_contents = self._create_upload_contents(bad_content)
-        
+
        result = self.processor.process_upload(upload_contents)
-        
+
        # Check error structure
        assert result.error is not None
        assert isinstance(result.error, str)
@@ -173,13 +185,13 @@ class TestBadDataHandling:

    def test_multiple_errors_in_file(self):
        """Test file with multiple different types of errors."""
-        multi_error_content = '''{"id": "doc_001", "text": "Missing embedding"}
+        multi_error_content = """{"id": "doc_001", "text": "Missing embedding"}
 {"id": "doc_002", "embedding": "wrong_type", "text": "Wrong embedding type"}
 {"id": "doc_003", "embedding": [0.1, 0.2], "text": "Valid line"}
-{"id": "doc_004", "embedding": [0.3, 0.4]'''  # Missing text and closing brace
-        
+{"id": "doc_004", "embedding": [0.3, 0.4]"""  # Missing text and closing brace
+
        upload_contents = self._create_upload_contents(multi_error_content)
        result = self.processor.process_upload(upload_contents)
-        
+
        # Should fail on first error encountered
-        assert result.error is not None
+        assert result.error is not None