diff --git a/src/embeddingbuddy/data/parser.py b/src/embeddingbuddy/data/parser.py index 3b8cc42..a1aa1fe 100644 --- a/src/embeddingbuddy/data/parser.py +++ b/src/embeddingbuddy/data/parser.py @@ -24,14 +24,10 @@ class NDJSONParser: documents.append(doc) except json.JSONDecodeError as e: raise json.JSONDecodeError( - f"Invalid JSON on line {line_num}: {e.msg}", - e.doc, - e.pos + f"Invalid JSON on line {line_num}: {e.msg}", e.doc, e.pos ) except KeyError as e: - raise KeyError( - f"Missing required field {e} on line {line_num}" - ) + raise KeyError(f"Missing required field {e} on line {line_num}") except (TypeError, ValueError) as e: raise ValueError( f"Invalid data format on line {line_num}: {str(e)}" @@ -52,15 +48,19 @@ class NDJSONParser: # Validate embedding format embedding = doc_dict["embedding"] if not isinstance(embedding, list): - raise ValueError(f"Embedding must be a list, got {type(embedding).__name__}") - + raise ValueError( + f"Embedding must be a list, got {type(embedding).__name__}" + ) + if not embedding: raise ValueError("Embedding cannot be empty") - + # Check that all embedding values are numbers for i, val in enumerate(embedding): if not isinstance(val, (int, float)) or val != val: # NaN check - raise ValueError(f"Embedding contains invalid value at index {i}: {val}") + raise ValueError( + f"Embedding contains invalid value at index {i}: {val}" + ) return Document( id=doc_dict["id"], diff --git a/src/embeddingbuddy/ui/callbacks/data_processing.py b/src/embeddingbuddy/ui/callbacks/data_processing.py index bd9edfd..e0491d3 100644 --- a/src/embeddingbuddy/ui/callbacks/data_processing.py +++ b/src/embeddingbuddy/ui/callbacks/data_processing.py @@ -25,7 +25,9 @@ class DataProcessingCallbacks: processed_data = self.processor.process_upload(contents, filename) if processed_data.error: - error_message = self._format_error_message(processed_data.error, filename) + error_message = self._format_error_message( + processed_data.error, filename + ) return ( {"error": processed_data.error}, error_message, @@ -80,14 +82,18 @@ class DataProcessingCallbacks: def _format_error_message(error: str, filename: str | None = None) -> str: """Format error message with helpful guidance for users.""" file_part = f" in file '{filename}'" if filename else "" - + # Check for common error patterns and provide helpful messages - if "embedding" in error.lower() and ("key" in error.lower() or "required field" in error.lower()): + if "embedding" in error.lower() and ( + "key" in error.lower() or "required field" in error.lower() + ): return ( f"❌ Missing 'embedding' field{file_part}. " "Each line must contain an 'embedding' field with a list of numbers." ) - elif "text" in error.lower() and ("key" in error.lower() or "required field" in error.lower()): + elif "text" in error.lower() and ( + "key" in error.lower() or "required field" in error.lower() + ): return ( f"❌ Missing 'text' field{file_part}. " "Each line must contain a 'text' field with the document content." diff --git a/src/embeddingbuddy/ui/components/upload.py b/src/embeddingbuddy/ui/components/upload.py index 8e4db92..b9271d7 100644 --- a/src/embeddingbuddy/ui/components/upload.py +++ b/src/embeddingbuddy/ui/components/upload.py @@ -62,4 +62,3 @@ class UploadComponent: color="danger", className="mb-3", ) - diff --git a/src/embeddingbuddy/ui/layout.py b/src/embeddingbuddy/ui/layout.py index cabfe55..71a0402 100644 --- a/src/embeddingbuddy/ui/layout.py +++ b/src/embeddingbuddy/ui/layout.py @@ -9,7 +9,8 @@ class AppLayout: def create_layout(self): return dbc.Container( - [self._create_header(), self._create_main_content()] + self._create_stores(), + [self._create_header(), self._create_main_content()] + + self._create_stores(), fluid=True, ) diff --git a/tests/test_bad_data.py b/tests/test_bad_data.py index 76c213c..0260c1c 100644 --- a/tests/test_bad_data.py +++ b/tests/test_bad_data.py @@ -1,4 +1,5 @@ """Tests for handling bad/invalid data files.""" + import pytest import json import base64 @@ -16,16 +17,16 @@ class TestBadDataHandling: def _create_upload_contents(self, text_content: str) -> str: """Helper to create upload contents format.""" - encoded = base64.b64encode(text_content.encode('utf-8')).decode('utf-8') + encoded = base64.b64encode(text_content.encode("utf-8")).decode("utf-8") return f"data:application/json;base64,{encoded}" def test_missing_embedding_field(self): """Test files missing required embedding field.""" bad_content = '{"id": "doc_001", "text": "Sample text", "category": "test"}' - + with pytest.raises(KeyError, match="embedding"): self.parser.parse_text(bad_content) - + # Test processor error handling upload_contents = self._create_upload_contents(bad_content) result = self.processor.process_upload(upload_contents) @@ -34,11 +35,13 @@ class TestBadDataHandling: def test_missing_text_field(self): """Test files missing required text field.""" - bad_content = '{"id": "doc_001", "embedding": [0.1, 0.2, 0.3], "category": "test"}' - + bad_content = ( + '{"id": "doc_001", "embedding": [0.1, 0.2, 0.3], "category": "test"}' + ) + with pytest.raises(KeyError, match="text"): self.parser.parse_text(bad_content) - + # Test processor error handling upload_contents = self._create_upload_contents(bad_content) result = self.processor.process_upload(upload_contents) @@ -49,10 +52,10 @@ class TestBadDataHandling: """Test files with malformed JSON syntax.""" # Missing closing brace bad_content = '{"id": "doc_001", "embedding": [0.1, 0.2], "text": "test"' - + with pytest.raises(json.JSONDecodeError): self.parser.parse_text(bad_content) - + # Test processor error handling upload_contents = self._create_upload_contents(bad_content) result = self.processor.process_upload(upload_contents) @@ -70,7 +73,7 @@ class TestBadDataHandling: # Null embedding '{"id": "doc_004", "embedding": null, "text": "test"}', ] - + for bad_content in test_cases: upload_contents = self._create_upload_contents(bad_content) result = self.processor.process_upload(upload_contents) @@ -78,25 +81,27 @@ class TestBadDataHandling: def test_inconsistent_embedding_dimensions(self): """Test files with embeddings of different dimensions.""" - bad_content = '''{"id": "doc_001", "embedding": [0.1, 0.2, 0.3, 0.4], "text": "4D embedding"} -{"id": "doc_002", "embedding": [0.1, 0.2, 0.3], "text": "3D embedding"}''' - + bad_content = """{"id": "doc_001", "embedding": [0.1, 0.2, 0.3, 0.4], "text": "4D embedding"} +{"id": "doc_002", "embedding": [0.1, 0.2, 0.3], "text": "3D embedding"}""" + upload_contents = self._create_upload_contents(bad_content) result = self.processor.process_upload(upload_contents) - + # This might succeed parsing but fail in processing # The error depends on where dimension validation occurs if result.error is None: # If parsing succeeds, check that embeddings have inconsistent shapes assert len(result.documents) == 2 - assert len(result.documents[0].embedding) != len(result.documents[1].embedding) + assert len(result.documents[0].embedding) != len( + result.documents[1].embedding + ) def test_empty_lines_in_ndjson(self): """Test files with empty lines mixed in.""" - content_with_empty_lines = '''{"id": "doc_001", "embedding": [0.1, 0.2], "text": "First line"} + content_with_empty_lines = """{"id": "doc_001", "embedding": [0.1, 0.2], "text": "First line"} + +{"id": "doc_002", "embedding": [0.3, 0.4], "text": "After empty line"}""" -{"id": "doc_002", "embedding": [0.3, 0.4], "text": "After empty line"}''' - # This should work - empty lines should be skipped documents = self.parser.parse_text(content_with_empty_lines) assert len(documents) == 2 @@ -105,55 +110,62 @@ class TestBadDataHandling: def test_not_ndjson_format(self): """Test regular JSON array instead of NDJSON.""" - json_array = '''[ + json_array = """[ {"id": "doc_001", "embedding": [0.1, 0.2], "text": "First"}, {"id": "doc_002", "embedding": [0.3, 0.4], "text": "Second"} -]''' - +]""" + with pytest.raises(json.JSONDecodeError): self.parser.parse_text(json_array) def test_binary_content_in_file(self): """Test files with binary content mixed in.""" # Simulate binary content that can't be decoded - binary_content = b'\x00\x01\x02{"id": "doc_001", "embedding": [0.1], "text": "test"}' - + binary_content = ( + b'\x00\x01\x02{"id": "doc_001", "embedding": [0.1], "text": "test"}' + ) + # This should result in an error when processing - encoded = base64.b64encode(binary_content).decode('utf-8') + encoded = base64.b64encode(binary_content).decode("utf-8") upload_contents = f"data:application/json;base64,{encoded}" result = self.processor.process_upload(upload_contents) - + # Should either fail with UnicodeDecodeError or JSON parsing error assert result.error is not None def test_extremely_large_embeddings(self): """Test embeddings with very large dimensions.""" large_embedding = [0.1] * 10000 # 10k dimensions - content = json.dumps({ - "id": "doc_001", - "embedding": large_embedding, - "text": "Large embedding test" - }) - + content = json.dumps( + { + "id": "doc_001", + "embedding": large_embedding, + "text": "Large embedding test", + } + ) + # This should work but might be slow upload_contents = self._create_upload_contents(content) result = self.processor.process_upload(upload_contents) - + if result.error is None: assert len(result.documents) == 1 assert len(result.documents[0].embedding) == 10000 def test_special_characters_in_text(self): """Test handling of special characters and unicode.""" - special_content = json.dumps({ - "id": "doc_001", - "embedding": [0.1, 0.2], - "text": "Special chars: 🚀 ñoñó 中文 \n\t\"" - }, ensure_ascii=False) - + special_content = json.dumps( + { + "id": "doc_001", + "embedding": [0.1, 0.2], + "text": 'Special chars: 🚀 ñoñó 中文 \n\t"', + }, + ensure_ascii=False, + ) + upload_contents = self._create_upload_contents(special_content) result = self.processor.process_upload(upload_contents) - + assert result.error is None assert len(result.documents) == 1 assert "🚀" in result.documents[0].text @@ -162,9 +174,9 @@ class TestBadDataHandling: """Test that processor returns proper error structure.""" bad_content = '{"invalid": "json"' # Missing closing brace upload_contents = self._create_upload_contents(bad_content) - + result = self.processor.process_upload(upload_contents) - + # Check error structure assert result.error is not None assert isinstance(result.error, str) @@ -173,13 +185,13 @@ class TestBadDataHandling: def test_multiple_errors_in_file(self): """Test file with multiple different types of errors.""" - multi_error_content = '''{"id": "doc_001", "text": "Missing embedding"} + multi_error_content = """{"id": "doc_001", "text": "Missing embedding"} {"id": "doc_002", "embedding": "wrong_type", "text": "Wrong embedding type"} {"id": "doc_003", "embedding": [0.1, 0.2], "text": "Valid line"} -{"id": "doc_004", "embedding": [0.3, 0.4]''' # Missing text and closing brace - +{"id": "doc_004", "embedding": [0.3, 0.4]""" # Missing text and closing brace + upload_contents = self._create_upload_contents(multi_error_content) result = self.processor.process_upload(upload_contents) - + # Should fail on first error encountered - assert result.error is not None \ No newline at end of file + assert result.error is not None