reformat

2025-08-14 08:07:50 -07:00
parent 6a995635ac
commit 4867614474
5 changed files with 79 additions and 61 deletions
--- a/src/embeddingbuddy/data/parser.py
+++ b/src/embeddingbuddy/data/parser.py
@@ -24,14 +24,10 @@ class NDJSONParser:
                    documents.append(doc)
                except json.JSONDecodeError as e:
                    raise json.JSONDecodeError(
-                        f"Invalid JSON on line {line_num}: {e.msg}",
+                        f"Invalid JSON on line {line_num}: {e.msg}", e.doc, e.pos
                        e.doc,
                        e.pos
                    )
                except KeyError as e:
-                    raise KeyError(
+                    raise KeyError(f"Missing required field {e} on line {line_num}")
                        f"Missing required field {e} on line {line_num}"
                    )
                except (TypeError, ValueError) as e:
                    raise ValueError(
                        f"Invalid data format on line {line_num}: {str(e)}"
@@ -52,7 +48,9 @@ class NDJSONParser:
        # Validate embedding format
        embedding = doc_dict["embedding"]
        if not isinstance(embedding, list):
-            raise ValueError(f"Embedding must be a list, got {type(embedding).__name__}")
+            raise ValueError(
                f"Embedding must be a list, got {type(embedding).__name__}"
            )
        if not embedding:
            raise ValueError("Embedding cannot be empty")
@@ -60,7 +58,9 @@ class NDJSONParser:
        # Check that all embedding values are numbers
        for i, val in enumerate(embedding):
            if not isinstance(val, (int, float)) or val != val:  # NaN check
-                raise ValueError(f"Embedding contains invalid value at index {i}: {val}")
+                raise ValueError(
                    f"Embedding contains invalid value at index {i}: {val}"
                )
        return Document(
            id=doc_dict["id"],
--- a/src/embeddingbuddy/ui/callbacks/data_processing.py
+++ b/src/embeddingbuddy/ui/callbacks/data_processing.py
@@ -25,7 +25,9 @@ class DataProcessingCallbacks:
            processed_data = self.processor.process_upload(contents, filename)
            if processed_data.error:
-                error_message = self._format_error_message(processed_data.error, filename)
+                error_message = self._format_error_message(
                    processed_data.error, filename
                )
                return (
                    {"error": processed_data.error},
                    error_message,
@@ -82,12 +84,16 @@ class DataProcessingCallbacks:
        file_part = f" in file '{filename}'" if filename else ""
        # Check for common error patterns and provide helpful messages
-        if "embedding" in error.lower() and ("key" in error.lower() or "required field" in error.lower()):
+        if "embedding" in error.lower() and (
            "key" in error.lower() or "required field" in error.lower()
        ):
            return (
                f"❌ Missing 'embedding' field{file_part}. "
                "Each line must contain an 'embedding' field with a list of numbers."
            )
-        elif "text" in error.lower() and ("key" in error.lower() or "required field" in error.lower()):
+        elif "text" in error.lower() and (
            "key" in error.lower() or "required field" in error.lower()
        ):
            return (
                f"❌ Missing 'text' field{file_part}. "
                "Each line must contain a 'text' field with the document content."
--- a/src/embeddingbuddy/ui/components/upload.py
+++ b/src/embeddingbuddy/ui/components/upload.py
@@ -62,4 +62,3 @@ class UploadComponent:
            color="danger",
            className="mb-3",
        )
--- a/src/embeddingbuddy/ui/layout.py
+++ b/src/embeddingbuddy/ui/layout.py
@@ -9,7 +9,8 @@ class AppLayout:
    def create_layout(self):
        return dbc.Container(
-            [self._create_header(), self._create_main_content()] + self._create_stores(),
+            [self._create_header(), self._create_main_content()]
            + self._create_stores(),
            fluid=True,
        )
--- a/tests/test_bad_data.py
+++ b/tests/test_bad_data.py
@@ -1,4 +1,5 @@
 """Tests for handling bad/invalid data files."""
 import pytest
 import json
 import base64
@@ -16,7 +17,7 @@ class TestBadDataHandling:
    def _create_upload_contents(self, text_content: str) -> str:
        """Helper to create upload contents format."""
-        encoded = base64.b64encode(text_content.encode('utf-8')).decode('utf-8')
+        encoded = base64.b64encode(text_content.encode("utf-8")).decode("utf-8")
        return f"data:application/json;base64,{encoded}"
    def test_missing_embedding_field(self):
@@ -34,7 +35,9 @@ class TestBadDataHandling:
    def test_missing_text_field(self):
        """Test files missing required text field."""
-        bad_content = '{"id": "doc_001", "embedding": [0.1, 0.2, 0.3], "category": "test"}'
+        bad_content = (
            '{"id": "doc_001", "embedding": [0.1, 0.2, 0.3], "category": "test"}'
        )
        with pytest.raises(KeyError, match="text"):
            self.parser.parse_text(bad_content)
@@ -78,8 +81,8 @@ class TestBadDataHandling:
    def test_inconsistent_embedding_dimensions(self):
        """Test files with embeddings of different dimensions."""
-        bad_content = '''{"id": "doc_001", "embedding": [0.1, 0.2, 0.3, 0.4], "text": "4D embedding"}
+        bad_content = """{"id": "doc_001", "embedding": [0.1, 0.2, 0.3, 0.4], "text": "4D embedding"}
-{"id": "doc_002", "embedding": [0.1, 0.2, 0.3], "text": "3D embedding"}'''
+{"id": "doc_002", "embedding": [0.1, 0.2, 0.3], "text": "3D embedding"}"""
        upload_contents = self._create_upload_contents(bad_content)
        result = self.processor.process_upload(upload_contents)
@@ -89,13 +92,15 @@ class TestBadDataHandling:
        if result.error is None:
            # If parsing succeeds, check that embeddings have inconsistent shapes
            assert len(result.documents) == 2
-            assert len(result.documents[0].embedding) != len(result.documents[1].embedding)
+            assert len(result.documents[0].embedding) != len(
                result.documents[1].embedding
            )
    def test_empty_lines_in_ndjson(self):
        """Test files with empty lines mixed in."""
-        content_with_empty_lines = '''{"id": "doc_001", "embedding": [0.1, 0.2], "text": "First line"}
+        content_with_empty_lines = """{"id": "doc_001", "embedding": [0.1, 0.2], "text": "First line"}
-{"id": "doc_002", "embedding": [0.3, 0.4], "text": "After empty line"}'''
+{"id": "doc_002", "embedding": [0.3, 0.4], "text": "After empty line"}"""
        # This should work - empty lines should be skipped
        documents = self.parser.parse_text(content_with_empty_lines)
@@ -105,10 +110,10 @@ class TestBadDataHandling:
    def test_not_ndjson_format(self):
        """Test regular JSON array instead of NDJSON."""
-        json_array = '''[
+        json_array = """[
  {"id": "doc_001", "embedding": [0.1, 0.2], "text": "First"},
  {"id": "doc_002", "embedding": [0.3, 0.4], "text": "Second"}
-]'''
+]"""
        with pytest.raises(json.JSONDecodeError):
            self.parser.parse_text(json_array)
@@ -116,10 +121,12 @@ class TestBadDataHandling:
    def test_binary_content_in_file(self):
        """Test files with binary content mixed in."""
        # Simulate binary content that can't be decoded
-        binary_content = b'\x00\x01\x02{"id": "doc_001", "embedding": [0.1], "text": "test"}'
+        binary_content = (
            b'\x00\x01\x02{"id": "doc_001", "embedding": [0.1], "text": "test"}'
        )
        # This should result in an error when processing
-        encoded = base64.b64encode(binary_content).decode('utf-8')
+        encoded = base64.b64encode(binary_content).decode("utf-8")
        upload_contents = f"data:application/json;base64,{encoded}"
        result = self.processor.process_upload(upload_contents)
@@ -129,11 +136,13 @@ class TestBadDataHandling:
    def test_extremely_large_embeddings(self):
        """Test embeddings with very large dimensions."""
        large_embedding = [0.1] * 10000  # 10k dimensions
-        content = json.dumps({
+        content = json.dumps(
-            "id": "doc_001", 
+            {
-            "embedding": large_embedding, 
+                "id": "doc_001",
-            "text": "Large embedding test"
+                "embedding": large_embedding,
-        })
+                "text": "Large embedding test",
            }
        )
        # This should work but might be slow
        upload_contents = self._create_upload_contents(content)
@@ -145,11 +154,14 @@ class TestBadDataHandling:
    def test_special_characters_in_text(self):
        """Test handling of special characters and unicode."""
-        special_content = json.dumps({
+        special_content = json.dumps(
-            "id": "doc_001",
+            {
-            "embedding": [0.1, 0.2],
+                "id": "doc_001",
-            "text": "Special chars: 🚀 ñoñó 中文 \n\t\""
+                "embedding": [0.1, 0.2],
-        }, ensure_ascii=False)
+                "text": 'Special chars: 🚀 ñoñó 中文 \n\t"',
            },
            ensure_ascii=False,
        )
        upload_contents = self._create_upload_contents(special_content)
        result = self.processor.process_upload(upload_contents)
@@ -173,10 +185,10 @@ class TestBadDataHandling:
    def test_multiple_errors_in_file(self):
        """Test file with multiple different types of errors."""
-        multi_error_content = '''{"id": "doc_001", "text": "Missing embedding"}
+        multi_error_content = """{"id": "doc_001", "text": "Missing embedding"}
 {"id": "doc_002", "embedding": "wrong_type", "text": "Wrong embedding type"}
 {"id": "doc_003", "embedding": [0.1, 0.2], "text": "Valid line"}
-{"id": "doc_004", "embedding": [0.3, 0.4]'''  # Missing text and closing brace
+{"id": "doc_004", "embedding": [0.3, 0.4]"""  # Missing text and closing brace
        upload_contents = self._create_upload_contents(multi_error_content)
        result = self.processor.process_upload(upload_contents)