reformat
	
		
			
	
		
	
	
		
	
		
			All checks were successful
		
		
	
	
		
			
				
	
				Security Scan / dependency-check (pull_request) Successful in 35s
				
			
		
			
				
	
				Security Scan / security (pull_request) Successful in 39s
				
			
		
			
				
	
				Test Suite / lint (pull_request) Successful in 30s
				
			
		
			
				
	
				Test Suite / test (3.11) (pull_request) Successful in 1m26s
				
			
		
			
				
	
				Test Suite / build (pull_request) Successful in 37s
				
			
		
		
	
	
				
					
				
			
		
			All checks were successful
		
		
	
	Security Scan / dependency-check (pull_request) Successful in 35s
				
			Security Scan / security (pull_request) Successful in 39s
				
			Test Suite / lint (pull_request) Successful in 30s
				
			Test Suite / test (3.11) (pull_request) Successful in 1m26s
				
			Test Suite / build (pull_request) Successful in 37s
				
			This commit is contained in:
		@@ -24,14 +24,10 @@ class NDJSONParser:
 | 
			
		||||
                    documents.append(doc)
 | 
			
		||||
                except json.JSONDecodeError as e:
 | 
			
		||||
                    raise json.JSONDecodeError(
 | 
			
		||||
                        f"Invalid JSON on line {line_num}: {e.msg}",
 | 
			
		||||
                        e.doc,
 | 
			
		||||
                        e.pos
 | 
			
		||||
                        f"Invalid JSON on line {line_num}: {e.msg}", e.doc, e.pos
 | 
			
		||||
                    )
 | 
			
		||||
                except KeyError as e:
 | 
			
		||||
                    raise KeyError(
 | 
			
		||||
                        f"Missing required field {e} on line {line_num}"
 | 
			
		||||
                    )
 | 
			
		||||
                    raise KeyError(f"Missing required field {e} on line {line_num}")
 | 
			
		||||
                except (TypeError, ValueError) as e:
 | 
			
		||||
                    raise ValueError(
 | 
			
		||||
                        f"Invalid data format on line {line_num}: {str(e)}"
 | 
			
		||||
@@ -52,15 +48,19 @@ class NDJSONParser:
 | 
			
		||||
        # Validate embedding format
 | 
			
		||||
        embedding = doc_dict["embedding"]
 | 
			
		||||
        if not isinstance(embedding, list):
 | 
			
		||||
            raise ValueError(f"Embedding must be a list, got {type(embedding).__name__}")
 | 
			
		||||
        
 | 
			
		||||
            raise ValueError(
 | 
			
		||||
                f"Embedding must be a list, got {type(embedding).__name__}"
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
        if not embedding:
 | 
			
		||||
            raise ValueError("Embedding cannot be empty")
 | 
			
		||||
        
 | 
			
		||||
 | 
			
		||||
        # Check that all embedding values are numbers
 | 
			
		||||
        for i, val in enumerate(embedding):
 | 
			
		||||
            if not isinstance(val, (int, float)) or val != val:  # NaN check
 | 
			
		||||
                raise ValueError(f"Embedding contains invalid value at index {i}: {val}")
 | 
			
		||||
                raise ValueError(
 | 
			
		||||
                    f"Embedding contains invalid value at index {i}: {val}"
 | 
			
		||||
                )
 | 
			
		||||
 | 
			
		||||
        return Document(
 | 
			
		||||
            id=doc_dict["id"],
 | 
			
		||||
 
 | 
			
		||||
@@ -25,7 +25,9 @@ class DataProcessingCallbacks:
 | 
			
		||||
            processed_data = self.processor.process_upload(contents, filename)
 | 
			
		||||
 | 
			
		||||
            if processed_data.error:
 | 
			
		||||
                error_message = self._format_error_message(processed_data.error, filename)
 | 
			
		||||
                error_message = self._format_error_message(
 | 
			
		||||
                    processed_data.error, filename
 | 
			
		||||
                )
 | 
			
		||||
                return (
 | 
			
		||||
                    {"error": processed_data.error},
 | 
			
		||||
                    error_message,
 | 
			
		||||
@@ -80,14 +82,18 @@ class DataProcessingCallbacks:
 | 
			
		||||
    def _format_error_message(error: str, filename: str | None = None) -> str:
 | 
			
		||||
        """Format error message with helpful guidance for users."""
 | 
			
		||||
        file_part = f" in file '{filename}'" if filename else ""
 | 
			
		||||
        
 | 
			
		||||
 | 
			
		||||
        # Check for common error patterns and provide helpful messages
 | 
			
		||||
        if "embedding" in error.lower() and ("key" in error.lower() or "required field" in error.lower()):
 | 
			
		||||
        if "embedding" in error.lower() and (
 | 
			
		||||
            "key" in error.lower() or "required field" in error.lower()
 | 
			
		||||
        ):
 | 
			
		||||
            return (
 | 
			
		||||
                f"❌ Missing 'embedding' field{file_part}. "
 | 
			
		||||
                "Each line must contain an 'embedding' field with a list of numbers."
 | 
			
		||||
            )
 | 
			
		||||
        elif "text" in error.lower() and ("key" in error.lower() or "required field" in error.lower()):
 | 
			
		||||
        elif "text" in error.lower() and (
 | 
			
		||||
            "key" in error.lower() or "required field" in error.lower()
 | 
			
		||||
        ):
 | 
			
		||||
            return (
 | 
			
		||||
                f"❌ Missing 'text' field{file_part}. "
 | 
			
		||||
                "Each line must contain a 'text' field with the document content."
 | 
			
		||||
 
 | 
			
		||||
@@ -62,4 +62,3 @@ class UploadComponent:
 | 
			
		||||
            color="danger",
 | 
			
		||||
            className="mb-3",
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -9,7 +9,8 @@ class AppLayout:
 | 
			
		||||
 | 
			
		||||
    def create_layout(self):
 | 
			
		||||
        return dbc.Container(
 | 
			
		||||
            [self._create_header(), self._create_main_content()] + self._create_stores(),
 | 
			
		||||
            [self._create_header(), self._create_main_content()]
 | 
			
		||||
            + self._create_stores(),
 | 
			
		||||
            fluid=True,
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -1,4 +1,5 @@
 | 
			
		||||
"""Tests for handling bad/invalid data files."""
 | 
			
		||||
 | 
			
		||||
import pytest
 | 
			
		||||
import json
 | 
			
		||||
import base64
 | 
			
		||||
@@ -16,16 +17,16 @@ class TestBadDataHandling:
 | 
			
		||||
 | 
			
		||||
    def _create_upload_contents(self, text_content: str) -> str:
 | 
			
		||||
        """Helper to create upload contents format."""
 | 
			
		||||
        encoded = base64.b64encode(text_content.encode('utf-8')).decode('utf-8')
 | 
			
		||||
        encoded = base64.b64encode(text_content.encode("utf-8")).decode("utf-8")
 | 
			
		||||
        return f"data:application/json;base64,{encoded}"
 | 
			
		||||
 | 
			
		||||
    def test_missing_embedding_field(self):
 | 
			
		||||
        """Test files missing required embedding field."""
 | 
			
		||||
        bad_content = '{"id": "doc_001", "text": "Sample text", "category": "test"}'
 | 
			
		||||
        
 | 
			
		||||
 | 
			
		||||
        with pytest.raises(KeyError, match="embedding"):
 | 
			
		||||
            self.parser.parse_text(bad_content)
 | 
			
		||||
        
 | 
			
		||||
 | 
			
		||||
        # Test processor error handling
 | 
			
		||||
        upload_contents = self._create_upload_contents(bad_content)
 | 
			
		||||
        result = self.processor.process_upload(upload_contents)
 | 
			
		||||
@@ -34,11 +35,13 @@ class TestBadDataHandling:
 | 
			
		||||
 | 
			
		||||
    def test_missing_text_field(self):
 | 
			
		||||
        """Test files missing required text field."""
 | 
			
		||||
        bad_content = '{"id": "doc_001", "embedding": [0.1, 0.2, 0.3], "category": "test"}'
 | 
			
		||||
        
 | 
			
		||||
        bad_content = (
 | 
			
		||||
            '{"id": "doc_001", "embedding": [0.1, 0.2, 0.3], "category": "test"}'
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
        with pytest.raises(KeyError, match="text"):
 | 
			
		||||
            self.parser.parse_text(bad_content)
 | 
			
		||||
        
 | 
			
		||||
 | 
			
		||||
        # Test processor error handling
 | 
			
		||||
        upload_contents = self._create_upload_contents(bad_content)
 | 
			
		||||
        result = self.processor.process_upload(upload_contents)
 | 
			
		||||
@@ -49,10 +52,10 @@ class TestBadDataHandling:
 | 
			
		||||
        """Test files with malformed JSON syntax."""
 | 
			
		||||
        # Missing closing brace
 | 
			
		||||
        bad_content = '{"id": "doc_001", "embedding": [0.1, 0.2], "text": "test"'
 | 
			
		||||
        
 | 
			
		||||
 | 
			
		||||
        with pytest.raises(json.JSONDecodeError):
 | 
			
		||||
            self.parser.parse_text(bad_content)
 | 
			
		||||
        
 | 
			
		||||
 | 
			
		||||
        # Test processor error handling
 | 
			
		||||
        upload_contents = self._create_upload_contents(bad_content)
 | 
			
		||||
        result = self.processor.process_upload(upload_contents)
 | 
			
		||||
@@ -70,7 +73,7 @@ class TestBadDataHandling:
 | 
			
		||||
            # Null embedding
 | 
			
		||||
            '{"id": "doc_004", "embedding": null, "text": "test"}',
 | 
			
		||||
        ]
 | 
			
		||||
        
 | 
			
		||||
 | 
			
		||||
        for bad_content in test_cases:
 | 
			
		||||
            upload_contents = self._create_upload_contents(bad_content)
 | 
			
		||||
            result = self.processor.process_upload(upload_contents)
 | 
			
		||||
@@ -78,25 +81,27 @@ class TestBadDataHandling:
 | 
			
		||||
 | 
			
		||||
    def test_inconsistent_embedding_dimensions(self):
 | 
			
		||||
        """Test files with embeddings of different dimensions."""
 | 
			
		||||
        bad_content = '''{"id": "doc_001", "embedding": [0.1, 0.2, 0.3, 0.4], "text": "4D embedding"}
 | 
			
		||||
{"id": "doc_002", "embedding": [0.1, 0.2, 0.3], "text": "3D embedding"}'''
 | 
			
		||||
        
 | 
			
		||||
        bad_content = """{"id": "doc_001", "embedding": [0.1, 0.2, 0.3, 0.4], "text": "4D embedding"}
 | 
			
		||||
{"id": "doc_002", "embedding": [0.1, 0.2, 0.3], "text": "3D embedding"}"""
 | 
			
		||||
 | 
			
		||||
        upload_contents = self._create_upload_contents(bad_content)
 | 
			
		||||
        result = self.processor.process_upload(upload_contents)
 | 
			
		||||
        
 | 
			
		||||
 | 
			
		||||
        # This might succeed parsing but fail in processing
 | 
			
		||||
        # The error depends on where dimension validation occurs
 | 
			
		||||
        if result.error is None:
 | 
			
		||||
            # If parsing succeeds, check that embeddings have inconsistent shapes
 | 
			
		||||
            assert len(result.documents) == 2
 | 
			
		||||
            assert len(result.documents[0].embedding) != len(result.documents[1].embedding)
 | 
			
		||||
            assert len(result.documents[0].embedding) != len(
 | 
			
		||||
                result.documents[1].embedding
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
    def test_empty_lines_in_ndjson(self):
 | 
			
		||||
        """Test files with empty lines mixed in."""
 | 
			
		||||
        content_with_empty_lines = '''{"id": "doc_001", "embedding": [0.1, 0.2], "text": "First line"}
 | 
			
		||||
        content_with_empty_lines = """{"id": "doc_001", "embedding": [0.1, 0.2], "text": "First line"}
 | 
			
		||||
 | 
			
		||||
{"id": "doc_002", "embedding": [0.3, 0.4], "text": "After empty line"}"""
 | 
			
		||||
 | 
			
		||||
{"id": "doc_002", "embedding": [0.3, 0.4], "text": "After empty line"}'''
 | 
			
		||||
        
 | 
			
		||||
        # This should work - empty lines should be skipped
 | 
			
		||||
        documents = self.parser.parse_text(content_with_empty_lines)
 | 
			
		||||
        assert len(documents) == 2
 | 
			
		||||
@@ -105,55 +110,62 @@ class TestBadDataHandling:
 | 
			
		||||
 | 
			
		||||
    def test_not_ndjson_format(self):
 | 
			
		||||
        """Test regular JSON array instead of NDJSON."""
 | 
			
		||||
        json_array = '''[
 | 
			
		||||
        json_array = """[
 | 
			
		||||
  {"id": "doc_001", "embedding": [0.1, 0.2], "text": "First"},
 | 
			
		||||
  {"id": "doc_002", "embedding": [0.3, 0.4], "text": "Second"}
 | 
			
		||||
]'''
 | 
			
		||||
        
 | 
			
		||||
]"""
 | 
			
		||||
 | 
			
		||||
        with pytest.raises(json.JSONDecodeError):
 | 
			
		||||
            self.parser.parse_text(json_array)
 | 
			
		||||
 | 
			
		||||
    def test_binary_content_in_file(self):
 | 
			
		||||
        """Test files with binary content mixed in."""
 | 
			
		||||
        # Simulate binary content that can't be decoded
 | 
			
		||||
        binary_content = b'\x00\x01\x02{"id": "doc_001", "embedding": [0.1], "text": "test"}'
 | 
			
		||||
        
 | 
			
		||||
        binary_content = (
 | 
			
		||||
            b'\x00\x01\x02{"id": "doc_001", "embedding": [0.1], "text": "test"}'
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
        # This should result in an error when processing
 | 
			
		||||
        encoded = base64.b64encode(binary_content).decode('utf-8')
 | 
			
		||||
        encoded = base64.b64encode(binary_content).decode("utf-8")
 | 
			
		||||
        upload_contents = f"data:application/json;base64,{encoded}"
 | 
			
		||||
        result = self.processor.process_upload(upload_contents)
 | 
			
		||||
        
 | 
			
		||||
 | 
			
		||||
        # Should either fail with UnicodeDecodeError or JSON parsing error
 | 
			
		||||
        assert result.error is not None
 | 
			
		||||
 | 
			
		||||
    def test_extremely_large_embeddings(self):
 | 
			
		||||
        """Test embeddings with very large dimensions."""
 | 
			
		||||
        large_embedding = [0.1] * 10000  # 10k dimensions
 | 
			
		||||
        content = json.dumps({
 | 
			
		||||
            "id": "doc_001", 
 | 
			
		||||
            "embedding": large_embedding, 
 | 
			
		||||
            "text": "Large embedding test"
 | 
			
		||||
        })
 | 
			
		||||
        
 | 
			
		||||
        content = json.dumps(
 | 
			
		||||
            {
 | 
			
		||||
                "id": "doc_001",
 | 
			
		||||
                "embedding": large_embedding,
 | 
			
		||||
                "text": "Large embedding test",
 | 
			
		||||
            }
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
        # This should work but might be slow
 | 
			
		||||
        upload_contents = self._create_upload_contents(content)
 | 
			
		||||
        result = self.processor.process_upload(upload_contents)
 | 
			
		||||
        
 | 
			
		||||
 | 
			
		||||
        if result.error is None:
 | 
			
		||||
            assert len(result.documents) == 1
 | 
			
		||||
            assert len(result.documents[0].embedding) == 10000
 | 
			
		||||
 | 
			
		||||
    def test_special_characters_in_text(self):
 | 
			
		||||
        """Test handling of special characters and unicode."""
 | 
			
		||||
        special_content = json.dumps({
 | 
			
		||||
            "id": "doc_001",
 | 
			
		||||
            "embedding": [0.1, 0.2],
 | 
			
		||||
            "text": "Special chars: 🚀 ñoñó 中文 \n\t\""
 | 
			
		||||
        }, ensure_ascii=False)
 | 
			
		||||
        
 | 
			
		||||
        special_content = json.dumps(
 | 
			
		||||
            {
 | 
			
		||||
                "id": "doc_001",
 | 
			
		||||
                "embedding": [0.1, 0.2],
 | 
			
		||||
                "text": 'Special chars: 🚀 ñoñó 中文 \n\t"',
 | 
			
		||||
            },
 | 
			
		||||
            ensure_ascii=False,
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
        upload_contents = self._create_upload_contents(special_content)
 | 
			
		||||
        result = self.processor.process_upload(upload_contents)
 | 
			
		||||
        
 | 
			
		||||
 | 
			
		||||
        assert result.error is None
 | 
			
		||||
        assert len(result.documents) == 1
 | 
			
		||||
        assert "🚀" in result.documents[0].text
 | 
			
		||||
@@ -162,9 +174,9 @@ class TestBadDataHandling:
 | 
			
		||||
        """Test that processor returns proper error structure."""
 | 
			
		||||
        bad_content = '{"invalid": "json"'  # Missing closing brace
 | 
			
		||||
        upload_contents = self._create_upload_contents(bad_content)
 | 
			
		||||
        
 | 
			
		||||
 | 
			
		||||
        result = self.processor.process_upload(upload_contents)
 | 
			
		||||
        
 | 
			
		||||
 | 
			
		||||
        # Check error structure
 | 
			
		||||
        assert result.error is not None
 | 
			
		||||
        assert isinstance(result.error, str)
 | 
			
		||||
@@ -173,13 +185,13 @@ class TestBadDataHandling:
 | 
			
		||||
 | 
			
		||||
    def test_multiple_errors_in_file(self):
 | 
			
		||||
        """Test file with multiple different types of errors."""
 | 
			
		||||
        multi_error_content = '''{"id": "doc_001", "text": "Missing embedding"}
 | 
			
		||||
        multi_error_content = """{"id": "doc_001", "text": "Missing embedding"}
 | 
			
		||||
{"id": "doc_002", "embedding": "wrong_type", "text": "Wrong embedding type"}
 | 
			
		||||
{"id": "doc_003", "embedding": [0.1, 0.2], "text": "Valid line"}
 | 
			
		||||
{"id": "doc_004", "embedding": [0.3, 0.4]'''  # Missing text and closing brace
 | 
			
		||||
        
 | 
			
		||||
{"id": "doc_004", "embedding": [0.3, 0.4]"""  # Missing text and closing brace
 | 
			
		||||
 | 
			
		||||
        upload_contents = self._create_upload_contents(multi_error_content)
 | 
			
		||||
        result = self.processor.process_upload(upload_contents)
 | 
			
		||||
        
 | 
			
		||||
 | 
			
		||||
        # Should fail on first error encountered
 | 
			
		||||
        assert result.error is not None
 | 
			
		||||
        assert result.error is not None
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user