reformat
All checks were successful
Security Scan / dependency-check (pull_request) Successful in 35s
Security Scan / security (pull_request) Successful in 39s
Test Suite / lint (pull_request) Successful in 30s
Test Suite / test (3.11) (pull_request) Successful in 1m26s
Test Suite / build (pull_request) Successful in 37s
All checks were successful
Security Scan / dependency-check (pull_request) Successful in 35s
Security Scan / security (pull_request) Successful in 39s
Test Suite / lint (pull_request) Successful in 30s
Test Suite / test (3.11) (pull_request) Successful in 1m26s
Test Suite / build (pull_request) Successful in 37s
This commit is contained in:
@@ -24,14 +24,10 @@ class NDJSONParser:
|
|||||||
documents.append(doc)
|
documents.append(doc)
|
||||||
except json.JSONDecodeError as e:
|
except json.JSONDecodeError as e:
|
||||||
raise json.JSONDecodeError(
|
raise json.JSONDecodeError(
|
||||||
f"Invalid JSON on line {line_num}: {e.msg}",
|
f"Invalid JSON on line {line_num}: {e.msg}", e.doc, e.pos
|
||||||
e.doc,
|
|
||||||
e.pos
|
|
||||||
)
|
)
|
||||||
except KeyError as e:
|
except KeyError as e:
|
||||||
raise KeyError(
|
raise KeyError(f"Missing required field {e} on line {line_num}")
|
||||||
f"Missing required field {e} on line {line_num}"
|
|
||||||
)
|
|
||||||
except (TypeError, ValueError) as e:
|
except (TypeError, ValueError) as e:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Invalid data format on line {line_num}: {str(e)}"
|
f"Invalid data format on line {line_num}: {str(e)}"
|
||||||
@@ -52,7 +48,9 @@ class NDJSONParser:
|
|||||||
# Validate embedding format
|
# Validate embedding format
|
||||||
embedding = doc_dict["embedding"]
|
embedding = doc_dict["embedding"]
|
||||||
if not isinstance(embedding, list):
|
if not isinstance(embedding, list):
|
||||||
raise ValueError(f"Embedding must be a list, got {type(embedding).__name__}")
|
raise ValueError(
|
||||||
|
f"Embedding must be a list, got {type(embedding).__name__}"
|
||||||
|
)
|
||||||
|
|
||||||
if not embedding:
|
if not embedding:
|
||||||
raise ValueError("Embedding cannot be empty")
|
raise ValueError("Embedding cannot be empty")
|
||||||
@@ -60,7 +58,9 @@ class NDJSONParser:
|
|||||||
# Check that all embedding values are numbers
|
# Check that all embedding values are numbers
|
||||||
for i, val in enumerate(embedding):
|
for i, val in enumerate(embedding):
|
||||||
if not isinstance(val, (int, float)) or val != val: # NaN check
|
if not isinstance(val, (int, float)) or val != val: # NaN check
|
||||||
raise ValueError(f"Embedding contains invalid value at index {i}: {val}")
|
raise ValueError(
|
||||||
|
f"Embedding contains invalid value at index {i}: {val}"
|
||||||
|
)
|
||||||
|
|
||||||
return Document(
|
return Document(
|
||||||
id=doc_dict["id"],
|
id=doc_dict["id"],
|
||||||
|
@@ -25,7 +25,9 @@ class DataProcessingCallbacks:
|
|||||||
processed_data = self.processor.process_upload(contents, filename)
|
processed_data = self.processor.process_upload(contents, filename)
|
||||||
|
|
||||||
if processed_data.error:
|
if processed_data.error:
|
||||||
error_message = self._format_error_message(processed_data.error, filename)
|
error_message = self._format_error_message(
|
||||||
|
processed_data.error, filename
|
||||||
|
)
|
||||||
return (
|
return (
|
||||||
{"error": processed_data.error},
|
{"error": processed_data.error},
|
||||||
error_message,
|
error_message,
|
||||||
@@ -82,12 +84,16 @@ class DataProcessingCallbacks:
|
|||||||
file_part = f" in file '{filename}'" if filename else ""
|
file_part = f" in file '{filename}'" if filename else ""
|
||||||
|
|
||||||
# Check for common error patterns and provide helpful messages
|
# Check for common error patterns and provide helpful messages
|
||||||
if "embedding" in error.lower() and ("key" in error.lower() or "required field" in error.lower()):
|
if "embedding" in error.lower() and (
|
||||||
|
"key" in error.lower() or "required field" in error.lower()
|
||||||
|
):
|
||||||
return (
|
return (
|
||||||
f"❌ Missing 'embedding' field{file_part}. "
|
f"❌ Missing 'embedding' field{file_part}. "
|
||||||
"Each line must contain an 'embedding' field with a list of numbers."
|
"Each line must contain an 'embedding' field with a list of numbers."
|
||||||
)
|
)
|
||||||
elif "text" in error.lower() and ("key" in error.lower() or "required field" in error.lower()):
|
elif "text" in error.lower() and (
|
||||||
|
"key" in error.lower() or "required field" in error.lower()
|
||||||
|
):
|
||||||
return (
|
return (
|
||||||
f"❌ Missing 'text' field{file_part}. "
|
f"❌ Missing 'text' field{file_part}. "
|
||||||
"Each line must contain a 'text' field with the document content."
|
"Each line must contain a 'text' field with the document content."
|
||||||
|
@@ -62,4 +62,3 @@ class UploadComponent:
|
|||||||
color="danger",
|
color="danger",
|
||||||
className="mb-3",
|
className="mb-3",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@@ -9,7 +9,8 @@ class AppLayout:
|
|||||||
|
|
||||||
def create_layout(self):
|
def create_layout(self):
|
||||||
return dbc.Container(
|
return dbc.Container(
|
||||||
[self._create_header(), self._create_main_content()] + self._create_stores(),
|
[self._create_header(), self._create_main_content()]
|
||||||
|
+ self._create_stores(),
|
||||||
fluid=True,
|
fluid=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@@ -1,4 +1,5 @@
|
|||||||
"""Tests for handling bad/invalid data files."""
|
"""Tests for handling bad/invalid data files."""
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import json
|
import json
|
||||||
import base64
|
import base64
|
||||||
@@ -16,7 +17,7 @@ class TestBadDataHandling:
|
|||||||
|
|
||||||
def _create_upload_contents(self, text_content: str) -> str:
|
def _create_upload_contents(self, text_content: str) -> str:
|
||||||
"""Helper to create upload contents format."""
|
"""Helper to create upload contents format."""
|
||||||
encoded = base64.b64encode(text_content.encode('utf-8')).decode('utf-8')
|
encoded = base64.b64encode(text_content.encode("utf-8")).decode("utf-8")
|
||||||
return f"data:application/json;base64,{encoded}"
|
return f"data:application/json;base64,{encoded}"
|
||||||
|
|
||||||
def test_missing_embedding_field(self):
|
def test_missing_embedding_field(self):
|
||||||
@@ -34,7 +35,9 @@ class TestBadDataHandling:
|
|||||||
|
|
||||||
def test_missing_text_field(self):
|
def test_missing_text_field(self):
|
||||||
"""Test files missing required text field."""
|
"""Test files missing required text field."""
|
||||||
bad_content = '{"id": "doc_001", "embedding": [0.1, 0.2, 0.3], "category": "test"}'
|
bad_content = (
|
||||||
|
'{"id": "doc_001", "embedding": [0.1, 0.2, 0.3], "category": "test"}'
|
||||||
|
)
|
||||||
|
|
||||||
with pytest.raises(KeyError, match="text"):
|
with pytest.raises(KeyError, match="text"):
|
||||||
self.parser.parse_text(bad_content)
|
self.parser.parse_text(bad_content)
|
||||||
@@ -78,8 +81,8 @@ class TestBadDataHandling:
|
|||||||
|
|
||||||
def test_inconsistent_embedding_dimensions(self):
|
def test_inconsistent_embedding_dimensions(self):
|
||||||
"""Test files with embeddings of different dimensions."""
|
"""Test files with embeddings of different dimensions."""
|
||||||
bad_content = '''{"id": "doc_001", "embedding": [0.1, 0.2, 0.3, 0.4], "text": "4D embedding"}
|
bad_content = """{"id": "doc_001", "embedding": [0.1, 0.2, 0.3, 0.4], "text": "4D embedding"}
|
||||||
{"id": "doc_002", "embedding": [0.1, 0.2, 0.3], "text": "3D embedding"}'''
|
{"id": "doc_002", "embedding": [0.1, 0.2, 0.3], "text": "3D embedding"}"""
|
||||||
|
|
||||||
upload_contents = self._create_upload_contents(bad_content)
|
upload_contents = self._create_upload_contents(bad_content)
|
||||||
result = self.processor.process_upload(upload_contents)
|
result = self.processor.process_upload(upload_contents)
|
||||||
@@ -89,13 +92,15 @@ class TestBadDataHandling:
|
|||||||
if result.error is None:
|
if result.error is None:
|
||||||
# If parsing succeeds, check that embeddings have inconsistent shapes
|
# If parsing succeeds, check that embeddings have inconsistent shapes
|
||||||
assert len(result.documents) == 2
|
assert len(result.documents) == 2
|
||||||
assert len(result.documents[0].embedding) != len(result.documents[1].embedding)
|
assert len(result.documents[0].embedding) != len(
|
||||||
|
result.documents[1].embedding
|
||||||
|
)
|
||||||
|
|
||||||
def test_empty_lines_in_ndjson(self):
|
def test_empty_lines_in_ndjson(self):
|
||||||
"""Test files with empty lines mixed in."""
|
"""Test files with empty lines mixed in."""
|
||||||
content_with_empty_lines = '''{"id": "doc_001", "embedding": [0.1, 0.2], "text": "First line"}
|
content_with_empty_lines = """{"id": "doc_001", "embedding": [0.1, 0.2], "text": "First line"}
|
||||||
|
|
||||||
{"id": "doc_002", "embedding": [0.3, 0.4], "text": "After empty line"}'''
|
{"id": "doc_002", "embedding": [0.3, 0.4], "text": "After empty line"}"""
|
||||||
|
|
||||||
# This should work - empty lines should be skipped
|
# This should work - empty lines should be skipped
|
||||||
documents = self.parser.parse_text(content_with_empty_lines)
|
documents = self.parser.parse_text(content_with_empty_lines)
|
||||||
@@ -105,10 +110,10 @@ class TestBadDataHandling:
|
|||||||
|
|
||||||
def test_not_ndjson_format(self):
|
def test_not_ndjson_format(self):
|
||||||
"""Test regular JSON array instead of NDJSON."""
|
"""Test regular JSON array instead of NDJSON."""
|
||||||
json_array = '''[
|
json_array = """[
|
||||||
{"id": "doc_001", "embedding": [0.1, 0.2], "text": "First"},
|
{"id": "doc_001", "embedding": [0.1, 0.2], "text": "First"},
|
||||||
{"id": "doc_002", "embedding": [0.3, 0.4], "text": "Second"}
|
{"id": "doc_002", "embedding": [0.3, 0.4], "text": "Second"}
|
||||||
]'''
|
]"""
|
||||||
|
|
||||||
with pytest.raises(json.JSONDecodeError):
|
with pytest.raises(json.JSONDecodeError):
|
||||||
self.parser.parse_text(json_array)
|
self.parser.parse_text(json_array)
|
||||||
@@ -116,10 +121,12 @@ class TestBadDataHandling:
|
|||||||
def test_binary_content_in_file(self):
|
def test_binary_content_in_file(self):
|
||||||
"""Test files with binary content mixed in."""
|
"""Test files with binary content mixed in."""
|
||||||
# Simulate binary content that can't be decoded
|
# Simulate binary content that can't be decoded
|
||||||
binary_content = b'\x00\x01\x02{"id": "doc_001", "embedding": [0.1], "text": "test"}'
|
binary_content = (
|
||||||
|
b'\x00\x01\x02{"id": "doc_001", "embedding": [0.1], "text": "test"}'
|
||||||
|
)
|
||||||
|
|
||||||
# This should result in an error when processing
|
# This should result in an error when processing
|
||||||
encoded = base64.b64encode(binary_content).decode('utf-8')
|
encoded = base64.b64encode(binary_content).decode("utf-8")
|
||||||
upload_contents = f"data:application/json;base64,{encoded}"
|
upload_contents = f"data:application/json;base64,{encoded}"
|
||||||
result = self.processor.process_upload(upload_contents)
|
result = self.processor.process_upload(upload_contents)
|
||||||
|
|
||||||
@@ -129,11 +136,13 @@ class TestBadDataHandling:
|
|||||||
def test_extremely_large_embeddings(self):
|
def test_extremely_large_embeddings(self):
|
||||||
"""Test embeddings with very large dimensions."""
|
"""Test embeddings with very large dimensions."""
|
||||||
large_embedding = [0.1] * 10000 # 10k dimensions
|
large_embedding = [0.1] * 10000 # 10k dimensions
|
||||||
content = json.dumps({
|
content = json.dumps(
|
||||||
"id": "doc_001",
|
{
|
||||||
"embedding": large_embedding,
|
"id": "doc_001",
|
||||||
"text": "Large embedding test"
|
"embedding": large_embedding,
|
||||||
})
|
"text": "Large embedding test",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
# This should work but might be slow
|
# This should work but might be slow
|
||||||
upload_contents = self._create_upload_contents(content)
|
upload_contents = self._create_upload_contents(content)
|
||||||
@@ -145,11 +154,14 @@ class TestBadDataHandling:
|
|||||||
|
|
||||||
def test_special_characters_in_text(self):
|
def test_special_characters_in_text(self):
|
||||||
"""Test handling of special characters and unicode."""
|
"""Test handling of special characters and unicode."""
|
||||||
special_content = json.dumps({
|
special_content = json.dumps(
|
||||||
"id": "doc_001",
|
{
|
||||||
"embedding": [0.1, 0.2],
|
"id": "doc_001",
|
||||||
"text": "Special chars: 🚀 ñoñó 中文 \n\t\""
|
"embedding": [0.1, 0.2],
|
||||||
}, ensure_ascii=False)
|
"text": 'Special chars: 🚀 ñoñó 中文 \n\t"',
|
||||||
|
},
|
||||||
|
ensure_ascii=False,
|
||||||
|
)
|
||||||
|
|
||||||
upload_contents = self._create_upload_contents(special_content)
|
upload_contents = self._create_upload_contents(special_content)
|
||||||
result = self.processor.process_upload(upload_contents)
|
result = self.processor.process_upload(upload_contents)
|
||||||
@@ -173,10 +185,10 @@ class TestBadDataHandling:
|
|||||||
|
|
||||||
def test_multiple_errors_in_file(self):
|
def test_multiple_errors_in_file(self):
|
||||||
"""Test file with multiple different types of errors."""
|
"""Test file with multiple different types of errors."""
|
||||||
multi_error_content = '''{"id": "doc_001", "text": "Missing embedding"}
|
multi_error_content = """{"id": "doc_001", "text": "Missing embedding"}
|
||||||
{"id": "doc_002", "embedding": "wrong_type", "text": "Wrong embedding type"}
|
{"id": "doc_002", "embedding": "wrong_type", "text": "Wrong embedding type"}
|
||||||
{"id": "doc_003", "embedding": [0.1, 0.2], "text": "Valid line"}
|
{"id": "doc_003", "embedding": [0.1, 0.2], "text": "Valid line"}
|
||||||
{"id": "doc_004", "embedding": [0.3, 0.4]''' # Missing text and closing brace
|
{"id": "doc_004", "embedding": [0.3, 0.4]""" # Missing text and closing brace
|
||||||
|
|
||||||
upload_contents = self._create_upload_contents(multi_error_content)
|
upload_contents = self._create_upload_contents(multi_error_content)
|
||||||
result = self.processor.process_upload(upload_contents)
|
result = self.processor.process_upload(upload_contents)
|
||||||
|
Reference in New Issue
Block a user