fixed refactored code
Some checks failed
Security Scan / dependency-check (pull_request) Successful in 38s
Security Scan / security (pull_request) Successful in 41s
Test Suite / lint (pull_request) Failing after 28s
Test Suite / test (3.11) (pull_request) Successful in 1m27s
Test Suite / build (pull_request) Has been skipped

This commit is contained in:
2025-08-14 07:55:40 -07:00
parent 1ec7e2c38c
commit 7b81c20a26
18 changed files with 386 additions and 537 deletions

View File

@@ -16,11 +16,26 @@ class NDJSONParser:
@staticmethod
def parse_text(text_content: str) -> List[Document]:
documents = []
for line in text_content.strip().split("\n"):
for line_num, line in enumerate(text_content.strip().split("\n"), 1):
if line.strip():
doc_dict = json.loads(line)
doc = NDJSONParser._dict_to_document(doc_dict)
documents.append(doc)
try:
doc_dict = json.loads(line)
doc = NDJSONParser._dict_to_document(doc_dict)
documents.append(doc)
except json.JSONDecodeError as e:
raise json.JSONDecodeError(
f"Invalid JSON on line {line_num}: {e.msg}",
e.doc,
e.pos
)
except KeyError as e:
raise KeyError(
f"Missing required field {e} on line {line_num}"
)
except (TypeError, ValueError) as e:
raise ValueError(
f"Invalid data format on line {line_num}: {str(e)}"
)
return documents
@staticmethod
@@ -28,10 +43,29 @@ class NDJSONParser:
if "id" not in doc_dict:
doc_dict["id"] = str(uuid.uuid4())
# Validate required fields
if "text" not in doc_dict:
raise KeyError("'text'")
if "embedding" not in doc_dict:
raise KeyError("'embedding'")
# Validate embedding format
embedding = doc_dict["embedding"]
if not isinstance(embedding, list):
raise ValueError(f"Embedding must be a list, got {type(embedding).__name__}")
if not embedding:
raise ValueError("Embedding cannot be empty")
# Check that all embedding values are numbers
for i, val in enumerate(embedding):
if not isinstance(val, (int, float)) or val != val: # NaN check
raise ValueError(f"Embedding contains invalid value at index {i}: {val}")
return Document(
id=doc_dict["id"],
text=doc_dict["text"],
embedding=doc_dict["embedding"],
embedding=embedding,
category=doc_dict.get("category"),
subcategory=doc_dict.get("subcategory"),
tags=doc_dict.get("tags"),