Files
embedding-buddy/src/embeddingbuddy/data/parser.py
Austin Godber 4867614474
All checks were successful
Security Scan / dependency-check (pull_request) Successful in 35s
Security Scan / security (pull_request) Successful in 39s
Test Suite / lint (pull_request) Successful in 30s
Test Suite / test (3.11) (pull_request) Successful in 1m26s
Test Suite / build (pull_request) Successful in 37s
reformat
2025-08-14 08:07:50 -07:00

73 lines
2.5 KiB
Python

import json
import uuid
import base64
from typing import List
from ..models.schemas import Document
class NDJSONParser:
@staticmethod
def parse_upload_contents(contents: str) -> List[Document]:
content_type, content_string = contents.split(",")
decoded = base64.b64decode(content_string)
text_content = decoded.decode("utf-8")
return NDJSONParser.parse_text(text_content)
@staticmethod
def parse_text(text_content: str) -> List[Document]:
documents = []
for line_num, line in enumerate(text_content.strip().split("\n"), 1):
if line.strip():
try:
doc_dict = json.loads(line)
doc = NDJSONParser._dict_to_document(doc_dict)
documents.append(doc)
except json.JSONDecodeError as e:
raise json.JSONDecodeError(
f"Invalid JSON on line {line_num}: {e.msg}", e.doc, e.pos
)
except KeyError as e:
raise KeyError(f"Missing required field {e} on line {line_num}")
except (TypeError, ValueError) as e:
raise ValueError(
f"Invalid data format on line {line_num}: {str(e)}"
)
return documents
@staticmethod
def _dict_to_document(doc_dict: dict) -> Document:
if "id" not in doc_dict:
doc_dict["id"] = str(uuid.uuid4())
# Validate required fields
if "text" not in doc_dict:
raise KeyError("'text'")
if "embedding" not in doc_dict:
raise KeyError("'embedding'")
# Validate embedding format
embedding = doc_dict["embedding"]
if not isinstance(embedding, list):
raise ValueError(
f"Embedding must be a list, got {type(embedding).__name__}"
)
if not embedding:
raise ValueError("Embedding cannot be empty")
# Check that all embedding values are numbers
for i, val in enumerate(embedding):
if not isinstance(val, (int, float)) or val != val: # NaN check
raise ValueError(
f"Embedding contains invalid value at index {i}: {val}"
)
return Document(
id=doc_dict["id"],
text=doc_dict["text"],
embedding=embedding,
category=doc_dict.get("category"),
subcategory=doc_dict.get("subcategory"),
tags=doc_dict.get("tags"),
)