add in browser embedding generation
Some checks failed
Security Scan / security (pull_request) Successful in 44s
Security Scan / dependency-check (pull_request) Successful in 49s
Test Suite / lint (pull_request) Failing after 40s
Test Suite / test (3.11) (pull_request) Successful in 1m39s
Test Suite / build (pull_request) Has been skipped

This commit is contained in:
2025-09-06 07:16:30 -07:00
parent 14abc446b7
commit cdaaffd735
13 changed files with 1582 additions and 3 deletions

View File

@@ -63,6 +63,90 @@ class DataProcessor:
except Exception as e:
return ProcessedData(documents=[], embeddings=np.array([]), error=str(e))
def process_client_embeddings(self, embeddings_data: dict) -> ProcessedData:
"""Process embeddings data received from client-side JavaScript."""
try:
if "error" in embeddings_data:
return ProcessedData(
documents=[],
embeddings=np.array([]),
error=embeddings_data["error"],
)
# Extract documents and embeddings from client data
documents_data = embeddings_data.get("documents", [])
embeddings_list = embeddings_data.get("embeddings", [])
if not documents_data or not embeddings_list:
return ProcessedData(
documents=[],
embeddings=np.array([]),
error="No documents or embeddings in client data",
)
if len(documents_data) != len(embeddings_list):
return ProcessedData(
documents=[],
embeddings=np.array([]),
error="Mismatch between number of documents and embeddings",
)
# Convert embeddings to numpy array first
try:
embeddings = np.array(embeddings_list)
if embeddings.ndim != 2:
return ProcessedData(
documents=[],
embeddings=np.array([]),
error="Invalid embedding dimensions",
)
except Exception as e:
return ProcessedData(
documents=[],
embeddings=np.array([]),
error=f"Error processing embeddings: {str(e)}",
)
# Convert to Document objects with embeddings
documents = []
for i, doc_data in enumerate(documents_data):
try:
# Skip if we don't have a corresponding embedding
if i >= len(embeddings):
continue
# Ensure required fields are present
if "id" not in doc_data or not doc_data["id"]:
doc_data["id"] = f"text_input_{i}"
if "text" not in doc_data or not doc_data["text"].strip():
continue # Skip documents without text
# Add the embedding to doc_data
doc_data["embedding"] = embeddings[i].tolist()
doc = Document(**doc_data)
documents.append(doc)
except Exception:
# Skip invalid documents but continue processing
continue
if not documents:
return ProcessedData(
documents=[],
embeddings=np.array([]),
error="No valid documents found in client data",
)
# Only keep embeddings for valid documents
valid_embeddings = embeddings[: len(documents)]
return ProcessedData(documents=documents, embeddings=valid_embeddings)
except Exception as e:
return ProcessedData(documents=[], embeddings=np.array([]), error=str(e))
def _extract_embeddings(self, documents: List[Document]) -> np.ndarray:
if not documents:
return np.array([])