this will load data from Opensearch.

it doesn't have prompts as well
This commit is contained in:
2025-08-14 13:49:46 -07:00
parent a2adc8b958
commit 9cf2f0e6fa
16 changed files with 1694 additions and 7 deletions

View File

@@ -1,6 +1,7 @@
import numpy as np
from typing import List, Optional, Tuple
from ..models.schemas import Document, ProcessedData
from ..models.field_mapper import FieldMapper
from .parser import NDJSONParser
@@ -26,6 +27,42 @@ class DataProcessor:
except Exception as e:
return ProcessedData(documents=[], embeddings=np.array([]), error=str(e))
def process_opensearch_data(
self, raw_documents: List[dict], field_mapping
) -> ProcessedData:
"""Process raw OpenSearch documents using field mapping."""
try:
# Transform documents using field mapping
transformed_docs = FieldMapper.transform_documents(
raw_documents, field_mapping
)
# Parse transformed documents
documents = []
for doc_dict in transformed_docs:
try:
# Ensure required fields are present with defaults if needed
if "id" not in doc_dict or not doc_dict["id"]:
doc_dict["id"] = f"doc_{len(documents)}"
doc = Document(**doc_dict)
documents.append(doc)
except Exception:
continue # Skip invalid documents
if not documents:
return ProcessedData(
documents=[],
embeddings=np.array([]),
error="No valid documents after transformation",
)
embeddings = self._extract_embeddings(documents)
return ProcessedData(documents=documents, embeddings=embeddings)
except Exception as e:
return ProcessedData(documents=[], embeddings=np.array([]), error=str(e))
def _extract_embeddings(self, documents: List[Document]) -> np.ndarray:
if not documents:
return np.array([])