this will load data from Opensearch.

it doesn't have prompts as well
2025-08-14 13:49:46 -07:00
parent a2adc8b958
commit 9cf2f0e6fa
16 changed files with 1694 additions and 7 deletions
--- a/src/embeddingbuddy/data/processor.py
+++ b/src/embeddingbuddy/data/processor.py
@@ -1,6 +1,7 @@
 import numpy as np
 from typing import List, Optional, Tuple
 from ..models.schemas import Document, ProcessedData
+from ..models.field_mapper import FieldMapper
 from .parser import NDJSONParser


@@ -26,6 +27,42 @@ class DataProcessor:
        except Exception as e:
            return ProcessedData(documents=[], embeddings=np.array([]), error=str(e))

+    def process_opensearch_data(
+        self, raw_documents: List[dict], field_mapping
+    ) -> ProcessedData:
+        """Process raw OpenSearch documents using field mapping."""
+        try:
+            # Transform documents using field mapping
+            transformed_docs = FieldMapper.transform_documents(
+                raw_documents, field_mapping
+            )
+
+            # Parse transformed documents
+            documents = []
+            for doc_dict in transformed_docs:
+                try:
+                    # Ensure required fields are present with defaults if needed
+                    if "id" not in doc_dict or not doc_dict["id"]:
+                        doc_dict["id"] = f"doc_{len(documents)}"
+
+                    doc = Document(**doc_dict)
+                    documents.append(doc)
+                except Exception:
+                    continue  # Skip invalid documents
+
+            if not documents:
+                return ProcessedData(
+                    documents=[],
+                    embeddings=np.array([]),
+                    error="No valid documents after transformation",
+                )
+
+            embeddings = self._extract_embeddings(documents)
+            return ProcessedData(documents=documents, embeddings=embeddings)
+
+        except Exception as e:
+            return ProcessedData(documents=[], embeddings=np.array([]), error=str(e))
+
    def _extract_embeddings(self, documents: List[Document]) -> np.ndarray:
        if not documents:
            return np.array([])