add ci workflows (#1)

Reviewed-on: #1
2025-08-13 21:03:42 -07:00
parent 809dbeb783
commit 1ec7e2c38c
24 changed files with 2069 additions and 532 deletions
--- a/src/embeddingbuddy/data/parser.py
+++ b/src/embeddingbuddy/data/parser.py
@@ -1,39 +1,38 @@
 import json
 import uuid
 import base64
-from typing import List, Union
-from ..models.schemas import Document, ProcessedData
+from typing import List
+from ..models.schemas import Document


 class NDJSONParser:
-    
    @staticmethod
    def parse_upload_contents(contents: str) -> List[Document]:
-        content_type, content_string = contents.split(',')
+        content_type, content_string = contents.split(",")
        decoded = base64.b64decode(content_string)
-        text_content = decoded.decode('utf-8')
+        text_content = decoded.decode("utf-8")
        return NDJSONParser.parse_text(text_content)
-    
+
    @staticmethod
    def parse_text(text_content: str) -> List[Document]:
        documents = []
-        for line in text_content.strip().split('\n'):
+        for line in text_content.strip().split("\n"):
            if line.strip():
                doc_dict = json.loads(line)
                doc = NDJSONParser._dict_to_document(doc_dict)
                documents.append(doc)
        return documents
-    
+
    @staticmethod
    def _dict_to_document(doc_dict: dict) -> Document:
-        if 'id' not in doc_dict:
-            doc_dict['id'] = str(uuid.uuid4())
-        
+        if "id" not in doc_dict:
+            doc_dict["id"] = str(uuid.uuid4())
+
        return Document(
-            id=doc_dict['id'],
-            text=doc_dict['text'],
-            embedding=doc_dict['embedding'],
-            category=doc_dict.get('category'),
-            subcategory=doc_dict.get('subcategory'),
-            tags=doc_dict.get('tags')
-        )
+            id=doc_dict["id"],
+            text=doc_dict["text"],
+            embedding=doc_dict["embedding"],
+            category=doc_dict.get("category"),
+            subcategory=doc_dict.get("subcategory"),
+            tags=doc_dict.get("tags"),
+        )
--- a/src/embeddingbuddy/data/processor.py
+++ b/src/embeddingbuddy/data/processor.py
@@ -5,18 +5,19 @@ from .parser import NDJSONParser


 class DataProcessor:
-    
    def __init__(self):
        self.parser = NDJSONParser()
-    
-    def process_upload(self, contents: str, filename: Optional[str] = None) -> ProcessedData:
+
+    def process_upload(
+        self, contents: str, filename: Optional[str] = None
+    ) -> ProcessedData:
        try:
            documents = self.parser.parse_upload_contents(contents)
            embeddings = self._extract_embeddings(documents)
            return ProcessedData(documents=documents, embeddings=embeddings)
        except Exception as e:
            return ProcessedData(documents=[], embeddings=np.array([]), error=str(e))
-    
+
    def process_text(self, text_content: str) -> ProcessedData:
        try:
            documents = self.parser.parse_text(text_content)
@@ -24,31 +25,35 @@ class DataProcessor:
            return ProcessedData(documents=documents, embeddings=embeddings)
        except Exception as e:
            return ProcessedData(documents=[], embeddings=np.array([]), error=str(e))
-    
+
    def _extract_embeddings(self, documents: List[Document]) -> np.ndarray:
        if not documents:
            return np.array([])
        return np.array([doc.embedding for doc in documents])
-    
-    def combine_data(self, doc_data: ProcessedData, prompt_data: Optional[ProcessedData] = None) -> Tuple[np.ndarray, List[Document], Optional[List[Document]]]:
+
+    def combine_data(
+        self, doc_data: ProcessedData, prompt_data: Optional[ProcessedData] = None
+    ) -> Tuple[np.ndarray, List[Document], Optional[List[Document]]]:
        if not doc_data or doc_data.error:
            raise ValueError("Invalid document data")
-        
+
        all_embeddings = doc_data.embeddings
        documents = doc_data.documents
        prompts = None
-        
+
        if prompt_data and not prompt_data.error and prompt_data.documents:
            all_embeddings = np.vstack([doc_data.embeddings, prompt_data.embeddings])
            prompts = prompt_data.documents
-        
+
        return all_embeddings, documents, prompts
-    
-    def split_reduced_data(self, reduced_embeddings: np.ndarray, n_documents: int, n_prompts: int = 0) -> Tuple[np.ndarray, Optional[np.ndarray]]:
+
+    def split_reduced_data(
+        self, reduced_embeddings: np.ndarray, n_documents: int, n_prompts: int = 0
+    ) -> Tuple[np.ndarray, Optional[np.ndarray]]:
        doc_reduced = reduced_embeddings[:n_documents]
        prompt_reduced = None
-        
+
        if n_prompts > 0:
-            prompt_reduced = reduced_embeddings[n_documents:n_documents + n_prompts]
-        
-        return doc_reduced, prompt_reduced
+            prompt_reduced = reduced_embeddings[n_documents : n_documents + n_prompts]
+
+        return doc_reduced, prompt_reduced