add ci workflows (#1)
All checks were successful
Security Scan / security (push) Successful in 30s
Security Scan / dependency-check (push) Successful in 25s
Test Suite / test (3.11) (push) Successful in 1m16s
Test Suite / lint (push) Successful in 20s
Test Suite / build (push) Successful in 35s

Reviewed-on: #1
This commit is contained in:
2025-08-13 21:03:42 -07:00
parent 809dbeb783
commit 1ec7e2c38c
24 changed files with 2069 additions and 532 deletions

View File

@@ -1,39 +1,38 @@
import json
import uuid
import base64
from typing import List, Union
from ..models.schemas import Document, ProcessedData
from typing import List
from ..models.schemas import Document
class NDJSONParser:
@staticmethod
def parse_upload_contents(contents: str) -> List[Document]:
content_type, content_string = contents.split(',')
content_type, content_string = contents.split(",")
decoded = base64.b64decode(content_string)
text_content = decoded.decode('utf-8')
text_content = decoded.decode("utf-8")
return NDJSONParser.parse_text(text_content)
@staticmethod
def parse_text(text_content: str) -> List[Document]:
documents = []
for line in text_content.strip().split('\n'):
for line in text_content.strip().split("\n"):
if line.strip():
doc_dict = json.loads(line)
doc = NDJSONParser._dict_to_document(doc_dict)
documents.append(doc)
return documents
@staticmethod
def _dict_to_document(doc_dict: dict) -> Document:
if 'id' not in doc_dict:
doc_dict['id'] = str(uuid.uuid4())
if "id" not in doc_dict:
doc_dict["id"] = str(uuid.uuid4())
return Document(
id=doc_dict['id'],
text=doc_dict['text'],
embedding=doc_dict['embedding'],
category=doc_dict.get('category'),
subcategory=doc_dict.get('subcategory'),
tags=doc_dict.get('tags')
)
id=doc_dict["id"],
text=doc_dict["text"],
embedding=doc_dict["embedding"],
category=doc_dict.get("category"),
subcategory=doc_dict.get("subcategory"),
tags=doc_dict.get("tags"),
)

View File

@@ -5,18 +5,19 @@ from .parser import NDJSONParser
class DataProcessor:
def __init__(self):
self.parser = NDJSONParser()
def process_upload(self, contents: str, filename: Optional[str] = None) -> ProcessedData:
def process_upload(
self, contents: str, filename: Optional[str] = None
) -> ProcessedData:
try:
documents = self.parser.parse_upload_contents(contents)
embeddings = self._extract_embeddings(documents)
return ProcessedData(documents=documents, embeddings=embeddings)
except Exception as e:
return ProcessedData(documents=[], embeddings=np.array([]), error=str(e))
def process_text(self, text_content: str) -> ProcessedData:
try:
documents = self.parser.parse_text(text_content)
@@ -24,31 +25,35 @@ class DataProcessor:
return ProcessedData(documents=documents, embeddings=embeddings)
except Exception as e:
return ProcessedData(documents=[], embeddings=np.array([]), error=str(e))
def _extract_embeddings(self, documents: List[Document]) -> np.ndarray:
if not documents:
return np.array([])
return np.array([doc.embedding for doc in documents])
def combine_data(self, doc_data: ProcessedData, prompt_data: Optional[ProcessedData] = None) -> Tuple[np.ndarray, List[Document], Optional[List[Document]]]:
def combine_data(
self, doc_data: ProcessedData, prompt_data: Optional[ProcessedData] = None
) -> Tuple[np.ndarray, List[Document], Optional[List[Document]]]:
if not doc_data or doc_data.error:
raise ValueError("Invalid document data")
all_embeddings = doc_data.embeddings
documents = doc_data.documents
prompts = None
if prompt_data and not prompt_data.error and prompt_data.documents:
all_embeddings = np.vstack([doc_data.embeddings, prompt_data.embeddings])
prompts = prompt_data.documents
return all_embeddings, documents, prompts
def split_reduced_data(self, reduced_embeddings: np.ndarray, n_documents: int, n_prompts: int = 0) -> Tuple[np.ndarray, Optional[np.ndarray]]:
def split_reduced_data(
self, reduced_embeddings: np.ndarray, n_documents: int, n_prompts: int = 0
) -> Tuple[np.ndarray, Optional[np.ndarray]]:
doc_reduced = reduced_embeddings[:n_documents]
prompt_reduced = None
if n_prompts > 0:
prompt_reduced = reduced_embeddings[n_documents:n_documents + n_prompts]
return doc_reduced, prompt_reduced
prompt_reduced = reduced_embeddings[n_documents : n_documents + n_prompts]
return doc_reduced, prompt_reduced