refactor and add tests, v0.2.0
This commit is contained in:
0
src/embeddingbuddy/data/__init__.py
Normal file
0
src/embeddingbuddy/data/__init__.py
Normal file
39
src/embeddingbuddy/data/parser.py
Normal file
39
src/embeddingbuddy/data/parser.py
Normal file
@@ -0,0 +1,39 @@
|
||||
import json
|
||||
import uuid
|
||||
import base64
|
||||
from typing import List, Union
|
||||
from ..models.schemas import Document, ProcessedData
|
||||
|
||||
|
||||
class NDJSONParser:
|
||||
|
||||
@staticmethod
|
||||
def parse_upload_contents(contents: str) -> List[Document]:
|
||||
content_type, content_string = contents.split(',')
|
||||
decoded = base64.b64decode(content_string)
|
||||
text_content = decoded.decode('utf-8')
|
||||
return NDJSONParser.parse_text(text_content)
|
||||
|
||||
@staticmethod
|
||||
def parse_text(text_content: str) -> List[Document]:
|
||||
documents = []
|
||||
for line in text_content.strip().split('\n'):
|
||||
if line.strip():
|
||||
doc_dict = json.loads(line)
|
||||
doc = NDJSONParser._dict_to_document(doc_dict)
|
||||
documents.append(doc)
|
||||
return documents
|
||||
|
||||
@staticmethod
|
||||
def _dict_to_document(doc_dict: dict) -> Document:
|
||||
if 'id' not in doc_dict:
|
||||
doc_dict['id'] = str(uuid.uuid4())
|
||||
|
||||
return Document(
|
||||
id=doc_dict['id'],
|
||||
text=doc_dict['text'],
|
||||
embedding=doc_dict['embedding'],
|
||||
category=doc_dict.get('category'),
|
||||
subcategory=doc_dict.get('subcategory'),
|
||||
tags=doc_dict.get('tags')
|
||||
)
|
54
src/embeddingbuddy/data/processor.py
Normal file
54
src/embeddingbuddy/data/processor.py
Normal file
@@ -0,0 +1,54 @@
|
||||
import numpy as np
|
||||
from typing import List, Optional, Tuple
|
||||
from ..models.schemas import Document, ProcessedData
|
||||
from .parser import NDJSONParser
|
||||
|
||||
|
||||
class DataProcessor:
|
||||
|
||||
def __init__(self):
|
||||
self.parser = NDJSONParser()
|
||||
|
||||
def process_upload(self, contents: str, filename: Optional[str] = None) -> ProcessedData:
|
||||
try:
|
||||
documents = self.parser.parse_upload_contents(contents)
|
||||
embeddings = self._extract_embeddings(documents)
|
||||
return ProcessedData(documents=documents, embeddings=embeddings)
|
||||
except Exception as e:
|
||||
return ProcessedData(documents=[], embeddings=np.array([]), error=str(e))
|
||||
|
||||
def process_text(self, text_content: str) -> ProcessedData:
|
||||
try:
|
||||
documents = self.parser.parse_text(text_content)
|
||||
embeddings = self._extract_embeddings(documents)
|
||||
return ProcessedData(documents=documents, embeddings=embeddings)
|
||||
except Exception as e:
|
||||
return ProcessedData(documents=[], embeddings=np.array([]), error=str(e))
|
||||
|
||||
def _extract_embeddings(self, documents: List[Document]) -> np.ndarray:
|
||||
if not documents:
|
||||
return np.array([])
|
||||
return np.array([doc.embedding for doc in documents])
|
||||
|
||||
def combine_data(self, doc_data: ProcessedData, prompt_data: Optional[ProcessedData] = None) -> Tuple[np.ndarray, List[Document], Optional[List[Document]]]:
|
||||
if not doc_data or doc_data.error:
|
||||
raise ValueError("Invalid document data")
|
||||
|
||||
all_embeddings = doc_data.embeddings
|
||||
documents = doc_data.documents
|
||||
prompts = None
|
||||
|
||||
if prompt_data and not prompt_data.error and prompt_data.documents:
|
||||
all_embeddings = np.vstack([doc_data.embeddings, prompt_data.embeddings])
|
||||
prompts = prompt_data.documents
|
||||
|
||||
return all_embeddings, documents, prompts
|
||||
|
||||
def split_reduced_data(self, reduced_embeddings: np.ndarray, n_documents: int, n_prompts: int = 0) -> Tuple[np.ndarray, Optional[np.ndarray]]:
|
||||
doc_reduced = reduced_embeddings[:n_documents]
|
||||
prompt_reduced = None
|
||||
|
||||
if n_prompts > 0:
|
||||
prompt_reduced = reduced_embeddings[n_documents:n_documents + n_prompts]
|
||||
|
||||
return doc_reduced, prompt_reduced
|
Reference in New Issue
Block a user