refactor and add tests, v0.2.0

This commit is contained in:
2025-08-13 20:07:40 -07:00
parent 76be59254c
commit 809dbeb783
32 changed files with 1401 additions and 32 deletions

View File

View File

@@ -0,0 +1,95 @@
from abc import ABC, abstractmethod
import numpy as np
from typing import Optional, Tuple
from sklearn.decomposition import PCA
import umap
from openTSNE import TSNE
from .schemas import ReducedData
class DimensionalityReducer(ABC):
def __init__(self, n_components: int = 3, random_state: int = 42):
self.n_components = n_components
self.random_state = random_state
self._reducer = None
@abstractmethod
def fit_transform(self, embeddings: np.ndarray) -> ReducedData:
pass
@abstractmethod
def get_method_name(self) -> str:
pass
class PCAReducer(DimensionalityReducer):
def fit_transform(self, embeddings: np.ndarray) -> ReducedData:
self._reducer = PCA(n_components=self.n_components)
reduced = self._reducer.fit_transform(embeddings)
variance_explained = self._reducer.explained_variance_ratio_
return ReducedData(
reduced_embeddings=reduced,
variance_explained=variance_explained,
method=self.get_method_name(),
n_components=self.n_components
)
def get_method_name(self) -> str:
return "PCA"
class TSNEReducer(DimensionalityReducer):
def fit_transform(self, embeddings: np.ndarray) -> ReducedData:
self._reducer = TSNE(n_components=self.n_components, random_state=self.random_state)
reduced = self._reducer.fit(embeddings)
return ReducedData(
reduced_embeddings=reduced,
variance_explained=None,
method=self.get_method_name(),
n_components=self.n_components
)
def get_method_name(self) -> str:
return "t-SNE"
class UMAPReducer(DimensionalityReducer):
def fit_transform(self, embeddings: np.ndarray) -> ReducedData:
self._reducer = umap.UMAP(n_components=self.n_components, random_state=self.random_state)
reduced = self._reducer.fit_transform(embeddings)
return ReducedData(
reduced_embeddings=reduced,
variance_explained=None,
method=self.get_method_name(),
n_components=self.n_components
)
def get_method_name(self) -> str:
return "UMAP"
class ReducerFactory:
@staticmethod
def create_reducer(method: str, n_components: int = 3, random_state: int = 42) -> DimensionalityReducer:
method_lower = method.lower()
if method_lower == 'pca':
return PCAReducer(n_components=n_components, random_state=random_state)
elif method_lower == 'tsne':
return TSNEReducer(n_components=n_components, random_state=random_state)
elif method_lower == 'umap':
return UMAPReducer(n_components=n_components, random_state=random_state)
else:
raise ValueError(f"Unknown reduction method: {method}")
@staticmethod
def get_available_methods() -> list:
return ['pca', 'tsne', 'umap']

View File

@@ -0,0 +1,58 @@
from typing import List, Optional, Any, Dict
from dataclasses import dataclass
import numpy as np
@dataclass
class Document:
id: str
text: str
embedding: List[float]
category: Optional[str] = None
subcategory: Optional[str] = None
tags: Optional[List[str]] = None
def __post_init__(self):
if self.tags is None:
self.tags = []
if self.category is None:
self.category = "Unknown"
if self.subcategory is None:
self.subcategory = "Unknown"
@dataclass
class ProcessedData:
documents: List[Document]
embeddings: np.ndarray
error: Optional[str] = None
def __post_init__(self):
if self.embeddings is not None and not isinstance(self.embeddings, np.ndarray):
self.embeddings = np.array(self.embeddings)
@dataclass
class ReducedData:
reduced_embeddings: np.ndarray
variance_explained: Optional[np.ndarray] = None
method: str = "unknown"
n_components: int = 2
def __post_init__(self):
if not isinstance(self.reduced_embeddings, np.ndarray):
self.reduced_embeddings = np.array(self.reduced_embeddings)
@dataclass
class PlotData:
documents: List[Document]
coordinates: np.ndarray
prompts: Optional[List[Document]] = None
prompt_coordinates: Optional[np.ndarray] = None
def __post_init__(self):
if not isinstance(self.coordinates, np.ndarray):
self.coordinates = np.array(self.coordinates)
if self.prompt_coordinates is not None and not isinstance(self.prompt_coordinates, np.ndarray):
self.prompt_coordinates = np.array(self.prompt_coordinates)