refactor and add tests, v0.2.0

2025-08-13 20:07:40 -07:00
parent 76be59254c
commit 809dbeb783
32 changed files with 1401 additions and 32 deletions
--- a/src/embeddingbuddy/models/reducers.py
+++ b/src/embeddingbuddy/models/reducers.py
@@ -0,0 +1,95 @@
+from abc import ABC, abstractmethod
+import numpy as np
+from typing import Optional, Tuple
+from sklearn.decomposition import PCA
+import umap
+from openTSNE import TSNE
+from .schemas import ReducedData
+
+
+class DimensionalityReducer(ABC):
+    
+    def __init__(self, n_components: int = 3, random_state: int = 42):
+        self.n_components = n_components
+        self.random_state = random_state
+        self._reducer = None
+    
+    @abstractmethod
+    def fit_transform(self, embeddings: np.ndarray) -> ReducedData:
+        pass
+    
+    @abstractmethod
+    def get_method_name(self) -> str:
+        pass
+
+
+class PCAReducer(DimensionalityReducer):
+    
+    def fit_transform(self, embeddings: np.ndarray) -> ReducedData:
+        self._reducer = PCA(n_components=self.n_components)
+        reduced = self._reducer.fit_transform(embeddings)
+        variance_explained = self._reducer.explained_variance_ratio_
+        
+        return ReducedData(
+            reduced_embeddings=reduced,
+            variance_explained=variance_explained,
+            method=self.get_method_name(),
+            n_components=self.n_components
+        )
+    
+    def get_method_name(self) -> str:
+        return "PCA"
+
+
+class TSNEReducer(DimensionalityReducer):
+    
+    def fit_transform(self, embeddings: np.ndarray) -> ReducedData:
+        self._reducer = TSNE(n_components=self.n_components, random_state=self.random_state)
+        reduced = self._reducer.fit(embeddings)
+        
+        return ReducedData(
+            reduced_embeddings=reduced,
+            variance_explained=None,
+            method=self.get_method_name(),
+            n_components=self.n_components
+        )
+    
+    def get_method_name(self) -> str:
+        return "t-SNE"
+
+
+class UMAPReducer(DimensionalityReducer):
+    
+    def fit_transform(self, embeddings: np.ndarray) -> ReducedData:
+        self._reducer = umap.UMAP(n_components=self.n_components, random_state=self.random_state)
+        reduced = self._reducer.fit_transform(embeddings)
+        
+        return ReducedData(
+            reduced_embeddings=reduced,
+            variance_explained=None,
+            method=self.get_method_name(),
+            n_components=self.n_components
+        )
+    
+    def get_method_name(self) -> str:
+        return "UMAP"
+
+
+class ReducerFactory:
+    
+    @staticmethod
+    def create_reducer(method: str, n_components: int = 3, random_state: int = 42) -> DimensionalityReducer:
+        method_lower = method.lower()
+        
+        if method_lower == 'pca':
+            return PCAReducer(n_components=n_components, random_state=random_state)
+        elif method_lower == 'tsne':
+            return TSNEReducer(n_components=n_components, random_state=random_state)
+        elif method_lower == 'umap':
+            return UMAPReducer(n_components=n_components, random_state=random_state)
+        else:
+            raise ValueError(f"Unknown reduction method: {method}")
+    
+    @staticmethod
+    def get_available_methods() -> list:
+        return ['pca', 'tsne', 'umap']