add in browser embedding generation

2025-09-06 07:16:30 -07:00
parent 14abc446b7
commit cdaaffd735
13 changed files with 1582 additions and 3 deletions
--- a/assets/embeddings.js
+++ b/assets/embeddings.js
@@ -0,0 +1,278 @@
+// Text input embedding generation using Transformers.js
+// This module runs entirely in the browser for privacy and performance
+
+// Global flag to track initialization
+window.transformersLoading = false;
+window.transformersLoaded = false;
+
+class TransformersEmbedder {
+    constructor() {
+        this.extractor = null;
+        this.currentModel = null;
+        this.modelCache = new Map();
+        this.isLoading = false;
+    }
+    
+    async initializeModel(modelName = 'Xenova/all-MiniLM-L6-v2') {
+        try {
+            if (this.modelCache.has(modelName)) {
+                this.extractor = this.modelCache.get(modelName);
+                this.currentModel = modelName;
+                return { success: true, model: modelName };
+            }
+            
+            if (this.isLoading) {
+                return { success: false, error: 'Model loading already in progress' };
+            }
+            
+            this.isLoading = true;
+            
+            // Use globally loaded Transformers.js pipeline
+            if (!window.transformers) {
+                if (!window.transformersPipeline) {
+                    // Wait for the pipeline to load
+                    let attempts = 0;
+                    while (!window.transformersPipeline && attempts < 50) { // Wait up to 5 seconds
+                        await new Promise(resolve => setTimeout(resolve, 100));
+                        attempts++;
+                    }
+                    if (!window.transformersPipeline) {
+                        throw new Error('Transformers.js pipeline not available. Please refresh the page.');
+                    }
+                }
+                window.transformers = { pipeline: window.transformersPipeline };
+                window.transformersLoaded = true;
+                console.log('✅ Using globally loaded Transformers.js pipeline');
+            }
+            
+            // Show loading progress to user
+            if (window.updateModelLoadingProgress) {
+                window.updateModelLoadingProgress(0, `Loading ${modelName}...`);
+            }
+            
+            this.extractor = await window.transformers.pipeline('feature-extraction', modelName, {
+                progress_callback: (data) => {
+                    if (window.updateModelLoadingProgress && data.progress !== undefined) {
+                        const progress = Math.round(data.progress);
+                        window.updateModelLoadingProgress(progress, data.status || 'Loading...');
+                    }
+                }
+            });
+            
+            this.modelCache.set(modelName, this.extractor);
+            this.currentModel = modelName;
+            this.isLoading = false;
+            
+            if (window.updateModelLoadingProgress) {
+                window.updateModelLoadingProgress(100, 'Model loaded successfully');
+            }
+            
+            return { success: true, model: modelName };
+        } catch (error) {
+            this.isLoading = false;
+            console.error('Model initialization error:', error);
+            return { success: false, error: error.message };
+        }
+    }
+    
+    async generateEmbeddings(texts, options = {}) {
+        if (!this.extractor) {
+            throw new Error('Model not initialized. Call initializeModel() first.');
+        }
+        
+        if (!texts || texts.length === 0) {
+            throw new Error('No texts provided for embedding generation.');
+        }
+        
+        const embeddings = [];
+        const defaultOptions = { 
+            pooling: 'mean', 
+            normalize: true,
+            ...options 
+        };
+        
+        // Process in batches to avoid memory issues
+        const batchSize = options.batchSize || 8;
+        
+        try {
+            for (let i = 0; i < texts.length; i += batchSize) {
+                const batch = texts.slice(i, i + batchSize);
+                
+                const batchResults = await Promise.all(
+                    batch.map(text => {
+                        if (!text || text.trim().length === 0) {
+                            throw new Error('Empty text found in batch');
+                        }
+                        return this.extractor(text.trim(), defaultOptions);
+                    })
+                );
+                
+                // Convert tensor output to arrays
+                batchResults.forEach((result, idx) => {
+                    if (result && result.data) {
+                        embeddings.push(Array.from(result.data));
+                    } else {
+                        throw new Error(`Invalid embedding result for text: ${batch[idx]}`);
+                    }
+                });
+                
+                // Update progress
+                const progress = Math.min(100, ((i + batch.length) / texts.length) * 100);
+                if (window.updateEmbeddingProgress) {
+                    window.updateEmbeddingProgress(progress, `Processing ${i + batch.length}/${texts.length} texts`);
+                }
+            }
+            
+            if (window.updateEmbeddingProgress) {
+                window.updateEmbeddingProgress(100, `Generated ${embeddings.length} embeddings successfully`);
+            }
+            
+            return embeddings;
+        } catch (error) {
+            console.error('Embedding generation error:', error);
+            throw error;
+        }
+    }
+}
+
+// Global instance
+window.transformersEmbedder = new TransformersEmbedder();
+console.log('📦 TransformersEmbedder instance created');
+
+// Global progress update functions
+window.updateModelLoadingProgress = function(progress, status) {
+    const progressBar = document.getElementById('model-loading-progress');
+    const statusText = document.getElementById('model-loading-status');
+    if (progressBar) {
+        progressBar.style.width = progress + '%';
+        progressBar.setAttribute('aria-valuenow', progress);
+    }
+    if (statusText) {
+        statusText.textContent = status;
+    }
+};
+
+window.updateEmbeddingProgress = function(progress, status) {
+    const progressBar = document.getElementById('embedding-progress');
+    const statusText = document.getElementById('embedding-status');
+    if (progressBar) {
+        progressBar.style.width = progress + '%';
+        progressBar.setAttribute('aria-valuenow', progress);
+    }
+    if (statusText) {
+        statusText.textContent = status;
+    }
+};
+
+// Dash clientside callback functions
+window.dash_clientside = window.dash_clientside || {};
+console.log('🔧 Setting up window.dash_clientside.transformers');
+window.dash_clientside.transformers = {
+    generateEmbeddings: async function(nClicks, textContent, modelName, tokenizationMethod, category, subcategory) {
+        console.log('🚀 generateEmbeddings called with:', { nClicks, modelName, tokenizationMethod, textLength: textContent?.length });
+        
+        if (!nClicks || !textContent || textContent.trim().length === 0) {
+            console.log('⚠️ Early return - missing required parameters');
+            return window.dash_clientside.no_update;
+        }
+        
+        try {
+            // Initialize model if needed
+            const initResult = await window.transformersEmbedder.initializeModel(modelName);
+            if (!initResult.success) {
+                return [
+                    { error: initResult.error },
+                    `❌ Model loading error: ${initResult.error}`,
+                    "danger",
+                    false
+                ];
+            }
+            
+            // Tokenize text based on method
+            let textChunks;
+            const trimmedText = textContent.trim();
+            
+            switch (tokenizationMethod) {
+                case 'sentence':
+                    // Simple sentence splitting - can be enhanced with proper NLP
+                    textChunks = trimmedText
+                        .split(/[.!?]+/)
+                        .map(s => s.trim())
+                        .filter(s => s.length > 0);
+                    break;
+                case 'paragraph':
+                    textChunks = trimmedText
+                        .split(/\n\s*\n/)
+                        .map(s => s.trim())
+                        .filter(s => s.length > 0);
+                    break;
+                case 'manual':
+                    textChunks = trimmedText
+                        .split('\n')
+                        .map(s => s.trim())
+                        .filter(s => s.length > 0);
+                    break;
+                default:
+                    textChunks = [trimmedText];
+            }
+            
+            if (textChunks.length === 0) {
+                return [
+                    { error: 'No valid text chunks found after tokenization' },
+                    '❌ Error: No valid text chunks found after tokenization',
+                    "danger",
+                    false
+                ];
+            }
+            
+            // Generate embeddings
+            const embeddings = await window.transformersEmbedder.generateEmbeddings(textChunks);
+            
+            if (!embeddings || embeddings.length !== textChunks.length) {
+                return [
+                    { error: 'Embedding generation failed - mismatch in text chunks and embeddings' },
+                    '❌ Error: Embedding generation failed',
+                    "danger",
+                    false
+                ];
+            }
+            
+            // Create documents structure
+            const documents = textChunks.map((text, i) => ({
+                id: `text_input_${Date.now()}_${i}`,
+                text: text,
+                embedding: embeddings[i],
+                category: category || "Text Input",
+                subcategory: subcategory || "Generated",
+                tags: []
+            }));
+            
+            return [
+                {
+                    documents: documents,
+                    embeddings: embeddings
+                },
+                `✅ Generated embeddings for ${documents.length} text chunks using ${modelName}`,
+                "success",
+                false
+            ];
+            
+        } catch (error) {
+            console.error('Client-side embedding error:', error);
+            return [
+                { error: error.message },
+                `❌ Error: ${error.message}`,
+                "danger", 
+                false
+            ];
+        }
+    }
+};
+
+console.log('✅ Transformers.js client-side setup complete');
+console.log('Available:', {
+    transformersEmbedder: !!window.transformersEmbedder,
+    dashClientside: !!window.dash_clientside,
+    transformersModule: !!window.dash_clientside?.transformers,
+    generateFunction: typeof window.dash_clientside?.transformers?.generateEmbeddings
+});