embedding-buddy/assets/embeddings.js

// Text input embedding generation using Transformers.js
// This module runs entirely in the browser for privacy and performance

// Global flag to track initialization
window.transformersLoading = false;
window.transformersLoaded = false;

class TransformersEmbedder {
    constructor() {
        this.extractor = null;
        this.currentModel = null;
        this.modelCache = new Map();
        this.isLoading = false;
    }

    async initializeModel(modelName = 'Xenova/all-MiniLM-L6-v2') {
        try {
            if (this.modelCache.has(modelName)) {
                this.extractor = this.modelCache.get(modelName);
                this.currentModel = modelName;
                return { success: true, model: modelName };
            }

            if (this.isLoading) {
                return { success: false, error: 'Model loading already in progress' };
            }

            this.isLoading = true;

            // Use globally loaded Transformers.js pipeline
            if (!window.transformers) {
                if (!window.transformersPipeline) {
                    // Wait for the pipeline to load
                    let attempts = 0;
                    while (!window.transformersPipeline && attempts < 50) { // Wait up to 5 seconds
                        await new Promise(resolve => setTimeout(resolve, 100));
                        attempts++;
                    }
                    if (!window.transformersPipeline) {
                        throw new Error('Transformers.js pipeline not available. Please refresh the page.');
                    }
                }
                window.transformers = { pipeline: window.transformersPipeline };
                window.transformersLoaded = true;
                console.log('✅ Using globally loaded Transformers.js pipeline');
            }

            // Show loading progress to user
            if (window.updateModelLoadingProgress) {
                window.updateModelLoadingProgress(0, `Loading ${modelName}...`);
            }

            this.extractor = await window.transformers.pipeline('feature-extraction', modelName, {
                progress_callback: (data) => {
                    if (window.updateModelLoadingProgress && data.progress !== undefined) {
                        const progress = Math.round(data.progress);
                        window.updateModelLoadingProgress(progress, data.status || 'Loading...');
                    }
                }
            });

            this.modelCache.set(modelName, this.extractor);
            this.currentModel = modelName;
            this.isLoading = false;

            if (window.updateModelLoadingProgress) {
                window.updateModelLoadingProgress(100, 'Model loaded successfully');
            }

            return { success: true, model: modelName };
        } catch (error) {
            this.isLoading = false;
            console.error('Model initialization error:', error);
            return { success: false, error: error.message };
        }
    }

    async generateEmbeddings(texts, options = {}) {
        if (!this.extractor) {
            throw new Error('Model not initialized. Call initializeModel() first.');
        }

        if (!texts || texts.length === 0) {
            throw new Error('No texts provided for embedding generation.');
        }

        const embeddings = [];
        const defaultOptions = {
            pooling: 'mean',
            normalize: true,
            ...options
        };

        // Process in batches to avoid memory issues
        const batchSize = options.batchSize || 8;

        try {
            for (let i = 0; i < texts.length; i += batchSize) {
                const batch = texts.slice(i, i + batchSize);

                const batchResults = await Promise.all(
                    batch.map(text => {
                        if (!text || text.trim().length === 0) {
                            throw new Error('Empty text found in batch');
                        }
                        return this.extractor(text.trim(), defaultOptions);
                    })
                );

                // Convert tensor output to arrays
                batchResults.forEach((result, idx) => {
                    if (result && result.data) {
                        embeddings.push(Array.from(result.data));
                    } else {
                        throw new Error(`Invalid embedding result for text: ${batch[idx]}`);
                    }
                });

                // Update progress
                const progress = Math.min(100, ((i + batch.length) / texts.length) * 100);
                if (window.updateEmbeddingProgress) {
                    window.updateEmbeddingProgress(progress, `Processing ${i + batch.length}/${texts.length} texts`);
                }
            }

            if (window.updateEmbeddingProgress) {
                window.updateEmbeddingProgress(100, `Generated ${embeddings.length} embeddings successfully`);
            }

            return embeddings;
        } catch (error) {
            console.error('Embedding generation error:', error);
            throw error;
        }
    }
}

// Global instance
window.transformersEmbedder = new TransformersEmbedder();
console.log('📦 TransformersEmbedder instance created');

// Global progress update functions
window.updateModelLoadingProgress = function(progress, status) {
    const progressBar = document.getElementById('model-loading-progress');
    const statusText = document.getElementById('model-loading-status');
    if (progressBar) {
        progressBar.style.width = progress + '%';
        progressBar.setAttribute('aria-valuenow', progress);
    }
    if (statusText) {
        statusText.textContent = status;
    }
};

window.updateEmbeddingProgress = function(progress, status) {
    const progressBar = document.getElementById('embedding-progress');
    const statusText = document.getElementById('embedding-status');
    if (progressBar) {
        progressBar.style.width = progress + '%';
        progressBar.setAttribute('aria-valuenow', progress);
    }
    if (statusText) {
        statusText.textContent = status;
    }
};

// Dash clientside callback functions
window.dash_clientside = window.dash_clientside || {};
console.log('🔧 Setting up window.dash_clientside.transformers');
window.dash_clientside.transformers = {
    generateEmbeddings: async function(nClicks, textContent, modelName, tokenizationMethod, category, subcategory) {
        console.log('🚀 generateEmbeddings called with:', { nClicks, modelName, tokenizationMethod, textLength: textContent?.length });

        if (!nClicks || !textContent || textContent.trim().length === 0) {
            console.log('⚠️ Early return - missing required parameters');
            return window.dash_clientside.no_update;
        }

        try {
            // Initialize model if needed
            const initResult = await window.transformersEmbedder.initializeModel(modelName);
            if (!initResult.success) {
                return [
                    { error: initResult.error },
                    `❌ Model loading error: ${initResult.error}`,
                    "danger",
                    false
                ];
            }

            // Tokenize text based on method
            let textChunks;
            const trimmedText = textContent.trim();

            switch (tokenizationMethod) {
                case 'sentence':
                    // Simple sentence splitting - can be enhanced with proper NLP
                    textChunks = trimmedText
                        .split(/[.!?]+/)
                        .map(s => s.trim())
                        .filter(s => s.length > 0);
                    break;
                case 'paragraph':
                    textChunks = trimmedText
                        .split(/\n\s*\n/)
                        .map(s => s.trim())
                        .filter(s => s.length > 0);
                    break;
                case 'manual':
                    textChunks = trimmedText
                        .split('\n')
                        .map(s => s.trim())
                        .filter(s => s.length > 0);
                    break;
                default:
                    textChunks = [trimmedText];
            }

            if (textChunks.length === 0) {
                return [
                    { error: 'No valid text chunks found after tokenization' },
                    '❌ Error: No valid text chunks found after tokenization',
                    "danger",
                    false
                ];
            }

            // Generate embeddings
            const embeddings = await window.transformersEmbedder.generateEmbeddings(textChunks);

            if (!embeddings || embeddings.length !== textChunks.length) {
                return [
                    { error: 'Embedding generation failed - mismatch in text chunks and embeddings' },
                    '❌ Error: Embedding generation failed',
                    "danger",
                    false
                ];
            }

            // Create documents structure
            const documents = textChunks.map((text, i) => ({
                id: `text_input_${Date.now()}_${i}`,
                text: text,
                embedding: embeddings[i],
                category: category || "Text Input",
                subcategory: subcategory || "Generated",
                tags: []
            }));

            return [
                {
                    documents: documents,
                    embeddings: embeddings
                },
                `✅ Generated embeddings for ${documents.length} text chunks using ${modelName}`,
                "success",
                false
            ];

        } catch (error) {
            console.error('Client-side embedding error:', error);
            return [
                { error: error.message },
                `❌ Error: ${error.message}`,
                "danger",
                false
            ];
        }
    }
};

console.log('✅ Transformers.js client-side setup complete');
console.log('Available:', {
    transformersEmbedder: !!window.transformersEmbedder,
    dashClientside: !!window.dash_clientside,
    transformersModule: !!window.dash_clientside?.transformers,
    generateFunction: typeof window.dash_clientside?.transformers?.generateEmbeddings
});