Files
embedding-buddy/assets/embeddings.js
Austin Godber cdaaffd735
Some checks failed
Security Scan / security (pull_request) Successful in 44s
Security Scan / dependency-check (pull_request) Successful in 49s
Test Suite / lint (pull_request) Failing after 40s
Test Suite / test (3.11) (pull_request) Successful in 1m39s
Test Suite / build (pull_request) Has been skipped
add in browser embedding generation
2025-09-06 07:16:30 -07:00

278 lines
11 KiB
JavaScript

// Text input embedding generation using Transformers.js
// This module runs entirely in the browser for privacy and performance
// Global flag to track initialization
window.transformersLoading = false;
window.transformersLoaded = false;
class TransformersEmbedder {
constructor() {
this.extractor = null;
this.currentModel = null;
this.modelCache = new Map();
this.isLoading = false;
}
async initializeModel(modelName = 'Xenova/all-MiniLM-L6-v2') {
try {
if (this.modelCache.has(modelName)) {
this.extractor = this.modelCache.get(modelName);
this.currentModel = modelName;
return { success: true, model: modelName };
}
if (this.isLoading) {
return { success: false, error: 'Model loading already in progress' };
}
this.isLoading = true;
// Use globally loaded Transformers.js pipeline
if (!window.transformers) {
if (!window.transformersPipeline) {
// Wait for the pipeline to load
let attempts = 0;
while (!window.transformersPipeline && attempts < 50) { // Wait up to 5 seconds
await new Promise(resolve => setTimeout(resolve, 100));
attempts++;
}
if (!window.transformersPipeline) {
throw new Error('Transformers.js pipeline not available. Please refresh the page.');
}
}
window.transformers = { pipeline: window.transformersPipeline };
window.transformersLoaded = true;
console.log('✅ Using globally loaded Transformers.js pipeline');
}
// Show loading progress to user
if (window.updateModelLoadingProgress) {
window.updateModelLoadingProgress(0, `Loading ${modelName}...`);
}
this.extractor = await window.transformers.pipeline('feature-extraction', modelName, {
progress_callback: (data) => {
if (window.updateModelLoadingProgress && data.progress !== undefined) {
const progress = Math.round(data.progress);
window.updateModelLoadingProgress(progress, data.status || 'Loading...');
}
}
});
this.modelCache.set(modelName, this.extractor);
this.currentModel = modelName;
this.isLoading = false;
if (window.updateModelLoadingProgress) {
window.updateModelLoadingProgress(100, 'Model loaded successfully');
}
return { success: true, model: modelName };
} catch (error) {
this.isLoading = false;
console.error('Model initialization error:', error);
return { success: false, error: error.message };
}
}
async generateEmbeddings(texts, options = {}) {
if (!this.extractor) {
throw new Error('Model not initialized. Call initializeModel() first.');
}
if (!texts || texts.length === 0) {
throw new Error('No texts provided for embedding generation.');
}
const embeddings = [];
const defaultOptions = {
pooling: 'mean',
normalize: true,
...options
};
// Process in batches to avoid memory issues
const batchSize = options.batchSize || 8;
try {
for (let i = 0; i < texts.length; i += batchSize) {
const batch = texts.slice(i, i + batchSize);
const batchResults = await Promise.all(
batch.map(text => {
if (!text || text.trim().length === 0) {
throw new Error('Empty text found in batch');
}
return this.extractor(text.trim(), defaultOptions);
})
);
// Convert tensor output to arrays
batchResults.forEach((result, idx) => {
if (result && result.data) {
embeddings.push(Array.from(result.data));
} else {
throw new Error(`Invalid embedding result for text: ${batch[idx]}`);
}
});
// Update progress
const progress = Math.min(100, ((i + batch.length) / texts.length) * 100);
if (window.updateEmbeddingProgress) {
window.updateEmbeddingProgress(progress, `Processing ${i + batch.length}/${texts.length} texts`);
}
}
if (window.updateEmbeddingProgress) {
window.updateEmbeddingProgress(100, `Generated ${embeddings.length} embeddings successfully`);
}
return embeddings;
} catch (error) {
console.error('Embedding generation error:', error);
throw error;
}
}
}
// Global instance
window.transformersEmbedder = new TransformersEmbedder();
console.log('📦 TransformersEmbedder instance created');
// Global progress update functions
window.updateModelLoadingProgress = function(progress, status) {
const progressBar = document.getElementById('model-loading-progress');
const statusText = document.getElementById('model-loading-status');
if (progressBar) {
progressBar.style.width = progress + '%';
progressBar.setAttribute('aria-valuenow', progress);
}
if (statusText) {
statusText.textContent = status;
}
};
window.updateEmbeddingProgress = function(progress, status) {
const progressBar = document.getElementById('embedding-progress');
const statusText = document.getElementById('embedding-status');
if (progressBar) {
progressBar.style.width = progress + '%';
progressBar.setAttribute('aria-valuenow', progress);
}
if (statusText) {
statusText.textContent = status;
}
};
// Dash clientside callback functions
window.dash_clientside = window.dash_clientside || {};
console.log('🔧 Setting up window.dash_clientside.transformers');
window.dash_clientside.transformers = {
generateEmbeddings: async function(nClicks, textContent, modelName, tokenizationMethod, category, subcategory) {
console.log('🚀 generateEmbeddings called with:', { nClicks, modelName, tokenizationMethod, textLength: textContent?.length });
if (!nClicks || !textContent || textContent.trim().length === 0) {
console.log('⚠️ Early return - missing required parameters');
return window.dash_clientside.no_update;
}
try {
// Initialize model if needed
const initResult = await window.transformersEmbedder.initializeModel(modelName);
if (!initResult.success) {
return [
{ error: initResult.error },
`❌ Model loading error: ${initResult.error}`,
"danger",
false
];
}
// Tokenize text based on method
let textChunks;
const trimmedText = textContent.trim();
switch (tokenizationMethod) {
case 'sentence':
// Simple sentence splitting - can be enhanced with proper NLP
textChunks = trimmedText
.split(/[.!?]+/)
.map(s => s.trim())
.filter(s => s.length > 0);
break;
case 'paragraph':
textChunks = trimmedText
.split(/\n\s*\n/)
.map(s => s.trim())
.filter(s => s.length > 0);
break;
case 'manual':
textChunks = trimmedText
.split('\n')
.map(s => s.trim())
.filter(s => s.length > 0);
break;
default:
textChunks = [trimmedText];
}
if (textChunks.length === 0) {
return [
{ error: 'No valid text chunks found after tokenization' },
'❌ Error: No valid text chunks found after tokenization',
"danger",
false
];
}
// Generate embeddings
const embeddings = await window.transformersEmbedder.generateEmbeddings(textChunks);
if (!embeddings || embeddings.length !== textChunks.length) {
return [
{ error: 'Embedding generation failed - mismatch in text chunks and embeddings' },
'❌ Error: Embedding generation failed',
"danger",
false
];
}
// Create documents structure
const documents = textChunks.map((text, i) => ({
id: `text_input_${Date.now()}_${i}`,
text: text,
embedding: embeddings[i],
category: category || "Text Input",
subcategory: subcategory || "Generated",
tags: []
}));
return [
{
documents: documents,
embeddings: embeddings
},
`✅ Generated embeddings for ${documents.length} text chunks using ${modelName}`,
"success",
false
];
} catch (error) {
console.error('Client-side embedding error:', error);
return [
{ error: error.message },
`❌ Error: ${error.message}`,
"danger",
false
];
}
}
};
console.log('✅ Transformers.js client-side setup complete');
console.log('Available:', {
transformersEmbedder: !!window.transformersEmbedder,
dashClientside: !!window.dash_clientside,
transformersModule: !!window.dash_clientside?.transformers,
generateFunction: typeof window.dash_clientside?.transformers?.generateEmbeddings
});