add in browser embedding generation
Some checks failed
Security Scan / security (pull_request) Successful in 44s
Security Scan / dependency-check (pull_request) Successful in 49s
Test Suite / lint (pull_request) Failing after 40s
Test Suite / test (3.11) (pull_request) Successful in 1m39s
Test Suite / build (pull_request) Has been skipped
Some checks failed
Security Scan / security (pull_request) Successful in 44s
Security Scan / dependency-check (pull_request) Successful in 49s
Test Suite / lint (pull_request) Failing after 40s
Test Suite / test (3.11) (pull_request) Successful in 1m39s
Test Suite / build (pull_request) Has been skipped
This commit is contained in:
278
assets/embeddings.js
Normal file
278
assets/embeddings.js
Normal file
@@ -0,0 +1,278 @@
|
||||
// Text input embedding generation using Transformers.js
|
||||
// This module runs entirely in the browser for privacy and performance
|
||||
|
||||
// Global flag to track initialization
|
||||
window.transformersLoading = false;
|
||||
window.transformersLoaded = false;
|
||||
|
||||
class TransformersEmbedder {
|
||||
constructor() {
|
||||
this.extractor = null;
|
||||
this.currentModel = null;
|
||||
this.modelCache = new Map();
|
||||
this.isLoading = false;
|
||||
}
|
||||
|
||||
async initializeModel(modelName = 'Xenova/all-MiniLM-L6-v2') {
|
||||
try {
|
||||
if (this.modelCache.has(modelName)) {
|
||||
this.extractor = this.modelCache.get(modelName);
|
||||
this.currentModel = modelName;
|
||||
return { success: true, model: modelName };
|
||||
}
|
||||
|
||||
if (this.isLoading) {
|
||||
return { success: false, error: 'Model loading already in progress' };
|
||||
}
|
||||
|
||||
this.isLoading = true;
|
||||
|
||||
// Use globally loaded Transformers.js pipeline
|
||||
if (!window.transformers) {
|
||||
if (!window.transformersPipeline) {
|
||||
// Wait for the pipeline to load
|
||||
let attempts = 0;
|
||||
while (!window.transformersPipeline && attempts < 50) { // Wait up to 5 seconds
|
||||
await new Promise(resolve => setTimeout(resolve, 100));
|
||||
attempts++;
|
||||
}
|
||||
if (!window.transformersPipeline) {
|
||||
throw new Error('Transformers.js pipeline not available. Please refresh the page.');
|
||||
}
|
||||
}
|
||||
window.transformers = { pipeline: window.transformersPipeline };
|
||||
window.transformersLoaded = true;
|
||||
console.log('✅ Using globally loaded Transformers.js pipeline');
|
||||
}
|
||||
|
||||
// Show loading progress to user
|
||||
if (window.updateModelLoadingProgress) {
|
||||
window.updateModelLoadingProgress(0, `Loading ${modelName}...`);
|
||||
}
|
||||
|
||||
this.extractor = await window.transformers.pipeline('feature-extraction', modelName, {
|
||||
progress_callback: (data) => {
|
||||
if (window.updateModelLoadingProgress && data.progress !== undefined) {
|
||||
const progress = Math.round(data.progress);
|
||||
window.updateModelLoadingProgress(progress, data.status || 'Loading...');
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
this.modelCache.set(modelName, this.extractor);
|
||||
this.currentModel = modelName;
|
||||
this.isLoading = false;
|
||||
|
||||
if (window.updateModelLoadingProgress) {
|
||||
window.updateModelLoadingProgress(100, 'Model loaded successfully');
|
||||
}
|
||||
|
||||
return { success: true, model: modelName };
|
||||
} catch (error) {
|
||||
this.isLoading = false;
|
||||
console.error('Model initialization error:', error);
|
||||
return { success: false, error: error.message };
|
||||
}
|
||||
}
|
||||
|
||||
async generateEmbeddings(texts, options = {}) {
|
||||
if (!this.extractor) {
|
||||
throw new Error('Model not initialized. Call initializeModel() first.');
|
||||
}
|
||||
|
||||
if (!texts || texts.length === 0) {
|
||||
throw new Error('No texts provided for embedding generation.');
|
||||
}
|
||||
|
||||
const embeddings = [];
|
||||
const defaultOptions = {
|
||||
pooling: 'mean',
|
||||
normalize: true,
|
||||
...options
|
||||
};
|
||||
|
||||
// Process in batches to avoid memory issues
|
||||
const batchSize = options.batchSize || 8;
|
||||
|
||||
try {
|
||||
for (let i = 0; i < texts.length; i += batchSize) {
|
||||
const batch = texts.slice(i, i + batchSize);
|
||||
|
||||
const batchResults = await Promise.all(
|
||||
batch.map(text => {
|
||||
if (!text || text.trim().length === 0) {
|
||||
throw new Error('Empty text found in batch');
|
||||
}
|
||||
return this.extractor(text.trim(), defaultOptions);
|
||||
})
|
||||
);
|
||||
|
||||
// Convert tensor output to arrays
|
||||
batchResults.forEach((result, idx) => {
|
||||
if (result && result.data) {
|
||||
embeddings.push(Array.from(result.data));
|
||||
} else {
|
||||
throw new Error(`Invalid embedding result for text: ${batch[idx]}`);
|
||||
}
|
||||
});
|
||||
|
||||
// Update progress
|
||||
const progress = Math.min(100, ((i + batch.length) / texts.length) * 100);
|
||||
if (window.updateEmbeddingProgress) {
|
||||
window.updateEmbeddingProgress(progress, `Processing ${i + batch.length}/${texts.length} texts`);
|
||||
}
|
||||
}
|
||||
|
||||
if (window.updateEmbeddingProgress) {
|
||||
window.updateEmbeddingProgress(100, `Generated ${embeddings.length} embeddings successfully`);
|
||||
}
|
||||
|
||||
return embeddings;
|
||||
} catch (error) {
|
||||
console.error('Embedding generation error:', error);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Global instance
|
||||
window.transformersEmbedder = new TransformersEmbedder();
|
||||
console.log('📦 TransformersEmbedder instance created');
|
||||
|
||||
// Global progress update functions
|
||||
window.updateModelLoadingProgress = function(progress, status) {
|
||||
const progressBar = document.getElementById('model-loading-progress');
|
||||
const statusText = document.getElementById('model-loading-status');
|
||||
if (progressBar) {
|
||||
progressBar.style.width = progress + '%';
|
||||
progressBar.setAttribute('aria-valuenow', progress);
|
||||
}
|
||||
if (statusText) {
|
||||
statusText.textContent = status;
|
||||
}
|
||||
};
|
||||
|
||||
window.updateEmbeddingProgress = function(progress, status) {
|
||||
const progressBar = document.getElementById('embedding-progress');
|
||||
const statusText = document.getElementById('embedding-status');
|
||||
if (progressBar) {
|
||||
progressBar.style.width = progress + '%';
|
||||
progressBar.setAttribute('aria-valuenow', progress);
|
||||
}
|
||||
if (statusText) {
|
||||
statusText.textContent = status;
|
||||
}
|
||||
};
|
||||
|
||||
// Dash clientside callback functions
|
||||
window.dash_clientside = window.dash_clientside || {};
|
||||
console.log('🔧 Setting up window.dash_clientside.transformers');
|
||||
window.dash_clientside.transformers = {
|
||||
generateEmbeddings: async function(nClicks, textContent, modelName, tokenizationMethod, category, subcategory) {
|
||||
console.log('🚀 generateEmbeddings called with:', { nClicks, modelName, tokenizationMethod, textLength: textContent?.length });
|
||||
|
||||
if (!nClicks || !textContent || textContent.trim().length === 0) {
|
||||
console.log('⚠️ Early return - missing required parameters');
|
||||
return window.dash_clientside.no_update;
|
||||
}
|
||||
|
||||
try {
|
||||
// Initialize model if needed
|
||||
const initResult = await window.transformersEmbedder.initializeModel(modelName);
|
||||
if (!initResult.success) {
|
||||
return [
|
||||
{ error: initResult.error },
|
||||
`❌ Model loading error: ${initResult.error}`,
|
||||
"danger",
|
||||
false
|
||||
];
|
||||
}
|
||||
|
||||
// Tokenize text based on method
|
||||
let textChunks;
|
||||
const trimmedText = textContent.trim();
|
||||
|
||||
switch (tokenizationMethod) {
|
||||
case 'sentence':
|
||||
// Simple sentence splitting - can be enhanced with proper NLP
|
||||
textChunks = trimmedText
|
||||
.split(/[.!?]+/)
|
||||
.map(s => s.trim())
|
||||
.filter(s => s.length > 0);
|
||||
break;
|
||||
case 'paragraph':
|
||||
textChunks = trimmedText
|
||||
.split(/\n\s*\n/)
|
||||
.map(s => s.trim())
|
||||
.filter(s => s.length > 0);
|
||||
break;
|
||||
case 'manual':
|
||||
textChunks = trimmedText
|
||||
.split('\n')
|
||||
.map(s => s.trim())
|
||||
.filter(s => s.length > 0);
|
||||
break;
|
||||
default:
|
||||
textChunks = [trimmedText];
|
||||
}
|
||||
|
||||
if (textChunks.length === 0) {
|
||||
return [
|
||||
{ error: 'No valid text chunks found after tokenization' },
|
||||
'❌ Error: No valid text chunks found after tokenization',
|
||||
"danger",
|
||||
false
|
||||
];
|
||||
}
|
||||
|
||||
// Generate embeddings
|
||||
const embeddings = await window.transformersEmbedder.generateEmbeddings(textChunks);
|
||||
|
||||
if (!embeddings || embeddings.length !== textChunks.length) {
|
||||
return [
|
||||
{ error: 'Embedding generation failed - mismatch in text chunks and embeddings' },
|
||||
'❌ Error: Embedding generation failed',
|
||||
"danger",
|
||||
false
|
||||
];
|
||||
}
|
||||
|
||||
// Create documents structure
|
||||
const documents = textChunks.map((text, i) => ({
|
||||
id: `text_input_${Date.now()}_${i}`,
|
||||
text: text,
|
||||
embedding: embeddings[i],
|
||||
category: category || "Text Input",
|
||||
subcategory: subcategory || "Generated",
|
||||
tags: []
|
||||
}));
|
||||
|
||||
return [
|
||||
{
|
||||
documents: documents,
|
||||
embeddings: embeddings
|
||||
},
|
||||
`✅ Generated embeddings for ${documents.length} text chunks using ${modelName}`,
|
||||
"success",
|
||||
false
|
||||
];
|
||||
|
||||
} catch (error) {
|
||||
console.error('Client-side embedding error:', error);
|
||||
return [
|
||||
{ error: error.message },
|
||||
`❌ Error: ${error.message}`,
|
||||
"danger",
|
||||
false
|
||||
];
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
console.log('✅ Transformers.js client-side setup complete');
|
||||
console.log('Available:', {
|
||||
transformersEmbedder: !!window.transformersEmbedder,
|
||||
dashClientside: !!window.dash_clientside,
|
||||
transformersModule: !!window.dash_clientside?.transformers,
|
||||
generateFunction: typeof window.dash_clientside?.transformers?.generateEmbeddings
|
||||
});
|
Reference in New Issue
Block a user