Some checks failed
Security Scan / security (pull_request) Successful in 44s
Security Scan / dependency-check (pull_request) Successful in 49s
Test Suite / lint (pull_request) Failing after 40s
Test Suite / test (3.11) (pull_request) Successful in 1m39s
Test Suite / build (pull_request) Has been skipped
278 lines
11 KiB
JavaScript
278 lines
11 KiB
JavaScript
// Text input embedding generation using Transformers.js
|
|
// This module runs entirely in the browser for privacy and performance
|
|
|
|
// Global flag to track initialization
|
|
window.transformersLoading = false;
|
|
window.transformersLoaded = false;
|
|
|
|
class TransformersEmbedder {
|
|
constructor() {
|
|
this.extractor = null;
|
|
this.currentModel = null;
|
|
this.modelCache = new Map();
|
|
this.isLoading = false;
|
|
}
|
|
|
|
async initializeModel(modelName = 'Xenova/all-MiniLM-L6-v2') {
|
|
try {
|
|
if (this.modelCache.has(modelName)) {
|
|
this.extractor = this.modelCache.get(modelName);
|
|
this.currentModel = modelName;
|
|
return { success: true, model: modelName };
|
|
}
|
|
|
|
if (this.isLoading) {
|
|
return { success: false, error: 'Model loading already in progress' };
|
|
}
|
|
|
|
this.isLoading = true;
|
|
|
|
// Use globally loaded Transformers.js pipeline
|
|
if (!window.transformers) {
|
|
if (!window.transformersPipeline) {
|
|
// Wait for the pipeline to load
|
|
let attempts = 0;
|
|
while (!window.transformersPipeline && attempts < 50) { // Wait up to 5 seconds
|
|
await new Promise(resolve => setTimeout(resolve, 100));
|
|
attempts++;
|
|
}
|
|
if (!window.transformersPipeline) {
|
|
throw new Error('Transformers.js pipeline not available. Please refresh the page.');
|
|
}
|
|
}
|
|
window.transformers = { pipeline: window.transformersPipeline };
|
|
window.transformersLoaded = true;
|
|
console.log('✅ Using globally loaded Transformers.js pipeline');
|
|
}
|
|
|
|
// Show loading progress to user
|
|
if (window.updateModelLoadingProgress) {
|
|
window.updateModelLoadingProgress(0, `Loading ${modelName}...`);
|
|
}
|
|
|
|
this.extractor = await window.transformers.pipeline('feature-extraction', modelName, {
|
|
progress_callback: (data) => {
|
|
if (window.updateModelLoadingProgress && data.progress !== undefined) {
|
|
const progress = Math.round(data.progress);
|
|
window.updateModelLoadingProgress(progress, data.status || 'Loading...');
|
|
}
|
|
}
|
|
});
|
|
|
|
this.modelCache.set(modelName, this.extractor);
|
|
this.currentModel = modelName;
|
|
this.isLoading = false;
|
|
|
|
if (window.updateModelLoadingProgress) {
|
|
window.updateModelLoadingProgress(100, 'Model loaded successfully');
|
|
}
|
|
|
|
return { success: true, model: modelName };
|
|
} catch (error) {
|
|
this.isLoading = false;
|
|
console.error('Model initialization error:', error);
|
|
return { success: false, error: error.message };
|
|
}
|
|
}
|
|
|
|
async generateEmbeddings(texts, options = {}) {
|
|
if (!this.extractor) {
|
|
throw new Error('Model not initialized. Call initializeModel() first.');
|
|
}
|
|
|
|
if (!texts || texts.length === 0) {
|
|
throw new Error('No texts provided for embedding generation.');
|
|
}
|
|
|
|
const embeddings = [];
|
|
const defaultOptions = {
|
|
pooling: 'mean',
|
|
normalize: true,
|
|
...options
|
|
};
|
|
|
|
// Process in batches to avoid memory issues
|
|
const batchSize = options.batchSize || 8;
|
|
|
|
try {
|
|
for (let i = 0; i < texts.length; i += batchSize) {
|
|
const batch = texts.slice(i, i + batchSize);
|
|
|
|
const batchResults = await Promise.all(
|
|
batch.map(text => {
|
|
if (!text || text.trim().length === 0) {
|
|
throw new Error('Empty text found in batch');
|
|
}
|
|
return this.extractor(text.trim(), defaultOptions);
|
|
})
|
|
);
|
|
|
|
// Convert tensor output to arrays
|
|
batchResults.forEach((result, idx) => {
|
|
if (result && result.data) {
|
|
embeddings.push(Array.from(result.data));
|
|
} else {
|
|
throw new Error(`Invalid embedding result for text: ${batch[idx]}`);
|
|
}
|
|
});
|
|
|
|
// Update progress
|
|
const progress = Math.min(100, ((i + batch.length) / texts.length) * 100);
|
|
if (window.updateEmbeddingProgress) {
|
|
window.updateEmbeddingProgress(progress, `Processing ${i + batch.length}/${texts.length} texts`);
|
|
}
|
|
}
|
|
|
|
if (window.updateEmbeddingProgress) {
|
|
window.updateEmbeddingProgress(100, `Generated ${embeddings.length} embeddings successfully`);
|
|
}
|
|
|
|
return embeddings;
|
|
} catch (error) {
|
|
console.error('Embedding generation error:', error);
|
|
throw error;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Global instance
|
|
window.transformersEmbedder = new TransformersEmbedder();
|
|
console.log('📦 TransformersEmbedder instance created');
|
|
|
|
// Global progress update functions
|
|
window.updateModelLoadingProgress = function(progress, status) {
|
|
const progressBar = document.getElementById('model-loading-progress');
|
|
const statusText = document.getElementById('model-loading-status');
|
|
if (progressBar) {
|
|
progressBar.style.width = progress + '%';
|
|
progressBar.setAttribute('aria-valuenow', progress);
|
|
}
|
|
if (statusText) {
|
|
statusText.textContent = status;
|
|
}
|
|
};
|
|
|
|
window.updateEmbeddingProgress = function(progress, status) {
|
|
const progressBar = document.getElementById('embedding-progress');
|
|
const statusText = document.getElementById('embedding-status');
|
|
if (progressBar) {
|
|
progressBar.style.width = progress + '%';
|
|
progressBar.setAttribute('aria-valuenow', progress);
|
|
}
|
|
if (statusText) {
|
|
statusText.textContent = status;
|
|
}
|
|
};
|
|
|
|
// Dash clientside callback functions
|
|
window.dash_clientside = window.dash_clientside || {};
|
|
console.log('🔧 Setting up window.dash_clientside.transformers');
|
|
window.dash_clientside.transformers = {
|
|
generateEmbeddings: async function(nClicks, textContent, modelName, tokenizationMethod, category, subcategory) {
|
|
console.log('🚀 generateEmbeddings called with:', { nClicks, modelName, tokenizationMethod, textLength: textContent?.length });
|
|
|
|
if (!nClicks || !textContent || textContent.trim().length === 0) {
|
|
console.log('⚠️ Early return - missing required parameters');
|
|
return window.dash_clientside.no_update;
|
|
}
|
|
|
|
try {
|
|
// Initialize model if needed
|
|
const initResult = await window.transformersEmbedder.initializeModel(modelName);
|
|
if (!initResult.success) {
|
|
return [
|
|
{ error: initResult.error },
|
|
`❌ Model loading error: ${initResult.error}`,
|
|
"danger",
|
|
false
|
|
];
|
|
}
|
|
|
|
// Tokenize text based on method
|
|
let textChunks;
|
|
const trimmedText = textContent.trim();
|
|
|
|
switch (tokenizationMethod) {
|
|
case 'sentence':
|
|
// Simple sentence splitting - can be enhanced with proper NLP
|
|
textChunks = trimmedText
|
|
.split(/[.!?]+/)
|
|
.map(s => s.trim())
|
|
.filter(s => s.length > 0);
|
|
break;
|
|
case 'paragraph':
|
|
textChunks = trimmedText
|
|
.split(/\n\s*\n/)
|
|
.map(s => s.trim())
|
|
.filter(s => s.length > 0);
|
|
break;
|
|
case 'manual':
|
|
textChunks = trimmedText
|
|
.split('\n')
|
|
.map(s => s.trim())
|
|
.filter(s => s.length > 0);
|
|
break;
|
|
default:
|
|
textChunks = [trimmedText];
|
|
}
|
|
|
|
if (textChunks.length === 0) {
|
|
return [
|
|
{ error: 'No valid text chunks found after tokenization' },
|
|
'❌ Error: No valid text chunks found after tokenization',
|
|
"danger",
|
|
false
|
|
];
|
|
}
|
|
|
|
// Generate embeddings
|
|
const embeddings = await window.transformersEmbedder.generateEmbeddings(textChunks);
|
|
|
|
if (!embeddings || embeddings.length !== textChunks.length) {
|
|
return [
|
|
{ error: 'Embedding generation failed - mismatch in text chunks and embeddings' },
|
|
'❌ Error: Embedding generation failed',
|
|
"danger",
|
|
false
|
|
];
|
|
}
|
|
|
|
// Create documents structure
|
|
const documents = textChunks.map((text, i) => ({
|
|
id: `text_input_${Date.now()}_${i}`,
|
|
text: text,
|
|
embedding: embeddings[i],
|
|
category: category || "Text Input",
|
|
subcategory: subcategory || "Generated",
|
|
tags: []
|
|
}));
|
|
|
|
return [
|
|
{
|
|
documents: documents,
|
|
embeddings: embeddings
|
|
},
|
|
`✅ Generated embeddings for ${documents.length} text chunks using ${modelName}`,
|
|
"success",
|
|
false
|
|
];
|
|
|
|
} catch (error) {
|
|
console.error('Client-side embedding error:', error);
|
|
return [
|
|
{ error: error.message },
|
|
`❌ Error: ${error.message}`,
|
|
"danger",
|
|
false
|
|
];
|
|
}
|
|
}
|
|
};
|
|
|
|
console.log('✅ Transformers.js client-side setup complete');
|
|
console.log('Available:', {
|
|
transformersEmbedder: !!window.transformersEmbedder,
|
|
dashClientside: !!window.dash_clientside,
|
|
transformersModule: !!window.dash_clientside?.transformers,
|
|
generateFunction: typeof window.dash_clientside?.transformers?.generateEmbeddings
|
|
}); |