Merge pull request 'add in browser embedding generation' (#4) from add-browser-embeddings into main
	
		
			
	
		
	
	
		
	
		
			All checks were successful
		
		
	
	
	
				
					
				
			
		
			All checks were successful
		
		
	
	
Reviewed-on: godber/embedding-buddy#4
This commit is contained in:
		@@ -3,7 +3,8 @@
 | 
			
		||||
    "allow": [
 | 
			
		||||
      "Bash(mkdir:*)",
 | 
			
		||||
      "Bash(uv run:*)",
 | 
			
		||||
      "Bash(uv add:*)"
 | 
			
		||||
      "Bash(uv add:*)",
 | 
			
		||||
      "Bash(uv sync:*)"
 | 
			
		||||
    ],
 | 
			
		||||
    "deny": [],
 | 
			
		||||
    "ask": [],
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										5
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										5
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							@@ -81,4 +81,7 @@ safety-report.json
 | 
			
		||||
pip-audit-report.json
 | 
			
		||||
 | 
			
		||||
# Temporary files
 | 
			
		||||
*.tmp
 | 
			
		||||
*.tmp
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
examples/extra
 | 
			
		||||
							
								
								
									
										278
									
								
								assets/embeddings.js
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										278
									
								
								assets/embeddings.js
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,278 @@
 | 
			
		||||
// Text input embedding generation using Transformers.js
 | 
			
		||||
// This module runs entirely in the browser for privacy and performance
 | 
			
		||||
 | 
			
		||||
// Global flag to track initialization
 | 
			
		||||
window.transformersLoading = false;
 | 
			
		||||
window.transformersLoaded = false;
 | 
			
		||||
 | 
			
		||||
class TransformersEmbedder {
 | 
			
		||||
    constructor() {
 | 
			
		||||
        this.extractor = null;
 | 
			
		||||
        this.currentModel = null;
 | 
			
		||||
        this.modelCache = new Map();
 | 
			
		||||
        this.isLoading = false;
 | 
			
		||||
    }
 | 
			
		||||
    
 | 
			
		||||
    async initializeModel(modelName = 'Xenova/all-MiniLM-L6-v2') {
 | 
			
		||||
        try {
 | 
			
		||||
            if (this.modelCache.has(modelName)) {
 | 
			
		||||
                this.extractor = this.modelCache.get(modelName);
 | 
			
		||||
                this.currentModel = modelName;
 | 
			
		||||
                return { success: true, model: modelName };
 | 
			
		||||
            }
 | 
			
		||||
            
 | 
			
		||||
            if (this.isLoading) {
 | 
			
		||||
                return { success: false, error: 'Model loading already in progress' };
 | 
			
		||||
            }
 | 
			
		||||
            
 | 
			
		||||
            this.isLoading = true;
 | 
			
		||||
            
 | 
			
		||||
            // Use globally loaded Transformers.js pipeline
 | 
			
		||||
            if (!window.transformers) {
 | 
			
		||||
                if (!window.transformersPipeline) {
 | 
			
		||||
                    // Wait for the pipeline to load
 | 
			
		||||
                    let attempts = 0;
 | 
			
		||||
                    while (!window.transformersPipeline && attempts < 50) { // Wait up to 5 seconds
 | 
			
		||||
                        await new Promise(resolve => setTimeout(resolve, 100));
 | 
			
		||||
                        attempts++;
 | 
			
		||||
                    }
 | 
			
		||||
                    if (!window.transformersPipeline) {
 | 
			
		||||
                        throw new Error('Transformers.js pipeline not available. Please refresh the page.');
 | 
			
		||||
                    }
 | 
			
		||||
                }
 | 
			
		||||
                window.transformers = { pipeline: window.transformersPipeline };
 | 
			
		||||
                window.transformersLoaded = true;
 | 
			
		||||
                console.log('✅ Using globally loaded Transformers.js pipeline');
 | 
			
		||||
            }
 | 
			
		||||
            
 | 
			
		||||
            // Show loading progress to user
 | 
			
		||||
            if (window.updateModelLoadingProgress) {
 | 
			
		||||
                window.updateModelLoadingProgress(0, `Loading ${modelName}...`);
 | 
			
		||||
            }
 | 
			
		||||
            
 | 
			
		||||
            this.extractor = await window.transformers.pipeline('feature-extraction', modelName, {
 | 
			
		||||
                progress_callback: (data) => {
 | 
			
		||||
                    if (window.updateModelLoadingProgress && data.progress !== undefined) {
 | 
			
		||||
                        const progress = Math.round(data.progress);
 | 
			
		||||
                        window.updateModelLoadingProgress(progress, data.status || 'Loading...');
 | 
			
		||||
                    }
 | 
			
		||||
                }
 | 
			
		||||
            });
 | 
			
		||||
            
 | 
			
		||||
            this.modelCache.set(modelName, this.extractor);
 | 
			
		||||
            this.currentModel = modelName;
 | 
			
		||||
            this.isLoading = false;
 | 
			
		||||
            
 | 
			
		||||
            if (window.updateModelLoadingProgress) {
 | 
			
		||||
                window.updateModelLoadingProgress(100, 'Model loaded successfully');
 | 
			
		||||
            }
 | 
			
		||||
            
 | 
			
		||||
            return { success: true, model: modelName };
 | 
			
		||||
        } catch (error) {
 | 
			
		||||
            this.isLoading = false;
 | 
			
		||||
            console.error('Model initialization error:', error);
 | 
			
		||||
            return { success: false, error: error.message };
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
    
 | 
			
		||||
    async generateEmbeddings(texts, options = {}) {
 | 
			
		||||
        if (!this.extractor) {
 | 
			
		||||
            throw new Error('Model not initialized. Call initializeModel() first.');
 | 
			
		||||
        }
 | 
			
		||||
        
 | 
			
		||||
        if (!texts || texts.length === 0) {
 | 
			
		||||
            throw new Error('No texts provided for embedding generation.');
 | 
			
		||||
        }
 | 
			
		||||
        
 | 
			
		||||
        const embeddings = [];
 | 
			
		||||
        const defaultOptions = { 
 | 
			
		||||
            pooling: 'mean', 
 | 
			
		||||
            normalize: true,
 | 
			
		||||
            ...options 
 | 
			
		||||
        };
 | 
			
		||||
        
 | 
			
		||||
        // Process in batches to avoid memory issues
 | 
			
		||||
        const batchSize = options.batchSize || 8;
 | 
			
		||||
        
 | 
			
		||||
        try {
 | 
			
		||||
            for (let i = 0; i < texts.length; i += batchSize) {
 | 
			
		||||
                const batch = texts.slice(i, i + batchSize);
 | 
			
		||||
                
 | 
			
		||||
                const batchResults = await Promise.all(
 | 
			
		||||
                    batch.map(text => {
 | 
			
		||||
                        if (!text || text.trim().length === 0) {
 | 
			
		||||
                            throw new Error('Empty text found in batch');
 | 
			
		||||
                        }
 | 
			
		||||
                        return this.extractor(text.trim(), defaultOptions);
 | 
			
		||||
                    })
 | 
			
		||||
                );
 | 
			
		||||
                
 | 
			
		||||
                // Convert tensor output to arrays
 | 
			
		||||
                batchResults.forEach((result, idx) => {
 | 
			
		||||
                    if (result && result.data) {
 | 
			
		||||
                        embeddings.push(Array.from(result.data));
 | 
			
		||||
                    } else {
 | 
			
		||||
                        throw new Error(`Invalid embedding result for text: ${batch[idx]}`);
 | 
			
		||||
                    }
 | 
			
		||||
                });
 | 
			
		||||
                
 | 
			
		||||
                // Update progress
 | 
			
		||||
                const progress = Math.min(100, ((i + batch.length) / texts.length) * 100);
 | 
			
		||||
                if (window.updateEmbeddingProgress) {
 | 
			
		||||
                    window.updateEmbeddingProgress(progress, `Processing ${i + batch.length}/${texts.length} texts`);
 | 
			
		||||
                }
 | 
			
		||||
            }
 | 
			
		||||
            
 | 
			
		||||
            if (window.updateEmbeddingProgress) {
 | 
			
		||||
                window.updateEmbeddingProgress(100, `Generated ${embeddings.length} embeddings successfully`);
 | 
			
		||||
            }
 | 
			
		||||
            
 | 
			
		||||
            return embeddings;
 | 
			
		||||
        } catch (error) {
 | 
			
		||||
            console.error('Embedding generation error:', error);
 | 
			
		||||
            throw error;
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Global instance
 | 
			
		||||
window.transformersEmbedder = new TransformersEmbedder();
 | 
			
		||||
console.log('📦 TransformersEmbedder instance created');
 | 
			
		||||
 | 
			
		||||
// Global progress update functions
 | 
			
		||||
window.updateModelLoadingProgress = function(progress, status) {
 | 
			
		||||
    const progressBar = document.getElementById('model-loading-progress');
 | 
			
		||||
    const statusText = document.getElementById('model-loading-status');
 | 
			
		||||
    if (progressBar) {
 | 
			
		||||
        progressBar.style.width = progress + '%';
 | 
			
		||||
        progressBar.setAttribute('aria-valuenow', progress);
 | 
			
		||||
    }
 | 
			
		||||
    if (statusText) {
 | 
			
		||||
        statusText.textContent = status;
 | 
			
		||||
    }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
window.updateEmbeddingProgress = function(progress, status) {
 | 
			
		||||
    const progressBar = document.getElementById('embedding-progress');
 | 
			
		||||
    const statusText = document.getElementById('embedding-status');
 | 
			
		||||
    if (progressBar) {
 | 
			
		||||
        progressBar.style.width = progress + '%';
 | 
			
		||||
        progressBar.setAttribute('aria-valuenow', progress);
 | 
			
		||||
    }
 | 
			
		||||
    if (statusText) {
 | 
			
		||||
        statusText.textContent = status;
 | 
			
		||||
    }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
// Dash clientside callback functions
 | 
			
		||||
window.dash_clientside = window.dash_clientside || {};
 | 
			
		||||
console.log('🔧 Setting up window.dash_clientside.transformers');
 | 
			
		||||
window.dash_clientside.transformers = {
 | 
			
		||||
    generateEmbeddings: async function(nClicks, textContent, modelName, tokenizationMethod, category, subcategory) {
 | 
			
		||||
        console.log('🚀 generateEmbeddings called with:', { nClicks, modelName, tokenizationMethod, textLength: textContent?.length });
 | 
			
		||||
        
 | 
			
		||||
        if (!nClicks || !textContent || textContent.trim().length === 0) {
 | 
			
		||||
            console.log('⚠️ Early return - missing required parameters');
 | 
			
		||||
            return window.dash_clientside.no_update;
 | 
			
		||||
        }
 | 
			
		||||
        
 | 
			
		||||
        try {
 | 
			
		||||
            // Initialize model if needed
 | 
			
		||||
            const initResult = await window.transformersEmbedder.initializeModel(modelName);
 | 
			
		||||
            if (!initResult.success) {
 | 
			
		||||
                return [
 | 
			
		||||
                    { error: initResult.error },
 | 
			
		||||
                    `❌ Model loading error: ${initResult.error}`,
 | 
			
		||||
                    "danger",
 | 
			
		||||
                    false
 | 
			
		||||
                ];
 | 
			
		||||
            }
 | 
			
		||||
            
 | 
			
		||||
            // Tokenize text based on method
 | 
			
		||||
            let textChunks;
 | 
			
		||||
            const trimmedText = textContent.trim();
 | 
			
		||||
            
 | 
			
		||||
            switch (tokenizationMethod) {
 | 
			
		||||
                case 'sentence':
 | 
			
		||||
                    // Simple sentence splitting - can be enhanced with proper NLP
 | 
			
		||||
                    textChunks = trimmedText
 | 
			
		||||
                        .split(/[.!?]+/)
 | 
			
		||||
                        .map(s => s.trim())
 | 
			
		||||
                        .filter(s => s.length > 0);
 | 
			
		||||
                    break;
 | 
			
		||||
                case 'paragraph':
 | 
			
		||||
                    textChunks = trimmedText
 | 
			
		||||
                        .split(/\n\s*\n/)
 | 
			
		||||
                        .map(s => s.trim())
 | 
			
		||||
                        .filter(s => s.length > 0);
 | 
			
		||||
                    break;
 | 
			
		||||
                case 'manual':
 | 
			
		||||
                    textChunks = trimmedText
 | 
			
		||||
                        .split('\n')
 | 
			
		||||
                        .map(s => s.trim())
 | 
			
		||||
                        .filter(s => s.length > 0);
 | 
			
		||||
                    break;
 | 
			
		||||
                default:
 | 
			
		||||
                    textChunks = [trimmedText];
 | 
			
		||||
            }
 | 
			
		||||
            
 | 
			
		||||
            if (textChunks.length === 0) {
 | 
			
		||||
                return [
 | 
			
		||||
                    { error: 'No valid text chunks found after tokenization' },
 | 
			
		||||
                    '❌ Error: No valid text chunks found after tokenization',
 | 
			
		||||
                    "danger",
 | 
			
		||||
                    false
 | 
			
		||||
                ];
 | 
			
		||||
            }
 | 
			
		||||
            
 | 
			
		||||
            // Generate embeddings
 | 
			
		||||
            const embeddings = await window.transformersEmbedder.generateEmbeddings(textChunks);
 | 
			
		||||
            
 | 
			
		||||
            if (!embeddings || embeddings.length !== textChunks.length) {
 | 
			
		||||
                return [
 | 
			
		||||
                    { error: 'Embedding generation failed - mismatch in text chunks and embeddings' },
 | 
			
		||||
                    '❌ Error: Embedding generation failed',
 | 
			
		||||
                    "danger",
 | 
			
		||||
                    false
 | 
			
		||||
                ];
 | 
			
		||||
            }
 | 
			
		||||
            
 | 
			
		||||
            // Create documents structure
 | 
			
		||||
            const documents = textChunks.map((text, i) => ({
 | 
			
		||||
                id: `text_input_${Date.now()}_${i}`,
 | 
			
		||||
                text: text,
 | 
			
		||||
                embedding: embeddings[i],
 | 
			
		||||
                category: category || "Text Input",
 | 
			
		||||
                subcategory: subcategory || "Generated",
 | 
			
		||||
                tags: []
 | 
			
		||||
            }));
 | 
			
		||||
            
 | 
			
		||||
            return [
 | 
			
		||||
                {
 | 
			
		||||
                    documents: documents,
 | 
			
		||||
                    embeddings: embeddings
 | 
			
		||||
                },
 | 
			
		||||
                `✅ Generated embeddings for ${documents.length} text chunks using ${modelName}`,
 | 
			
		||||
                "success",
 | 
			
		||||
                false
 | 
			
		||||
            ];
 | 
			
		||||
            
 | 
			
		||||
        } catch (error) {
 | 
			
		||||
            console.error('Client-side embedding error:', error);
 | 
			
		||||
            return [
 | 
			
		||||
                { error: error.message },
 | 
			
		||||
                `❌ Error: ${error.message}`,
 | 
			
		||||
                "danger", 
 | 
			
		||||
                false
 | 
			
		||||
            ];
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
console.log('✅ Transformers.js client-side setup complete');
 | 
			
		||||
console.log('Available:', {
 | 
			
		||||
    transformersEmbedder: !!window.transformersEmbedder,
 | 
			
		||||
    dashClientside: !!window.dash_clientside,
 | 
			
		||||
    transformersModule: !!window.dash_clientside?.transformers,
 | 
			
		||||
    generateFunction: typeof window.dash_clientside?.transformers?.generateEmbeddings
 | 
			
		||||
});
 | 
			
		||||
							
								
								
									
										9
									
								
								assets/package.json
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										9
									
								
								assets/package.json
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,9 @@
 | 
			
		||||
{
 | 
			
		||||
  "name": "embeddingbuddy-assets",
 | 
			
		||||
  "version": "1.0.0",
 | 
			
		||||
  "description": "JavaScript dependencies for EmbeddingBuddy text input functionality",
 | 
			
		||||
  "dependencies": {
 | 
			
		||||
    "@huggingface/transformers": "^3.0.0"
 | 
			
		||||
  },
 | 
			
		||||
  "type": "module"
 | 
			
		||||
}
 | 
			
		||||
							
								
								
									
										106
									
								
								assets/sample-txt.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										106
									
								
								assets/sample-txt.md
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,106 @@
 | 
			
		||||
The sun peeked through the clouds after a drizzly morning.
 | 
			
		||||
A gentle breeze rustled the leaves as we walked along the shoreline.
 | 
			
		||||
Heavy rains caused flooding in several low-lying neighborhoods.
 | 
			
		||||
It was so hot that even the birds sought shade under the palm trees.
 | 
			
		||||
By midnight, the temperature had dropped below freezing.
 | 
			
		||||
Thunderstorms lit up the sky with flashes of lightning.
 | 
			
		||||
A thick fog settled over the city streets at dawn.
 | 
			
		||||
The air smelled of ozone after the sudden hailstorm.
 | 
			
		||||
I watched the snowflakes drift silently onto the ground.
 | 
			
		||||
A double rainbow appeared after the rain shower.
 | 
			
		||||
The humidity soared to uncomfortable levels by midday.
 | 
			
		||||
Dust devils formed in the dry desert plains.
 | 
			
		||||
The barometer readings indicated an approaching front.
 | 
			
		||||
A sudden gust of wind knocked over the garden chairs.
 | 
			
		||||
Light drizzle turned into a torrential downpour within minutes.
 | 
			
		||||
The new smartphone features a foldable display and 5G connectivity.
 | 
			
		||||
In the world of AI, transformers have revolutionized natural language processing.
 | 
			
		||||
Quantum computing promises to solve problems beyond classical computers' reach.
 | 
			
		||||
Blockchain technology is being explored for secure voting systems.
 | 
			
		||||
Virtual reality headsets are becoming more affordable and accessible.
 | 
			
		||||
The rise of electric vehicles is reshaping the automotive industry.
 | 
			
		||||
Cloud computing allows businesses to scale resources dynamically.
 | 
			
		||||
Machine learning algorithms can now predict stock market trends with surprising accuracy.
 | 
			
		||||
Augmented reality applications are transforming retail experiences.
 | 
			
		||||
The Internet of Things connects everyday devices to the web for smarter living.
 | 
			
		||||
Cybersecurity threats are evolving, requiring constant vigilance.
 | 
			
		||||
3D printing is enabling rapid prototyping and custom manufacturing.
 | 
			
		||||
Edge computing reduces latency by processing data closer to the source.
 | 
			
		||||
Biometric authentication methods are enhancing security in devices.
 | 
			
		||||
Wearable technology is tracking health metrics in real-time.
 | 
			
		||||
Artificial intelligence is being used to create realistic deepfakes.
 | 
			
		||||
Preheat the oven to 375°F before you start mixing the batter.
 | 
			
		||||
She finely chopped the garlic and sautéed it in two tablespoons of olive oil.
 | 
			
		||||
A pinch of saffron adds a beautiful color and aroma to traditional paella.
 | 
			
		||||
If the soup is too salty, add a peeled potato to absorb excess sodium.
 | 
			
		||||
Let the bread dough rise for at least an hour in a warm, draft-free spot.
 | 
			
		||||
Marinate the chicken overnight in a blend of citrus and spices.
 | 
			
		||||
Use a cast-iron skillet to sear the steak on high heat.
 | 
			
		||||
Whisk the egg whites until they form stiff peaks.
 | 
			
		||||
Fold in the chocolate chips gently to keep the batter airy.
 | 
			
		||||
Brush the pastry with an egg wash for a golden finish.
 | 
			
		||||
Slow-roast the pork shoulder until it falls off the bone.
 | 
			
		||||
Garnish the salad with toasted nuts and fresh herbs.
 | 
			
		||||
Deglaze the pan with white wine for a rich sauce.
 | 
			
		||||
Simmer the curry paste until the aroma intensifies.
 | 
			
		||||
Let the risotto rest before serving to thicken slightly.
 | 
			
		||||
He dribbled past two defenders and sank a three-pointer at the buzzer.
 | 
			
		||||
The marathon runner kept a steady pace despite the sweltering heat.
 | 
			
		||||
Their home team clinched the championship with a last-minute goal.
 | 
			
		||||
NASCAR fans cheered as the cars roared around the oval track.
 | 
			
		||||
She landed a perfect triple axel at the figure skating championship.
 | 
			
		||||
The cyclist pedaled up the steep hill in record time.
 | 
			
		||||
He pitched a no-hitter during the high school baseball game.
 | 
			
		||||
The quarterback threw a touchdown pass under heavy pressure.
 | 
			
		||||
They scored a hat-trick in the hockey final.
 | 
			
		||||
The boxer delivered a swift uppercut in the final round.
 | 
			
		||||
Surfers caught massive waves at dawn on the Pacific coast.
 | 
			
		||||
Fans erupted when the underdog scored the winning goal.
 | 
			
		||||
The swimmer broke the national record in the 200m freestyle.
 | 
			
		||||
The gymnast executed a flawless routine on the balance beam.
 | 
			
		||||
The rugby team celebrated their victory with a traditional haka.
 | 
			
		||||
The stock market rallied after positive earnings reports.
 | 
			
		||||
Investors are closely watching interest rate changes by the Federal Reserve.
 | 
			
		||||
Cryptocurrency prices have been extremely volatile this year.
 | 
			
		||||
Diversification is key to managing investment risk effectively.
 | 
			
		||||
Inflation rates have reached a 40-year high, impacting consumer spending.
 | 
			
		||||
Many companies are adopting ESG criteria to attract socially conscious investors.
 | 
			
		||||
The bond market is reacting to geopolitical tensions and supply chain disruptions.
 | 
			
		||||
Venture capital funding for startups has surged in the tech sector.
 | 
			
		||||
Exchange-traded funds (ETFs) offer a way to invest in diversified portfolios.
 | 
			
		||||
The global economy is recovering from the pandemic, but challenges remain.
 | 
			
		||||
Central banks are exploring digital currencies to modernize payment systems.
 | 
			
		||||
Retail investors are increasingly participating in the stock market through apps.
 | 
			
		||||
Hedge funds are using complex algorithms to gain an edge in trading.
 | 
			
		||||
Real estate prices have skyrocketed in urban areas due to low inventory.
 | 
			
		||||
The startup raised $10 million in its Series A funding round.
 | 
			
		||||
The symphony orchestra played a hauntingly beautiful melody.
 | 
			
		||||
She strummed her guitar softly, filling the room with a warm sound.
 | 
			
		||||
The DJ mixed tracks seamlessly, keeping the crowd dancing all night.
 | 
			
		||||
His voice soared during the high notes of the ballad.
 | 
			
		||||
The band played an acoustic set in the intimate coffee shop.
 | 
			
		||||
Jazz musicians often improvise solos based on the chord changes.
 | 
			
		||||
The opera singer hit the high C with perfect pitch.
 | 
			
		||||
The choir harmonized beautifully, filling the church with sound.
 | 
			
		||||
He composed a symphony that was performed at the concert hall.
 | 
			
		||||
The singer-songwriter wrote heartfelt lyrics about love and loss.
 | 
			
		||||
The rock band headlined the festival, drawing a massive crowd.
 | 
			
		||||
Hip-hop artists use rhythm and rhyme to tell powerful stories.
 | 
			
		||||
The violinist played a virtuosic solo that left the audience in awe.
 | 
			
		||||
Folk music often reflects the culture and traditions of a community.
 | 
			
		||||
The gospel choir lifted spirits with their uplifting performance.
 | 
			
		||||
The fall of the Berlin Wall in 1989 marked the end of the Cold War.
 | 
			
		||||
Ancient Egypt's pyramids are a testament to their architectural prowess.
 | 
			
		||||
Europe's Renaissance period sparked a revival in art and science.
 | 
			
		||||
The signing of the Declaration of Independence in 1776 established the United States.
 | 
			
		||||
The Industrial Revolution transformed economies and societies worldwide.
 | 
			
		||||
Rome was the center of a vast empire that influenced law and governance.
 | 
			
		||||
The discovery of the New World by Christopher Columbus in 1492 changed global trade.
 | 
			
		||||
The French Revolution in 1789 led to significant political and social change.
 | 
			
		||||
World War II was a global conflict that reshaped international relations.
 | 
			
		||||
The fall of the Roman Empire in 476 AD marked the beginning of the Middle Ages.
 | 
			
		||||
The invention of the printing press revolutionized the spread of knowledge.
 | 
			
		||||
The Cold War was characterized by political tension between the U.S. and the Soviet Union.
 | 
			
		||||
The ancient Silk Road connected East and West through trade routes.
 | 
			
		||||
The signing of the Magna Carta in 1215 established principles of due process.
 | 
			
		||||
Exploration during the Age of Discovery expanded European empires across the globe.
 | 
			
		||||
							
								
								
									
										172
									
								
								assets/transformers-loader.js
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										172
									
								
								assets/transformers-loader.js
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,172 @@
 | 
			
		||||
// Simple script to load Transformers.js from CDN and initialize embedding functionality
 | 
			
		||||
// This approach uses traditional script loading instead of ES6 modules
 | 
			
		||||
 | 
			
		||||
console.log('🔧 Transformers.js loader starting...');
 | 
			
		||||
 | 
			
		||||
// Global state
 | 
			
		||||
window.transformersLibraryLoaded = false;
 | 
			
		||||
window.transformersLibraryLoading = false;
 | 
			
		||||
 | 
			
		||||
// Function to dynamically load a script
 | 
			
		||||
function loadScript(src) {
 | 
			
		||||
    return new Promise((resolve, reject) => {
 | 
			
		||||
        const script = document.createElement('script');
 | 
			
		||||
        script.src = src;
 | 
			
		||||
        script.type = 'module';
 | 
			
		||||
        script.onload = () => resolve();
 | 
			
		||||
        script.onerror = () => reject(new Error(`Failed to load script: ${src}`));
 | 
			
		||||
        document.head.appendChild(script);
 | 
			
		||||
    });
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Function to initialize Transformers.js
 | 
			
		||||
async function initializeTransformers() {
 | 
			
		||||
    if (window.transformersLibraryLoaded) {
 | 
			
		||||
        console.log('✅ Transformers.js already loaded');
 | 
			
		||||
        return true;
 | 
			
		||||
    }
 | 
			
		||||
    
 | 
			
		||||
    if (window.transformersLibraryLoading) {
 | 
			
		||||
        console.log('⏳ Transformers.js already loading, waiting...');
 | 
			
		||||
        // Wait for loading to complete
 | 
			
		||||
        while (window.transformersLibraryLoading) {
 | 
			
		||||
            await new Promise(resolve => setTimeout(resolve, 100));
 | 
			
		||||
        }
 | 
			
		||||
        return window.transformersLibraryLoaded;
 | 
			
		||||
    }
 | 
			
		||||
    
 | 
			
		||||
    window.transformersLibraryLoading = true;
 | 
			
		||||
    
 | 
			
		||||
    try {
 | 
			
		||||
        console.log('📦 Loading Transformers.js from CDN...');
 | 
			
		||||
        
 | 
			
		||||
        // Use dynamic import since this is more reliable with ES modules
 | 
			
		||||
        const transformers = await import('https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.0.0');
 | 
			
		||||
        window.transformersLibrary = transformers;
 | 
			
		||||
        window.transformersLibraryLoaded = true;
 | 
			
		||||
        
 | 
			
		||||
        console.log('✅ Transformers.js loaded successfully');
 | 
			
		||||
        return true;
 | 
			
		||||
    } catch (error) {
 | 
			
		||||
        console.error('❌ Failed to load Transformers.js:', error);
 | 
			
		||||
        return false;
 | 
			
		||||
    } finally {
 | 
			
		||||
        window.transformersLibraryLoading = false;
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Simple embeddings class
 | 
			
		||||
class SimpleEmbedder {
 | 
			
		||||
    constructor() {
 | 
			
		||||
        this.pipeline = null;
 | 
			
		||||
        this.modelCache = new Map();
 | 
			
		||||
    }
 | 
			
		||||
    
 | 
			
		||||
    async generateEmbeddings(texts, modelName = 'Xenova/all-MiniLM-L6-v2') {
 | 
			
		||||
        console.log('🔄 Generating embeddings for', texts.length, 'texts with model', modelName);
 | 
			
		||||
        
 | 
			
		||||
        // Ensure Transformers.js is loaded
 | 
			
		||||
        if (!window.transformersLibraryLoaded) {
 | 
			
		||||
            const loaded = await initializeTransformers();
 | 
			
		||||
            if (!loaded) {
 | 
			
		||||
                throw new Error('Failed to load Transformers.js');
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
        
 | 
			
		||||
        // Create pipeline if not cached
 | 
			
		||||
        if (!this.modelCache.has(modelName)) {
 | 
			
		||||
            console.log('🏗️ Creating pipeline for', modelName);
 | 
			
		||||
            const { pipeline } = window.transformersLibrary;
 | 
			
		||||
            this.pipeline = await pipeline('feature-extraction', modelName);
 | 
			
		||||
            this.modelCache.set(modelName, this.pipeline);
 | 
			
		||||
        } else {
 | 
			
		||||
            this.pipeline = this.modelCache.get(modelName);
 | 
			
		||||
        }
 | 
			
		||||
        
 | 
			
		||||
        // Generate embeddings
 | 
			
		||||
        const embeddings = [];
 | 
			
		||||
        for (let i = 0; i < texts.length; i++) {
 | 
			
		||||
            console.log(`Processing text ${i + 1}/${texts.length}...`);
 | 
			
		||||
            const result = await this.pipeline(texts[i], { pooling: 'mean', normalize: true });
 | 
			
		||||
            embeddings.push(Array.from(result.data));
 | 
			
		||||
        }
 | 
			
		||||
        
 | 
			
		||||
        console.log('✅ Generated', embeddings.length, 'embeddings');
 | 
			
		||||
        return embeddings;
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Create global instance
 | 
			
		||||
window.simpleEmbedder = new SimpleEmbedder();
 | 
			
		||||
 | 
			
		||||
// Set up Dash clientside callbacks
 | 
			
		||||
window.dash_clientside = window.dash_clientside || {};
 | 
			
		||||
window.dash_clientside.transformers = {
 | 
			
		||||
    generateEmbeddings: async function(nClicks, textContent, modelName, tokenizationMethod, category, subcategory) {
 | 
			
		||||
        console.log('🚀 Client-side generateEmbeddings called');
 | 
			
		||||
        
 | 
			
		||||
        if (!nClicks || !textContent || textContent.trim().length === 0) {
 | 
			
		||||
            console.log('⚠️ Missing required parameters');
 | 
			
		||||
            return window.dash_clientside.no_update;
 | 
			
		||||
        }
 | 
			
		||||
        
 | 
			
		||||
        try {
 | 
			
		||||
            // Tokenize text
 | 
			
		||||
            let textChunks;
 | 
			
		||||
            const trimmedText = textContent.trim();
 | 
			
		||||
            
 | 
			
		||||
            switch (tokenizationMethod) {
 | 
			
		||||
                case 'sentence':
 | 
			
		||||
                    textChunks = trimmedText.split(/[.!?]+/).map(s => s.trim()).filter(s => s.length > 0);
 | 
			
		||||
                    break;
 | 
			
		||||
                case 'paragraph':
 | 
			
		||||
                    textChunks = trimmedText.split(/\n\s*\n/).map(s => s.trim()).filter(s => s.length > 0);
 | 
			
		||||
                    break;
 | 
			
		||||
                case 'manual':
 | 
			
		||||
                    textChunks = trimmedText.split('\n').map(s => s.trim()).filter(s => s.length > 0);
 | 
			
		||||
                    break;
 | 
			
		||||
                default:
 | 
			
		||||
                    textChunks = [trimmedText];
 | 
			
		||||
            }
 | 
			
		||||
            
 | 
			
		||||
            if (textChunks.length === 0) {
 | 
			
		||||
                throw new Error('No valid text chunks after tokenization');
 | 
			
		||||
            }
 | 
			
		||||
            
 | 
			
		||||
            // Generate embeddings
 | 
			
		||||
            const embeddings = await window.simpleEmbedder.generateEmbeddings(textChunks, modelName);
 | 
			
		||||
            
 | 
			
		||||
            // Create documents
 | 
			
		||||
            const documents = textChunks.map((text, i) => ({
 | 
			
		||||
                id: `text_input_${Date.now()}_${i}`,
 | 
			
		||||
                text: text,
 | 
			
		||||
                embedding: embeddings[i],
 | 
			
		||||
                category: category || "Text Input",
 | 
			
		||||
                subcategory: subcategory || "Generated", 
 | 
			
		||||
                tags: []
 | 
			
		||||
            }));
 | 
			
		||||
            
 | 
			
		||||
            return [
 | 
			
		||||
                {
 | 
			
		||||
                    documents: documents,
 | 
			
		||||
                    embeddings: embeddings
 | 
			
		||||
                },
 | 
			
		||||
                `✅ Generated embeddings for ${documents.length} text chunks using ${modelName}`,
 | 
			
		||||
                "success",
 | 
			
		||||
                false
 | 
			
		||||
            ];
 | 
			
		||||
            
 | 
			
		||||
        } catch (error) {
 | 
			
		||||
            console.error('❌ Error generating embeddings:', error);
 | 
			
		||||
            return [
 | 
			
		||||
                { error: error.message },
 | 
			
		||||
                `❌ Error: ${error.message}`,
 | 
			
		||||
                "danger",
 | 
			
		||||
                false
 | 
			
		||||
            ];
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
console.log('✅ Simple Transformers.js setup complete');
 | 
			
		||||
console.log('Available functions:', Object.keys(window.dash_clientside.transformers));
 | 
			
		||||
@@ -8,7 +8,15 @@ from .ui.callbacks.interactions import InteractionCallbacks
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def create_app():
 | 
			
		||||
    app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])
 | 
			
		||||
    import os
 | 
			
		||||
 | 
			
		||||
    # Get the project root directory (two levels up from this file)
 | 
			
		||||
    project_root = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
 | 
			
		||||
    assets_path = os.path.join(project_root, "assets")
 | 
			
		||||
 | 
			
		||||
    app = dash.Dash(
 | 
			
		||||
        __name__, external_stylesheets=[dbc.themes.BOOTSTRAP], assets_folder=assets_path
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    # Allow callbacks to components that are dynamically created in tabs
 | 
			
		||||
    app.config.suppress_callback_exceptions = True
 | 
			
		||||
@@ -20,9 +28,78 @@ def create_app():
 | 
			
		||||
    VisualizationCallbacks()
 | 
			
		||||
    InteractionCallbacks()
 | 
			
		||||
 | 
			
		||||
    # Register client-side callback for embedding generation
 | 
			
		||||
    _register_client_side_callbacks(app)
 | 
			
		||||
 | 
			
		||||
    return app
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _register_client_side_callbacks(app):
 | 
			
		||||
    """Register client-side callbacks for browser-based processing."""
 | 
			
		||||
    from dash import Input, Output, State
 | 
			
		||||
 | 
			
		||||
    # Client-side callback for embedding generation
 | 
			
		||||
    app.clientside_callback(
 | 
			
		||||
        """
 | 
			
		||||
        function(nClicks, textContent, modelName, tokenizationMethod, batchSize, category, subcategory) {
 | 
			
		||||
            if (!nClicks || !textContent || !textContent.trim()) {
 | 
			
		||||
                return window.dash_clientside.no_update;
 | 
			
		||||
            }
 | 
			
		||||
            
 | 
			
		||||
            console.log('🔍 Checking for Transformers.js...');
 | 
			
		||||
            console.log('window.dash_clientside:', typeof window.dash_clientside);
 | 
			
		||||
            console.log('window.dash_clientside.transformers:', typeof window.dash_clientside?.transformers);
 | 
			
		||||
            console.log('generateEmbeddings function:', typeof window.dash_clientside?.transformers?.generateEmbeddings);
 | 
			
		||||
            
 | 
			
		||||
            if (typeof window.dash_clientside !== 'undefined' && 
 | 
			
		||||
                typeof window.dash_clientside.transformers !== 'undefined' &&
 | 
			
		||||
                typeof window.dash_clientside.transformers.generateEmbeddings === 'function') {
 | 
			
		||||
                
 | 
			
		||||
                console.log('✅ Calling Transformers.js generateEmbeddings...');
 | 
			
		||||
                return window.dash_clientside.transformers.generateEmbeddings(
 | 
			
		||||
                    nClicks, textContent, modelName, tokenizationMethod, category, subcategory
 | 
			
		||||
                );
 | 
			
		||||
            }
 | 
			
		||||
            
 | 
			
		||||
            // More detailed error information
 | 
			
		||||
            let errorMsg = '❌ Transformers.js not available. ';
 | 
			
		||||
            if (typeof window.dash_clientside === 'undefined') {
 | 
			
		||||
                errorMsg += 'dash_clientside not found.';
 | 
			
		||||
            } else if (typeof window.dash_clientside.transformers === 'undefined') {
 | 
			
		||||
                errorMsg += 'transformers module not found.';
 | 
			
		||||
            } else if (typeof window.dash_clientside.transformers.generateEmbeddings !== 'function') {
 | 
			
		||||
                errorMsg += 'generateEmbeddings function not found.';
 | 
			
		||||
            }
 | 
			
		||||
            
 | 
			
		||||
            console.error(errorMsg);
 | 
			
		||||
            
 | 
			
		||||
            return [
 | 
			
		||||
                { error: 'Transformers.js not loaded. Please refresh the page and try again.' },
 | 
			
		||||
                errorMsg + ' Please refresh the page.',
 | 
			
		||||
                'danger',
 | 
			
		||||
                false
 | 
			
		||||
            ];
 | 
			
		||||
        }
 | 
			
		||||
        """,
 | 
			
		||||
        [
 | 
			
		||||
            Output("embeddings-generated-trigger", "data"),
 | 
			
		||||
            Output("text-input-status-immediate", "children"),
 | 
			
		||||
            Output("text-input-status-immediate", "color"),
 | 
			
		||||
            Output("generate-embeddings-btn", "disabled", allow_duplicate=True),
 | 
			
		||||
        ],
 | 
			
		||||
        [Input("generate-embeddings-btn", "n_clicks")],
 | 
			
		||||
        [
 | 
			
		||||
            State("text-input-area", "value"),
 | 
			
		||||
            State("model-selection", "value"),
 | 
			
		||||
            State("tokenization-method", "value"),
 | 
			
		||||
            State("batch-size", "value"),
 | 
			
		||||
            State("text-category", "value"),
 | 
			
		||||
            State("text-subcategory", "value"),
 | 
			
		||||
        ],
 | 
			
		||||
        prevent_initial_call=True,
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def run_app(app=None, debug=None, host=None, port=None):
 | 
			
		||||
    if app is None:
 | 
			
		||||
        app = create_app()
 | 
			
		||||
 
 | 
			
		||||
@@ -79,6 +79,71 @@ class AppSettings:
 | 
			
		||||
    OPENSEARCH_CONNECTION_TIMEOUT = 30
 | 
			
		||||
    OPENSEARCH_VERIFY_CERTS = True
 | 
			
		||||
 | 
			
		||||
    # Text Input / Transformers.js Configuration
 | 
			
		||||
    DEFAULT_EMBEDDING_MODEL = "Xenova/all-mpnet-base-v2"
 | 
			
		||||
    MAX_TEXT_LENGTH = 50000  # Characters (browser memory limits)
 | 
			
		||||
    DEFAULT_TOKENIZATION_METHOD = "sentence"
 | 
			
		||||
    MAX_BATCH_SIZE = 8  # Process in smaller batches for memory management
 | 
			
		||||
 | 
			
		||||
    # Available Transformers.js compatible models
 | 
			
		||||
    AVAILABLE_MODELS = [
 | 
			
		||||
        {
 | 
			
		||||
            "name": "Xenova/all-mpnet-base-v2",
 | 
			
		||||
            "label": "All-MPNet-Base-v2 (Quality, 768d)",
 | 
			
		||||
            "description": "Higher quality embeddings with better semantic understanding",
 | 
			
		||||
            "dimensions": 768,
 | 
			
		||||
            "size": "109 MB",
 | 
			
		||||
            "context_length": 512,
 | 
			
		||||
            "multilingual": False,
 | 
			
		||||
            "default": True,
 | 
			
		||||
        },
 | 
			
		||||
        {
 | 
			
		||||
            "name": "Xenova/all-MiniLM-L6-v2",
 | 
			
		||||
            "label": "All-MiniLM-L6-v2 (Fast, 384d)",
 | 
			
		||||
            "description": "Lightweight model, good for quick testing and general purpose",
 | 
			
		||||
            "dimensions": 384,
 | 
			
		||||
            "size": "23 MB",
 | 
			
		||||
            "context_length": 512,
 | 
			
		||||
            "multilingual": False,
 | 
			
		||||
            "default": False,
 | 
			
		||||
        },
 | 
			
		||||
        {
 | 
			
		||||
            "name": "Xenova/paraphrase-multilingual-MiniLM-L12-v2",
 | 
			
		||||
            "label": "Multilingual MiniLM (50+ languages)",
 | 
			
		||||
            "description": "Support for multiple languages with good performance",
 | 
			
		||||
            "dimensions": 384,
 | 
			
		||||
            "size": "127 MB",
 | 
			
		||||
            "context_length": 512,
 | 
			
		||||
            "multilingual": True,
 | 
			
		||||
        },
 | 
			
		||||
        {
 | 
			
		||||
            "name": "Xenova/bge-small-en-v1.5",
 | 
			
		||||
            "label": "BGE Small English (High quality, 384d)",
 | 
			
		||||
            "description": "Beijing Academy of AI model with excellent performance on retrieval tasks",
 | 
			
		||||
            "dimensions": 384,
 | 
			
		||||
            "size": "67 MB",
 | 
			
		||||
            "context_length": 512,
 | 
			
		||||
            "multilingual": False,
 | 
			
		||||
        },
 | 
			
		||||
        {
 | 
			
		||||
            "name": "Xenova/gte-small",
 | 
			
		||||
            "label": "GTE Small (General Text Embeddings, 384d)",
 | 
			
		||||
            "description": "Alibaba's general text embedding model, balanced performance",
 | 
			
		||||
            "dimensions": 384,
 | 
			
		||||
            "size": "67 MB",
 | 
			
		||||
            "context_length": 512,
 | 
			
		||||
            "multilingual": False,
 | 
			
		||||
        },
 | 
			
		||||
    ]
 | 
			
		||||
 | 
			
		||||
    # Browser compatibility requirements
 | 
			
		||||
    SUPPORTED_BROWSERS = {
 | 
			
		||||
        "chrome": ">=88",
 | 
			
		||||
        "firefox": ">=92",
 | 
			
		||||
        "safari": ">=15.4",
 | 
			
		||||
        "edge": ">=88",
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    # Bootstrap Theme
 | 
			
		||||
    EXTERNAL_STYLESHEETS = [
 | 
			
		||||
        "https://cdn.jsdelivr.net/npm/bootstrap@5.1.3/dist/css/bootstrap.min.css"
 | 
			
		||||
 
 | 
			
		||||
@@ -63,6 +63,90 @@ class DataProcessor:
 | 
			
		||||
        except Exception as e:
 | 
			
		||||
            return ProcessedData(documents=[], embeddings=np.array([]), error=str(e))
 | 
			
		||||
 | 
			
		||||
    def process_client_embeddings(self, embeddings_data: dict) -> ProcessedData:
 | 
			
		||||
        """Process embeddings data received from client-side JavaScript."""
 | 
			
		||||
        try:
 | 
			
		||||
            if "error" in embeddings_data:
 | 
			
		||||
                return ProcessedData(
 | 
			
		||||
                    documents=[],
 | 
			
		||||
                    embeddings=np.array([]),
 | 
			
		||||
                    error=embeddings_data["error"],
 | 
			
		||||
                )
 | 
			
		||||
 | 
			
		||||
            # Extract documents and embeddings from client data
 | 
			
		||||
            documents_data = embeddings_data.get("documents", [])
 | 
			
		||||
            embeddings_list = embeddings_data.get("embeddings", [])
 | 
			
		||||
 | 
			
		||||
            if not documents_data or not embeddings_list:
 | 
			
		||||
                return ProcessedData(
 | 
			
		||||
                    documents=[],
 | 
			
		||||
                    embeddings=np.array([]),
 | 
			
		||||
                    error="No documents or embeddings in client data",
 | 
			
		||||
                )
 | 
			
		||||
 | 
			
		||||
            if len(documents_data) != len(embeddings_list):
 | 
			
		||||
                return ProcessedData(
 | 
			
		||||
                    documents=[],
 | 
			
		||||
                    embeddings=np.array([]),
 | 
			
		||||
                    error="Mismatch between number of documents and embeddings",
 | 
			
		||||
                )
 | 
			
		||||
 | 
			
		||||
            # Convert embeddings to numpy array first
 | 
			
		||||
            try:
 | 
			
		||||
                embeddings = np.array(embeddings_list)
 | 
			
		||||
 | 
			
		||||
                if embeddings.ndim != 2:
 | 
			
		||||
                    return ProcessedData(
 | 
			
		||||
                        documents=[],
 | 
			
		||||
                        embeddings=np.array([]),
 | 
			
		||||
                        error="Invalid embedding dimensions",
 | 
			
		||||
                    )
 | 
			
		||||
 | 
			
		||||
            except Exception as e:
 | 
			
		||||
                return ProcessedData(
 | 
			
		||||
                    documents=[],
 | 
			
		||||
                    embeddings=np.array([]),
 | 
			
		||||
                    error=f"Error processing embeddings: {str(e)}",
 | 
			
		||||
                )
 | 
			
		||||
 | 
			
		||||
            # Convert to Document objects with embeddings
 | 
			
		||||
            documents = []
 | 
			
		||||
            for i, doc_data in enumerate(documents_data):
 | 
			
		||||
                try:
 | 
			
		||||
                    # Skip if we don't have a corresponding embedding
 | 
			
		||||
                    if i >= len(embeddings):
 | 
			
		||||
                        continue
 | 
			
		||||
 | 
			
		||||
                    # Ensure required fields are present
 | 
			
		||||
                    if "id" not in doc_data or not doc_data["id"]:
 | 
			
		||||
                        doc_data["id"] = f"text_input_{i}"
 | 
			
		||||
                    if "text" not in doc_data or not doc_data["text"].strip():
 | 
			
		||||
                        continue  # Skip documents without text
 | 
			
		||||
 | 
			
		||||
                    # Add the embedding to doc_data
 | 
			
		||||
                    doc_data["embedding"] = embeddings[i].tolist()
 | 
			
		||||
 | 
			
		||||
                    doc = Document(**doc_data)
 | 
			
		||||
                    documents.append(doc)
 | 
			
		||||
                except Exception:
 | 
			
		||||
                    # Skip invalid documents but continue processing
 | 
			
		||||
                    continue
 | 
			
		||||
 | 
			
		||||
            if not documents:
 | 
			
		||||
                return ProcessedData(
 | 
			
		||||
                    documents=[],
 | 
			
		||||
                    embeddings=np.array([]),
 | 
			
		||||
                    error="No valid documents found in client data",
 | 
			
		||||
                )
 | 
			
		||||
 | 
			
		||||
            # Only keep embeddings for valid documents
 | 
			
		||||
            valid_embeddings = embeddings[: len(documents)]
 | 
			
		||||
 | 
			
		||||
            return ProcessedData(documents=documents, embeddings=valid_embeddings)
 | 
			
		||||
 | 
			
		||||
        except Exception as e:
 | 
			
		||||
            return ProcessedData(documents=[], embeddings=np.array([]), error=str(e))
 | 
			
		||||
 | 
			
		||||
    def _extract_embeddings(self, documents: List[Document]) -> np.ndarray:
 | 
			
		||||
        if not documents:
 | 
			
		||||
            return np.array([])
 | 
			
		||||
 
 | 
			
		||||
@@ -1,4 +1,4 @@
 | 
			
		||||
from dash import callback, Input, Output, State, no_update
 | 
			
		||||
from dash import callback, Input, Output, State, no_update, html
 | 
			
		||||
from ...data.processor import DataProcessor
 | 
			
		||||
from ...data.sources.opensearch import OpenSearchClient
 | 
			
		||||
from ...models.field_mapper import FieldMapper
 | 
			
		||||
@@ -87,6 +87,8 @@ class DataProcessingCallbacks:
 | 
			
		||||
 | 
			
		||||
            if active_tab == "opensearch-tab":
 | 
			
		||||
                return [datasource.create_opensearch_tab()]
 | 
			
		||||
            elif active_tab == "text-input-tab":
 | 
			
		||||
                return [datasource.create_text_input_tab()]
 | 
			
		||||
            else:
 | 
			
		||||
                return [datasource.create_file_upload_tab()]
 | 
			
		||||
 | 
			
		||||
@@ -97,6 +99,9 @@ class DataProcessingCallbacks:
 | 
			
		||||
        # Register collapsible section callbacks
 | 
			
		||||
        self._register_collapse_callbacks()
 | 
			
		||||
 | 
			
		||||
        # Register text input callbacks
 | 
			
		||||
        self._register_text_input_callbacks()
 | 
			
		||||
 | 
			
		||||
    def _register_opensearch_callbacks(self, section_type, opensearch_client):
 | 
			
		||||
        """Register callbacks for a specific section (data or prompts)."""
 | 
			
		||||
 | 
			
		||||
@@ -463,6 +468,224 @@ class DataProcessingCallbacks:
 | 
			
		||||
                return new_state, icon_class
 | 
			
		||||
            return is_open, "fas fa-chevron-down me-2"
 | 
			
		||||
 | 
			
		||||
    def _register_text_input_callbacks(self):
 | 
			
		||||
        """Register callbacks for text input functionality."""
 | 
			
		||||
 | 
			
		||||
        # Text length counter callback
 | 
			
		||||
        @callback(
 | 
			
		||||
            Output("text-length-counter", "children"),
 | 
			
		||||
            Input("text-input-area", "value"),
 | 
			
		||||
            prevent_initial_call=False,
 | 
			
		||||
        )
 | 
			
		||||
        def update_text_length_counter(text_value):
 | 
			
		||||
            if not text_value:
 | 
			
		||||
                return "0"
 | 
			
		||||
            return f"{len(text_value):,}"
 | 
			
		||||
 | 
			
		||||
        # Generate button enable/disable callback
 | 
			
		||||
        @callback(
 | 
			
		||||
            [
 | 
			
		||||
                Output("generate-embeddings-btn", "disabled"),
 | 
			
		||||
                Output("generation-help", "children"),
 | 
			
		||||
                Output("generation-help", "color"),
 | 
			
		||||
            ],
 | 
			
		||||
            [
 | 
			
		||||
                Input("text-input-area", "value"),
 | 
			
		||||
                Input("model-selection", "value"),
 | 
			
		||||
            ],
 | 
			
		||||
            prevent_initial_call=False,
 | 
			
		||||
        )
 | 
			
		||||
        def toggle_generate_button(text_value, model_name):
 | 
			
		||||
            import dash_bootstrap_components as dbc
 | 
			
		||||
 | 
			
		||||
            if not text_value or not text_value.strip():
 | 
			
		||||
                return (
 | 
			
		||||
                    True,
 | 
			
		||||
                    dbc.Alert(
 | 
			
		||||
                        [
 | 
			
		||||
                            html.I(className="fas fa-info-circle me-2"),
 | 
			
		||||
                            "Enter some text above to enable embedding generation.",
 | 
			
		||||
                        ],
 | 
			
		||||
                        color="light",
 | 
			
		||||
                    ),
 | 
			
		||||
                    "light",
 | 
			
		||||
                )
 | 
			
		||||
 | 
			
		||||
            if not model_name:
 | 
			
		||||
                return (
 | 
			
		||||
                    True,
 | 
			
		||||
                    dbc.Alert(
 | 
			
		||||
                        [
 | 
			
		||||
                            html.I(className="fas fa-exclamation-triangle me-2"),
 | 
			
		||||
                            "Select an embedding model to continue.",
 | 
			
		||||
                        ],
 | 
			
		||||
                        color="warning",
 | 
			
		||||
                    ),
 | 
			
		||||
                    "warning",
 | 
			
		||||
                )
 | 
			
		||||
 | 
			
		||||
            text_length = len(text_value.strip())
 | 
			
		||||
            if text_length > AppSettings.MAX_TEXT_LENGTH:
 | 
			
		||||
                return (
 | 
			
		||||
                    True,
 | 
			
		||||
                    dbc.Alert(
 | 
			
		||||
                        [
 | 
			
		||||
                            html.I(className="fas fa-exclamation-triangle me-2"),
 | 
			
		||||
                            f"Text too long ({text_length:,} characters). Maximum allowed: {AppSettings.MAX_TEXT_LENGTH:,} characters.",
 | 
			
		||||
                        ],
 | 
			
		||||
                        color="danger",
 | 
			
		||||
                    ),
 | 
			
		||||
                    "danger",
 | 
			
		||||
                )
 | 
			
		||||
 | 
			
		||||
            return (
 | 
			
		||||
                False,
 | 
			
		||||
                dbc.Alert(
 | 
			
		||||
                    [
 | 
			
		||||
                        html.I(className="fas fa-check-circle me-2"),
 | 
			
		||||
                        f"Ready to generate embeddings for {text_length:,} characters using {model_name}.",
 | 
			
		||||
                    ],
 | 
			
		||||
                    color="success",
 | 
			
		||||
                ),
 | 
			
		||||
                "success",
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
        # Clear text callback
 | 
			
		||||
        @callback(
 | 
			
		||||
            Output("text-input-area", "value"),
 | 
			
		||||
            [Input("clear-text-btn", "n_clicks"), Input("load-sample-btn", "n_clicks")],
 | 
			
		||||
            prevent_initial_call=True,
 | 
			
		||||
        )
 | 
			
		||||
        def handle_text_input_actions(clear_clicks, load_clicks):
 | 
			
		||||
            from dash import ctx
 | 
			
		||||
 | 
			
		||||
            if not ctx.triggered:
 | 
			
		||||
                return no_update
 | 
			
		||||
 | 
			
		||||
            button_id = ctx.triggered[0]["prop_id"].split(".")[0]
 | 
			
		||||
 | 
			
		||||
            if button_id == "clear-text-btn" and clear_clicks:
 | 
			
		||||
                return ""
 | 
			
		||||
            elif button_id == "load-sample-btn" and load_clicks:
 | 
			
		||||
                return self._load_sample_text()
 | 
			
		||||
 | 
			
		||||
            return no_update
 | 
			
		||||
 | 
			
		||||
        # Model info callback
 | 
			
		||||
        @callback(
 | 
			
		||||
            Output("model-info", "children"),
 | 
			
		||||
            Input("model-selection", "value"),
 | 
			
		||||
            prevent_initial_call=False,
 | 
			
		||||
        )
 | 
			
		||||
        def update_model_info(model_name):
 | 
			
		||||
            if not model_name:
 | 
			
		||||
                return html.Span("Please select a model", className="text-muted")
 | 
			
		||||
 | 
			
		||||
            from ...config.settings import AppSettings
 | 
			
		||||
 | 
			
		||||
            settings = AppSettings()
 | 
			
		||||
 | 
			
		||||
            for model in settings.AVAILABLE_MODELS:
 | 
			
		||||
                if model["name"] == model_name:
 | 
			
		||||
                    return html.Div(
 | 
			
		||||
                        [
 | 
			
		||||
                            html.Strong(
 | 
			
		||||
                                f"Dimensions: {model['dimensions']} | Context Length: {model['context_length']}"
 | 
			
		||||
                            ),
 | 
			
		||||
                            html.Br(),
 | 
			
		||||
                            html.Span(model["description"]),
 | 
			
		||||
                            html.Br(),
 | 
			
		||||
                            html.Small(
 | 
			
		||||
                                f"Multilingual: {'Yes' if model.get('multilingual', False) else 'No'} | Size: {model['size']}",
 | 
			
		||||
                                className="text-muted",
 | 
			
		||||
                            ),
 | 
			
		||||
                        ]
 | 
			
		||||
                    )
 | 
			
		||||
 | 
			
		||||
            return html.Span("Model information not available", className="text-muted")
 | 
			
		||||
 | 
			
		||||
        # Process client-side embeddings result callback
 | 
			
		||||
        @callback(
 | 
			
		||||
            [
 | 
			
		||||
                Output("processed-data", "data", allow_duplicate=True),
 | 
			
		||||
                Output("text-input-status", "children"),
 | 
			
		||||
                Output("text-input-status", "color"),
 | 
			
		||||
                Output("text-input-status", "style"),
 | 
			
		||||
                Output("generate-embeddings-btn", "disabled", allow_duplicate=True),
 | 
			
		||||
            ],
 | 
			
		||||
            [Input("embeddings-generated-trigger", "data")],
 | 
			
		||||
            prevent_initial_call=True,
 | 
			
		||||
        )
 | 
			
		||||
        def process_embeddings_result(embeddings_data):
 | 
			
		||||
            """Process embeddings generated client-side."""
 | 
			
		||||
            if not embeddings_data:
 | 
			
		||||
                return no_update, no_update, no_update, no_update, no_update
 | 
			
		||||
 | 
			
		||||
            processed_data = self.processor.process_client_embeddings(embeddings_data)
 | 
			
		||||
 | 
			
		||||
            if processed_data.error:
 | 
			
		||||
                return (
 | 
			
		||||
                    {"error": processed_data.error},
 | 
			
		||||
                    f"❌ Error: {processed_data.error}",
 | 
			
		||||
                    "danger",
 | 
			
		||||
                    {"display": "block"},
 | 
			
		||||
                    False,
 | 
			
		||||
                )
 | 
			
		||||
 | 
			
		||||
            return (
 | 
			
		||||
                {
 | 
			
		||||
                    "documents": [
 | 
			
		||||
                        self._document_to_dict(doc) for doc in processed_data.documents
 | 
			
		||||
                    ],
 | 
			
		||||
                    "embeddings": processed_data.embeddings.tolist(),
 | 
			
		||||
                },
 | 
			
		||||
                f"✅ Generated embeddings for {len(processed_data.documents)} text chunks",
 | 
			
		||||
                "success",
 | 
			
		||||
                {"display": "block"},
 | 
			
		||||
                False,
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
    def _load_sample_text(self):
 | 
			
		||||
        """Load sample text from assets/sample-txt.md file."""
 | 
			
		||||
        import os
 | 
			
		||||
 | 
			
		||||
        try:
 | 
			
		||||
            # Get the project root directory (four levels up from this file)
 | 
			
		||||
            current_file = os.path.abspath(__file__)
 | 
			
		||||
            project_root = os.path.dirname(
 | 
			
		||||
                os.path.dirname(
 | 
			
		||||
                    os.path.dirname(os.path.dirname(os.path.dirname(current_file)))
 | 
			
		||||
                )
 | 
			
		||||
            )
 | 
			
		||||
            sample_file_path = os.path.join(project_root, "assets", "sample-txt.md")
 | 
			
		||||
 | 
			
		||||
            if os.path.exists(sample_file_path):
 | 
			
		||||
                with open(sample_file_path, "r", encoding="utf-8") as file:
 | 
			
		||||
                    return file.read()
 | 
			
		||||
            else:
 | 
			
		||||
                # Fallback sample text if file doesn't exist
 | 
			
		||||
                return """The sun peeked through the clouds after a drizzly morning.
 | 
			
		||||
A gentle breeze rustled the leaves as we walked along the shoreline.
 | 
			
		||||
Heavy rains caused flooding in several low-lying neighborhoods.
 | 
			
		||||
It was so hot that even the birds sought shade under the palm trees.
 | 
			
		||||
By midnight, the temperature had dropped below freezing.
 | 
			
		||||
 | 
			
		||||
The new smartphone features a foldable display and 5G connectivity.
 | 
			
		||||
In the world of AI, transformers have revolutionized natural language processing.
 | 
			
		||||
Quantum computing promises to solve problems beyond classical computers' reach.
 | 
			
		||||
Blockchain technology is being explored for secure voting systems.
 | 
			
		||||
Virtual reality headsets are becoming more affordable and accessible.
 | 
			
		||||
 | 
			
		||||
Preheat the oven to 375°F before you start mixing the batter.
 | 
			
		||||
She finely chopped the garlic and sautéed it in two tablespoons of olive oil.
 | 
			
		||||
A pinch of saffron adds a beautiful color and aroma to traditional paella.
 | 
			
		||||
If the soup is too salty, add a peeled potato to absorb excess sodium.
 | 
			
		||||
Let the bread dough rise for at least an hour in a warm, draft-free spot."""
 | 
			
		||||
 | 
			
		||||
        except Exception:
 | 
			
		||||
            # Return a simple fallback if there's any error
 | 
			
		||||
            return "This is sample text for testing embedding generation. You can replace this with your own text."
 | 
			
		||||
 | 
			
		||||
    @staticmethod
 | 
			
		||||
    def _document_to_dict(doc):
 | 
			
		||||
        return {
 | 
			
		||||
 
 | 
			
		||||
@@ -1,11 +1,13 @@
 | 
			
		||||
from dash import dcc, html
 | 
			
		||||
import dash_bootstrap_components as dbc
 | 
			
		||||
from .upload import UploadComponent
 | 
			
		||||
from .textinput import TextInputComponent
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class DataSourceComponent:
 | 
			
		||||
    def __init__(self):
 | 
			
		||||
        self.upload_component = UploadComponent()
 | 
			
		||||
        self.text_input_component = TextInputComponent()
 | 
			
		||||
 | 
			
		||||
    def create_tabbed_interface(self):
 | 
			
		||||
        """Create tabbed interface for different data sources."""
 | 
			
		||||
@@ -17,6 +19,7 @@ class DataSourceComponent:
 | 
			
		||||
                            [
 | 
			
		||||
                                dbc.Tab(label="File Upload", tab_id="file-tab"),
 | 
			
		||||
                                dbc.Tab(label="OpenSearch", tab_id="opensearch-tab"),
 | 
			
		||||
                                dbc.Tab(label="Text Input", tab_id="text-input-tab"),
 | 
			
		||||
                            ],
 | 
			
		||||
                            id="data-source-tabs",
 | 
			
		||||
                            active_tab="file-tab",
 | 
			
		||||
@@ -208,6 +211,10 @@ class DataSourceComponent:
 | 
			
		||||
            ]
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
    def create_text_input_tab(self):
 | 
			
		||||
        """Create text input tab content for browser-based embedding generation."""
 | 
			
		||||
        return html.Div([self.text_input_component.create_text_input_interface()])
 | 
			
		||||
 | 
			
		||||
    def _create_opensearch_section(self, section_type):
 | 
			
		||||
        """Create a complete OpenSearch section for either 'data' or 'prompts'."""
 | 
			
		||||
        section_id = section_type  # 'data' or 'prompts'
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										402
									
								
								src/embeddingbuddy/ui/components/textinput.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										402
									
								
								src/embeddingbuddy/ui/components/textinput.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,402 @@
 | 
			
		||||
"""Text input component for generating embeddings from user text."""
 | 
			
		||||
 | 
			
		||||
import dash_bootstrap_components as dbc
 | 
			
		||||
from dash import dcc, html
 | 
			
		||||
 | 
			
		||||
from embeddingbuddy.config.settings import AppSettings
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class TextInputComponent:
 | 
			
		||||
    """Component for text input and embedding generation."""
 | 
			
		||||
 | 
			
		||||
    def __init__(self):
 | 
			
		||||
        self.settings = AppSettings()
 | 
			
		||||
 | 
			
		||||
    def create_text_input_interface(self):
 | 
			
		||||
        """Create the complete text input interface with model selection and processing options."""
 | 
			
		||||
        return html.Div(
 | 
			
		||||
            [
 | 
			
		||||
                # Model selection section
 | 
			
		||||
                self._create_model_selection(),
 | 
			
		||||
                html.Hr(),
 | 
			
		||||
                # Text input section
 | 
			
		||||
                self._create_text_input_area(),
 | 
			
		||||
                # Text action buttons
 | 
			
		||||
                self._create_text_action_buttons(),
 | 
			
		||||
                html.Hr(),
 | 
			
		||||
                # Processing options
 | 
			
		||||
                self._create_processing_options(),
 | 
			
		||||
                html.Hr(),
 | 
			
		||||
                # Generation controls
 | 
			
		||||
                self._create_generation_controls(),
 | 
			
		||||
                html.Hr(),
 | 
			
		||||
                # Progress indicators
 | 
			
		||||
                self._create_progress_indicators(),
 | 
			
		||||
                html.Hr(),
 | 
			
		||||
                # Status and results
 | 
			
		||||
                self._create_status_section(),
 | 
			
		||||
                # Hidden components for data flow
 | 
			
		||||
                self._create_hidden_components(),
 | 
			
		||||
            ],
 | 
			
		||||
            className="p-3",
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
    def _create_model_selection(self):
 | 
			
		||||
        """Create model selection dropdown with descriptions."""
 | 
			
		||||
        model_options = []
 | 
			
		||||
        for model in self.settings.AVAILABLE_MODELS:
 | 
			
		||||
            label = f"{model['label']} - {model['size']}"
 | 
			
		||||
            if model.get("default", False):
 | 
			
		||||
                label += " (Recommended)"
 | 
			
		||||
 | 
			
		||||
            model_options.append({"label": label, "value": model["name"]})
 | 
			
		||||
 | 
			
		||||
        return html.Div(
 | 
			
		||||
            [
 | 
			
		||||
                html.H5("Embedding Model", className="mb-3"),
 | 
			
		||||
                html.Div(
 | 
			
		||||
                    [
 | 
			
		||||
                        dcc.Dropdown(
 | 
			
		||||
                            id="model-selection",
 | 
			
		||||
                            options=model_options,
 | 
			
		||||
                            value=self.settings.DEFAULT_EMBEDDING_MODEL,
 | 
			
		||||
                            placeholder="Select an embedding model...",
 | 
			
		||||
                            className="mb-2",
 | 
			
		||||
                        ),
 | 
			
		||||
                        dbc.Alert(
 | 
			
		||||
                            [
 | 
			
		||||
                                html.Div(
 | 
			
		||||
                                    id="model-info",
 | 
			
		||||
                                    children=self._get_model_description(
 | 
			
		||||
                                        self.settings.DEFAULT_EMBEDDING_MODEL
 | 
			
		||||
                                    ),
 | 
			
		||||
                                )
 | 
			
		||||
                            ],
 | 
			
		||||
                            color="info",
 | 
			
		||||
                            className="small",
 | 
			
		||||
                        ),
 | 
			
		||||
                    ]
 | 
			
		||||
                ),
 | 
			
		||||
            ]
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
    def _create_text_input_area(self):
 | 
			
		||||
        """Create text input textarea with character limits."""
 | 
			
		||||
        return html.Div(
 | 
			
		||||
            [
 | 
			
		||||
                html.H5("Text Input", className="mb-3"),
 | 
			
		||||
                dcc.Textarea(
 | 
			
		||||
                    id="text-input-area",
 | 
			
		||||
                    placeholder="Paste your text here... Each sentence, paragraph, or line will become a separate data point depending on your tokenization method below.",
 | 
			
		||||
                    value="",
 | 
			
		||||
                    style={
 | 
			
		||||
                        "width": "100%",
 | 
			
		||||
                        "height": "300px",
 | 
			
		||||
                        "resize": "vertical",
 | 
			
		||||
                        "font-family": "monospace",
 | 
			
		||||
                        "font-size": "14px",
 | 
			
		||||
                    },
 | 
			
		||||
                    maxLength=self.settings.MAX_TEXT_LENGTH,
 | 
			
		||||
                    className="form-control",
 | 
			
		||||
                ),
 | 
			
		||||
                html.Small(
 | 
			
		||||
                    f"Maximum {self.settings.MAX_TEXT_LENGTH:,} characters. Current: ",
 | 
			
		||||
                    className="text-muted",
 | 
			
		||||
                ),
 | 
			
		||||
                html.Small(
 | 
			
		||||
                    id="text-length-counter",
 | 
			
		||||
                    children="0",
 | 
			
		||||
                    className="text-muted fw-bold",
 | 
			
		||||
                ),
 | 
			
		||||
                html.Small(" characters", className="text-muted"),
 | 
			
		||||
            ]
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
    def _create_text_action_buttons(self):
 | 
			
		||||
        """Create action buttons for text input (Load Sample, Clear)."""
 | 
			
		||||
        return html.Div(
 | 
			
		||||
            [
 | 
			
		||||
                dbc.Row(
 | 
			
		||||
                    [
 | 
			
		||||
                        dbc.Col(
 | 
			
		||||
                            [
 | 
			
		||||
                                dbc.Button(
 | 
			
		||||
                                    [
 | 
			
		||||
                                        html.I(className="fas fa-file-text me-2"),
 | 
			
		||||
                                        "Load Sample Text",
 | 
			
		||||
                                    ],
 | 
			
		||||
                                    id="load-sample-btn",
 | 
			
		||||
                                    color="info",
 | 
			
		||||
                                    size="sm",
 | 
			
		||||
                                    className="w-100",
 | 
			
		||||
                                )
 | 
			
		||||
                            ],
 | 
			
		||||
                            md=6,
 | 
			
		||||
                        ),
 | 
			
		||||
                        dbc.Col(
 | 
			
		||||
                            [
 | 
			
		||||
                                dbc.Button(
 | 
			
		||||
                                    [
 | 
			
		||||
                                        html.I(className="fas fa-trash me-2"),
 | 
			
		||||
                                        "Clear Text",
 | 
			
		||||
                                    ],
 | 
			
		||||
                                    id="clear-text-btn",
 | 
			
		||||
                                    color="outline-secondary",
 | 
			
		||||
                                    size="sm",
 | 
			
		||||
                                    className="w-100",
 | 
			
		||||
                                )
 | 
			
		||||
                            ],
 | 
			
		||||
                            md=6,
 | 
			
		||||
                        ),
 | 
			
		||||
                    ],
 | 
			
		||||
                    className="mt-2 mb-3",
 | 
			
		||||
                )
 | 
			
		||||
            ]
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
    def _create_processing_options(self):
 | 
			
		||||
        """Create tokenization and metadata options."""
 | 
			
		||||
        return html.Div(
 | 
			
		||||
            [
 | 
			
		||||
                html.H5("Processing Options", className="mb-3"),
 | 
			
		||||
                dbc.Row(
 | 
			
		||||
                    [
 | 
			
		||||
                        dbc.Col(
 | 
			
		||||
                            [
 | 
			
		||||
                                html.Label(
 | 
			
		||||
                                    "Text Splitting Method:", className="form-label"
 | 
			
		||||
                                ),
 | 
			
		||||
                                dcc.Dropdown(
 | 
			
		||||
                                    id="tokenization-method",
 | 
			
		||||
                                    options=[
 | 
			
		||||
                                        {
 | 
			
		||||
                                            "label": "Sentences (split on . ! ?)",
 | 
			
		||||
                                            "value": "sentence",
 | 
			
		||||
                                        },
 | 
			
		||||
                                        {
 | 
			
		||||
                                            "label": "Paragraphs (split on double newline)",
 | 
			
		||||
                                            "value": "paragraph",
 | 
			
		||||
                                        },
 | 
			
		||||
                                        {
 | 
			
		||||
                                            "label": "Lines (split on single newline)",
 | 
			
		||||
                                            "value": "manual",
 | 
			
		||||
                                        },
 | 
			
		||||
                                        {
 | 
			
		||||
                                            "label": "Entire text as one document",
 | 
			
		||||
                                            "value": "whole",
 | 
			
		||||
                                        },
 | 
			
		||||
                                    ],
 | 
			
		||||
                                    value=self.settings.DEFAULT_TOKENIZATION_METHOD,
 | 
			
		||||
                                    className="mb-3",
 | 
			
		||||
                                ),
 | 
			
		||||
                            ],
 | 
			
		||||
                            md=6,
 | 
			
		||||
                        ),
 | 
			
		||||
                        dbc.Col(
 | 
			
		||||
                            [
 | 
			
		||||
                                html.Label("Batch Size:", className="form-label"),
 | 
			
		||||
                                dcc.Dropdown(
 | 
			
		||||
                                    id="batch-size",
 | 
			
		||||
                                    options=[
 | 
			
		||||
                                        {
 | 
			
		||||
                                            "label": "Small batches (4) - Lower memory",
 | 
			
		||||
                                            "value": 4,
 | 
			
		||||
                                        },
 | 
			
		||||
                                        {
 | 
			
		||||
                                            "label": "Medium batches (8) - Balanced",
 | 
			
		||||
                                            "value": 8,
 | 
			
		||||
                                        },
 | 
			
		||||
                                        {
 | 
			
		||||
                                            "label": "Large batches (16) - Faster",
 | 
			
		||||
                                            "value": 16,
 | 
			
		||||
                                        },
 | 
			
		||||
                                    ],
 | 
			
		||||
                                    value=self.settings.MAX_BATCH_SIZE,
 | 
			
		||||
                                    className="mb-3",
 | 
			
		||||
                                ),
 | 
			
		||||
                            ],
 | 
			
		||||
                            md=6,
 | 
			
		||||
                        ),
 | 
			
		||||
                    ]
 | 
			
		||||
                ),
 | 
			
		||||
                dbc.Row(
 | 
			
		||||
                    [
 | 
			
		||||
                        dbc.Col(
 | 
			
		||||
                            [
 | 
			
		||||
                                html.Label(
 | 
			
		||||
                                    "Category (Optional):", className="form-label"
 | 
			
		||||
                                ),
 | 
			
		||||
                                dcc.Input(
 | 
			
		||||
                                    id="text-category",
 | 
			
		||||
                                    type="text",
 | 
			
		||||
                                    placeholder="e.g., Notes, Articles, Ideas...",
 | 
			
		||||
                                    value="Text Input",
 | 
			
		||||
                                    className="form-control mb-3",
 | 
			
		||||
                                ),
 | 
			
		||||
                            ],
 | 
			
		||||
                            md=6,
 | 
			
		||||
                        ),
 | 
			
		||||
                        dbc.Col(
 | 
			
		||||
                            [
 | 
			
		||||
                                html.Label(
 | 
			
		||||
                                    "Subcategory (Optional):", className="form-label"
 | 
			
		||||
                                ),
 | 
			
		||||
                                dcc.Input(
 | 
			
		||||
                                    id="text-subcategory",
 | 
			
		||||
                                    type="text",
 | 
			
		||||
                                    placeholder="e.g., Meeting Notes, Research...",
 | 
			
		||||
                                    value="Generated",
 | 
			
		||||
                                    className="form-control mb-3",
 | 
			
		||||
                                ),
 | 
			
		||||
                            ],
 | 
			
		||||
                            md=6,
 | 
			
		||||
                        ),
 | 
			
		||||
                    ]
 | 
			
		||||
                ),
 | 
			
		||||
            ]
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
    def _create_generation_controls(self):
 | 
			
		||||
        """Create embedding generation button and controls."""
 | 
			
		||||
        return html.Div(
 | 
			
		||||
            [
 | 
			
		||||
                html.H5("Generate Embeddings", className="mb-3"),
 | 
			
		||||
                dbc.Row(
 | 
			
		||||
                    [
 | 
			
		||||
                        dbc.Col(
 | 
			
		||||
                            [
 | 
			
		||||
                                dbc.Button(
 | 
			
		||||
                                    [
 | 
			
		||||
                                        html.I(className="fas fa-magic me-2"),
 | 
			
		||||
                                        "Generate Embeddings",
 | 
			
		||||
                                    ],
 | 
			
		||||
                                    id="generate-embeddings-btn",
 | 
			
		||||
                                    color="primary",
 | 
			
		||||
                                    size="lg",
 | 
			
		||||
                                    disabled=True,
 | 
			
		||||
                                    className="w-100",
 | 
			
		||||
                                )
 | 
			
		||||
                            ],
 | 
			
		||||
                            md=12,
 | 
			
		||||
                        ),
 | 
			
		||||
                    ]
 | 
			
		||||
                ),
 | 
			
		||||
                html.Div(
 | 
			
		||||
                    [
 | 
			
		||||
                        dbc.Alert(
 | 
			
		||||
                            [
 | 
			
		||||
                                html.I(className="fas fa-info-circle me-2"),
 | 
			
		||||
                                "Enter some text above and select a model to enable embedding generation.",
 | 
			
		||||
                            ],
 | 
			
		||||
                            color="light",
 | 
			
		||||
                            className="mt-3",
 | 
			
		||||
                            id="generation-help",
 | 
			
		||||
                        )
 | 
			
		||||
                    ]
 | 
			
		||||
                ),
 | 
			
		||||
            ]
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
    def _create_progress_indicators(self):
 | 
			
		||||
        """Create progress bars for model loading and embedding generation."""
 | 
			
		||||
        return html.Div(
 | 
			
		||||
            [
 | 
			
		||||
                # Model loading progress
 | 
			
		||||
                html.Div(
 | 
			
		||||
                    [
 | 
			
		||||
                        html.H6("Model Loading Progress", className="mb-2"),
 | 
			
		||||
                        dbc.Progress(
 | 
			
		||||
                            id="model-loading-progress",
 | 
			
		||||
                            value=0,
 | 
			
		||||
                            striped=True,
 | 
			
		||||
                            animated=True,
 | 
			
		||||
                            className="mb-2",
 | 
			
		||||
                        ),
 | 
			
		||||
                        html.Small(
 | 
			
		||||
                            id="model-loading-status",
 | 
			
		||||
                            children="No model loading in progress",
 | 
			
		||||
                            className="text-muted",
 | 
			
		||||
                        ),
 | 
			
		||||
                    ],
 | 
			
		||||
                    id="model-loading-section",
 | 
			
		||||
                    style={"display": "none"},
 | 
			
		||||
                ),
 | 
			
		||||
                html.Br(),
 | 
			
		||||
                # Embedding generation progress
 | 
			
		||||
                html.Div(
 | 
			
		||||
                    [
 | 
			
		||||
                        html.H6("Embedding Generation Progress", className="mb-2"),
 | 
			
		||||
                        dbc.Progress(
 | 
			
		||||
                            id="embedding-progress",
 | 
			
		||||
                            value=0,
 | 
			
		||||
                            striped=True,
 | 
			
		||||
                            animated=True,
 | 
			
		||||
                            className="mb-2",
 | 
			
		||||
                        ),
 | 
			
		||||
                        html.Small(
 | 
			
		||||
                            id="embedding-status",
 | 
			
		||||
                            children="No embedding generation in progress",
 | 
			
		||||
                            className="text-muted",
 | 
			
		||||
                        ),
 | 
			
		||||
                    ],
 | 
			
		||||
                    id="embedding-progress-section",
 | 
			
		||||
                    style={"display": "none"},
 | 
			
		||||
                ),
 | 
			
		||||
            ]
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
    def _create_status_section(self):
 | 
			
		||||
        """Create status alerts and results preview."""
 | 
			
		||||
        return html.Div(
 | 
			
		||||
            [
 | 
			
		||||
                # Immediate status (from client-side)
 | 
			
		||||
                dbc.Alert(
 | 
			
		||||
                    id="text-input-status-immediate",
 | 
			
		||||
                    children="Ready to generate embeddings",
 | 
			
		||||
                    color="light",
 | 
			
		||||
                    className="mb-3",
 | 
			
		||||
                ),
 | 
			
		||||
                # Server-side status
 | 
			
		||||
                dbc.Alert(
 | 
			
		||||
                    id="text-input-status",
 | 
			
		||||
                    children="",
 | 
			
		||||
                    color="light",
 | 
			
		||||
                    className="mb-3",
 | 
			
		||||
                    style={"display": "none"},
 | 
			
		||||
                ),
 | 
			
		||||
                # Results preview
 | 
			
		||||
                html.Div(id="embedding-results-preview"),
 | 
			
		||||
            ]
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
    def _create_hidden_components(self):
 | 
			
		||||
        """Create hidden components for data flow."""
 | 
			
		||||
        return html.Div(
 | 
			
		||||
            [
 | 
			
		||||
                # Store for embeddings data from client-side
 | 
			
		||||
                dcc.Store(id="embeddings-generated-trigger"),
 | 
			
		||||
                # Store for tokenization preview
 | 
			
		||||
                dcc.Store(id="tokenization-preview-data"),
 | 
			
		||||
            ]
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
    def _get_model_description(self, model_name):
 | 
			
		||||
        """Get description for a specific model."""
 | 
			
		||||
        for model in self.settings.AVAILABLE_MODELS:
 | 
			
		||||
            if model["name"] == model_name:
 | 
			
		||||
                return html.Div(
 | 
			
		||||
                    [
 | 
			
		||||
                        html.Strong(
 | 
			
		||||
                            f"Dimensions: {model['dimensions']} | Context Length: {model['context_length']}"
 | 
			
		||||
                        ),
 | 
			
		||||
                        html.Br(),
 | 
			
		||||
                        html.Span(model["description"]),
 | 
			
		||||
                        html.Br(),
 | 
			
		||||
                        html.Small(
 | 
			
		||||
                            f"Multilingual: {'Yes' if model.get('multilingual', False) else 'No'} | Size: {model['size']}",
 | 
			
		||||
                            className="text-muted",
 | 
			
		||||
                        ),
 | 
			
		||||
                    ]
 | 
			
		||||
                )
 | 
			
		||||
 | 
			
		||||
        return html.Span("Model information not available", className="text-muted")
 | 
			
		||||
@@ -20,6 +20,15 @@ class AppLayout:
 | 
			
		||||
                dbc.Col(
 | 
			
		||||
                    [
 | 
			
		||||
                        html.H1("EmbeddingBuddy", className="text-center mb-4"),
 | 
			
		||||
                        # Load Transformers.js from CDN
 | 
			
		||||
                        html.Script(
 | 
			
		||||
                            """
 | 
			
		||||
                            import { pipeline } from 'https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.7.2';
 | 
			
		||||
                            window.transformersPipeline = pipeline;
 | 
			
		||||
                            console.log('✅ Transformers.js pipeline loaded globally');
 | 
			
		||||
                            """,
 | 
			
		||||
                            type="module",
 | 
			
		||||
                        ),
 | 
			
		||||
                    ],
 | 
			
		||||
                    width=12,
 | 
			
		||||
                )
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										158
									
								
								tests/test_client_embeddings.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										158
									
								
								tests/test_client_embeddings.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,158 @@
 | 
			
		||||
"""Tests for client-side embedding processing functionality."""
 | 
			
		||||
 | 
			
		||||
import numpy as np
 | 
			
		||||
 | 
			
		||||
from src.embeddingbuddy.data.processor import DataProcessor
 | 
			
		||||
from src.embeddingbuddy.models.schemas import ProcessedData
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class TestClientEmbeddingsProcessing:
 | 
			
		||||
    """Test client-side embeddings processing functionality."""
 | 
			
		||||
 | 
			
		||||
    def setup_method(self):
 | 
			
		||||
        """Set up test instances."""
 | 
			
		||||
        self.processor = DataProcessor()
 | 
			
		||||
 | 
			
		||||
    def test_process_client_embeddings_success(self):
 | 
			
		||||
        """Test successful processing of client-side embeddings data."""
 | 
			
		||||
        client_data = {
 | 
			
		||||
            "documents": [
 | 
			
		||||
                {
 | 
			
		||||
                    "id": "text_input_0",
 | 
			
		||||
                    "text": "First test document",
 | 
			
		||||
                    "category": "Text Input",
 | 
			
		||||
                    "subcategory": "Generated",
 | 
			
		||||
                    "tags": [],
 | 
			
		||||
                },
 | 
			
		||||
                {
 | 
			
		||||
                    "id": "text_input_1",
 | 
			
		||||
                    "text": "Second test document",
 | 
			
		||||
                    "category": "Text Input",
 | 
			
		||||
                    "subcategory": "Generated",
 | 
			
		||||
                    "tags": [],
 | 
			
		||||
                },
 | 
			
		||||
            ],
 | 
			
		||||
            "embeddings": [[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8]],
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        result = self.processor.process_client_embeddings(client_data)
 | 
			
		||||
 | 
			
		||||
        assert isinstance(result, ProcessedData)
 | 
			
		||||
        assert result.error is None
 | 
			
		||||
        assert len(result.documents) == 2
 | 
			
		||||
        assert result.embeddings.shape == (2, 4)
 | 
			
		||||
 | 
			
		||||
        # Check document content
 | 
			
		||||
        assert result.documents[0].text == "First test document"
 | 
			
		||||
        assert result.documents[1].text == "Second test document"
 | 
			
		||||
 | 
			
		||||
        # Check embeddings match
 | 
			
		||||
        np.testing.assert_array_equal(result.embeddings[0], [0.1, 0.2, 0.3, 0.4])
 | 
			
		||||
        np.testing.assert_array_equal(result.embeddings[1], [0.5, 0.6, 0.7, 0.8])
 | 
			
		||||
 | 
			
		||||
    def test_process_client_embeddings_with_error(self):
 | 
			
		||||
        """Test processing client data with error."""
 | 
			
		||||
        client_data = {"error": "Transformers.js not loaded"}
 | 
			
		||||
 | 
			
		||||
        result = self.processor.process_client_embeddings(client_data)
 | 
			
		||||
 | 
			
		||||
        assert isinstance(result, ProcessedData)
 | 
			
		||||
        assert result.error == "Transformers.js not loaded"
 | 
			
		||||
        assert len(result.documents) == 0
 | 
			
		||||
        assert result.embeddings.size == 0
 | 
			
		||||
 | 
			
		||||
    def test_process_client_embeddings_missing_data(self):
 | 
			
		||||
        """Test processing with missing documents or embeddings."""
 | 
			
		||||
        client_data = {"documents": []}
 | 
			
		||||
 | 
			
		||||
        result = self.processor.process_client_embeddings(client_data)
 | 
			
		||||
 | 
			
		||||
        assert isinstance(result, ProcessedData)
 | 
			
		||||
        assert "No documents or embeddings in client data" in result.error
 | 
			
		||||
        assert len(result.documents) == 0
 | 
			
		||||
 | 
			
		||||
    def test_process_client_embeddings_mismatch_count(self):
 | 
			
		||||
        """Test processing with mismatched document and embedding counts."""
 | 
			
		||||
        client_data = {
 | 
			
		||||
            "documents": [
 | 
			
		||||
                {
 | 
			
		||||
                    "id": "test",
 | 
			
		||||
                    "text": "Test document",
 | 
			
		||||
                    "category": "Test",
 | 
			
		||||
                    "subcategory": "Test",
 | 
			
		||||
                    "tags": [],
 | 
			
		||||
                }
 | 
			
		||||
            ],
 | 
			
		||||
            "embeddings": [[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8]],
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        result = self.processor.process_client_embeddings(client_data)
 | 
			
		||||
 | 
			
		||||
        assert isinstance(result, ProcessedData)
 | 
			
		||||
        assert "Mismatch between number of documents and embeddings" in result.error
 | 
			
		||||
        assert len(result.documents) == 0
 | 
			
		||||
 | 
			
		||||
    def test_process_client_embeddings_invalid_document(self):
 | 
			
		||||
        """Test processing with invalid document data."""
 | 
			
		||||
        client_data = {
 | 
			
		||||
            "documents": [
 | 
			
		||||
                {"text": ""},  # Empty text should be skipped
 | 
			
		||||
                {
 | 
			
		||||
                    "id": "test2",
 | 
			
		||||
                    "text": "Valid document",
 | 
			
		||||
                    "category": "Test",
 | 
			
		||||
                    "subcategory": "Test",
 | 
			
		||||
                    "tags": [],
 | 
			
		||||
                },
 | 
			
		||||
            ],
 | 
			
		||||
            "embeddings": [[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8]],
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        result = self.processor.process_client_embeddings(client_data)
 | 
			
		||||
 | 
			
		||||
        assert isinstance(result, ProcessedData)
 | 
			
		||||
        assert result.error is None
 | 
			
		||||
        assert len(result.documents) == 1  # Only valid document should be processed
 | 
			
		||||
        assert result.documents[0].text == "Valid document"
 | 
			
		||||
 | 
			
		||||
    def test_process_client_embeddings_auto_id_generation(self):
 | 
			
		||||
        """Test automatic ID generation for documents without IDs."""
 | 
			
		||||
        client_data = {
 | 
			
		||||
            "documents": [
 | 
			
		||||
                {
 | 
			
		||||
                    "text": "Document without ID",
 | 
			
		||||
                    "category": "Test",
 | 
			
		||||
                    "subcategory": "Test",
 | 
			
		||||
                    "tags": [],
 | 
			
		||||
                }
 | 
			
		||||
            ],
 | 
			
		||||
            "embeddings": [[0.1, 0.2, 0.3, 0.4]],
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        result = self.processor.process_client_embeddings(client_data)
 | 
			
		||||
 | 
			
		||||
        assert isinstance(result, ProcessedData)
 | 
			
		||||
        assert result.error is None
 | 
			
		||||
        assert len(result.documents) == 1
 | 
			
		||||
        assert result.documents[0].id.startswith("text_input_")
 | 
			
		||||
 | 
			
		||||
    def test_process_client_embeddings_invalid_embedding_format(self):
 | 
			
		||||
        """Test processing with invalid embedding format."""
 | 
			
		||||
        client_data = {
 | 
			
		||||
            "documents": [
 | 
			
		||||
                {
 | 
			
		||||
                    "id": "test",
 | 
			
		||||
                    "text": "Test document",
 | 
			
		||||
                    "category": "Test",
 | 
			
		||||
                    "subcategory": "Test",
 | 
			
		||||
                    "tags": [],
 | 
			
		||||
                }
 | 
			
		||||
            ],
 | 
			
		||||
            "embeddings": 0.5,  # Scalar instead of array
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        result = self.processor.process_client_embeddings(client_data)
 | 
			
		||||
 | 
			
		||||
        assert isinstance(result, ProcessedData)
 | 
			
		||||
        assert result.error is not None  # Should have some error
 | 
			
		||||
        assert len(result.documents) == 0
 | 
			
		||||
		Reference in New Issue
	
	Block a user