diff --git a/.gitignore b/.gitignore index 3001038..db782f8 100644 --- a/.gitignore +++ b/.gitignore @@ -81,4 +81,7 @@ safety-report.json pip-audit-report.json # Temporary files -*.tmp \ No newline at end of file +*.tmp + + +examples/extra \ No newline at end of file diff --git a/assets/embeddings.js b/assets/embeddings.js new file mode 100644 index 0000000..3d8f736 --- /dev/null +++ b/assets/embeddings.js @@ -0,0 +1,278 @@ +// Text input embedding generation using Transformers.js +// This module runs entirely in the browser for privacy and performance + +// Global flag to track initialization +window.transformersLoading = false; +window.transformersLoaded = false; + +class TransformersEmbedder { + constructor() { + this.extractor = null; + this.currentModel = null; + this.modelCache = new Map(); + this.isLoading = false; + } + + async initializeModel(modelName = 'Xenova/all-MiniLM-L6-v2') { + try { + if (this.modelCache.has(modelName)) { + this.extractor = this.modelCache.get(modelName); + this.currentModel = modelName; + return { success: true, model: modelName }; + } + + if (this.isLoading) { + return { success: false, error: 'Model loading already in progress' }; + } + + this.isLoading = true; + + // Use globally loaded Transformers.js pipeline + if (!window.transformers) { + if (!window.transformersPipeline) { + // Wait for the pipeline to load + let attempts = 0; + while (!window.transformersPipeline && attempts < 50) { // Wait up to 5 seconds + await new Promise(resolve => setTimeout(resolve, 100)); + attempts++; + } + if (!window.transformersPipeline) { + throw new Error('Transformers.js pipeline not available. Please refresh the page.'); + } + } + window.transformers = { pipeline: window.transformersPipeline }; + window.transformersLoaded = true; + console.log('✅ Using globally loaded Transformers.js pipeline'); + } + + // Show loading progress to user + if (window.updateModelLoadingProgress) { + window.updateModelLoadingProgress(0, `Loading ${modelName}...`); + } + + this.extractor = await window.transformers.pipeline('feature-extraction', modelName, { + progress_callback: (data) => { + if (window.updateModelLoadingProgress && data.progress !== undefined) { + const progress = Math.round(data.progress); + window.updateModelLoadingProgress(progress, data.status || 'Loading...'); + } + } + }); + + this.modelCache.set(modelName, this.extractor); + this.currentModel = modelName; + this.isLoading = false; + + if (window.updateModelLoadingProgress) { + window.updateModelLoadingProgress(100, 'Model loaded successfully'); + } + + return { success: true, model: modelName }; + } catch (error) { + this.isLoading = false; + console.error('Model initialization error:', error); + return { success: false, error: error.message }; + } + } + + async generateEmbeddings(texts, options = {}) { + if (!this.extractor) { + throw new Error('Model not initialized. Call initializeModel() first.'); + } + + if (!texts || texts.length === 0) { + throw new Error('No texts provided for embedding generation.'); + } + + const embeddings = []; + const defaultOptions = { + pooling: 'mean', + normalize: true, + ...options + }; + + // Process in batches to avoid memory issues + const batchSize = options.batchSize || 8; + + try { + for (let i = 0; i < texts.length; i += batchSize) { + const batch = texts.slice(i, i + batchSize); + + const batchResults = await Promise.all( + batch.map(text => { + if (!text || text.trim().length === 0) { + throw new Error('Empty text found in batch'); + } + return this.extractor(text.trim(), defaultOptions); + }) + ); + + // Convert tensor output to arrays + batchResults.forEach((result, idx) => { + if (result && result.data) { + embeddings.push(Array.from(result.data)); + } else { + throw new Error(`Invalid embedding result for text: ${batch[idx]}`); + } + }); + + // Update progress + const progress = Math.min(100, ((i + batch.length) / texts.length) * 100); + if (window.updateEmbeddingProgress) { + window.updateEmbeddingProgress(progress, `Processing ${i + batch.length}/${texts.length} texts`); + } + } + + if (window.updateEmbeddingProgress) { + window.updateEmbeddingProgress(100, `Generated ${embeddings.length} embeddings successfully`); + } + + return embeddings; + } catch (error) { + console.error('Embedding generation error:', error); + throw error; + } + } +} + +// Global instance +window.transformersEmbedder = new TransformersEmbedder(); +console.log('📦 TransformersEmbedder instance created'); + +// Global progress update functions +window.updateModelLoadingProgress = function(progress, status) { + const progressBar = document.getElementById('model-loading-progress'); + const statusText = document.getElementById('model-loading-status'); + if (progressBar) { + progressBar.style.width = progress + '%'; + progressBar.setAttribute('aria-valuenow', progress); + } + if (statusText) { + statusText.textContent = status; + } +}; + +window.updateEmbeddingProgress = function(progress, status) { + const progressBar = document.getElementById('embedding-progress'); + const statusText = document.getElementById('embedding-status'); + if (progressBar) { + progressBar.style.width = progress + '%'; + progressBar.setAttribute('aria-valuenow', progress); + } + if (statusText) { + statusText.textContent = status; + } +}; + +// Dash clientside callback functions +window.dash_clientside = window.dash_clientside || {}; +console.log('🔧 Setting up window.dash_clientside.transformers'); +window.dash_clientside.transformers = { + generateEmbeddings: async function(nClicks, textContent, modelName, tokenizationMethod, category, subcategory) { + console.log('🚀 generateEmbeddings called with:', { nClicks, modelName, tokenizationMethod, textLength: textContent?.length }); + + if (!nClicks || !textContent || textContent.trim().length === 0) { + console.log('⚠️ Early return - missing required parameters'); + return window.dash_clientside.no_update; + } + + try { + // Initialize model if needed + const initResult = await window.transformersEmbedder.initializeModel(modelName); + if (!initResult.success) { + return [ + { error: initResult.error }, + `❌ Model loading error: ${initResult.error}`, + "danger", + false + ]; + } + + // Tokenize text based on method + let textChunks; + const trimmedText = textContent.trim(); + + switch (tokenizationMethod) { + case 'sentence': + // Simple sentence splitting - can be enhanced with proper NLP + textChunks = trimmedText + .split(/[.!?]+/) + .map(s => s.trim()) + .filter(s => s.length > 0); + break; + case 'paragraph': + textChunks = trimmedText + .split(/\n\s*\n/) + .map(s => s.trim()) + .filter(s => s.length > 0); + break; + case 'manual': + textChunks = trimmedText + .split('\n') + .map(s => s.trim()) + .filter(s => s.length > 0); + break; + default: + textChunks = [trimmedText]; + } + + if (textChunks.length === 0) { + return [ + { error: 'No valid text chunks found after tokenization' }, + '❌ Error: No valid text chunks found after tokenization', + "danger", + false + ]; + } + + // Generate embeddings + const embeddings = await window.transformersEmbedder.generateEmbeddings(textChunks); + + if (!embeddings || embeddings.length !== textChunks.length) { + return [ + { error: 'Embedding generation failed - mismatch in text chunks and embeddings' }, + '❌ Error: Embedding generation failed', + "danger", + false + ]; + } + + // Create documents structure + const documents = textChunks.map((text, i) => ({ + id: `text_input_${Date.now()}_${i}`, + text: text, + embedding: embeddings[i], + category: category || "Text Input", + subcategory: subcategory || "Generated", + tags: [] + })); + + return [ + { + documents: documents, + embeddings: embeddings + }, + `✅ Generated embeddings for ${documents.length} text chunks using ${modelName}`, + "success", + false + ]; + + } catch (error) { + console.error('Client-side embedding error:', error); + return [ + { error: error.message }, + `❌ Error: ${error.message}`, + "danger", + false + ]; + } + } +}; + +console.log('✅ Transformers.js client-side setup complete'); +console.log('Available:', { + transformersEmbedder: !!window.transformersEmbedder, + dashClientside: !!window.dash_clientside, + transformersModule: !!window.dash_clientside?.transformers, + generateFunction: typeof window.dash_clientside?.transformers?.generateEmbeddings +}); \ No newline at end of file diff --git a/assets/package.json b/assets/package.json new file mode 100644 index 0000000..d1423b4 --- /dev/null +++ b/assets/package.json @@ -0,0 +1,9 @@ +{ + "name": "embeddingbuddy-assets", + "version": "1.0.0", + "description": "JavaScript dependencies for EmbeddingBuddy text input functionality", + "dependencies": { + "@huggingface/transformers": "^3.0.0" + }, + "type": "module" +} \ No newline at end of file diff --git a/assets/sample-txt.md b/assets/sample-txt.md new file mode 100644 index 0000000..8b717b8 --- /dev/null +++ b/assets/sample-txt.md @@ -0,0 +1,106 @@ +The sun peeked through the clouds after a drizzly morning. +A gentle breeze rustled the leaves as we walked along the shoreline. +Heavy rains caused flooding in several low-lying neighborhoods. +It was so hot that even the birds sought shade under the palm trees. +By midnight, the temperature had dropped below freezing. +Thunderstorms lit up the sky with flashes of lightning. +A thick fog settled over the city streets at dawn. +The air smelled of ozone after the sudden hailstorm. +I watched the snowflakes drift silently onto the ground. +A double rainbow appeared after the rain shower. +The humidity soared to uncomfortable levels by midday. +Dust devils formed in the dry desert plains. +The barometer readings indicated an approaching front. +A sudden gust of wind knocked over the garden chairs. +Light drizzle turned into a torrential downpour within minutes. +The new smartphone features a foldable display and 5G connectivity. +In the world of AI, transformers have revolutionized natural language processing. +Quantum computing promises to solve problems beyond classical computers' reach. +Blockchain technology is being explored for secure voting systems. +Virtual reality headsets are becoming more affordable and accessible. +The rise of electric vehicles is reshaping the automotive industry. +Cloud computing allows businesses to scale resources dynamically. +Machine learning algorithms can now predict stock market trends with surprising accuracy. +Augmented reality applications are transforming retail experiences. +The Internet of Things connects everyday devices to the web for smarter living. +Cybersecurity threats are evolving, requiring constant vigilance. +3D printing is enabling rapid prototyping and custom manufacturing. +Edge computing reduces latency by processing data closer to the source. +Biometric authentication methods are enhancing security in devices. +Wearable technology is tracking health metrics in real-time. +Artificial intelligence is being used to create realistic deepfakes. +Preheat the oven to 375°F before you start mixing the batter. +She finely chopped the garlic and sautéed it in two tablespoons of olive oil. +A pinch of saffron adds a beautiful color and aroma to traditional paella. +If the soup is too salty, add a peeled potato to absorb excess sodium. +Let the bread dough rise for at least an hour in a warm, draft-free spot. +Marinate the chicken overnight in a blend of citrus and spices. +Use a cast-iron skillet to sear the steak on high heat. +Whisk the egg whites until they form stiff peaks. +Fold in the chocolate chips gently to keep the batter airy. +Brush the pastry with an egg wash for a golden finish. +Slow-roast the pork shoulder until it falls off the bone. +Garnish the salad with toasted nuts and fresh herbs. +Deglaze the pan with white wine for a rich sauce. +Simmer the curry paste until the aroma intensifies. +Let the risotto rest before serving to thicken slightly. +He dribbled past two defenders and sank a three-pointer at the buzzer. +The marathon runner kept a steady pace despite the sweltering heat. +Their home team clinched the championship with a last-minute goal. +NASCAR fans cheered as the cars roared around the oval track. +She landed a perfect triple axel at the figure skating championship. +The cyclist pedaled up the steep hill in record time. +He pitched a no-hitter during the high school baseball game. +The quarterback threw a touchdown pass under heavy pressure. +They scored a hat-trick in the hockey final. +The boxer delivered a swift uppercut in the final round. +Surfers caught massive waves at dawn on the Pacific coast. +Fans erupted when the underdog scored the winning goal. +The swimmer broke the national record in the 200m freestyle. +The gymnast executed a flawless routine on the balance beam. +The rugby team celebrated their victory with a traditional haka. +The stock market rallied after positive earnings reports. +Investors are closely watching interest rate changes by the Federal Reserve. +Cryptocurrency prices have been extremely volatile this year. +Diversification is key to managing investment risk effectively. +Inflation rates have reached a 40-year high, impacting consumer spending. +Many companies are adopting ESG criteria to attract socially conscious investors. +The bond market is reacting to geopolitical tensions and supply chain disruptions. +Venture capital funding for startups has surged in the tech sector. +Exchange-traded funds (ETFs) offer a way to invest in diversified portfolios. +The global economy is recovering from the pandemic, but challenges remain. +Central banks are exploring digital currencies to modernize payment systems. +Retail investors are increasingly participating in the stock market through apps. +Hedge funds are using complex algorithms to gain an edge in trading. +Real estate prices have skyrocketed in urban areas due to low inventory. +The startup raised $10 million in its Series A funding round. +The symphony orchestra played a hauntingly beautiful melody. +She strummed her guitar softly, filling the room with a warm sound. +The DJ mixed tracks seamlessly, keeping the crowd dancing all night. +His voice soared during the high notes of the ballad. +The band played an acoustic set in the intimate coffee shop. +Jazz musicians often improvise solos based on the chord changes. +The opera singer hit the high C with perfect pitch. +The choir harmonized beautifully, filling the church with sound. +He composed a symphony that was performed at the concert hall. +The singer-songwriter wrote heartfelt lyrics about love and loss. +The rock band headlined the festival, drawing a massive crowd. +Hip-hop artists use rhythm and rhyme to tell powerful stories. +The violinist played a virtuosic solo that left the audience in awe. +Folk music often reflects the culture and traditions of a community. +The gospel choir lifted spirits with their uplifting performance. +The fall of the Berlin Wall in 1989 marked the end of the Cold War. +Ancient Egypt's pyramids are a testament to their architectural prowess. +Europe's Renaissance period sparked a revival in art and science. +The signing of the Declaration of Independence in 1776 established the United States. +The Industrial Revolution transformed economies and societies worldwide. +Rome was the center of a vast empire that influenced law and governance. +The discovery of the New World by Christopher Columbus in 1492 changed global trade. +The French Revolution in 1789 led to significant political and social change. +World War II was a global conflict that reshaped international relations. +The fall of the Roman Empire in 476 AD marked the beginning of the Middle Ages. +The invention of the printing press revolutionized the spread of knowledge. +The Cold War was characterized by political tension between the U.S. and the Soviet Union. +The ancient Silk Road connected East and West through trade routes. +The signing of the Magna Carta in 1215 established principles of due process. +Exploration during the Age of Discovery expanded European empires across the globe. \ No newline at end of file diff --git a/assets/transformers-loader.js b/assets/transformers-loader.js new file mode 100644 index 0000000..1aefc94 --- /dev/null +++ b/assets/transformers-loader.js @@ -0,0 +1,172 @@ +// Simple script to load Transformers.js from CDN and initialize embedding functionality +// This approach uses traditional script loading instead of ES6 modules + +console.log('🔧 Transformers.js loader starting...'); + +// Global state +window.transformersLibraryLoaded = false; +window.transformersLibraryLoading = false; + +// Function to dynamically load a script +function loadScript(src) { + return new Promise((resolve, reject) => { + const script = document.createElement('script'); + script.src = src; + script.type = 'module'; + script.onload = () => resolve(); + script.onerror = () => reject(new Error(`Failed to load script: ${src}`)); + document.head.appendChild(script); + }); +} + +// Function to initialize Transformers.js +async function initializeTransformers() { + if (window.transformersLibraryLoaded) { + console.log('✅ Transformers.js already loaded'); + return true; + } + + if (window.transformersLibraryLoading) { + console.log('⏳ Transformers.js already loading, waiting...'); + // Wait for loading to complete + while (window.transformersLibraryLoading) { + await new Promise(resolve => setTimeout(resolve, 100)); + } + return window.transformersLibraryLoaded; + } + + window.transformersLibraryLoading = true; + + try { + console.log('📦 Loading Transformers.js from CDN...'); + + // Use dynamic import since this is more reliable with ES modules + const transformers = await import('https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.0.0'); + window.transformersLibrary = transformers; + window.transformersLibraryLoaded = true; + + console.log('✅ Transformers.js loaded successfully'); + return true; + } catch (error) { + console.error('❌ Failed to load Transformers.js:', error); + return false; + } finally { + window.transformersLibraryLoading = false; + } +} + +// Simple embeddings class +class SimpleEmbedder { + constructor() { + this.pipeline = null; + this.modelCache = new Map(); + } + + async generateEmbeddings(texts, modelName = 'Xenova/all-MiniLM-L6-v2') { + console.log('🔄 Generating embeddings for', texts.length, 'texts with model', modelName); + + // Ensure Transformers.js is loaded + if (!window.transformersLibraryLoaded) { + const loaded = await initializeTransformers(); + if (!loaded) { + throw new Error('Failed to load Transformers.js'); + } + } + + // Create pipeline if not cached + if (!this.modelCache.has(modelName)) { + console.log('🏗️ Creating pipeline for', modelName); + const { pipeline } = window.transformersLibrary; + this.pipeline = await pipeline('feature-extraction', modelName); + this.modelCache.set(modelName, this.pipeline); + } else { + this.pipeline = this.modelCache.get(modelName); + } + + // Generate embeddings + const embeddings = []; + for (let i = 0; i < texts.length; i++) { + console.log(`Processing text ${i + 1}/${texts.length}...`); + const result = await this.pipeline(texts[i], { pooling: 'mean', normalize: true }); + embeddings.push(Array.from(result.data)); + } + + console.log('✅ Generated', embeddings.length, 'embeddings'); + return embeddings; + } +} + +// Create global instance +window.simpleEmbedder = new SimpleEmbedder(); + +// Set up Dash clientside callbacks +window.dash_clientside = window.dash_clientside || {}; +window.dash_clientside.transformers = { + generateEmbeddings: async function(nClicks, textContent, modelName, tokenizationMethod, category, subcategory) { + console.log('🚀 Client-side generateEmbeddings called'); + + if (!nClicks || !textContent || textContent.trim().length === 0) { + console.log('⚠️ Missing required parameters'); + return window.dash_clientside.no_update; + } + + try { + // Tokenize text + let textChunks; + const trimmedText = textContent.trim(); + + switch (tokenizationMethod) { + case 'sentence': + textChunks = trimmedText.split(/[.!?]+/).map(s => s.trim()).filter(s => s.length > 0); + break; + case 'paragraph': + textChunks = trimmedText.split(/\n\s*\n/).map(s => s.trim()).filter(s => s.length > 0); + break; + case 'manual': + textChunks = trimmedText.split('\n').map(s => s.trim()).filter(s => s.length > 0); + break; + default: + textChunks = [trimmedText]; + } + + if (textChunks.length === 0) { + throw new Error('No valid text chunks after tokenization'); + } + + // Generate embeddings + const embeddings = await window.simpleEmbedder.generateEmbeddings(textChunks, modelName); + + // Create documents + const documents = textChunks.map((text, i) => ({ + id: `text_input_${Date.now()}_${i}`, + text: text, + embedding: embeddings[i], + category: category || "Text Input", + subcategory: subcategory || "Generated", + tags: [] + })); + + return [ + { + documents: documents, + embeddings: embeddings + }, + `✅ Generated embeddings for ${documents.length} text chunks using ${modelName}`, + "success", + false + ]; + + } catch (error) { + console.error('❌ Error generating embeddings:', error); + return [ + { error: error.message }, + `❌ Error: ${error.message}`, + "danger", + false + ]; + } + } +}; + +console.log('✅ Simple Transformers.js setup complete'); +console.log('Available functions:', Object.keys(window.dash_clientside.transformers)); \ No newline at end of file diff --git a/src/embeddingbuddy/app.py b/src/embeddingbuddy/app.py index d850d03..29231ef 100644 --- a/src/embeddingbuddy/app.py +++ b/src/embeddingbuddy/app.py @@ -8,7 +8,16 @@ from .ui.callbacks.interactions import InteractionCallbacks def create_app(): - app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP]) + import os + # Get the project root directory (two levels up from this file) + project_root = os.path.dirname(os.path.dirname(os.path.dirname(__file__))) + assets_path = os.path.join(project_root, 'assets') + + app = dash.Dash( + __name__, + external_stylesheets=[dbc.themes.BOOTSTRAP], + assets_folder=assets_path + ) # Allow callbacks to components that are dynamically created in tabs app.config.suppress_callback_exceptions = True @@ -20,9 +29,78 @@ def create_app(): VisualizationCallbacks() InteractionCallbacks() + # Register client-side callback for embedding generation + _register_client_side_callbacks(app) + return app +def _register_client_side_callbacks(app): + """Register client-side callbacks for browser-based processing.""" + from dash import Input, Output, State + + # Client-side callback for embedding generation + app.clientside_callback( + """ + function(nClicks, textContent, modelName, tokenizationMethod, batchSize, category, subcategory) { + if (!nClicks || !textContent || !textContent.trim()) { + return window.dash_clientside.no_update; + } + + console.log('🔍 Checking for Transformers.js...'); + console.log('window.dash_clientside:', typeof window.dash_clientside); + console.log('window.dash_clientside.transformers:', typeof window.dash_clientside?.transformers); + console.log('generateEmbeddings function:', typeof window.dash_clientside?.transformers?.generateEmbeddings); + + if (typeof window.dash_clientside !== 'undefined' && + typeof window.dash_clientside.transformers !== 'undefined' && + typeof window.dash_clientside.transformers.generateEmbeddings === 'function') { + + console.log('✅ Calling Transformers.js generateEmbeddings...'); + return window.dash_clientside.transformers.generateEmbeddings( + nClicks, textContent, modelName, tokenizationMethod, category, subcategory + ); + } + + // More detailed error information + let errorMsg = '❌ Transformers.js not available. '; + if (typeof window.dash_clientside === 'undefined') { + errorMsg += 'dash_clientside not found.'; + } else if (typeof window.dash_clientside.transformers === 'undefined') { + errorMsg += 'transformers module not found.'; + } else if (typeof window.dash_clientside.transformers.generateEmbeddings !== 'function') { + errorMsg += 'generateEmbeddings function not found.'; + } + + console.error(errorMsg); + + return [ + { error: 'Transformers.js not loaded. Please refresh the page and try again.' }, + errorMsg + ' Please refresh the page.', + 'danger', + false + ]; + } + """, + [ + Output("embeddings-generated-trigger", "data"), + Output("text-input-status-immediate", "children"), + Output("text-input-status-immediate", "color"), + Output("generate-embeddings-btn", "disabled", allow_duplicate=True), + ], + [Input("generate-embeddings-btn", "n_clicks")], + [ + State("text-input-area", "value"), + State("model-selection", "value"), + State("tokenization-method", "value"), + State("batch-size", "value"), + State("text-category", "value"), + State("text-subcategory", "value"), + ], + prevent_initial_call=True, + ) + + def run_app(app=None, debug=None, host=None, port=None): if app is None: app = create_app() diff --git a/src/embeddingbuddy/config/settings.py b/src/embeddingbuddy/config/settings.py index 82cdac0..3f60ab2 100644 --- a/src/embeddingbuddy/config/settings.py +++ b/src/embeddingbuddy/config/settings.py @@ -79,6 +79,71 @@ class AppSettings: OPENSEARCH_CONNECTION_TIMEOUT = 30 OPENSEARCH_VERIFY_CERTS = True + # Text Input / Transformers.js Configuration + DEFAULT_EMBEDDING_MODEL = "Xenova/all-mpnet-base-v2" + MAX_TEXT_LENGTH = 50000 # Characters (browser memory limits) + DEFAULT_TOKENIZATION_METHOD = "sentence" + MAX_BATCH_SIZE = 8 # Process in smaller batches for memory management + + # Available Transformers.js compatible models + AVAILABLE_MODELS = [ + { + "name": "Xenova/all-mpnet-base-v2", + "label": "All-MPNet-Base-v2 (Quality, 768d)", + "description": "Higher quality embeddings with better semantic understanding", + "dimensions": 768, + "size": "109 MB", + "context_length": 512, + "multilingual": False, + "default": True, + }, + { + "name": "Xenova/all-MiniLM-L6-v2", + "label": "All-MiniLM-L6-v2 (Fast, 384d)", + "description": "Lightweight model, good for quick testing and general purpose", + "dimensions": 384, + "size": "23 MB", + "context_length": 512, + "multilingual": False, + "default": False, + }, + { + "name": "Xenova/paraphrase-multilingual-MiniLM-L12-v2", + "label": "Multilingual MiniLM (50+ languages)", + "description": "Support for multiple languages with good performance", + "dimensions": 384, + "size": "127 MB", + "context_length": 512, + "multilingual": True, + }, + { + "name": "Xenova/bge-small-en-v1.5", + "label": "BGE Small English (High quality, 384d)", + "description": "Beijing Academy of AI model with excellent performance on retrieval tasks", + "dimensions": 384, + "size": "67 MB", + "context_length": 512, + "multilingual": False, + }, + { + "name": "Xenova/gte-small", + "label": "GTE Small (General Text Embeddings, 384d)", + "description": "Alibaba's general text embedding model, balanced performance", + "dimensions": 384, + "size": "67 MB", + "context_length": 512, + "multilingual": False, + }, + ] + + # Browser compatibility requirements + SUPPORTED_BROWSERS = { + "chrome": ">=88", + "firefox": ">=92", + "safari": ">=15.4", + "edge": ">=88", + } + # Bootstrap Theme EXTERNAL_STYLESHEETS = [ "https://cdn.jsdelivr.net/npm/bootstrap@5.1.3/dist/css/bootstrap.min.css" diff --git a/src/embeddingbuddy/data/processor.py b/src/embeddingbuddy/data/processor.py index e7f82b2..793ed95 100644 --- a/src/embeddingbuddy/data/processor.py +++ b/src/embeddingbuddy/data/processor.py @@ -63,6 +63,90 @@ class DataProcessor: except Exception as e: return ProcessedData(documents=[], embeddings=np.array([]), error=str(e)) + def process_client_embeddings(self, embeddings_data: dict) -> ProcessedData: + """Process embeddings data received from client-side JavaScript.""" + try: + if "error" in embeddings_data: + return ProcessedData( + documents=[], + embeddings=np.array([]), + error=embeddings_data["error"], + ) + + # Extract documents and embeddings from client data + documents_data = embeddings_data.get("documents", []) + embeddings_list = embeddings_data.get("embeddings", []) + + if not documents_data or not embeddings_list: + return ProcessedData( + documents=[], + embeddings=np.array([]), + error="No documents or embeddings in client data", + ) + + if len(documents_data) != len(embeddings_list): + return ProcessedData( + documents=[], + embeddings=np.array([]), + error="Mismatch between number of documents and embeddings", + ) + + # Convert embeddings to numpy array first + try: + embeddings = np.array(embeddings_list) + + if embeddings.ndim != 2: + return ProcessedData( + documents=[], + embeddings=np.array([]), + error="Invalid embedding dimensions", + ) + + except Exception as e: + return ProcessedData( + documents=[], + embeddings=np.array([]), + error=f"Error processing embeddings: {str(e)}", + ) + + # Convert to Document objects with embeddings + documents = [] + for i, doc_data in enumerate(documents_data): + try: + # Skip if we don't have a corresponding embedding + if i >= len(embeddings): + continue + + # Ensure required fields are present + if "id" not in doc_data or not doc_data["id"]: + doc_data["id"] = f"text_input_{i}" + if "text" not in doc_data or not doc_data["text"].strip(): + continue # Skip documents without text + + # Add the embedding to doc_data + doc_data["embedding"] = embeddings[i].tolist() + + doc = Document(**doc_data) + documents.append(doc) + except Exception: + # Skip invalid documents but continue processing + continue + + if not documents: + return ProcessedData( + documents=[], + embeddings=np.array([]), + error="No valid documents found in client data", + ) + + # Only keep embeddings for valid documents + valid_embeddings = embeddings[: len(documents)] + + return ProcessedData(documents=documents, embeddings=valid_embeddings) + + except Exception as e: + return ProcessedData(documents=[], embeddings=np.array([]), error=str(e)) + def _extract_embeddings(self, documents: List[Document]) -> np.ndarray: if not documents: return np.array([]) diff --git a/src/embeddingbuddy/ui/callbacks/data_processing.py b/src/embeddingbuddy/ui/callbacks/data_processing.py index f739bc7..de8aee2 100644 --- a/src/embeddingbuddy/ui/callbacks/data_processing.py +++ b/src/embeddingbuddy/ui/callbacks/data_processing.py @@ -1,4 +1,4 @@ -from dash import callback, Input, Output, State, no_update +from dash import callback, Input, Output, State, no_update, html from ...data.processor import DataProcessor from ...data.sources.opensearch import OpenSearchClient from ...models.field_mapper import FieldMapper @@ -87,6 +87,8 @@ class DataProcessingCallbacks: if active_tab == "opensearch-tab": return [datasource.create_opensearch_tab()] + elif active_tab == "text-input-tab": + return [datasource.create_text_input_tab()] else: return [datasource.create_file_upload_tab()] @@ -97,6 +99,9 @@ class DataProcessingCallbacks: # Register collapsible section callbacks self._register_collapse_callbacks() + # Register text input callbacks + self._register_text_input_callbacks() + def _register_opensearch_callbacks(self, section_type, opensearch_client): """Register callbacks for a specific section (data or prompts).""" @@ -463,6 +468,220 @@ class DataProcessingCallbacks: return new_state, icon_class return is_open, "fas fa-chevron-down me-2" + def _register_text_input_callbacks(self): + """Register callbacks for text input functionality.""" + + # Text length counter callback + @callback( + Output("text-length-counter", "children"), + Input("text-input-area", "value"), + prevent_initial_call=False, + ) + def update_text_length_counter(text_value): + if not text_value: + return "0" + return f"{len(text_value):,}" + + # Generate button enable/disable callback + @callback( + [ + Output("generate-embeddings-btn", "disabled"), + Output("generation-help", "children"), + Output("generation-help", "color"), + ], + [ + Input("text-input-area", "value"), + Input("model-selection", "value"), + ], + prevent_initial_call=False, + ) + def toggle_generate_button(text_value, model_name): + import dash_bootstrap_components as dbc + + if not text_value or not text_value.strip(): + return ( + True, + dbc.Alert( + [ + html.I(className="fas fa-info-circle me-2"), + "Enter some text above to enable embedding generation.", + ], + color="light", + ), + "light", + ) + + if not model_name: + return ( + True, + dbc.Alert( + [ + html.I(className="fas fa-exclamation-triangle me-2"), + "Select an embedding model to continue.", + ], + color="warning", + ), + "warning", + ) + + text_length = len(text_value.strip()) + if text_length > AppSettings.MAX_TEXT_LENGTH: + return ( + True, + dbc.Alert( + [ + html.I(className="fas fa-exclamation-triangle me-2"), + f"Text too long ({text_length:,} characters). Maximum allowed: {AppSettings.MAX_TEXT_LENGTH:,} characters.", + ], + color="danger", + ), + "danger", + ) + + return ( + False, + dbc.Alert( + [ + html.I(className="fas fa-check-circle me-2"), + f"Ready to generate embeddings for {text_length:,} characters using {model_name}.", + ], + color="success", + ), + "success", + ) + + # Clear text callback + @callback( + Output("text-input-area", "value"), + [Input("clear-text-btn", "n_clicks"), Input("load-sample-btn", "n_clicks")], + prevent_initial_call=True, + ) + def handle_text_input_actions(clear_clicks, load_clicks): + from dash import ctx + + if not ctx.triggered: + return no_update + + button_id = ctx.triggered[0]['prop_id'].split('.')[0] + + if button_id == "clear-text-btn" and clear_clicks: + return "" + elif button_id == "load-sample-btn" and load_clicks: + return self._load_sample_text() + + return no_update + + # Model info callback + @callback( + Output("model-info", "children"), + Input("model-selection", "value"), + prevent_initial_call=False, + ) + def update_model_info(model_name): + if not model_name: + return html.Span("Please select a model", className="text-muted") + + from ...config.settings import AppSettings + + settings = AppSettings() + + for model in settings.AVAILABLE_MODELS: + if model["name"] == model_name: + return html.Div( + [ + html.Strong( + f"Dimensions: {model['dimensions']} | Context Length: {model['context_length']}" + ), + html.Br(), + html.Span(model["description"]), + html.Br(), + html.Small( + f"Multilingual: {'Yes' if model.get('multilingual', False) else 'No'} | Size: {model['size']}", + className="text-muted", + ), + ] + ) + + return html.Span("Model information not available", className="text-muted") + + # Process client-side embeddings result callback + @callback( + [ + Output("processed-data", "data", allow_duplicate=True), + Output("text-input-status", "children"), + Output("text-input-status", "color"), + Output("text-input-status", "style"), + Output("generate-embeddings-btn", "disabled", allow_duplicate=True), + ], + [Input("embeddings-generated-trigger", "data")], + prevent_initial_call=True, + ) + def process_embeddings_result(embeddings_data): + """Process embeddings generated client-side.""" + if not embeddings_data: + return no_update, no_update, no_update, no_update, no_update + + processed_data = self.processor.process_client_embeddings(embeddings_data) + + if processed_data.error: + return ( + {"error": processed_data.error}, + f"❌ Error: {processed_data.error}", + "danger", + {"display": "block"}, + False, + ) + + return ( + { + "documents": [ + self._document_to_dict(doc) for doc in processed_data.documents + ], + "embeddings": processed_data.embeddings.tolist(), + }, + f"✅ Generated embeddings for {len(processed_data.documents)} text chunks", + "success", + {"display": "block"}, + False, + ) + + def _load_sample_text(self): + """Load sample text from assets/sample-txt.md file.""" + import os + + try: + # Get the project root directory (four levels up from this file) + current_file = os.path.abspath(__file__) + project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(current_file))))) + sample_file_path = os.path.join(project_root, 'assets', 'sample-txt.md') + + if os.path.exists(sample_file_path): + with open(sample_file_path, 'r', encoding='utf-8') as file: + return file.read() + else: + # Fallback sample text if file doesn't exist + return """The sun peeked through the clouds after a drizzly morning. +A gentle breeze rustled the leaves as we walked along the shoreline. +Heavy rains caused flooding in several low-lying neighborhoods. +It was so hot that even the birds sought shade under the palm trees. +By midnight, the temperature had dropped below freezing. + +The new smartphone features a foldable display and 5G connectivity. +In the world of AI, transformers have revolutionized natural language processing. +Quantum computing promises to solve problems beyond classical computers' reach. +Blockchain technology is being explored for secure voting systems. +Virtual reality headsets are becoming more affordable and accessible. + +Preheat the oven to 375°F before you start mixing the batter. +She finely chopped the garlic and sautéed it in two tablespoons of olive oil. +A pinch of saffron adds a beautiful color and aroma to traditional paella. +If the soup is too salty, add a peeled potato to absorb excess sodium. +Let the bread dough rise for at least an hour in a warm, draft-free spot.""" + + except Exception as e: + # Return a simple fallback if there's any error + return "This is sample text for testing embedding generation. You can replace this with your own text." + @staticmethod def _document_to_dict(doc): return { diff --git a/src/embeddingbuddy/ui/components/datasource.py b/src/embeddingbuddy/ui/components/datasource.py index 2d4c630..1cbc935 100644 --- a/src/embeddingbuddy/ui/components/datasource.py +++ b/src/embeddingbuddy/ui/components/datasource.py @@ -1,11 +1,13 @@ from dash import dcc, html import dash_bootstrap_components as dbc from .upload import UploadComponent +from .textinput import TextInputComponent class DataSourceComponent: def __init__(self): self.upload_component = UploadComponent() + self.text_input_component = TextInputComponent() def create_tabbed_interface(self): """Create tabbed interface for different data sources.""" @@ -17,6 +19,7 @@ class DataSourceComponent: [ dbc.Tab(label="File Upload", tab_id="file-tab"), dbc.Tab(label="OpenSearch", tab_id="opensearch-tab"), + dbc.Tab(label="Text Input", tab_id="text-input-tab"), ], id="data-source-tabs", active_tab="file-tab", @@ -208,6 +211,10 @@ class DataSourceComponent: ] ) + def create_text_input_tab(self): + """Create text input tab content for browser-based embedding generation.""" + return html.Div([self.text_input_component.create_text_input_interface()]) + def _create_opensearch_section(self, section_type): """Create a complete OpenSearch section for either 'data' or 'prompts'.""" section_id = section_type # 'data' or 'prompts' diff --git a/src/embeddingbuddy/ui/components/textinput.py b/src/embeddingbuddy/ui/components/textinput.py new file mode 100644 index 0000000..8397954 --- /dev/null +++ b/src/embeddingbuddy/ui/components/textinput.py @@ -0,0 +1,402 @@ +"""Text input component for generating embeddings from user text.""" + +import dash_bootstrap_components as dbc +from dash import dcc, html + +from embeddingbuddy.config.settings import AppSettings + + +class TextInputComponent: + """Component for text input and embedding generation.""" + + def __init__(self): + self.settings = AppSettings() + + def create_text_input_interface(self): + """Create the complete text input interface with model selection and processing options.""" + return html.Div( + [ + # Model selection section + self._create_model_selection(), + html.Hr(), + # Text input section + self._create_text_input_area(), + # Text action buttons + self._create_text_action_buttons(), + html.Hr(), + # Processing options + self._create_processing_options(), + html.Hr(), + # Generation controls + self._create_generation_controls(), + html.Hr(), + # Progress indicators + self._create_progress_indicators(), + html.Hr(), + # Status and results + self._create_status_section(), + # Hidden components for data flow + self._create_hidden_components(), + ], + className="p-3", + ) + + def _create_model_selection(self): + """Create model selection dropdown with descriptions.""" + model_options = [] + for model in self.settings.AVAILABLE_MODELS: + label = f"{model['label']} - {model['size']}" + if model.get("default", False): + label += " (Recommended)" + + model_options.append({"label": label, "value": model["name"]}) + + return html.Div( + [ + html.H5("Embedding Model", className="mb-3"), + html.Div( + [ + dcc.Dropdown( + id="model-selection", + options=model_options, + value=self.settings.DEFAULT_EMBEDDING_MODEL, + placeholder="Select an embedding model...", + className="mb-2", + ), + dbc.Alert( + [ + html.Div( + id="model-info", + children=self._get_model_description( + self.settings.DEFAULT_EMBEDDING_MODEL + ), + ) + ], + color="info", + className="small", + ), + ] + ), + ] + ) + + def _create_text_input_area(self): + """Create text input textarea with character limits.""" + return html.Div( + [ + html.H5("Text Input", className="mb-3"), + dcc.Textarea( + id="text-input-area", + placeholder="Paste your text here... Each sentence, paragraph, or line will become a separate data point depending on your tokenization method below.", + value="", + style={ + "width": "100%", + "height": "300px", + "resize": "vertical", + "font-family": "monospace", + "font-size": "14px", + }, + maxLength=self.settings.MAX_TEXT_LENGTH, + className="form-control", + ), + html.Small( + f"Maximum {self.settings.MAX_TEXT_LENGTH:,} characters. Current: ", + className="text-muted", + ), + html.Small( + id="text-length-counter", + children="0", + className="text-muted fw-bold", + ), + html.Small(" characters", className="text-muted"), + ] + ) + + def _create_text_action_buttons(self): + """Create action buttons for text input (Load Sample, Clear).""" + return html.Div( + [ + dbc.Row( + [ + dbc.Col( + [ + dbc.Button( + [ + html.I(className="fas fa-file-text me-2"), + "Load Sample Text", + ], + id="load-sample-btn", + color="info", + size="sm", + className="w-100", + ) + ], + md=6, + ), + dbc.Col( + [ + dbc.Button( + [ + html.I(className="fas fa-trash me-2"), + "Clear Text", + ], + id="clear-text-btn", + color="outline-secondary", + size="sm", + className="w-100", + ) + ], + md=6, + ), + ], + className="mt-2 mb-3", + ) + ] + ) + + def _create_processing_options(self): + """Create tokenization and metadata options.""" + return html.Div( + [ + html.H5("Processing Options", className="mb-3"), + dbc.Row( + [ + dbc.Col( + [ + html.Label( + "Text Splitting Method:", className="form-label" + ), + dcc.Dropdown( + id="tokenization-method", + options=[ + { + "label": "Sentences (split on . ! ?)", + "value": "sentence", + }, + { + "label": "Paragraphs (split on double newline)", + "value": "paragraph", + }, + { + "label": "Lines (split on single newline)", + "value": "manual", + }, + { + "label": "Entire text as one document", + "value": "whole", + }, + ], + value=self.settings.DEFAULT_TOKENIZATION_METHOD, + className="mb-3", + ), + ], + md=6, + ), + dbc.Col( + [ + html.Label("Batch Size:", className="form-label"), + dcc.Dropdown( + id="batch-size", + options=[ + { + "label": "Small batches (4) - Lower memory", + "value": 4, + }, + { + "label": "Medium batches (8) - Balanced", + "value": 8, + }, + { + "label": "Large batches (16) - Faster", + "value": 16, + }, + ], + value=self.settings.MAX_BATCH_SIZE, + className="mb-3", + ), + ], + md=6, + ), + ] + ), + dbc.Row( + [ + dbc.Col( + [ + html.Label( + "Category (Optional):", className="form-label" + ), + dcc.Input( + id="text-category", + type="text", + placeholder="e.g., Notes, Articles, Ideas...", + value="Text Input", + className="form-control mb-3", + ), + ], + md=6, + ), + dbc.Col( + [ + html.Label( + "Subcategory (Optional):", className="form-label" + ), + dcc.Input( + id="text-subcategory", + type="text", + placeholder="e.g., Meeting Notes, Research...", + value="Generated", + className="form-control mb-3", + ), + ], + md=6, + ), + ] + ), + ] + ) + + def _create_generation_controls(self): + """Create embedding generation button and controls.""" + return html.Div( + [ + html.H5("Generate Embeddings", className="mb-3"), + dbc.Row( + [ + dbc.Col( + [ + dbc.Button( + [ + html.I(className="fas fa-magic me-2"), + "Generate Embeddings", + ], + id="generate-embeddings-btn", + color="primary", + size="lg", + disabled=True, + className="w-100", + ) + ], + md=12, + ), + ] + ), + html.Div( + [ + dbc.Alert( + [ + html.I(className="fas fa-info-circle me-2"), + "Enter some text above and select a model to enable embedding generation.", + ], + color="light", + className="mt-3", + id="generation-help", + ) + ] + ), + ] + ) + + def _create_progress_indicators(self): + """Create progress bars for model loading and embedding generation.""" + return html.Div( + [ + # Model loading progress + html.Div( + [ + html.H6("Model Loading Progress", className="mb-2"), + dbc.Progress( + id="model-loading-progress", + value=0, + striped=True, + animated=True, + className="mb-2", + ), + html.Small( + id="model-loading-status", + children="No model loading in progress", + className="text-muted", + ), + ], + id="model-loading-section", + style={"display": "none"}, + ), + html.Br(), + # Embedding generation progress + html.Div( + [ + html.H6("Embedding Generation Progress", className="mb-2"), + dbc.Progress( + id="embedding-progress", + value=0, + striped=True, + animated=True, + className="mb-2", + ), + html.Small( + id="embedding-status", + children="No embedding generation in progress", + className="text-muted", + ), + ], + id="embedding-progress-section", + style={"display": "none"}, + ), + ] + ) + + def _create_status_section(self): + """Create status alerts and results preview.""" + return html.Div( + [ + # Immediate status (from client-side) + dbc.Alert( + id="text-input-status-immediate", + children="Ready to generate embeddings", + color="light", + className="mb-3", + ), + # Server-side status + dbc.Alert( + id="text-input-status", + children="", + color="light", + className="mb-3", + style={"display": "none"}, + ), + # Results preview + html.Div(id="embedding-results-preview"), + ] + ) + + def _create_hidden_components(self): + """Create hidden components for data flow.""" + return html.Div( + [ + # Store for embeddings data from client-side + dcc.Store(id="embeddings-generated-trigger"), + # Store for tokenization preview + dcc.Store(id="tokenization-preview-data"), + ] + ) + + def _get_model_description(self, model_name): + """Get description for a specific model.""" + for model in self.settings.AVAILABLE_MODELS: + if model["name"] == model_name: + return html.Div( + [ + html.Strong( + f"Dimensions: {model['dimensions']} | Context Length: {model['context_length']}" + ), + html.Br(), + html.Span(model["description"]), + html.Br(), + html.Small( + f"Multilingual: {'Yes' if model.get('multilingual', False) else 'No'} | Size: {model['size']}", + className="text-muted", + ), + ] + ) + + return html.Span("Model information not available", className="text-muted") diff --git a/src/embeddingbuddy/ui/layout.py b/src/embeddingbuddy/ui/layout.py index 71a0402..5c63676 100644 --- a/src/embeddingbuddy/ui/layout.py +++ b/src/embeddingbuddy/ui/layout.py @@ -20,6 +20,15 @@ class AppLayout: dbc.Col( [ html.H1("EmbeddingBuddy", className="text-center mb-4"), + # Load Transformers.js from CDN + html.Script( + """ + import { pipeline } from 'https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.7.2'; + window.transformersPipeline = pipeline; + console.log('✅ Transformers.js pipeline loaded globally'); + """, + type="module" + ), ], width=12, ) diff --git a/tests/test_client_embeddings.py b/tests/test_client_embeddings.py new file mode 100644 index 0000000..41380d7 --- /dev/null +++ b/tests/test_client_embeddings.py @@ -0,0 +1,147 @@ +"""Tests for client-side embedding processing functionality.""" + +import pytest +import numpy as np + +from src.embeddingbuddy.data.processor import DataProcessor +from src.embeddingbuddy.models.schemas import ProcessedData + + +class TestClientEmbeddingsProcessing: + """Test client-side embeddings processing functionality.""" + + def setup_method(self): + """Set up test instances.""" + self.processor = DataProcessor() + + def test_process_client_embeddings_success(self): + """Test successful processing of client-side embeddings data.""" + client_data = { + "documents": [ + { + "id": "text_input_0", + "text": "First test document", + "category": "Text Input", + "subcategory": "Generated", + "tags": [] + }, + { + "id": "text_input_1", + "text": "Second test document", + "category": "Text Input", + "subcategory": "Generated", + "tags": [] + } + ], + "embeddings": [ + [0.1, 0.2, 0.3, 0.4], + [0.5, 0.6, 0.7, 0.8] + ] + } + + result = self.processor.process_client_embeddings(client_data) + + assert isinstance(result, ProcessedData) + assert result.error is None + assert len(result.documents) == 2 + assert result.embeddings.shape == (2, 4) + + # Check document content + assert result.documents[0].text == "First test document" + assert result.documents[1].text == "Second test document" + + # Check embeddings match + np.testing.assert_array_equal(result.embeddings[0], [0.1, 0.2, 0.3, 0.4]) + np.testing.assert_array_equal(result.embeddings[1], [0.5, 0.6, 0.7, 0.8]) + + def test_process_client_embeddings_with_error(self): + """Test processing client data with error.""" + client_data = {"error": "Transformers.js not loaded"} + + result = self.processor.process_client_embeddings(client_data) + + assert isinstance(result, ProcessedData) + assert result.error == "Transformers.js not loaded" + assert len(result.documents) == 0 + assert result.embeddings.size == 0 + + def test_process_client_embeddings_missing_data(self): + """Test processing with missing documents or embeddings.""" + client_data = {"documents": []} + + result = self.processor.process_client_embeddings(client_data) + + assert isinstance(result, ProcessedData) + assert "No documents or embeddings in client data" in result.error + assert len(result.documents) == 0 + + def test_process_client_embeddings_mismatch_count(self): + """Test processing with mismatched document and embedding counts.""" + client_data = { + "documents": [ + {"id": "test", "text": "Test document", "category": "Test", "subcategory": "Test", "tags": []} + ], + "embeddings": [ + [0.1, 0.2, 0.3, 0.4], + [0.5, 0.6, 0.7, 0.8] + ] + } + + result = self.processor.process_client_embeddings(client_data) + + assert isinstance(result, ProcessedData) + assert "Mismatch between number of documents and embeddings" in result.error + assert len(result.documents) == 0 + + def test_process_client_embeddings_invalid_document(self): + """Test processing with invalid document data.""" + client_data = { + "documents": [ + {"text": ""}, # Empty text should be skipped + {"id": "test2", "text": "Valid document", "category": "Test", "subcategory": "Test", "tags": []} + ], + "embeddings": [ + [0.1, 0.2, 0.3, 0.4], + [0.5, 0.6, 0.7, 0.8] + ] + } + + result = self.processor.process_client_embeddings(client_data) + + assert isinstance(result, ProcessedData) + assert result.error is None + assert len(result.documents) == 1 # Only valid document should be processed + assert result.documents[0].text == "Valid document" + + def test_process_client_embeddings_auto_id_generation(self): + """Test automatic ID generation for documents without IDs.""" + client_data = { + "documents": [ + {"text": "Document without ID", "category": "Test", "subcategory": "Test", "tags": []} + ], + "embeddings": [ + [0.1, 0.2, 0.3, 0.4] + ] + } + + result = self.processor.process_client_embeddings(client_data) + + assert isinstance(result, ProcessedData) + assert result.error is None + assert len(result.documents) == 1 + assert result.documents[0].id.startswith("text_input_") + + def test_process_client_embeddings_invalid_embedding_format(self): + """Test processing with invalid embedding format.""" + client_data = { + "documents": [ + {"id": "test", "text": "Test document", "category": "Test", "subcategory": "Test", "tags": []} + ], + "embeddings": 0.5 # Scalar instead of array + } + + result = self.processor.process_client_embeddings(client_data) + + assert isinstance(result, ProcessedData) + assert result.error is not None # Should have some error + assert len(result.documents) == 0 \ No newline at end of file