3 Commits

Author SHA1 Message Date
bced5e07ce minor formatting
All checks were successful
Security Scan / security (pull_request) Successful in 43s
Security Scan / dependency-check (pull_request) Successful in 45s
Test Suite / lint (pull_request) Successful in 30s
Test Suite / test (3.11) (pull_request) Successful in 1m29s
Test Suite / build (pull_request) Successful in 39s
2025-09-06 07:23:26 -07:00
cdaaffd735 add in browser embedding generation
Some checks failed
Security Scan / security (pull_request) Successful in 44s
Security Scan / dependency-check (pull_request) Successful in 49s
Test Suite / lint (pull_request) Failing after 40s
Test Suite / test (3.11) (pull_request) Successful in 1m39s
Test Suite / build (pull_request) Has been skipped
2025-09-06 07:16:30 -07:00
14abc446b7 Merge pull request 'add-os-load' (#3) from add-os-load into main
Some checks failed
Test Suite / lint (push) Successful in 25s
Test Suite / test (3.11) (push) Successful in 1m29s
Release / test (push) Successful in 1m2s
Release / build-and-release (push) Failing after 32s
Test Suite / build (push) Successful in 45s
Security Scan / security (push) Successful in 46s
Security Scan / dependency-check (push) Successful in 50s
This adds support for loading data from Opensearch.

Reviewed-on: #3
2025-08-14 19:07:24 -07:00
14 changed files with 1598 additions and 4 deletions

View File

@@ -3,7 +3,8 @@
"allow": [
"Bash(mkdir:*)",
"Bash(uv run:*)",
"Bash(uv add:*)"
"Bash(uv add:*)",
"Bash(uv sync:*)"
],
"deny": [],
"ask": [],

5
.gitignore vendored
View File

@@ -81,4 +81,7 @@ safety-report.json
pip-audit-report.json
# Temporary files
*.tmp
*.tmp
examples/extra

278
assets/embeddings.js Normal file
View File

@@ -0,0 +1,278 @@
// Text input embedding generation using Transformers.js
// This module runs entirely in the browser for privacy and performance
// Global flag to track initialization
window.transformersLoading = false;
window.transformersLoaded = false;
class TransformersEmbedder {
constructor() {
this.extractor = null;
this.currentModel = null;
this.modelCache = new Map();
this.isLoading = false;
}
async initializeModel(modelName = 'Xenova/all-MiniLM-L6-v2') {
try {
if (this.modelCache.has(modelName)) {
this.extractor = this.modelCache.get(modelName);
this.currentModel = modelName;
return { success: true, model: modelName };
}
if (this.isLoading) {
return { success: false, error: 'Model loading already in progress' };
}
this.isLoading = true;
// Use globally loaded Transformers.js pipeline
if (!window.transformers) {
if (!window.transformersPipeline) {
// Wait for the pipeline to load
let attempts = 0;
while (!window.transformersPipeline && attempts < 50) { // Wait up to 5 seconds
await new Promise(resolve => setTimeout(resolve, 100));
attempts++;
}
if (!window.transformersPipeline) {
throw new Error('Transformers.js pipeline not available. Please refresh the page.');
}
}
window.transformers = { pipeline: window.transformersPipeline };
window.transformersLoaded = true;
console.log('✅ Using globally loaded Transformers.js pipeline');
}
// Show loading progress to user
if (window.updateModelLoadingProgress) {
window.updateModelLoadingProgress(0, `Loading ${modelName}...`);
}
this.extractor = await window.transformers.pipeline('feature-extraction', modelName, {
progress_callback: (data) => {
if (window.updateModelLoadingProgress && data.progress !== undefined) {
const progress = Math.round(data.progress);
window.updateModelLoadingProgress(progress, data.status || 'Loading...');
}
}
});
this.modelCache.set(modelName, this.extractor);
this.currentModel = modelName;
this.isLoading = false;
if (window.updateModelLoadingProgress) {
window.updateModelLoadingProgress(100, 'Model loaded successfully');
}
return { success: true, model: modelName };
} catch (error) {
this.isLoading = false;
console.error('Model initialization error:', error);
return { success: false, error: error.message };
}
}
async generateEmbeddings(texts, options = {}) {
if (!this.extractor) {
throw new Error('Model not initialized. Call initializeModel() first.');
}
if (!texts || texts.length === 0) {
throw new Error('No texts provided for embedding generation.');
}
const embeddings = [];
const defaultOptions = {
pooling: 'mean',
normalize: true,
...options
};
// Process in batches to avoid memory issues
const batchSize = options.batchSize || 8;
try {
for (let i = 0; i < texts.length; i += batchSize) {
const batch = texts.slice(i, i + batchSize);
const batchResults = await Promise.all(
batch.map(text => {
if (!text || text.trim().length === 0) {
throw new Error('Empty text found in batch');
}
return this.extractor(text.trim(), defaultOptions);
})
);
// Convert tensor output to arrays
batchResults.forEach((result, idx) => {
if (result && result.data) {
embeddings.push(Array.from(result.data));
} else {
throw new Error(`Invalid embedding result for text: ${batch[idx]}`);
}
});
// Update progress
const progress = Math.min(100, ((i + batch.length) / texts.length) * 100);
if (window.updateEmbeddingProgress) {
window.updateEmbeddingProgress(progress, `Processing ${i + batch.length}/${texts.length} texts`);
}
}
if (window.updateEmbeddingProgress) {
window.updateEmbeddingProgress(100, `Generated ${embeddings.length} embeddings successfully`);
}
return embeddings;
} catch (error) {
console.error('Embedding generation error:', error);
throw error;
}
}
}
// Global instance
window.transformersEmbedder = new TransformersEmbedder();
console.log('📦 TransformersEmbedder instance created');
// Global progress update functions
window.updateModelLoadingProgress = function(progress, status) {
const progressBar = document.getElementById('model-loading-progress');
const statusText = document.getElementById('model-loading-status');
if (progressBar) {
progressBar.style.width = progress + '%';
progressBar.setAttribute('aria-valuenow', progress);
}
if (statusText) {
statusText.textContent = status;
}
};
window.updateEmbeddingProgress = function(progress, status) {
const progressBar = document.getElementById('embedding-progress');
const statusText = document.getElementById('embedding-status');
if (progressBar) {
progressBar.style.width = progress + '%';
progressBar.setAttribute('aria-valuenow', progress);
}
if (statusText) {
statusText.textContent = status;
}
};
// Dash clientside callback functions
window.dash_clientside = window.dash_clientside || {};
console.log('🔧 Setting up window.dash_clientside.transformers');
window.dash_clientside.transformers = {
generateEmbeddings: async function(nClicks, textContent, modelName, tokenizationMethod, category, subcategory) {
console.log('🚀 generateEmbeddings called with:', { nClicks, modelName, tokenizationMethod, textLength: textContent?.length });
if (!nClicks || !textContent || textContent.trim().length === 0) {
console.log('⚠️ Early return - missing required parameters');
return window.dash_clientside.no_update;
}
try {
// Initialize model if needed
const initResult = await window.transformersEmbedder.initializeModel(modelName);
if (!initResult.success) {
return [
{ error: initResult.error },
`❌ Model loading error: ${initResult.error}`,
"danger",
false
];
}
// Tokenize text based on method
let textChunks;
const trimmedText = textContent.trim();
switch (tokenizationMethod) {
case 'sentence':
// Simple sentence splitting - can be enhanced with proper NLP
textChunks = trimmedText
.split(/[.!?]+/)
.map(s => s.trim())
.filter(s => s.length > 0);
break;
case 'paragraph':
textChunks = trimmedText
.split(/\n\s*\n/)
.map(s => s.trim())
.filter(s => s.length > 0);
break;
case 'manual':
textChunks = trimmedText
.split('\n')
.map(s => s.trim())
.filter(s => s.length > 0);
break;
default:
textChunks = [trimmedText];
}
if (textChunks.length === 0) {
return [
{ error: 'No valid text chunks found after tokenization' },
'❌ Error: No valid text chunks found after tokenization',
"danger",
false
];
}
// Generate embeddings
const embeddings = await window.transformersEmbedder.generateEmbeddings(textChunks);
if (!embeddings || embeddings.length !== textChunks.length) {
return [
{ error: 'Embedding generation failed - mismatch in text chunks and embeddings' },
'❌ Error: Embedding generation failed',
"danger",
false
];
}
// Create documents structure
const documents = textChunks.map((text, i) => ({
id: `text_input_${Date.now()}_${i}`,
text: text,
embedding: embeddings[i],
category: category || "Text Input",
subcategory: subcategory || "Generated",
tags: []
}));
return [
{
documents: documents,
embeddings: embeddings
},
`✅ Generated embeddings for ${documents.length} text chunks using ${modelName}`,
"success",
false
];
} catch (error) {
console.error('Client-side embedding error:', error);
return [
{ error: error.message },
`❌ Error: ${error.message}`,
"danger",
false
];
}
}
};
console.log('✅ Transformers.js client-side setup complete');
console.log('Available:', {
transformersEmbedder: !!window.transformersEmbedder,
dashClientside: !!window.dash_clientside,
transformersModule: !!window.dash_clientside?.transformers,
generateFunction: typeof window.dash_clientside?.transformers?.generateEmbeddings
});

9
assets/package.json Normal file
View File

@@ -0,0 +1,9 @@
{
"name": "embeddingbuddy-assets",
"version": "1.0.0",
"description": "JavaScript dependencies for EmbeddingBuddy text input functionality",
"dependencies": {
"@huggingface/transformers": "^3.0.0"
},
"type": "module"
}

106
assets/sample-txt.md Normal file
View File

@@ -0,0 +1,106 @@
The sun peeked through the clouds after a drizzly morning.
A gentle breeze rustled the leaves as we walked along the shoreline.
Heavy rains caused flooding in several low-lying neighborhoods.
It was so hot that even the birds sought shade under the palm trees.
By midnight, the temperature had dropped below freezing.
Thunderstorms lit up the sky with flashes of lightning.
A thick fog settled over the city streets at dawn.
The air smelled of ozone after the sudden hailstorm.
I watched the snowflakes drift silently onto the ground.
A double rainbow appeared after the rain shower.
The humidity soared to uncomfortable levels by midday.
Dust devils formed in the dry desert plains.
The barometer readings indicated an approaching front.
A sudden gust of wind knocked over the garden chairs.
Light drizzle turned into a torrential downpour within minutes.
The new smartphone features a foldable display and 5G connectivity.
In the world of AI, transformers have revolutionized natural language processing.
Quantum computing promises to solve problems beyond classical computers' reach.
Blockchain technology is being explored for secure voting systems.
Virtual reality headsets are becoming more affordable and accessible.
The rise of electric vehicles is reshaping the automotive industry.
Cloud computing allows businesses to scale resources dynamically.
Machine learning algorithms can now predict stock market trends with surprising accuracy.
Augmented reality applications are transforming retail experiences.
The Internet of Things connects everyday devices to the web for smarter living.
Cybersecurity threats are evolving, requiring constant vigilance.
3D printing is enabling rapid prototyping and custom manufacturing.
Edge computing reduces latency by processing data closer to the source.
Biometric authentication methods are enhancing security in devices.
Wearable technology is tracking health metrics in real-time.
Artificial intelligence is being used to create realistic deepfakes.
Preheat the oven to 375°F before you start mixing the batter.
She finely chopped the garlic and sautéed it in two tablespoons of olive oil.
A pinch of saffron adds a beautiful color and aroma to traditional paella.
If the soup is too salty, add a peeled potato to absorb excess sodium.
Let the bread dough rise for at least an hour in a warm, draft-free spot.
Marinate the chicken overnight in a blend of citrus and spices.
Use a cast-iron skillet to sear the steak on high heat.
Whisk the egg whites until they form stiff peaks.
Fold in the chocolate chips gently to keep the batter airy.
Brush the pastry with an egg wash for a golden finish.
Slow-roast the pork shoulder until it falls off the bone.
Garnish the salad with toasted nuts and fresh herbs.
Deglaze the pan with white wine for a rich sauce.
Simmer the curry paste until the aroma intensifies.
Let the risotto rest before serving to thicken slightly.
He dribbled past two defenders and sank a three-pointer at the buzzer.
The marathon runner kept a steady pace despite the sweltering heat.
Their home team clinched the championship with a last-minute goal.
NASCAR fans cheered as the cars roared around the oval track.
She landed a perfect triple axel at the figure skating championship.
The cyclist pedaled up the steep hill in record time.
He pitched a no-hitter during the high school baseball game.
The quarterback threw a touchdown pass under heavy pressure.
They scored a hat-trick in the hockey final.
The boxer delivered a swift uppercut in the final round.
Surfers caught massive waves at dawn on the Pacific coast.
Fans erupted when the underdog scored the winning goal.
The swimmer broke the national record in the 200m freestyle.
The gymnast executed a flawless routine on the balance beam.
The rugby team celebrated their victory with a traditional haka.
The stock market rallied after positive earnings reports.
Investors are closely watching interest rate changes by the Federal Reserve.
Cryptocurrency prices have been extremely volatile this year.
Diversification is key to managing investment risk effectively.
Inflation rates have reached a 40-year high, impacting consumer spending.
Many companies are adopting ESG criteria to attract socially conscious investors.
The bond market is reacting to geopolitical tensions and supply chain disruptions.
Venture capital funding for startups has surged in the tech sector.
Exchange-traded funds (ETFs) offer a way to invest in diversified portfolios.
The global economy is recovering from the pandemic, but challenges remain.
Central banks are exploring digital currencies to modernize payment systems.
Retail investors are increasingly participating in the stock market through apps.
Hedge funds are using complex algorithms to gain an edge in trading.
Real estate prices have skyrocketed in urban areas due to low inventory.
The startup raised $10 million in its Series A funding round.
The symphony orchestra played a hauntingly beautiful melody.
She strummed her guitar softly, filling the room with a warm sound.
The DJ mixed tracks seamlessly, keeping the crowd dancing all night.
His voice soared during the high notes of the ballad.
The band played an acoustic set in the intimate coffee shop.
Jazz musicians often improvise solos based on the chord changes.
The opera singer hit the high C with perfect pitch.
The choir harmonized beautifully, filling the church with sound.
He composed a symphony that was performed at the concert hall.
The singer-songwriter wrote heartfelt lyrics about love and loss.
The rock band headlined the festival, drawing a massive crowd.
Hip-hop artists use rhythm and rhyme to tell powerful stories.
The violinist played a virtuosic solo that left the audience in awe.
Folk music often reflects the culture and traditions of a community.
The gospel choir lifted spirits with their uplifting performance.
The fall of the Berlin Wall in 1989 marked the end of the Cold War.
Ancient Egypt's pyramids are a testament to their architectural prowess.
Europe's Renaissance period sparked a revival in art and science.
The signing of the Declaration of Independence in 1776 established the United States.
The Industrial Revolution transformed economies and societies worldwide.
Rome was the center of a vast empire that influenced law and governance.
The discovery of the New World by Christopher Columbus in 1492 changed global trade.
The French Revolution in 1789 led to significant political and social change.
World War II was a global conflict that reshaped international relations.
The fall of the Roman Empire in 476 AD marked the beginning of the Middle Ages.
The invention of the printing press revolutionized the spread of knowledge.
The Cold War was characterized by political tension between the U.S. and the Soviet Union.
The ancient Silk Road connected East and West through trade routes.
The signing of the Magna Carta in 1215 established principles of due process.
Exploration during the Age of Discovery expanded European empires across the globe.

View File

@@ -0,0 +1,172 @@
// Simple script to load Transformers.js from CDN and initialize embedding functionality
// This approach uses traditional script loading instead of ES6 modules
console.log('🔧 Transformers.js loader starting...');
// Global state
window.transformersLibraryLoaded = false;
window.transformersLibraryLoading = false;
// Function to dynamically load a script
function loadScript(src) {
return new Promise((resolve, reject) => {
const script = document.createElement('script');
script.src = src;
script.type = 'module';
script.onload = () => resolve();
script.onerror = () => reject(new Error(`Failed to load script: ${src}`));
document.head.appendChild(script);
});
}
// Function to initialize Transformers.js
async function initializeTransformers() {
if (window.transformersLibraryLoaded) {
console.log('✅ Transformers.js already loaded');
return true;
}
if (window.transformersLibraryLoading) {
console.log('⏳ Transformers.js already loading, waiting...');
// Wait for loading to complete
while (window.transformersLibraryLoading) {
await new Promise(resolve => setTimeout(resolve, 100));
}
return window.transformersLibraryLoaded;
}
window.transformersLibraryLoading = true;
try {
console.log('📦 Loading Transformers.js from CDN...');
// Use dynamic import since this is more reliable with ES modules
const transformers = await import('https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.0.0');
window.transformersLibrary = transformers;
window.transformersLibraryLoaded = true;
console.log('✅ Transformers.js loaded successfully');
return true;
} catch (error) {
console.error('❌ Failed to load Transformers.js:', error);
return false;
} finally {
window.transformersLibraryLoading = false;
}
}
// Simple embeddings class
class SimpleEmbedder {
constructor() {
this.pipeline = null;
this.modelCache = new Map();
}
async generateEmbeddings(texts, modelName = 'Xenova/all-MiniLM-L6-v2') {
console.log('🔄 Generating embeddings for', texts.length, 'texts with model', modelName);
// Ensure Transformers.js is loaded
if (!window.transformersLibraryLoaded) {
const loaded = await initializeTransformers();
if (!loaded) {
throw new Error('Failed to load Transformers.js');
}
}
// Create pipeline if not cached
if (!this.modelCache.has(modelName)) {
console.log('🏗️ Creating pipeline for', modelName);
const { pipeline } = window.transformersLibrary;
this.pipeline = await pipeline('feature-extraction', modelName);
this.modelCache.set(modelName, this.pipeline);
} else {
this.pipeline = this.modelCache.get(modelName);
}
// Generate embeddings
const embeddings = [];
for (let i = 0; i < texts.length; i++) {
console.log(`Processing text ${i + 1}/${texts.length}...`);
const result = await this.pipeline(texts[i], { pooling: 'mean', normalize: true });
embeddings.push(Array.from(result.data));
}
console.log('✅ Generated', embeddings.length, 'embeddings');
return embeddings;
}
}
// Create global instance
window.simpleEmbedder = new SimpleEmbedder();
// Set up Dash clientside callbacks
window.dash_clientside = window.dash_clientside || {};
window.dash_clientside.transformers = {
generateEmbeddings: async function(nClicks, textContent, modelName, tokenizationMethod, category, subcategory) {
console.log('🚀 Client-side generateEmbeddings called');
if (!nClicks || !textContent || textContent.trim().length === 0) {
console.log('⚠️ Missing required parameters');
return window.dash_clientside.no_update;
}
try {
// Tokenize text
let textChunks;
const trimmedText = textContent.trim();
switch (tokenizationMethod) {
case 'sentence':
textChunks = trimmedText.split(/[.!?]+/).map(s => s.trim()).filter(s => s.length > 0);
break;
case 'paragraph':
textChunks = trimmedText.split(/\n\s*\n/).map(s => s.trim()).filter(s => s.length > 0);
break;
case 'manual':
textChunks = trimmedText.split('\n').map(s => s.trim()).filter(s => s.length > 0);
break;
default:
textChunks = [trimmedText];
}
if (textChunks.length === 0) {
throw new Error('No valid text chunks after tokenization');
}
// Generate embeddings
const embeddings = await window.simpleEmbedder.generateEmbeddings(textChunks, modelName);
// Create documents
const documents = textChunks.map((text, i) => ({
id: `text_input_${Date.now()}_${i}`,
text: text,
embedding: embeddings[i],
category: category || "Text Input",
subcategory: subcategory || "Generated",
tags: []
}));
return [
{
documents: documents,
embeddings: embeddings
},
`✅ Generated embeddings for ${documents.length} text chunks using ${modelName}`,
"success",
false
];
} catch (error) {
console.error('❌ Error generating embeddings:', error);
return [
{ error: error.message },
`❌ Error: ${error.message}`,
"danger",
false
];
}
}
};
console.log('✅ Simple Transformers.js setup complete');
console.log('Available functions:', Object.keys(window.dash_clientside.transformers));

View File

@@ -8,7 +8,15 @@ from .ui.callbacks.interactions import InteractionCallbacks
def create_app():
app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])
import os
# Get the project root directory (two levels up from this file)
project_root = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
assets_path = os.path.join(project_root, "assets")
app = dash.Dash(
__name__, external_stylesheets=[dbc.themes.BOOTSTRAP], assets_folder=assets_path
)
# Allow callbacks to components that are dynamically created in tabs
app.config.suppress_callback_exceptions = True
@@ -20,9 +28,78 @@ def create_app():
VisualizationCallbacks()
InteractionCallbacks()
# Register client-side callback for embedding generation
_register_client_side_callbacks(app)
return app
def _register_client_side_callbacks(app):
"""Register client-side callbacks for browser-based processing."""
from dash import Input, Output, State
# Client-side callback for embedding generation
app.clientside_callback(
"""
function(nClicks, textContent, modelName, tokenizationMethod, batchSize, category, subcategory) {
if (!nClicks || !textContent || !textContent.trim()) {
return window.dash_clientside.no_update;
}
console.log('🔍 Checking for Transformers.js...');
console.log('window.dash_clientside:', typeof window.dash_clientside);
console.log('window.dash_clientside.transformers:', typeof window.dash_clientside?.transformers);
console.log('generateEmbeddings function:', typeof window.dash_clientside?.transformers?.generateEmbeddings);
if (typeof window.dash_clientside !== 'undefined' &&
typeof window.dash_clientside.transformers !== 'undefined' &&
typeof window.dash_clientside.transformers.generateEmbeddings === 'function') {
console.log('✅ Calling Transformers.js generateEmbeddings...');
return window.dash_clientside.transformers.generateEmbeddings(
nClicks, textContent, modelName, tokenizationMethod, category, subcategory
);
}
// More detailed error information
let errorMsg = '❌ Transformers.js not available. ';
if (typeof window.dash_clientside === 'undefined') {
errorMsg += 'dash_clientside not found.';
} else if (typeof window.dash_clientside.transformers === 'undefined') {
errorMsg += 'transformers module not found.';
} else if (typeof window.dash_clientside.transformers.generateEmbeddings !== 'function') {
errorMsg += 'generateEmbeddings function not found.';
}
console.error(errorMsg);
return [
{ error: 'Transformers.js not loaded. Please refresh the page and try again.' },
errorMsg + ' Please refresh the page.',
'danger',
false
];
}
""",
[
Output("embeddings-generated-trigger", "data"),
Output("text-input-status-immediate", "children"),
Output("text-input-status-immediate", "color"),
Output("generate-embeddings-btn", "disabled", allow_duplicate=True),
],
[Input("generate-embeddings-btn", "n_clicks")],
[
State("text-input-area", "value"),
State("model-selection", "value"),
State("tokenization-method", "value"),
State("batch-size", "value"),
State("text-category", "value"),
State("text-subcategory", "value"),
],
prevent_initial_call=True,
)
def run_app(app=None, debug=None, host=None, port=None):
if app is None:
app = create_app()

View File

@@ -79,6 +79,71 @@ class AppSettings:
OPENSEARCH_CONNECTION_TIMEOUT = 30
OPENSEARCH_VERIFY_CERTS = True
# Text Input / Transformers.js Configuration
DEFAULT_EMBEDDING_MODEL = "Xenova/all-mpnet-base-v2"
MAX_TEXT_LENGTH = 50000 # Characters (browser memory limits)
DEFAULT_TOKENIZATION_METHOD = "sentence"
MAX_BATCH_SIZE = 8 # Process in smaller batches for memory management
# Available Transformers.js compatible models
AVAILABLE_MODELS = [
{
"name": "Xenova/all-mpnet-base-v2",
"label": "All-MPNet-Base-v2 (Quality, 768d)",
"description": "Higher quality embeddings with better semantic understanding",
"dimensions": 768,
"size": "109 MB",
"context_length": 512,
"multilingual": False,
"default": True,
},
{
"name": "Xenova/all-MiniLM-L6-v2",
"label": "All-MiniLM-L6-v2 (Fast, 384d)",
"description": "Lightweight model, good for quick testing and general purpose",
"dimensions": 384,
"size": "23 MB",
"context_length": 512,
"multilingual": False,
"default": False,
},
{
"name": "Xenova/paraphrase-multilingual-MiniLM-L12-v2",
"label": "Multilingual MiniLM (50+ languages)",
"description": "Support for multiple languages with good performance",
"dimensions": 384,
"size": "127 MB",
"context_length": 512,
"multilingual": True,
},
{
"name": "Xenova/bge-small-en-v1.5",
"label": "BGE Small English (High quality, 384d)",
"description": "Beijing Academy of AI model with excellent performance on retrieval tasks",
"dimensions": 384,
"size": "67 MB",
"context_length": 512,
"multilingual": False,
},
{
"name": "Xenova/gte-small",
"label": "GTE Small (General Text Embeddings, 384d)",
"description": "Alibaba's general text embedding model, balanced performance",
"dimensions": 384,
"size": "67 MB",
"context_length": 512,
"multilingual": False,
},
]
# Browser compatibility requirements
SUPPORTED_BROWSERS = {
"chrome": ">=88",
"firefox": ">=92",
"safari": ">=15.4",
"edge": ">=88",
}
# Bootstrap Theme
EXTERNAL_STYLESHEETS = [
"https://cdn.jsdelivr.net/npm/bootstrap@5.1.3/dist/css/bootstrap.min.css"

View File

@@ -63,6 +63,90 @@ class DataProcessor:
except Exception as e:
return ProcessedData(documents=[], embeddings=np.array([]), error=str(e))
def process_client_embeddings(self, embeddings_data: dict) -> ProcessedData:
"""Process embeddings data received from client-side JavaScript."""
try:
if "error" in embeddings_data:
return ProcessedData(
documents=[],
embeddings=np.array([]),
error=embeddings_data["error"],
)
# Extract documents and embeddings from client data
documents_data = embeddings_data.get("documents", [])
embeddings_list = embeddings_data.get("embeddings", [])
if not documents_data or not embeddings_list:
return ProcessedData(
documents=[],
embeddings=np.array([]),
error="No documents or embeddings in client data",
)
if len(documents_data) != len(embeddings_list):
return ProcessedData(
documents=[],
embeddings=np.array([]),
error="Mismatch between number of documents and embeddings",
)
# Convert embeddings to numpy array first
try:
embeddings = np.array(embeddings_list)
if embeddings.ndim != 2:
return ProcessedData(
documents=[],
embeddings=np.array([]),
error="Invalid embedding dimensions",
)
except Exception as e:
return ProcessedData(
documents=[],
embeddings=np.array([]),
error=f"Error processing embeddings: {str(e)}",
)
# Convert to Document objects with embeddings
documents = []
for i, doc_data in enumerate(documents_data):
try:
# Skip if we don't have a corresponding embedding
if i >= len(embeddings):
continue
# Ensure required fields are present
if "id" not in doc_data or not doc_data["id"]:
doc_data["id"] = f"text_input_{i}"
if "text" not in doc_data or not doc_data["text"].strip():
continue # Skip documents without text
# Add the embedding to doc_data
doc_data["embedding"] = embeddings[i].tolist()
doc = Document(**doc_data)
documents.append(doc)
except Exception:
# Skip invalid documents but continue processing
continue
if not documents:
return ProcessedData(
documents=[],
embeddings=np.array([]),
error="No valid documents found in client data",
)
# Only keep embeddings for valid documents
valid_embeddings = embeddings[: len(documents)]
return ProcessedData(documents=documents, embeddings=valid_embeddings)
except Exception as e:
return ProcessedData(documents=[], embeddings=np.array([]), error=str(e))
def _extract_embeddings(self, documents: List[Document]) -> np.ndarray:
if not documents:
return np.array([])

View File

@@ -1,4 +1,4 @@
from dash import callback, Input, Output, State, no_update
from dash import callback, Input, Output, State, no_update, html
from ...data.processor import DataProcessor
from ...data.sources.opensearch import OpenSearchClient
from ...models.field_mapper import FieldMapper
@@ -87,6 +87,8 @@ class DataProcessingCallbacks:
if active_tab == "opensearch-tab":
return [datasource.create_opensearch_tab()]
elif active_tab == "text-input-tab":
return [datasource.create_text_input_tab()]
else:
return [datasource.create_file_upload_tab()]
@@ -97,6 +99,9 @@ class DataProcessingCallbacks:
# Register collapsible section callbacks
self._register_collapse_callbacks()
# Register text input callbacks
self._register_text_input_callbacks()
def _register_opensearch_callbacks(self, section_type, opensearch_client):
"""Register callbacks for a specific section (data or prompts)."""
@@ -463,6 +468,224 @@ class DataProcessingCallbacks:
return new_state, icon_class
return is_open, "fas fa-chevron-down me-2"
def _register_text_input_callbacks(self):
"""Register callbacks for text input functionality."""
# Text length counter callback
@callback(
Output("text-length-counter", "children"),
Input("text-input-area", "value"),
prevent_initial_call=False,
)
def update_text_length_counter(text_value):
if not text_value:
return "0"
return f"{len(text_value):,}"
# Generate button enable/disable callback
@callback(
[
Output("generate-embeddings-btn", "disabled"),
Output("generation-help", "children"),
Output("generation-help", "color"),
],
[
Input("text-input-area", "value"),
Input("model-selection", "value"),
],
prevent_initial_call=False,
)
def toggle_generate_button(text_value, model_name):
import dash_bootstrap_components as dbc
if not text_value or not text_value.strip():
return (
True,
dbc.Alert(
[
html.I(className="fas fa-info-circle me-2"),
"Enter some text above to enable embedding generation.",
],
color="light",
),
"light",
)
if not model_name:
return (
True,
dbc.Alert(
[
html.I(className="fas fa-exclamation-triangle me-2"),
"Select an embedding model to continue.",
],
color="warning",
),
"warning",
)
text_length = len(text_value.strip())
if text_length > AppSettings.MAX_TEXT_LENGTH:
return (
True,
dbc.Alert(
[
html.I(className="fas fa-exclamation-triangle me-2"),
f"Text too long ({text_length:,} characters). Maximum allowed: {AppSettings.MAX_TEXT_LENGTH:,} characters.",
],
color="danger",
),
"danger",
)
return (
False,
dbc.Alert(
[
html.I(className="fas fa-check-circle me-2"),
f"Ready to generate embeddings for {text_length:,} characters using {model_name}.",
],
color="success",
),
"success",
)
# Clear text callback
@callback(
Output("text-input-area", "value"),
[Input("clear-text-btn", "n_clicks"), Input("load-sample-btn", "n_clicks")],
prevent_initial_call=True,
)
def handle_text_input_actions(clear_clicks, load_clicks):
from dash import ctx
if not ctx.triggered:
return no_update
button_id = ctx.triggered[0]["prop_id"].split(".")[0]
if button_id == "clear-text-btn" and clear_clicks:
return ""
elif button_id == "load-sample-btn" and load_clicks:
return self._load_sample_text()
return no_update
# Model info callback
@callback(
Output("model-info", "children"),
Input("model-selection", "value"),
prevent_initial_call=False,
)
def update_model_info(model_name):
if not model_name:
return html.Span("Please select a model", className="text-muted")
from ...config.settings import AppSettings
settings = AppSettings()
for model in settings.AVAILABLE_MODELS:
if model["name"] == model_name:
return html.Div(
[
html.Strong(
f"Dimensions: {model['dimensions']} | Context Length: {model['context_length']}"
),
html.Br(),
html.Span(model["description"]),
html.Br(),
html.Small(
f"Multilingual: {'Yes' if model.get('multilingual', False) else 'No'} | Size: {model['size']}",
className="text-muted",
),
]
)
return html.Span("Model information not available", className="text-muted")
# Process client-side embeddings result callback
@callback(
[
Output("processed-data", "data", allow_duplicate=True),
Output("text-input-status", "children"),
Output("text-input-status", "color"),
Output("text-input-status", "style"),
Output("generate-embeddings-btn", "disabled", allow_duplicate=True),
],
[Input("embeddings-generated-trigger", "data")],
prevent_initial_call=True,
)
def process_embeddings_result(embeddings_data):
"""Process embeddings generated client-side."""
if not embeddings_data:
return no_update, no_update, no_update, no_update, no_update
processed_data = self.processor.process_client_embeddings(embeddings_data)
if processed_data.error:
return (
{"error": processed_data.error},
f"❌ Error: {processed_data.error}",
"danger",
{"display": "block"},
False,
)
return (
{
"documents": [
self._document_to_dict(doc) for doc in processed_data.documents
],
"embeddings": processed_data.embeddings.tolist(),
},
f"✅ Generated embeddings for {len(processed_data.documents)} text chunks",
"success",
{"display": "block"},
False,
)
def _load_sample_text(self):
"""Load sample text from assets/sample-txt.md file."""
import os
try:
# Get the project root directory (four levels up from this file)
current_file = os.path.abspath(__file__)
project_root = os.path.dirname(
os.path.dirname(
os.path.dirname(os.path.dirname(os.path.dirname(current_file)))
)
)
sample_file_path = os.path.join(project_root, "assets", "sample-txt.md")
if os.path.exists(sample_file_path):
with open(sample_file_path, "r", encoding="utf-8") as file:
return file.read()
else:
# Fallback sample text if file doesn't exist
return """The sun peeked through the clouds after a drizzly morning.
A gentle breeze rustled the leaves as we walked along the shoreline.
Heavy rains caused flooding in several low-lying neighborhoods.
It was so hot that even the birds sought shade under the palm trees.
By midnight, the temperature had dropped below freezing.
The new smartphone features a foldable display and 5G connectivity.
In the world of AI, transformers have revolutionized natural language processing.
Quantum computing promises to solve problems beyond classical computers' reach.
Blockchain technology is being explored for secure voting systems.
Virtual reality headsets are becoming more affordable and accessible.
Preheat the oven to 375°F before you start mixing the batter.
She finely chopped the garlic and sautéed it in two tablespoons of olive oil.
A pinch of saffron adds a beautiful color and aroma to traditional paella.
If the soup is too salty, add a peeled potato to absorb excess sodium.
Let the bread dough rise for at least an hour in a warm, draft-free spot."""
except Exception:
# Return a simple fallback if there's any error
return "This is sample text for testing embedding generation. You can replace this with your own text."
@staticmethod
def _document_to_dict(doc):
return {

View File

@@ -1,11 +1,13 @@
from dash import dcc, html
import dash_bootstrap_components as dbc
from .upload import UploadComponent
from .textinput import TextInputComponent
class DataSourceComponent:
def __init__(self):
self.upload_component = UploadComponent()
self.text_input_component = TextInputComponent()
def create_tabbed_interface(self):
"""Create tabbed interface for different data sources."""
@@ -17,6 +19,7 @@ class DataSourceComponent:
[
dbc.Tab(label="File Upload", tab_id="file-tab"),
dbc.Tab(label="OpenSearch", tab_id="opensearch-tab"),
dbc.Tab(label="Text Input", tab_id="text-input-tab"),
],
id="data-source-tabs",
active_tab="file-tab",
@@ -208,6 +211,10 @@ class DataSourceComponent:
]
)
def create_text_input_tab(self):
"""Create text input tab content for browser-based embedding generation."""
return html.Div([self.text_input_component.create_text_input_interface()])
def _create_opensearch_section(self, section_type):
"""Create a complete OpenSearch section for either 'data' or 'prompts'."""
section_id = section_type # 'data' or 'prompts'

View File

@@ -0,0 +1,402 @@
"""Text input component for generating embeddings from user text."""
import dash_bootstrap_components as dbc
from dash import dcc, html
from embeddingbuddy.config.settings import AppSettings
class TextInputComponent:
"""Component for text input and embedding generation."""
def __init__(self):
self.settings = AppSettings()
def create_text_input_interface(self):
"""Create the complete text input interface with model selection and processing options."""
return html.Div(
[
# Model selection section
self._create_model_selection(),
html.Hr(),
# Text input section
self._create_text_input_area(),
# Text action buttons
self._create_text_action_buttons(),
html.Hr(),
# Processing options
self._create_processing_options(),
html.Hr(),
# Generation controls
self._create_generation_controls(),
html.Hr(),
# Progress indicators
self._create_progress_indicators(),
html.Hr(),
# Status and results
self._create_status_section(),
# Hidden components for data flow
self._create_hidden_components(),
],
className="p-3",
)
def _create_model_selection(self):
"""Create model selection dropdown with descriptions."""
model_options = []
for model in self.settings.AVAILABLE_MODELS:
label = f"{model['label']} - {model['size']}"
if model.get("default", False):
label += " (Recommended)"
model_options.append({"label": label, "value": model["name"]})
return html.Div(
[
html.H5("Embedding Model", className="mb-3"),
html.Div(
[
dcc.Dropdown(
id="model-selection",
options=model_options,
value=self.settings.DEFAULT_EMBEDDING_MODEL,
placeholder="Select an embedding model...",
className="mb-2",
),
dbc.Alert(
[
html.Div(
id="model-info",
children=self._get_model_description(
self.settings.DEFAULT_EMBEDDING_MODEL
),
)
],
color="info",
className="small",
),
]
),
]
)
def _create_text_input_area(self):
"""Create text input textarea with character limits."""
return html.Div(
[
html.H5("Text Input", className="mb-3"),
dcc.Textarea(
id="text-input-area",
placeholder="Paste your text here... Each sentence, paragraph, or line will become a separate data point depending on your tokenization method below.",
value="",
style={
"width": "100%",
"height": "300px",
"resize": "vertical",
"font-family": "monospace",
"font-size": "14px",
},
maxLength=self.settings.MAX_TEXT_LENGTH,
className="form-control",
),
html.Small(
f"Maximum {self.settings.MAX_TEXT_LENGTH:,} characters. Current: ",
className="text-muted",
),
html.Small(
id="text-length-counter",
children="0",
className="text-muted fw-bold",
),
html.Small(" characters", className="text-muted"),
]
)
def _create_text_action_buttons(self):
"""Create action buttons for text input (Load Sample, Clear)."""
return html.Div(
[
dbc.Row(
[
dbc.Col(
[
dbc.Button(
[
html.I(className="fas fa-file-text me-2"),
"Load Sample Text",
],
id="load-sample-btn",
color="info",
size="sm",
className="w-100",
)
],
md=6,
),
dbc.Col(
[
dbc.Button(
[
html.I(className="fas fa-trash me-2"),
"Clear Text",
],
id="clear-text-btn",
color="outline-secondary",
size="sm",
className="w-100",
)
],
md=6,
),
],
className="mt-2 mb-3",
)
]
)
def _create_processing_options(self):
"""Create tokenization and metadata options."""
return html.Div(
[
html.H5("Processing Options", className="mb-3"),
dbc.Row(
[
dbc.Col(
[
html.Label(
"Text Splitting Method:", className="form-label"
),
dcc.Dropdown(
id="tokenization-method",
options=[
{
"label": "Sentences (split on . ! ?)",
"value": "sentence",
},
{
"label": "Paragraphs (split on double newline)",
"value": "paragraph",
},
{
"label": "Lines (split on single newline)",
"value": "manual",
},
{
"label": "Entire text as one document",
"value": "whole",
},
],
value=self.settings.DEFAULT_TOKENIZATION_METHOD,
className="mb-3",
),
],
md=6,
),
dbc.Col(
[
html.Label("Batch Size:", className="form-label"),
dcc.Dropdown(
id="batch-size",
options=[
{
"label": "Small batches (4) - Lower memory",
"value": 4,
},
{
"label": "Medium batches (8) - Balanced",
"value": 8,
},
{
"label": "Large batches (16) - Faster",
"value": 16,
},
],
value=self.settings.MAX_BATCH_SIZE,
className="mb-3",
),
],
md=6,
),
]
),
dbc.Row(
[
dbc.Col(
[
html.Label(
"Category (Optional):", className="form-label"
),
dcc.Input(
id="text-category",
type="text",
placeholder="e.g., Notes, Articles, Ideas...",
value="Text Input",
className="form-control mb-3",
),
],
md=6,
),
dbc.Col(
[
html.Label(
"Subcategory (Optional):", className="form-label"
),
dcc.Input(
id="text-subcategory",
type="text",
placeholder="e.g., Meeting Notes, Research...",
value="Generated",
className="form-control mb-3",
),
],
md=6,
),
]
),
]
)
def _create_generation_controls(self):
"""Create embedding generation button and controls."""
return html.Div(
[
html.H5("Generate Embeddings", className="mb-3"),
dbc.Row(
[
dbc.Col(
[
dbc.Button(
[
html.I(className="fas fa-magic me-2"),
"Generate Embeddings",
],
id="generate-embeddings-btn",
color="primary",
size="lg",
disabled=True,
className="w-100",
)
],
md=12,
),
]
),
html.Div(
[
dbc.Alert(
[
html.I(className="fas fa-info-circle me-2"),
"Enter some text above and select a model to enable embedding generation.",
],
color="light",
className="mt-3",
id="generation-help",
)
]
),
]
)
def _create_progress_indicators(self):
"""Create progress bars for model loading and embedding generation."""
return html.Div(
[
# Model loading progress
html.Div(
[
html.H6("Model Loading Progress", className="mb-2"),
dbc.Progress(
id="model-loading-progress",
value=0,
striped=True,
animated=True,
className="mb-2",
),
html.Small(
id="model-loading-status",
children="No model loading in progress",
className="text-muted",
),
],
id="model-loading-section",
style={"display": "none"},
),
html.Br(),
# Embedding generation progress
html.Div(
[
html.H6("Embedding Generation Progress", className="mb-2"),
dbc.Progress(
id="embedding-progress",
value=0,
striped=True,
animated=True,
className="mb-2",
),
html.Small(
id="embedding-status",
children="No embedding generation in progress",
className="text-muted",
),
],
id="embedding-progress-section",
style={"display": "none"},
),
]
)
def _create_status_section(self):
"""Create status alerts and results preview."""
return html.Div(
[
# Immediate status (from client-side)
dbc.Alert(
id="text-input-status-immediate",
children="Ready to generate embeddings",
color="light",
className="mb-3",
),
# Server-side status
dbc.Alert(
id="text-input-status",
children="",
color="light",
className="mb-3",
style={"display": "none"},
),
# Results preview
html.Div(id="embedding-results-preview"),
]
)
def _create_hidden_components(self):
"""Create hidden components for data flow."""
return html.Div(
[
# Store for embeddings data from client-side
dcc.Store(id="embeddings-generated-trigger"),
# Store for tokenization preview
dcc.Store(id="tokenization-preview-data"),
]
)
def _get_model_description(self, model_name):
"""Get description for a specific model."""
for model in self.settings.AVAILABLE_MODELS:
if model["name"] == model_name:
return html.Div(
[
html.Strong(
f"Dimensions: {model['dimensions']} | Context Length: {model['context_length']}"
),
html.Br(),
html.Span(model["description"]),
html.Br(),
html.Small(
f"Multilingual: {'Yes' if model.get('multilingual', False) else 'No'} | Size: {model['size']}",
className="text-muted",
),
]
)
return html.Span("Model information not available", className="text-muted")

View File

@@ -20,6 +20,15 @@ class AppLayout:
dbc.Col(
[
html.H1("EmbeddingBuddy", className="text-center mb-4"),
# Load Transformers.js from CDN
html.Script(
"""
import { pipeline } from 'https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.7.2';
window.transformersPipeline = pipeline;
console.log('✅ Transformers.js pipeline loaded globally');
""",
type="module",
),
],
width=12,
)

View File

@@ -0,0 +1,158 @@
"""Tests for client-side embedding processing functionality."""
import numpy as np
from src.embeddingbuddy.data.processor import DataProcessor
from src.embeddingbuddy.models.schemas import ProcessedData
class TestClientEmbeddingsProcessing:
"""Test client-side embeddings processing functionality."""
def setup_method(self):
"""Set up test instances."""
self.processor = DataProcessor()
def test_process_client_embeddings_success(self):
"""Test successful processing of client-side embeddings data."""
client_data = {
"documents": [
{
"id": "text_input_0",
"text": "First test document",
"category": "Text Input",
"subcategory": "Generated",
"tags": [],
},
{
"id": "text_input_1",
"text": "Second test document",
"category": "Text Input",
"subcategory": "Generated",
"tags": [],
},
],
"embeddings": [[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8]],
}
result = self.processor.process_client_embeddings(client_data)
assert isinstance(result, ProcessedData)
assert result.error is None
assert len(result.documents) == 2
assert result.embeddings.shape == (2, 4)
# Check document content
assert result.documents[0].text == "First test document"
assert result.documents[1].text == "Second test document"
# Check embeddings match
np.testing.assert_array_equal(result.embeddings[0], [0.1, 0.2, 0.3, 0.4])
np.testing.assert_array_equal(result.embeddings[1], [0.5, 0.6, 0.7, 0.8])
def test_process_client_embeddings_with_error(self):
"""Test processing client data with error."""
client_data = {"error": "Transformers.js not loaded"}
result = self.processor.process_client_embeddings(client_data)
assert isinstance(result, ProcessedData)
assert result.error == "Transformers.js not loaded"
assert len(result.documents) == 0
assert result.embeddings.size == 0
def test_process_client_embeddings_missing_data(self):
"""Test processing with missing documents or embeddings."""
client_data = {"documents": []}
result = self.processor.process_client_embeddings(client_data)
assert isinstance(result, ProcessedData)
assert "No documents or embeddings in client data" in result.error
assert len(result.documents) == 0
def test_process_client_embeddings_mismatch_count(self):
"""Test processing with mismatched document and embedding counts."""
client_data = {
"documents": [
{
"id": "test",
"text": "Test document",
"category": "Test",
"subcategory": "Test",
"tags": [],
}
],
"embeddings": [[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8]],
}
result = self.processor.process_client_embeddings(client_data)
assert isinstance(result, ProcessedData)
assert "Mismatch between number of documents and embeddings" in result.error
assert len(result.documents) == 0
def test_process_client_embeddings_invalid_document(self):
"""Test processing with invalid document data."""
client_data = {
"documents": [
{"text": ""}, # Empty text should be skipped
{
"id": "test2",
"text": "Valid document",
"category": "Test",
"subcategory": "Test",
"tags": [],
},
],
"embeddings": [[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8]],
}
result = self.processor.process_client_embeddings(client_data)
assert isinstance(result, ProcessedData)
assert result.error is None
assert len(result.documents) == 1 # Only valid document should be processed
assert result.documents[0].text == "Valid document"
def test_process_client_embeddings_auto_id_generation(self):
"""Test automatic ID generation for documents without IDs."""
client_data = {
"documents": [
{
"text": "Document without ID",
"category": "Test",
"subcategory": "Test",
"tags": [],
}
],
"embeddings": [[0.1, 0.2, 0.3, 0.4]],
}
result = self.processor.process_client_embeddings(client_data)
assert isinstance(result, ProcessedData)
assert result.error is None
assert len(result.documents) == 1
assert result.documents[0].id.startswith("text_input_")
def test_process_client_embeddings_invalid_embedding_format(self):
"""Test processing with invalid embedding format."""
client_data = {
"documents": [
{
"id": "test",
"text": "Test document",
"category": "Test",
"subcategory": "Test",
"tags": [],
}
],
"embeddings": 0.5, # Scalar instead of array
}
result = self.processor.process_client_embeddings(client_data)
assert isinstance(result, ProcessedData)
assert result.error is not None # Should have some error
assert len(result.documents) == 0