Compare commits
10 Commits
add-os-loa
...
add-docker
Author | SHA1 | Date | |
---|---|---|---|
9c3ff6e799 | |||
781d055e60 | |||
0f5cea2850 | |||
1bd70705e7 | |||
6610b9c196 | |||
f4095cc0cb | |||
cb6ab1bfbe | |||
bced5e07ce | |||
cdaaffd735 | |||
14abc446b7 |
@@ -3,7 +3,8 @@
|
|||||||
"allow": [
|
"allow": [
|
||||||
"Bash(mkdir:*)",
|
"Bash(mkdir:*)",
|
||||||
"Bash(uv run:*)",
|
"Bash(uv run:*)",
|
||||||
"Bash(uv add:*)"
|
"Bash(uv add:*)",
|
||||||
|
"Bash(uv sync:*)"
|
||||||
],
|
],
|
||||||
"deny": [],
|
"deny": [],
|
||||||
"ask": [],
|
"ask": [],
|
||||||
|
76
.dockerignore
Normal file
76
.dockerignore
Normal file
@@ -0,0 +1,76 @@
|
|||||||
|
# Python
|
||||||
|
__pycache__/
|
||||||
|
*.py[cod]
|
||||||
|
*$py.class
|
||||||
|
*.so
|
||||||
|
.Python
|
||||||
|
env/
|
||||||
|
venv/
|
||||||
|
ENV/
|
||||||
|
env.bak/
|
||||||
|
venv.bak/
|
||||||
|
.venv/
|
||||||
|
|
||||||
|
# Testing
|
||||||
|
.pytest_cache/
|
||||||
|
.coverage
|
||||||
|
htmlcov/
|
||||||
|
.tox/
|
||||||
|
coverage.xml
|
||||||
|
*.cover
|
||||||
|
|
||||||
|
# Development tools
|
||||||
|
.mypy_cache/
|
||||||
|
.ruff_cache/
|
||||||
|
|
||||||
|
# IDE
|
||||||
|
.vscode/
|
||||||
|
.idea/
|
||||||
|
*.swp
|
||||||
|
*.swo
|
||||||
|
*~
|
||||||
|
|
||||||
|
# OS
|
||||||
|
.DS_Store
|
||||||
|
.DS_Store?
|
||||||
|
._*
|
||||||
|
.Spotlight-V100
|
||||||
|
.Trashes
|
||||||
|
ehthumbs.db
|
||||||
|
Thumbs.db
|
||||||
|
|
||||||
|
# Git
|
||||||
|
.git/
|
||||||
|
.gitignore
|
||||||
|
|
||||||
|
# Documentation
|
||||||
|
*.md
|
||||||
|
!README.md
|
||||||
|
|
||||||
|
# Docker
|
||||||
|
Dockerfile*
|
||||||
|
docker-compose*.yml
|
||||||
|
.dockerignore
|
||||||
|
|
||||||
|
# Data files (may contain sensitive information)
|
||||||
|
*.ndjson
|
||||||
|
*.ldjson
|
||||||
|
*.json
|
||||||
|
|
||||||
|
# Reports
|
||||||
|
*-report.json
|
||||||
|
bandit-report.json
|
||||||
|
safety-report.json
|
||||||
|
|
||||||
|
# Screenshots
|
||||||
|
*.png
|
||||||
|
*.jpg
|
||||||
|
*.jpeg
|
||||||
|
*.gif
|
||||||
|
|
||||||
|
# Logs
|
||||||
|
*.log
|
||||||
|
|
||||||
|
# Temporary files
|
||||||
|
*.tmp
|
||||||
|
*.temp
|
5
.gitignore
vendored
5
.gitignore
vendored
@@ -81,4 +81,7 @@ safety-report.json
|
|||||||
pip-audit-report.json
|
pip-audit-report.json
|
||||||
|
|
||||||
# Temporary files
|
# Temporary files
|
||||||
*.tmp
|
*.tmp
|
||||||
|
|
||||||
|
|
||||||
|
examples/extra
|
17
CLAUDE.md
17
CLAUDE.md
@@ -21,8 +21,23 @@ uv sync
|
|||||||
|
|
||||||
**Run the application:**
|
**Run the application:**
|
||||||
|
|
||||||
|
Development mode (with auto-reload):
|
||||||
```bash
|
```bash
|
||||||
uv run python main.py
|
uv run run_dev.py
|
||||||
|
```
|
||||||
|
|
||||||
|
Production mode (with Gunicorn WSGI server):
|
||||||
|
```bash
|
||||||
|
# First install production dependencies
|
||||||
|
uv sync --extra prod
|
||||||
|
|
||||||
|
# Then run in production mode
|
||||||
|
uv run run_prod.py
|
||||||
|
```
|
||||||
|
|
||||||
|
Legacy mode (basic Dash server):
|
||||||
|
```bash
|
||||||
|
uv run main.py
|
||||||
```
|
```
|
||||||
|
|
||||||
The app will be available at http://127.0.0.1:8050
|
The app will be available at http://127.0.0.1:8050
|
||||||
|
73
Dockerfile
Normal file
73
Dockerfile
Normal file
@@ -0,0 +1,73 @@
|
|||||||
|
# Two-stage Dockerfile for EmbeddingBuddy
|
||||||
|
# Stage 1: Builder
|
||||||
|
FROM python:3.11-slim as builder
|
||||||
|
|
||||||
|
# Install system dependencies for building Python packages
|
||||||
|
RUN apt-get update && apt-get install -y \
|
||||||
|
build-essential \
|
||||||
|
gcc \
|
||||||
|
g++ \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# Install uv for dependency management
|
||||||
|
RUN pip install uv
|
||||||
|
|
||||||
|
# Set working directory
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
# Copy dependency files
|
||||||
|
COPY pyproject.toml uv.lock ./
|
||||||
|
|
||||||
|
# Copy source code (needed for editable install)
|
||||||
|
COPY src/ src/
|
||||||
|
COPY main.py .
|
||||||
|
COPY wsgi.py .
|
||||||
|
COPY run_prod.py .
|
||||||
|
COPY assets/ assets/
|
||||||
|
|
||||||
|
# Create virtual environment and install dependencies (including production extras)
|
||||||
|
RUN uv venv .venv
|
||||||
|
RUN uv sync --frozen --extra prod
|
||||||
|
|
||||||
|
# Stage 2: Runtime
|
||||||
|
FROM python:3.11-slim as runtime
|
||||||
|
|
||||||
|
# Install runtime dependencies for compiled packages
|
||||||
|
RUN apt-get update && apt-get install -y \
|
||||||
|
libgomp1 \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# Set working directory
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
# Copy virtual environment from builder stage
|
||||||
|
COPY --from=builder /app/.venv /app/.venv
|
||||||
|
|
||||||
|
# Copy application files from builder stage
|
||||||
|
COPY --from=builder /app/src /app/src
|
||||||
|
COPY --from=builder /app/main.py /app/main.py
|
||||||
|
COPY --from=builder /app/assets /app/assets
|
||||||
|
COPY --from=builder /app/wsgi.py /app/wsgi.py
|
||||||
|
COPY --from=builder /app/run_prod.py /app/run_prod.py
|
||||||
|
|
||||||
|
# Make sure the virtual environment is in PATH
|
||||||
|
ENV PATH="/app/.venv/bin:$PATH"
|
||||||
|
|
||||||
|
# Set Python path
|
||||||
|
ENV PYTHONPATH="/app/src:$PYTHONPATH"
|
||||||
|
|
||||||
|
# Environment variables for production
|
||||||
|
ENV EMBEDDINGBUDDY_HOST=0.0.0.0
|
||||||
|
ENV EMBEDDINGBUDDY_PORT=8050
|
||||||
|
ENV EMBEDDINGBUDDY_DEBUG=false
|
||||||
|
ENV EMBEDDINGBUDDY_ENV=production
|
||||||
|
|
||||||
|
# Expose port
|
||||||
|
EXPOSE 8050
|
||||||
|
|
||||||
|
# Health check
|
||||||
|
HEALTHCHECK --interval=30s --timeout=10s --start-period=30s --retries=3 \
|
||||||
|
CMD python -c "import requests; requests.get('http://localhost:8050/', timeout=5)" || exit 1
|
||||||
|
|
||||||
|
# Run application with Gunicorn in production
|
||||||
|
CMD ["python", "run_prod.py"]
|
66
README.md
66
README.md
@@ -12,7 +12,7 @@ EmbeddingBuddy provides an intuitive web interface for analyzing high-dimensiona
|
|||||||
embedding vectors by applying various dimensionality reduction algorithms and
|
embedding vectors by applying various dimensionality reduction algorithms and
|
||||||
visualizing the results in interactive 2D and 3D plots. The application features
|
visualizing the results in interactive 2D and 3D plots. The application features
|
||||||
a clean, modular architecture that makes it easy to test, maintain, and extend
|
a clean, modular architecture that makes it easy to test, maintain, and extend
|
||||||
with new features. It supports dual dataset visualization, allowing you to compare
|
with new features. It supports dual dataset visualization, allowing you to compare
|
||||||
documents and prompts to understand how queries relate to your content.
|
documents and prompts to understand how queries relate to your content.
|
||||||
|
|
||||||
## Features
|
## Features
|
||||||
@@ -73,17 +73,77 @@ uv sync
|
|||||||
|
|
||||||
2. **Run the application:**
|
2. **Run the application:**
|
||||||
|
|
||||||
|
**Development mode** (with auto-reload):
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
uv run python main.py
|
uv run run_dev.py
|
||||||
```
|
```
|
||||||
|
|
||||||
3. **Open your browser** to http://127.0.0.1:8050
|
**Production mode** (with Gunicorn WSGI server):
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Install production dependencies
|
||||||
|
uv sync --extra prod
|
||||||
|
|
||||||
|
# Run in production mode
|
||||||
|
uv run run_prod.py
|
||||||
|
```
|
||||||
|
|
||||||
|
**Legacy mode** (basic Dash server):
|
||||||
|
|
||||||
|
```bash
|
||||||
|
uv run main.py
|
||||||
|
```
|
||||||
|
|
||||||
|
3. **Open your browser** to <http://127.0.0.1:8050>
|
||||||
|
|
||||||
4. **Test with sample data**:
|
4. **Test with sample data**:
|
||||||
- Upload `sample_data.ndjson` (documents)
|
- Upload `sample_data.ndjson` (documents)
|
||||||
- Upload `sample_prompts.ndjson` (prompts) to see dual visualization
|
- Upload `sample_prompts.ndjson` (prompts) to see dual visualization
|
||||||
- Use the "Show prompts" toggle to compare how prompts relate to documents
|
- Use the "Show prompts" toggle to compare how prompts relate to documents
|
||||||
|
|
||||||
|
## Docker
|
||||||
|
|
||||||
|
You can also run EmbeddingBuddy using Docker:
|
||||||
|
|
||||||
|
### Basic Usage
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Run in the background
|
||||||
|
docker compose up -d
|
||||||
|
```
|
||||||
|
|
||||||
|
The application will be available at <http://127.0.0.1:8050>
|
||||||
|
|
||||||
|
### With OpenSearch
|
||||||
|
|
||||||
|
To run with OpenSearch for enhanced search capabilities:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Run in the background with OpenSearch
|
||||||
|
docker compose --profile opensearch up -d
|
||||||
|
```
|
||||||
|
|
||||||
|
This will start both the EmbeddingBuddy application and an OpenSearch instance.
|
||||||
|
OpenSearch will be available at <http://127.0.0.1:9200>
|
||||||
|
|
||||||
|
### Docker Commands
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Stop all services
|
||||||
|
docker compose down
|
||||||
|
|
||||||
|
# Stop and remove volumes
|
||||||
|
docker compose down -v
|
||||||
|
|
||||||
|
# View logs
|
||||||
|
docker compose logs embeddingbuddy
|
||||||
|
docker compose logs opensearch
|
||||||
|
|
||||||
|
# Rebuild containers
|
||||||
|
docker compose build
|
||||||
|
```
|
||||||
|
|
||||||
## Development
|
## Development
|
||||||
|
|
||||||
### Project Structure
|
### Project Structure
|
||||||
|
278
assets/embeddings.js
Normal file
278
assets/embeddings.js
Normal file
@@ -0,0 +1,278 @@
|
|||||||
|
// Text input embedding generation using Transformers.js
|
||||||
|
// This module runs entirely in the browser for privacy and performance
|
||||||
|
|
||||||
|
// Global flag to track initialization
|
||||||
|
window.transformersLoading = false;
|
||||||
|
window.transformersLoaded = false;
|
||||||
|
|
||||||
|
class TransformersEmbedder {
|
||||||
|
constructor() {
|
||||||
|
this.extractor = null;
|
||||||
|
this.currentModel = null;
|
||||||
|
this.modelCache = new Map();
|
||||||
|
this.isLoading = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
async initializeModel(modelName = 'Xenova/all-MiniLM-L6-v2') {
|
||||||
|
try {
|
||||||
|
if (this.modelCache.has(modelName)) {
|
||||||
|
this.extractor = this.modelCache.get(modelName);
|
||||||
|
this.currentModel = modelName;
|
||||||
|
return { success: true, model: modelName };
|
||||||
|
}
|
||||||
|
|
||||||
|
if (this.isLoading) {
|
||||||
|
return { success: false, error: 'Model loading already in progress' };
|
||||||
|
}
|
||||||
|
|
||||||
|
this.isLoading = true;
|
||||||
|
|
||||||
|
// Use globally loaded Transformers.js pipeline
|
||||||
|
if (!window.transformers) {
|
||||||
|
if (!window.transformersPipeline) {
|
||||||
|
// Wait for the pipeline to load
|
||||||
|
let attempts = 0;
|
||||||
|
while (!window.transformersPipeline && attempts < 50) { // Wait up to 5 seconds
|
||||||
|
await new Promise(resolve => setTimeout(resolve, 100));
|
||||||
|
attempts++;
|
||||||
|
}
|
||||||
|
if (!window.transformersPipeline) {
|
||||||
|
throw new Error('Transformers.js pipeline not available. Please refresh the page.');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
window.transformers = { pipeline: window.transformersPipeline };
|
||||||
|
window.transformersLoaded = true;
|
||||||
|
console.log('✅ Using globally loaded Transformers.js pipeline');
|
||||||
|
}
|
||||||
|
|
||||||
|
// Show loading progress to user
|
||||||
|
if (window.updateModelLoadingProgress) {
|
||||||
|
window.updateModelLoadingProgress(0, `Loading ${modelName}...`);
|
||||||
|
}
|
||||||
|
|
||||||
|
this.extractor = await window.transformers.pipeline('feature-extraction', modelName, {
|
||||||
|
progress_callback: (data) => {
|
||||||
|
if (window.updateModelLoadingProgress && data.progress !== undefined) {
|
||||||
|
const progress = Math.round(data.progress);
|
||||||
|
window.updateModelLoadingProgress(progress, data.status || 'Loading...');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
this.modelCache.set(modelName, this.extractor);
|
||||||
|
this.currentModel = modelName;
|
||||||
|
this.isLoading = false;
|
||||||
|
|
||||||
|
if (window.updateModelLoadingProgress) {
|
||||||
|
window.updateModelLoadingProgress(100, 'Model loaded successfully');
|
||||||
|
}
|
||||||
|
|
||||||
|
return { success: true, model: modelName };
|
||||||
|
} catch (error) {
|
||||||
|
this.isLoading = false;
|
||||||
|
console.error('Model initialization error:', error);
|
||||||
|
return { success: false, error: error.message };
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async generateEmbeddings(texts, options = {}) {
|
||||||
|
if (!this.extractor) {
|
||||||
|
throw new Error('Model not initialized. Call initializeModel() first.');
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!texts || texts.length === 0) {
|
||||||
|
throw new Error('No texts provided for embedding generation.');
|
||||||
|
}
|
||||||
|
|
||||||
|
const embeddings = [];
|
||||||
|
const defaultOptions = {
|
||||||
|
pooling: 'mean',
|
||||||
|
normalize: true,
|
||||||
|
...options
|
||||||
|
};
|
||||||
|
|
||||||
|
// Process in batches to avoid memory issues
|
||||||
|
const batchSize = options.batchSize || 8;
|
||||||
|
|
||||||
|
try {
|
||||||
|
for (let i = 0; i < texts.length; i += batchSize) {
|
||||||
|
const batch = texts.slice(i, i + batchSize);
|
||||||
|
|
||||||
|
const batchResults = await Promise.all(
|
||||||
|
batch.map(text => {
|
||||||
|
if (!text || text.trim().length === 0) {
|
||||||
|
throw new Error('Empty text found in batch');
|
||||||
|
}
|
||||||
|
return this.extractor(text.trim(), defaultOptions);
|
||||||
|
})
|
||||||
|
);
|
||||||
|
|
||||||
|
// Convert tensor output to arrays
|
||||||
|
batchResults.forEach((result, idx) => {
|
||||||
|
if (result && result.data) {
|
||||||
|
embeddings.push(Array.from(result.data));
|
||||||
|
} else {
|
||||||
|
throw new Error(`Invalid embedding result for text: ${batch[idx]}`);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// Update progress
|
||||||
|
const progress = Math.min(100, ((i + batch.length) / texts.length) * 100);
|
||||||
|
if (window.updateEmbeddingProgress) {
|
||||||
|
window.updateEmbeddingProgress(progress, `Processing ${i + batch.length}/${texts.length} texts`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (window.updateEmbeddingProgress) {
|
||||||
|
window.updateEmbeddingProgress(100, `Generated ${embeddings.length} embeddings successfully`);
|
||||||
|
}
|
||||||
|
|
||||||
|
return embeddings;
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Embedding generation error:', error);
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Global instance
|
||||||
|
window.transformersEmbedder = new TransformersEmbedder();
|
||||||
|
console.log('📦 TransformersEmbedder instance created');
|
||||||
|
|
||||||
|
// Global progress update functions
|
||||||
|
window.updateModelLoadingProgress = function(progress, status) {
|
||||||
|
const progressBar = document.getElementById('model-loading-progress');
|
||||||
|
const statusText = document.getElementById('model-loading-status');
|
||||||
|
if (progressBar) {
|
||||||
|
progressBar.style.width = progress + '%';
|
||||||
|
progressBar.setAttribute('aria-valuenow', progress);
|
||||||
|
}
|
||||||
|
if (statusText) {
|
||||||
|
statusText.textContent = status;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
window.updateEmbeddingProgress = function(progress, status) {
|
||||||
|
const progressBar = document.getElementById('embedding-progress');
|
||||||
|
const statusText = document.getElementById('embedding-status');
|
||||||
|
if (progressBar) {
|
||||||
|
progressBar.style.width = progress + '%';
|
||||||
|
progressBar.setAttribute('aria-valuenow', progress);
|
||||||
|
}
|
||||||
|
if (statusText) {
|
||||||
|
statusText.textContent = status;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Dash clientside callback functions
|
||||||
|
window.dash_clientside = window.dash_clientside || {};
|
||||||
|
console.log('🔧 Setting up window.dash_clientside.transformers');
|
||||||
|
window.dash_clientside.transformers = {
|
||||||
|
generateEmbeddings: async function(nClicks, textContent, modelName, tokenizationMethod, category, subcategory) {
|
||||||
|
console.log('🚀 generateEmbeddings called with:', { nClicks, modelName, tokenizationMethod, textLength: textContent?.length });
|
||||||
|
|
||||||
|
if (!nClicks || !textContent || textContent.trim().length === 0) {
|
||||||
|
console.log('⚠️ Early return - missing required parameters');
|
||||||
|
return window.dash_clientside.no_update;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Initialize model if needed
|
||||||
|
const initResult = await window.transformersEmbedder.initializeModel(modelName);
|
||||||
|
if (!initResult.success) {
|
||||||
|
return [
|
||||||
|
{ error: initResult.error },
|
||||||
|
`❌ Model loading error: ${initResult.error}`,
|
||||||
|
"danger",
|
||||||
|
false
|
||||||
|
];
|
||||||
|
}
|
||||||
|
|
||||||
|
// Tokenize text based on method
|
||||||
|
let textChunks;
|
||||||
|
const trimmedText = textContent.trim();
|
||||||
|
|
||||||
|
switch (tokenizationMethod) {
|
||||||
|
case 'sentence':
|
||||||
|
// Simple sentence splitting - can be enhanced with proper NLP
|
||||||
|
textChunks = trimmedText
|
||||||
|
.split(/[.!?]+/)
|
||||||
|
.map(s => s.trim())
|
||||||
|
.filter(s => s.length > 0);
|
||||||
|
break;
|
||||||
|
case 'paragraph':
|
||||||
|
textChunks = trimmedText
|
||||||
|
.split(/\n\s*\n/)
|
||||||
|
.map(s => s.trim())
|
||||||
|
.filter(s => s.length > 0);
|
||||||
|
break;
|
||||||
|
case 'manual':
|
||||||
|
textChunks = trimmedText
|
||||||
|
.split('\n')
|
||||||
|
.map(s => s.trim())
|
||||||
|
.filter(s => s.length > 0);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
textChunks = [trimmedText];
|
||||||
|
}
|
||||||
|
|
||||||
|
if (textChunks.length === 0) {
|
||||||
|
return [
|
||||||
|
{ error: 'No valid text chunks found after tokenization' },
|
||||||
|
'❌ Error: No valid text chunks found after tokenization',
|
||||||
|
"danger",
|
||||||
|
false
|
||||||
|
];
|
||||||
|
}
|
||||||
|
|
||||||
|
// Generate embeddings
|
||||||
|
const embeddings = await window.transformersEmbedder.generateEmbeddings(textChunks);
|
||||||
|
|
||||||
|
if (!embeddings || embeddings.length !== textChunks.length) {
|
||||||
|
return [
|
||||||
|
{ error: 'Embedding generation failed - mismatch in text chunks and embeddings' },
|
||||||
|
'❌ Error: Embedding generation failed',
|
||||||
|
"danger",
|
||||||
|
false
|
||||||
|
];
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create documents structure
|
||||||
|
const documents = textChunks.map((text, i) => ({
|
||||||
|
id: `text_input_${Date.now()}_${i}`,
|
||||||
|
text: text,
|
||||||
|
embedding: embeddings[i],
|
||||||
|
category: category || "Text Input",
|
||||||
|
subcategory: subcategory || "Generated",
|
||||||
|
tags: []
|
||||||
|
}));
|
||||||
|
|
||||||
|
return [
|
||||||
|
{
|
||||||
|
documents: documents,
|
||||||
|
embeddings: embeddings
|
||||||
|
},
|
||||||
|
`✅ Generated embeddings for ${documents.length} text chunks using ${modelName}`,
|
||||||
|
"success",
|
||||||
|
false
|
||||||
|
];
|
||||||
|
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Client-side embedding error:', error);
|
||||||
|
return [
|
||||||
|
{ error: error.message },
|
||||||
|
`❌ Error: ${error.message}`,
|
||||||
|
"danger",
|
||||||
|
false
|
||||||
|
];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
console.log('✅ Transformers.js client-side setup complete');
|
||||||
|
console.log('Available:', {
|
||||||
|
transformersEmbedder: !!window.transformersEmbedder,
|
||||||
|
dashClientside: !!window.dash_clientside,
|
||||||
|
transformersModule: !!window.dash_clientside?.transformers,
|
||||||
|
generateFunction: typeof window.dash_clientside?.transformers?.generateEmbeddings
|
||||||
|
});
|
9
assets/package.json
Normal file
9
assets/package.json
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
{
|
||||||
|
"name": "embeddingbuddy-assets",
|
||||||
|
"version": "1.0.0",
|
||||||
|
"description": "JavaScript dependencies for EmbeddingBuddy text input functionality",
|
||||||
|
"dependencies": {
|
||||||
|
"@huggingface/transformers": "^3.0.0"
|
||||||
|
},
|
||||||
|
"type": "module"
|
||||||
|
}
|
106
assets/sample-txt.md
Normal file
106
assets/sample-txt.md
Normal file
@@ -0,0 +1,106 @@
|
|||||||
|
The sun peeked through the clouds after a drizzly morning.
|
||||||
|
A gentle breeze rustled the leaves as we walked along the shoreline.
|
||||||
|
Heavy rains caused flooding in several low-lying neighborhoods.
|
||||||
|
It was so hot that even the birds sought shade under the palm trees.
|
||||||
|
By midnight, the temperature had dropped below freezing.
|
||||||
|
Thunderstorms lit up the sky with flashes of lightning.
|
||||||
|
A thick fog settled over the city streets at dawn.
|
||||||
|
The air smelled of ozone after the sudden hailstorm.
|
||||||
|
I watched the snowflakes drift silently onto the ground.
|
||||||
|
A double rainbow appeared after the rain shower.
|
||||||
|
The humidity soared to uncomfortable levels by midday.
|
||||||
|
Dust devils formed in the dry desert plains.
|
||||||
|
The barometer readings indicated an approaching front.
|
||||||
|
A sudden gust of wind knocked over the garden chairs.
|
||||||
|
Light drizzle turned into a torrential downpour within minutes.
|
||||||
|
The new smartphone features a foldable display and 5G connectivity.
|
||||||
|
In the world of AI, transformers have revolutionized natural language processing.
|
||||||
|
Quantum computing promises to solve problems beyond classical computers' reach.
|
||||||
|
Blockchain technology is being explored for secure voting systems.
|
||||||
|
Virtual reality headsets are becoming more affordable and accessible.
|
||||||
|
The rise of electric vehicles is reshaping the automotive industry.
|
||||||
|
Cloud computing allows businesses to scale resources dynamically.
|
||||||
|
Machine learning algorithms can now predict stock market trends with surprising accuracy.
|
||||||
|
Augmented reality applications are transforming retail experiences.
|
||||||
|
The Internet of Things connects everyday devices to the web for smarter living.
|
||||||
|
Cybersecurity threats are evolving, requiring constant vigilance.
|
||||||
|
3D printing is enabling rapid prototyping and custom manufacturing.
|
||||||
|
Edge computing reduces latency by processing data closer to the source.
|
||||||
|
Biometric authentication methods are enhancing security in devices.
|
||||||
|
Wearable technology is tracking health metrics in real-time.
|
||||||
|
Artificial intelligence is being used to create realistic deepfakes.
|
||||||
|
Preheat the oven to 375°F before you start mixing the batter.
|
||||||
|
She finely chopped the garlic and sautéed it in two tablespoons of olive oil.
|
||||||
|
A pinch of saffron adds a beautiful color and aroma to traditional paella.
|
||||||
|
If the soup is too salty, add a peeled potato to absorb excess sodium.
|
||||||
|
Let the bread dough rise for at least an hour in a warm, draft-free spot.
|
||||||
|
Marinate the chicken overnight in a blend of citrus and spices.
|
||||||
|
Use a cast-iron skillet to sear the steak on high heat.
|
||||||
|
Whisk the egg whites until they form stiff peaks.
|
||||||
|
Fold in the chocolate chips gently to keep the batter airy.
|
||||||
|
Brush the pastry with an egg wash for a golden finish.
|
||||||
|
Slow-roast the pork shoulder until it falls off the bone.
|
||||||
|
Garnish the salad with toasted nuts and fresh herbs.
|
||||||
|
Deglaze the pan with white wine for a rich sauce.
|
||||||
|
Simmer the curry paste until the aroma intensifies.
|
||||||
|
Let the risotto rest before serving to thicken slightly.
|
||||||
|
He dribbled past two defenders and sank a three-pointer at the buzzer.
|
||||||
|
The marathon runner kept a steady pace despite the sweltering heat.
|
||||||
|
Their home team clinched the championship with a last-minute goal.
|
||||||
|
NASCAR fans cheered as the cars roared around the oval track.
|
||||||
|
She landed a perfect triple axel at the figure skating championship.
|
||||||
|
The cyclist pedaled up the steep hill in record time.
|
||||||
|
He pitched a no-hitter during the high school baseball game.
|
||||||
|
The quarterback threw a touchdown pass under heavy pressure.
|
||||||
|
They scored a hat-trick in the hockey final.
|
||||||
|
The boxer delivered a swift uppercut in the final round.
|
||||||
|
Surfers caught massive waves at dawn on the Pacific coast.
|
||||||
|
Fans erupted when the underdog scored the winning goal.
|
||||||
|
The swimmer broke the national record in the 200m freestyle.
|
||||||
|
The gymnast executed a flawless routine on the balance beam.
|
||||||
|
The rugby team celebrated their victory with a traditional haka.
|
||||||
|
The stock market rallied after positive earnings reports.
|
||||||
|
Investors are closely watching interest rate changes by the Federal Reserve.
|
||||||
|
Cryptocurrency prices have been extremely volatile this year.
|
||||||
|
Diversification is key to managing investment risk effectively.
|
||||||
|
Inflation rates have reached a 40-year high, impacting consumer spending.
|
||||||
|
Many companies are adopting ESG criteria to attract socially conscious investors.
|
||||||
|
The bond market is reacting to geopolitical tensions and supply chain disruptions.
|
||||||
|
Venture capital funding for startups has surged in the tech sector.
|
||||||
|
Exchange-traded funds (ETFs) offer a way to invest in diversified portfolios.
|
||||||
|
The global economy is recovering from the pandemic, but challenges remain.
|
||||||
|
Central banks are exploring digital currencies to modernize payment systems.
|
||||||
|
Retail investors are increasingly participating in the stock market through apps.
|
||||||
|
Hedge funds are using complex algorithms to gain an edge in trading.
|
||||||
|
Real estate prices have skyrocketed in urban areas due to low inventory.
|
||||||
|
The startup raised $10 million in its Series A funding round.
|
||||||
|
The symphony orchestra played a hauntingly beautiful melody.
|
||||||
|
She strummed her guitar softly, filling the room with a warm sound.
|
||||||
|
The DJ mixed tracks seamlessly, keeping the crowd dancing all night.
|
||||||
|
His voice soared during the high notes of the ballad.
|
||||||
|
The band played an acoustic set in the intimate coffee shop.
|
||||||
|
Jazz musicians often improvise solos based on the chord changes.
|
||||||
|
The opera singer hit the high C with perfect pitch.
|
||||||
|
The choir harmonized beautifully, filling the church with sound.
|
||||||
|
He composed a symphony that was performed at the concert hall.
|
||||||
|
The singer-songwriter wrote heartfelt lyrics about love and loss.
|
||||||
|
The rock band headlined the festival, drawing a massive crowd.
|
||||||
|
Hip-hop artists use rhythm and rhyme to tell powerful stories.
|
||||||
|
The violinist played a virtuosic solo that left the audience in awe.
|
||||||
|
Folk music often reflects the culture and traditions of a community.
|
||||||
|
The gospel choir lifted spirits with their uplifting performance.
|
||||||
|
The fall of the Berlin Wall in 1989 marked the end of the Cold War.
|
||||||
|
Ancient Egypt's pyramids are a testament to their architectural prowess.
|
||||||
|
Europe's Renaissance period sparked a revival in art and science.
|
||||||
|
The signing of the Declaration of Independence in 1776 established the United States.
|
||||||
|
The Industrial Revolution transformed economies and societies worldwide.
|
||||||
|
Rome was the center of a vast empire that influenced law and governance.
|
||||||
|
The discovery of the New World by Christopher Columbus in 1492 changed global trade.
|
||||||
|
The French Revolution in 1789 led to significant political and social change.
|
||||||
|
World War II was a global conflict that reshaped international relations.
|
||||||
|
The fall of the Roman Empire in 476 AD marked the beginning of the Middle Ages.
|
||||||
|
The invention of the printing press revolutionized the spread of knowledge.
|
||||||
|
The Cold War was characterized by political tension between the U.S. and the Soviet Union.
|
||||||
|
The ancient Silk Road connected East and West through trade routes.
|
||||||
|
The signing of the Magna Carta in 1215 established principles of due process.
|
||||||
|
Exploration during the Age of Discovery expanded European empires across the globe.
|
172
assets/transformers-loader.js
Normal file
172
assets/transformers-loader.js
Normal file
@@ -0,0 +1,172 @@
|
|||||||
|
// Simple script to load Transformers.js from CDN and initialize embedding functionality
|
||||||
|
// This approach uses traditional script loading instead of ES6 modules
|
||||||
|
|
||||||
|
console.log('🔧 Transformers.js loader starting...');
|
||||||
|
|
||||||
|
// Global state
|
||||||
|
window.transformersLibraryLoaded = false;
|
||||||
|
window.transformersLibraryLoading = false;
|
||||||
|
|
||||||
|
// Function to dynamically load a script
|
||||||
|
function loadScript(src) {
|
||||||
|
return new Promise((resolve, reject) => {
|
||||||
|
const script = document.createElement('script');
|
||||||
|
script.src = src;
|
||||||
|
script.type = 'module';
|
||||||
|
script.onload = () => resolve();
|
||||||
|
script.onerror = () => reject(new Error(`Failed to load script: ${src}`));
|
||||||
|
document.head.appendChild(script);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Function to initialize Transformers.js
|
||||||
|
async function initializeTransformers() {
|
||||||
|
if (window.transformersLibraryLoaded) {
|
||||||
|
console.log('✅ Transformers.js already loaded');
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (window.transformersLibraryLoading) {
|
||||||
|
console.log('⏳ Transformers.js already loading, waiting...');
|
||||||
|
// Wait for loading to complete
|
||||||
|
while (window.transformersLibraryLoading) {
|
||||||
|
await new Promise(resolve => setTimeout(resolve, 100));
|
||||||
|
}
|
||||||
|
return window.transformersLibraryLoaded;
|
||||||
|
}
|
||||||
|
|
||||||
|
window.transformersLibraryLoading = true;
|
||||||
|
|
||||||
|
try {
|
||||||
|
console.log('📦 Loading Transformers.js from CDN...');
|
||||||
|
|
||||||
|
// Use dynamic import since this is more reliable with ES modules
|
||||||
|
const transformers = await import('https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.0.0');
|
||||||
|
window.transformersLibrary = transformers;
|
||||||
|
window.transformersLibraryLoaded = true;
|
||||||
|
|
||||||
|
console.log('✅ Transformers.js loaded successfully');
|
||||||
|
return true;
|
||||||
|
} catch (error) {
|
||||||
|
console.error('❌ Failed to load Transformers.js:', error);
|
||||||
|
return false;
|
||||||
|
} finally {
|
||||||
|
window.transformersLibraryLoading = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Simple embeddings class
|
||||||
|
class SimpleEmbedder {
|
||||||
|
constructor() {
|
||||||
|
this.pipeline = null;
|
||||||
|
this.modelCache = new Map();
|
||||||
|
}
|
||||||
|
|
||||||
|
async generateEmbeddings(texts, modelName = 'Xenova/all-MiniLM-L6-v2') {
|
||||||
|
console.log('🔄 Generating embeddings for', texts.length, 'texts with model', modelName);
|
||||||
|
|
||||||
|
// Ensure Transformers.js is loaded
|
||||||
|
if (!window.transformersLibraryLoaded) {
|
||||||
|
const loaded = await initializeTransformers();
|
||||||
|
if (!loaded) {
|
||||||
|
throw new Error('Failed to load Transformers.js');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create pipeline if not cached
|
||||||
|
if (!this.modelCache.has(modelName)) {
|
||||||
|
console.log('🏗️ Creating pipeline for', modelName);
|
||||||
|
const { pipeline } = window.transformersLibrary;
|
||||||
|
this.pipeline = await pipeline('feature-extraction', modelName);
|
||||||
|
this.modelCache.set(modelName, this.pipeline);
|
||||||
|
} else {
|
||||||
|
this.pipeline = this.modelCache.get(modelName);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Generate embeddings
|
||||||
|
const embeddings = [];
|
||||||
|
for (let i = 0; i < texts.length; i++) {
|
||||||
|
console.log(`Processing text ${i + 1}/${texts.length}...`);
|
||||||
|
const result = await this.pipeline(texts[i], { pooling: 'mean', normalize: true });
|
||||||
|
embeddings.push(Array.from(result.data));
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log('✅ Generated', embeddings.length, 'embeddings');
|
||||||
|
return embeddings;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create global instance
|
||||||
|
window.simpleEmbedder = new SimpleEmbedder();
|
||||||
|
|
||||||
|
// Set up Dash clientside callbacks
|
||||||
|
window.dash_clientside = window.dash_clientside || {};
|
||||||
|
window.dash_clientside.transformers = {
|
||||||
|
generateEmbeddings: async function(nClicks, textContent, modelName, tokenizationMethod, category, subcategory) {
|
||||||
|
console.log('🚀 Client-side generateEmbeddings called');
|
||||||
|
|
||||||
|
if (!nClicks || !textContent || textContent.trim().length === 0) {
|
||||||
|
console.log('⚠️ Missing required parameters');
|
||||||
|
return window.dash_clientside.no_update;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Tokenize text
|
||||||
|
let textChunks;
|
||||||
|
const trimmedText = textContent.trim();
|
||||||
|
|
||||||
|
switch (tokenizationMethod) {
|
||||||
|
case 'sentence':
|
||||||
|
textChunks = trimmedText.split(/[.!?]+/).map(s => s.trim()).filter(s => s.length > 0);
|
||||||
|
break;
|
||||||
|
case 'paragraph':
|
||||||
|
textChunks = trimmedText.split(/\n\s*\n/).map(s => s.trim()).filter(s => s.length > 0);
|
||||||
|
break;
|
||||||
|
case 'manual':
|
||||||
|
textChunks = trimmedText.split('\n').map(s => s.trim()).filter(s => s.length > 0);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
textChunks = [trimmedText];
|
||||||
|
}
|
||||||
|
|
||||||
|
if (textChunks.length === 0) {
|
||||||
|
throw new Error('No valid text chunks after tokenization');
|
||||||
|
}
|
||||||
|
|
||||||
|
// Generate embeddings
|
||||||
|
const embeddings = await window.simpleEmbedder.generateEmbeddings(textChunks, modelName);
|
||||||
|
|
||||||
|
// Create documents
|
||||||
|
const documents = textChunks.map((text, i) => ({
|
||||||
|
id: `text_input_${Date.now()}_${i}`,
|
||||||
|
text: text,
|
||||||
|
embedding: embeddings[i],
|
||||||
|
category: category || "Text Input",
|
||||||
|
subcategory: subcategory || "Generated",
|
||||||
|
tags: []
|
||||||
|
}));
|
||||||
|
|
||||||
|
return [
|
||||||
|
{
|
||||||
|
documents: documents,
|
||||||
|
embeddings: embeddings
|
||||||
|
},
|
||||||
|
`✅ Generated embeddings for ${documents.length} text chunks using ${modelName}`,
|
||||||
|
"success",
|
||||||
|
false
|
||||||
|
];
|
||||||
|
|
||||||
|
} catch (error) {
|
||||||
|
console.error('❌ Error generating embeddings:', error);
|
||||||
|
return [
|
||||||
|
{ error: error.message },
|
||||||
|
`❌ Error: ${error.message}`,
|
||||||
|
"danger",
|
||||||
|
false
|
||||||
|
];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
console.log('✅ Simple Transformers.js setup complete');
|
||||||
|
console.log('Available functions:', Object.keys(window.dash_clientside.transformers));
|
133
bump_version.py
Executable file
133
bump_version.py
Executable file
@@ -0,0 +1,133 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Version bump script for EmbeddingBuddy.
|
||||||
|
Automatically updates version in pyproject.toml following semantic versioning.
|
||||||
|
"""
|
||||||
|
import argparse
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
def get_current_version(pyproject_path: Path) -> str:
|
||||||
|
"""Extract current version from pyproject.toml."""
|
||||||
|
content = pyproject_path.read_text()
|
||||||
|
match = re.search(r'version\s*=\s*"([^"]+)"', content)
|
||||||
|
if not match:
|
||||||
|
raise ValueError("Could not find version in pyproject.toml")
|
||||||
|
return match.group(1)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_version(version_str: str) -> tuple[int, int, int]:
|
||||||
|
"""Parse semantic version string into major, minor, patch tuple."""
|
||||||
|
match = re.match(r'(\d+)\.(\d+)\.(\d+)', version_str)
|
||||||
|
if not match:
|
||||||
|
raise ValueError(f"Invalid version format: {version_str}")
|
||||||
|
return int(match.group(1)), int(match.group(2)), int(match.group(3))
|
||||||
|
|
||||||
|
|
||||||
|
def bump_version(current: str, bump_type: str) -> str:
|
||||||
|
"""Bump version based on type (major, minor, patch)."""
|
||||||
|
major, minor, patch = parse_version(current)
|
||||||
|
|
||||||
|
if bump_type == "major":
|
||||||
|
return f"{major + 1}.0.0"
|
||||||
|
elif bump_type == "minor":
|
||||||
|
return f"{major}.{minor + 1}.0"
|
||||||
|
elif bump_type == "patch":
|
||||||
|
return f"{major}.{minor}.{patch + 1}"
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Invalid bump type: {bump_type}")
|
||||||
|
|
||||||
|
|
||||||
|
def update_version_in_file(pyproject_path: Path, new_version: str) -> None:
|
||||||
|
"""Update version in pyproject.toml file."""
|
||||||
|
content = pyproject_path.read_text()
|
||||||
|
updated_content = re.sub(
|
||||||
|
r'version\s*=\s*"[^"]+"',
|
||||||
|
f'version = "{new_version}"',
|
||||||
|
content
|
||||||
|
)
|
||||||
|
pyproject_path.write_text(updated_content)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Main version bump function."""
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Bump version in pyproject.toml",
|
||||||
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||||
|
epilog="""
|
||||||
|
Examples:
|
||||||
|
python bump_version.py patch # 0.3.0 -> 0.3.1
|
||||||
|
python bump_version.py minor # 0.3.0 -> 0.4.0
|
||||||
|
python bump_version.py major # 0.3.0 -> 1.0.0
|
||||||
|
python bump_version.py --set 1.2.3 # Set specific version
|
||||||
|
|
||||||
|
Semantic versioning guide:
|
||||||
|
- patch: Bug fixes, no API changes
|
||||||
|
- minor: New features, backward compatible
|
||||||
|
- major: Breaking changes, not backward compatible
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
|
||||||
|
group = parser.add_mutually_exclusive_group(required=True)
|
||||||
|
group.add_argument(
|
||||||
|
"bump_type",
|
||||||
|
nargs="?",
|
||||||
|
choices=["major", "minor", "patch"],
|
||||||
|
help="Type of version bump"
|
||||||
|
)
|
||||||
|
group.add_argument(
|
||||||
|
"--set",
|
||||||
|
dest="set_version",
|
||||||
|
help="Set specific version (e.g., 1.2.3)"
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--dry-run",
|
||||||
|
action="store_true",
|
||||||
|
help="Show what would be changed without making changes"
|
||||||
|
)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# Find pyproject.toml
|
||||||
|
pyproject_path = Path("pyproject.toml")
|
||||||
|
if not pyproject_path.exists():
|
||||||
|
print("❌ pyproject.toml not found in current directory")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
try:
|
||||||
|
current_version = get_current_version(pyproject_path)
|
||||||
|
print(f"📦 Current version: {current_version}")
|
||||||
|
|
||||||
|
if args.set_version:
|
||||||
|
# Validate the set version format
|
||||||
|
parse_version(args.set_version)
|
||||||
|
new_version = args.set_version
|
||||||
|
else:
|
||||||
|
new_version = bump_version(current_version, args.bump_type)
|
||||||
|
|
||||||
|
print(f"🚀 New version: {new_version}")
|
||||||
|
|
||||||
|
if args.dry_run:
|
||||||
|
print("🔍 Dry run - no changes made")
|
||||||
|
else:
|
||||||
|
update_version_in_file(pyproject_path, new_version)
|
||||||
|
print("✅ Version updated in pyproject.toml")
|
||||||
|
print()
|
||||||
|
print("💡 Next steps:")
|
||||||
|
print(" 1. Review changes: git diff")
|
||||||
|
print(" 2. Commit changes: git add . && git commit -m 'bump version to {}'".format(new_version))
|
||||||
|
print(" 3. Tag release: git tag v{}".format(new_version))
|
||||||
|
|
||||||
|
except ValueError as e:
|
||||||
|
print(f"❌ Error: {e}")
|
||||||
|
sys.exit(1)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Unexpected error: {e}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
69
docker-compose.yml
Normal file
69
docker-compose.yml
Normal file
@@ -0,0 +1,69 @@
|
|||||||
|
services:
|
||||||
|
opensearch:
|
||||||
|
image: opensearchproject/opensearch:2
|
||||||
|
container_name: embeddingbuddy-opensearch
|
||||||
|
profiles:
|
||||||
|
- opensearch
|
||||||
|
environment:
|
||||||
|
- cluster.name=embeddingbuddy-cluster
|
||||||
|
- node.name=embeddingbuddy-node
|
||||||
|
- discovery.type=single-node
|
||||||
|
- bootstrap.memory_lock=true
|
||||||
|
- "OPENSEARCH_JAVA_OPTS=-Xms512m -Xmx512m"
|
||||||
|
- "DISABLE_INSTALL_DEMO_CONFIG=true"
|
||||||
|
- "DISABLE_SECURITY_PLUGIN=true"
|
||||||
|
ulimits:
|
||||||
|
memlock:
|
||||||
|
soft: -1
|
||||||
|
hard: -1
|
||||||
|
nofile:
|
||||||
|
soft: 65536
|
||||||
|
hard: 65536
|
||||||
|
volumes:
|
||||||
|
- opensearch-data:/usr/share/opensearch/data
|
||||||
|
ports:
|
||||||
|
- "9200:9200"
|
||||||
|
- "9600:9600"
|
||||||
|
networks:
|
||||||
|
- embeddingbuddy
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD-SHELL", "curl -f http://localhost:9200/_cluster/health || exit 1"]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 10s
|
||||||
|
retries: 5
|
||||||
|
start_period: 60s
|
||||||
|
|
||||||
|
embeddingbuddy:
|
||||||
|
build: .
|
||||||
|
container_name: embeddingbuddy-app
|
||||||
|
environment:
|
||||||
|
- EMBEDDINGBUDDY_HOST=0.0.0.0
|
||||||
|
- EMBEDDINGBUDDY_PORT=8050
|
||||||
|
- EMBEDDINGBUDDY_DEBUG=false
|
||||||
|
- OPENSEARCH_HOST=opensearch
|
||||||
|
- OPENSEARCH_PORT=9200
|
||||||
|
- OPENSEARCH_SCHEME=http
|
||||||
|
- OPENSEARCH_VERIFY_CERTS=false
|
||||||
|
ports:
|
||||||
|
- "8050:8050"
|
||||||
|
networks:
|
||||||
|
- embeddingbuddy
|
||||||
|
depends_on:
|
||||||
|
opensearch:
|
||||||
|
condition: service_healthy
|
||||||
|
required: false
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD-SHELL", "python -c 'import requests; requests.get(\"http://localhost:8050/\", timeout=5)'"]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 10s
|
||||||
|
retries: 3
|
||||||
|
start_period: 30s
|
||||||
|
restart: unless-stopped
|
||||||
|
|
||||||
|
volumes:
|
||||||
|
opensearch-data:
|
||||||
|
driver: local
|
||||||
|
|
||||||
|
networks:
|
||||||
|
embeddingbuddy:
|
||||||
|
driver: bridge
|
Binary file not shown.
Before Width: | Height: | Size: 339 KiB After Width: | Height: | Size: 501 KiB |
@@ -1,6 +1,6 @@
|
|||||||
[project]
|
[project]
|
||||||
name = "embeddingbuddy"
|
name = "embeddingbuddy"
|
||||||
version = "0.3.0"
|
version = "0.4.0"
|
||||||
description = "A Python Dash application for interactive exploration and visualization of embedding vectors through dimensionality reduction techniques."
|
description = "A Python Dash application for interactive exploration and visualization of embedding vectors through dimensionality reduction techniques."
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
requires-python = ">=3.11"
|
requires-python = ">=3.11"
|
||||||
@@ -12,7 +12,6 @@ dependencies = [
|
|||||||
"scikit-learn>=1.3.2",
|
"scikit-learn>=1.3.2",
|
||||||
"dash-bootstrap-components>=1.5.0",
|
"dash-bootstrap-components>=1.5.0",
|
||||||
"umap-learn>=0.5.8",
|
"umap-learn>=0.5.8",
|
||||||
"numba>=0.56.4",
|
|
||||||
"openTSNE>=1.0.0",
|
"openTSNE>=1.0.0",
|
||||||
"mypy>=1.17.1",
|
"mypy>=1.17.1",
|
||||||
"opensearch-py>=3.0.0",
|
"opensearch-py>=3.0.0",
|
||||||
@@ -32,11 +31,14 @@ security = [
|
|||||||
"safety>=2.3.0",
|
"safety>=2.3.0",
|
||||||
"pip-audit>=2.6.0",
|
"pip-audit>=2.6.0",
|
||||||
]
|
]
|
||||||
|
prod = [
|
||||||
|
"gunicorn>=21.2.0",
|
||||||
|
]
|
||||||
dev = [
|
dev = [
|
||||||
"embeddingbuddy[test,lint,security]",
|
"embeddingbuddy[test,lint,security]",
|
||||||
]
|
]
|
||||||
all = [
|
all = [
|
||||||
"embeddingbuddy[test,lint,security]",
|
"embeddingbuddy[test,lint,security,prod]",
|
||||||
]
|
]
|
||||||
|
|
||||||
[build-system]
|
[build-system]
|
||||||
|
26
run_dev.py
Normal file
26
run_dev.py
Normal file
@@ -0,0 +1,26 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Development runner with auto-reload enabled.
|
||||||
|
This runs the Dash development server with hot reloading.
|
||||||
|
"""
|
||||||
|
import os
|
||||||
|
from src.embeddingbuddy.app import create_app, run_app
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Run the application in development mode with auto-reload."""
|
||||||
|
# Force development settings
|
||||||
|
os.environ["EMBEDDINGBUDDY_ENV"] = "development"
|
||||||
|
os.environ["EMBEDDINGBUDDY_DEBUG"] = "true"
|
||||||
|
|
||||||
|
print("🚀 Starting EmbeddingBuddy in development mode...")
|
||||||
|
print("📁 Auto-reload enabled - changes will trigger restart")
|
||||||
|
print("🌐 Server will be available at http://127.0.0.1:8050")
|
||||||
|
print("⏹️ Press Ctrl+C to stop")
|
||||||
|
|
||||||
|
app = create_app()
|
||||||
|
|
||||||
|
# Run with development server (includes auto-reload when debug=True)
|
||||||
|
run_app(app, debug=True)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
49
run_prod.py
Normal file
49
run_prod.py
Normal file
@@ -0,0 +1,49 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Production runner using Gunicorn WSGI server.
|
||||||
|
This provides better performance and stability for production deployments.
|
||||||
|
"""
|
||||||
|
import os
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
from src.embeddingbuddy.config.settings import AppSettings
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Run the application in production mode with Gunicorn."""
|
||||||
|
# Force production settings
|
||||||
|
os.environ["EMBEDDINGBUDDY_ENV"] = "production"
|
||||||
|
os.environ["EMBEDDINGBUDDY_DEBUG"] = "false"
|
||||||
|
|
||||||
|
print("🚀 Starting EmbeddingBuddy in production mode...")
|
||||||
|
print(f"⚙️ Workers: {AppSettings.GUNICORN_WORKERS}")
|
||||||
|
print(f"🌐 Server will be available at http://{AppSettings.GUNICORN_BIND}")
|
||||||
|
print("⏹️ Press Ctrl+C to stop")
|
||||||
|
|
||||||
|
# Gunicorn command
|
||||||
|
cmd = [
|
||||||
|
"gunicorn",
|
||||||
|
"--workers", str(AppSettings.GUNICORN_WORKERS),
|
||||||
|
"--bind", AppSettings.GUNICORN_BIND,
|
||||||
|
"--timeout", str(AppSettings.GUNICORN_TIMEOUT),
|
||||||
|
"--keep-alive", str(AppSettings.GUNICORN_KEEPALIVE),
|
||||||
|
"--access-logfile", "-",
|
||||||
|
"--error-logfile", "-",
|
||||||
|
"--log-level", "info",
|
||||||
|
"wsgi:application"
|
||||||
|
]
|
||||||
|
|
||||||
|
try:
|
||||||
|
subprocess.run(cmd, check=True)
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
print("\n🛑 Shutting down...")
|
||||||
|
sys.exit(0)
|
||||||
|
except subprocess.CalledProcessError as e:
|
||||||
|
print(f"❌ Error running Gunicorn: {e}")
|
||||||
|
sys.exit(1)
|
||||||
|
except FileNotFoundError:
|
||||||
|
print("❌ Gunicorn not found. Install it with: uv add gunicorn")
|
||||||
|
print("💡 Or run in development mode with: python run_dev.py")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
@@ -8,7 +8,15 @@ from .ui.callbacks.interactions import InteractionCallbacks
|
|||||||
|
|
||||||
|
|
||||||
def create_app():
|
def create_app():
|
||||||
app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])
|
import os
|
||||||
|
|
||||||
|
# Get the project root directory (two levels up from this file)
|
||||||
|
project_root = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
|
||||||
|
assets_path = os.path.join(project_root, "assets")
|
||||||
|
|
||||||
|
app = dash.Dash(
|
||||||
|
__name__, external_stylesheets=[dbc.themes.BOOTSTRAP], assets_folder=assets_path
|
||||||
|
)
|
||||||
|
|
||||||
# Allow callbacks to components that are dynamically created in tabs
|
# Allow callbacks to components that are dynamically created in tabs
|
||||||
app.config.suppress_callback_exceptions = True
|
app.config.suppress_callback_exceptions = True
|
||||||
@@ -20,9 +28,78 @@ def create_app():
|
|||||||
VisualizationCallbacks()
|
VisualizationCallbacks()
|
||||||
InteractionCallbacks()
|
InteractionCallbacks()
|
||||||
|
|
||||||
|
# Register client-side callback for embedding generation
|
||||||
|
_register_client_side_callbacks(app)
|
||||||
|
|
||||||
return app
|
return app
|
||||||
|
|
||||||
|
|
||||||
|
def _register_client_side_callbacks(app):
|
||||||
|
"""Register client-side callbacks for browser-based processing."""
|
||||||
|
from dash import Input, Output, State
|
||||||
|
|
||||||
|
# Client-side callback for embedding generation
|
||||||
|
app.clientside_callback(
|
||||||
|
"""
|
||||||
|
function(nClicks, textContent, modelName, tokenizationMethod, batchSize, category, subcategory) {
|
||||||
|
if (!nClicks || !textContent || !textContent.trim()) {
|
||||||
|
return window.dash_clientside.no_update;
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log('🔍 Checking for Transformers.js...');
|
||||||
|
console.log('window.dash_clientside:', typeof window.dash_clientside);
|
||||||
|
console.log('window.dash_clientside.transformers:', typeof window.dash_clientside?.transformers);
|
||||||
|
console.log('generateEmbeddings function:', typeof window.dash_clientside?.transformers?.generateEmbeddings);
|
||||||
|
|
||||||
|
if (typeof window.dash_clientside !== 'undefined' &&
|
||||||
|
typeof window.dash_clientside.transformers !== 'undefined' &&
|
||||||
|
typeof window.dash_clientside.transformers.generateEmbeddings === 'function') {
|
||||||
|
|
||||||
|
console.log('✅ Calling Transformers.js generateEmbeddings...');
|
||||||
|
return window.dash_clientside.transformers.generateEmbeddings(
|
||||||
|
nClicks, textContent, modelName, tokenizationMethod, category, subcategory
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// More detailed error information
|
||||||
|
let errorMsg = '❌ Transformers.js not available. ';
|
||||||
|
if (typeof window.dash_clientside === 'undefined') {
|
||||||
|
errorMsg += 'dash_clientside not found.';
|
||||||
|
} else if (typeof window.dash_clientside.transformers === 'undefined') {
|
||||||
|
errorMsg += 'transformers module not found.';
|
||||||
|
} else if (typeof window.dash_clientside.transformers.generateEmbeddings !== 'function') {
|
||||||
|
errorMsg += 'generateEmbeddings function not found.';
|
||||||
|
}
|
||||||
|
|
||||||
|
console.error(errorMsg);
|
||||||
|
|
||||||
|
return [
|
||||||
|
{ error: 'Transformers.js not loaded. Please refresh the page and try again.' },
|
||||||
|
errorMsg + ' Please refresh the page.',
|
||||||
|
'danger',
|
||||||
|
false
|
||||||
|
];
|
||||||
|
}
|
||||||
|
""",
|
||||||
|
[
|
||||||
|
Output("embeddings-generated-trigger", "data"),
|
||||||
|
Output("text-input-status-immediate", "children"),
|
||||||
|
Output("text-input-status-immediate", "color"),
|
||||||
|
Output("generate-embeddings-btn", "disabled", allow_duplicate=True),
|
||||||
|
],
|
||||||
|
[Input("generate-embeddings-btn", "n_clicks")],
|
||||||
|
[
|
||||||
|
State("text-input-area", "value"),
|
||||||
|
State("model-selection", "value"),
|
||||||
|
State("tokenization-method", "value"),
|
||||||
|
State("batch-size", "value"),
|
||||||
|
State("text-category", "value"),
|
||||||
|
State("text-subcategory", "value"),
|
||||||
|
],
|
||||||
|
prevent_initial_call=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def run_app(app=None, debug=None, host=None, port=None):
|
def run_app(app=None, debug=None, host=None, port=None):
|
||||||
if app is None:
|
if app is None:
|
||||||
app = create_app()
|
app = create_app()
|
||||||
|
@@ -73,12 +73,88 @@ class AppSettings:
|
|||||||
HOST = os.getenv("EMBEDDINGBUDDY_HOST", "127.0.0.1")
|
HOST = os.getenv("EMBEDDINGBUDDY_HOST", "127.0.0.1")
|
||||||
PORT = int(os.getenv("EMBEDDINGBUDDY_PORT", "8050"))
|
PORT = int(os.getenv("EMBEDDINGBUDDY_PORT", "8050"))
|
||||||
|
|
||||||
|
# Environment Configuration
|
||||||
|
ENVIRONMENT = os.getenv(
|
||||||
|
"EMBEDDINGBUDDY_ENV", "development"
|
||||||
|
) # development, production
|
||||||
|
|
||||||
|
# WSGI Server Configuration (for production)
|
||||||
|
GUNICORN_WORKERS = int(os.getenv("GUNICORN_WORKERS", "4"))
|
||||||
|
GUNICORN_BIND = os.getenv("GUNICORN_BIND", f"{HOST}:{PORT}")
|
||||||
|
GUNICORN_TIMEOUT = int(os.getenv("GUNICORN_TIMEOUT", "120"))
|
||||||
|
GUNICORN_KEEPALIVE = int(os.getenv("GUNICORN_KEEPALIVE", "5"))
|
||||||
|
|
||||||
# OpenSearch Configuration
|
# OpenSearch Configuration
|
||||||
OPENSEARCH_DEFAULT_SIZE = 100
|
OPENSEARCH_DEFAULT_SIZE = 100
|
||||||
OPENSEARCH_SAMPLE_SIZE = 5
|
OPENSEARCH_SAMPLE_SIZE = 5
|
||||||
OPENSEARCH_CONNECTION_TIMEOUT = 30
|
OPENSEARCH_CONNECTION_TIMEOUT = 30
|
||||||
OPENSEARCH_VERIFY_CERTS = True
|
OPENSEARCH_VERIFY_CERTS = True
|
||||||
|
|
||||||
|
# Text Input / Transformers.js Configuration
|
||||||
|
DEFAULT_EMBEDDING_MODEL = "Xenova/all-mpnet-base-v2"
|
||||||
|
MAX_TEXT_LENGTH = 50000 # Characters (browser memory limits)
|
||||||
|
DEFAULT_TOKENIZATION_METHOD = "sentence"
|
||||||
|
MAX_BATCH_SIZE = 8 # Process in smaller batches for memory management
|
||||||
|
|
||||||
|
# Available Transformers.js compatible models
|
||||||
|
AVAILABLE_MODELS = [
|
||||||
|
{
|
||||||
|
"name": "Xenova/all-mpnet-base-v2",
|
||||||
|
"label": "All-MPNet-Base-v2 (Quality, 768d)",
|
||||||
|
"description": "Higher quality embeddings with better semantic understanding",
|
||||||
|
"dimensions": 768,
|
||||||
|
"size": "109 MB",
|
||||||
|
"context_length": 512,
|
||||||
|
"multilingual": False,
|
||||||
|
"default": True,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Xenova/all-MiniLM-L6-v2",
|
||||||
|
"label": "All-MiniLM-L6-v2 (Fast, 384d)",
|
||||||
|
"description": "Lightweight model, good for quick testing and general purpose",
|
||||||
|
"dimensions": 384,
|
||||||
|
"size": "23 MB",
|
||||||
|
"context_length": 512,
|
||||||
|
"multilingual": False,
|
||||||
|
"default": False,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Xenova/paraphrase-multilingual-MiniLM-L12-v2",
|
||||||
|
"label": "Multilingual MiniLM (50+ languages)",
|
||||||
|
"description": "Support for multiple languages with good performance",
|
||||||
|
"dimensions": 384,
|
||||||
|
"size": "127 MB",
|
||||||
|
"context_length": 512,
|
||||||
|
"multilingual": True,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Xenova/bge-small-en-v1.5",
|
||||||
|
"label": "BGE Small English (High quality, 384d)",
|
||||||
|
"description": "Beijing Academy of AI model with excellent performance on retrieval tasks",
|
||||||
|
"dimensions": 384,
|
||||||
|
"size": "67 MB",
|
||||||
|
"context_length": 512,
|
||||||
|
"multilingual": False,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Xenova/gte-small",
|
||||||
|
"label": "GTE Small (General Text Embeddings, 384d)",
|
||||||
|
"description": "Alibaba's general text embedding model, balanced performance",
|
||||||
|
"dimensions": 384,
|
||||||
|
"size": "67 MB",
|
||||||
|
"context_length": 512,
|
||||||
|
"multilingual": False,
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
# Browser compatibility requirements
|
||||||
|
SUPPORTED_BROWSERS = {
|
||||||
|
"chrome": ">=88",
|
||||||
|
"firefox": ">=92",
|
||||||
|
"safari": ">=15.4",
|
||||||
|
"edge": ">=88",
|
||||||
|
}
|
||||||
|
|
||||||
# Bootstrap Theme
|
# Bootstrap Theme
|
||||||
EXTERNAL_STYLESHEETS = [
|
EXTERNAL_STYLESHEETS = [
|
||||||
"https://cdn.jsdelivr.net/npm/bootstrap@5.1.3/dist/css/bootstrap.min.css"
|
"https://cdn.jsdelivr.net/npm/bootstrap@5.1.3/dist/css/bootstrap.min.css"
|
||||||
|
@@ -63,6 +63,90 @@ class DataProcessor:
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
return ProcessedData(documents=[], embeddings=np.array([]), error=str(e))
|
return ProcessedData(documents=[], embeddings=np.array([]), error=str(e))
|
||||||
|
|
||||||
|
def process_client_embeddings(self, embeddings_data: dict) -> ProcessedData:
|
||||||
|
"""Process embeddings data received from client-side JavaScript."""
|
||||||
|
try:
|
||||||
|
if "error" in embeddings_data:
|
||||||
|
return ProcessedData(
|
||||||
|
documents=[],
|
||||||
|
embeddings=np.array([]),
|
||||||
|
error=embeddings_data["error"],
|
||||||
|
)
|
||||||
|
|
||||||
|
# Extract documents and embeddings from client data
|
||||||
|
documents_data = embeddings_data.get("documents", [])
|
||||||
|
embeddings_list = embeddings_data.get("embeddings", [])
|
||||||
|
|
||||||
|
if not documents_data or not embeddings_list:
|
||||||
|
return ProcessedData(
|
||||||
|
documents=[],
|
||||||
|
embeddings=np.array([]),
|
||||||
|
error="No documents or embeddings in client data",
|
||||||
|
)
|
||||||
|
|
||||||
|
if len(documents_data) != len(embeddings_list):
|
||||||
|
return ProcessedData(
|
||||||
|
documents=[],
|
||||||
|
embeddings=np.array([]),
|
||||||
|
error="Mismatch between number of documents and embeddings",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Convert embeddings to numpy array first
|
||||||
|
try:
|
||||||
|
embeddings = np.array(embeddings_list)
|
||||||
|
|
||||||
|
if embeddings.ndim != 2:
|
||||||
|
return ProcessedData(
|
||||||
|
documents=[],
|
||||||
|
embeddings=np.array([]),
|
||||||
|
error="Invalid embedding dimensions",
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return ProcessedData(
|
||||||
|
documents=[],
|
||||||
|
embeddings=np.array([]),
|
||||||
|
error=f"Error processing embeddings: {str(e)}",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Convert to Document objects with embeddings
|
||||||
|
documents = []
|
||||||
|
for i, doc_data in enumerate(documents_data):
|
||||||
|
try:
|
||||||
|
# Skip if we don't have a corresponding embedding
|
||||||
|
if i >= len(embeddings):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Ensure required fields are present
|
||||||
|
if "id" not in doc_data or not doc_data["id"]:
|
||||||
|
doc_data["id"] = f"text_input_{i}"
|
||||||
|
if "text" not in doc_data or not doc_data["text"].strip():
|
||||||
|
continue # Skip documents without text
|
||||||
|
|
||||||
|
# Add the embedding to doc_data
|
||||||
|
doc_data["embedding"] = embeddings[i].tolist()
|
||||||
|
|
||||||
|
doc = Document(**doc_data)
|
||||||
|
documents.append(doc)
|
||||||
|
except Exception:
|
||||||
|
# Skip invalid documents but continue processing
|
||||||
|
continue
|
||||||
|
|
||||||
|
if not documents:
|
||||||
|
return ProcessedData(
|
||||||
|
documents=[],
|
||||||
|
embeddings=np.array([]),
|
||||||
|
error="No valid documents found in client data",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Only keep embeddings for valid documents
|
||||||
|
valid_embeddings = embeddings[: len(documents)]
|
||||||
|
|
||||||
|
return ProcessedData(documents=documents, embeddings=valid_embeddings)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return ProcessedData(documents=[], embeddings=np.array([]), error=str(e))
|
||||||
|
|
||||||
def _extract_embeddings(self, documents: List[Document]) -> np.ndarray:
|
def _extract_embeddings(self, documents: List[Document]) -> np.ndarray:
|
||||||
if not documents:
|
if not documents:
|
||||||
return np.array([])
|
return np.array([])
|
||||||
|
@@ -1,4 +1,4 @@
|
|||||||
from dash import callback, Input, Output, State, no_update
|
from dash import callback, Input, Output, State, no_update, html
|
||||||
from ...data.processor import DataProcessor
|
from ...data.processor import DataProcessor
|
||||||
from ...data.sources.opensearch import OpenSearchClient
|
from ...data.sources.opensearch import OpenSearchClient
|
||||||
from ...models.field_mapper import FieldMapper
|
from ...models.field_mapper import FieldMapper
|
||||||
@@ -87,6 +87,8 @@ class DataProcessingCallbacks:
|
|||||||
|
|
||||||
if active_tab == "opensearch-tab":
|
if active_tab == "opensearch-tab":
|
||||||
return [datasource.create_opensearch_tab()]
|
return [datasource.create_opensearch_tab()]
|
||||||
|
elif active_tab == "text-input-tab":
|
||||||
|
return [datasource.create_text_input_tab()]
|
||||||
else:
|
else:
|
||||||
return [datasource.create_file_upload_tab()]
|
return [datasource.create_file_upload_tab()]
|
||||||
|
|
||||||
@@ -97,6 +99,9 @@ class DataProcessingCallbacks:
|
|||||||
# Register collapsible section callbacks
|
# Register collapsible section callbacks
|
||||||
self._register_collapse_callbacks()
|
self._register_collapse_callbacks()
|
||||||
|
|
||||||
|
# Register text input callbacks
|
||||||
|
self._register_text_input_callbacks()
|
||||||
|
|
||||||
def _register_opensearch_callbacks(self, section_type, opensearch_client):
|
def _register_opensearch_callbacks(self, section_type, opensearch_client):
|
||||||
"""Register callbacks for a specific section (data or prompts)."""
|
"""Register callbacks for a specific section (data or prompts)."""
|
||||||
|
|
||||||
@@ -463,6 +468,224 @@ class DataProcessingCallbacks:
|
|||||||
return new_state, icon_class
|
return new_state, icon_class
|
||||||
return is_open, "fas fa-chevron-down me-2"
|
return is_open, "fas fa-chevron-down me-2"
|
||||||
|
|
||||||
|
def _register_text_input_callbacks(self):
|
||||||
|
"""Register callbacks for text input functionality."""
|
||||||
|
|
||||||
|
# Text length counter callback
|
||||||
|
@callback(
|
||||||
|
Output("text-length-counter", "children"),
|
||||||
|
Input("text-input-area", "value"),
|
||||||
|
prevent_initial_call=False,
|
||||||
|
)
|
||||||
|
def update_text_length_counter(text_value):
|
||||||
|
if not text_value:
|
||||||
|
return "0"
|
||||||
|
return f"{len(text_value):,}"
|
||||||
|
|
||||||
|
# Generate button enable/disable callback
|
||||||
|
@callback(
|
||||||
|
[
|
||||||
|
Output("generate-embeddings-btn", "disabled"),
|
||||||
|
Output("generation-help", "children"),
|
||||||
|
Output("generation-help", "color"),
|
||||||
|
],
|
||||||
|
[
|
||||||
|
Input("text-input-area", "value"),
|
||||||
|
Input("model-selection", "value"),
|
||||||
|
],
|
||||||
|
prevent_initial_call=False,
|
||||||
|
)
|
||||||
|
def toggle_generate_button(text_value, model_name):
|
||||||
|
import dash_bootstrap_components as dbc
|
||||||
|
|
||||||
|
if not text_value or not text_value.strip():
|
||||||
|
return (
|
||||||
|
True,
|
||||||
|
dbc.Alert(
|
||||||
|
[
|
||||||
|
html.I(className="fas fa-info-circle me-2"),
|
||||||
|
"Enter some text above to enable embedding generation.",
|
||||||
|
],
|
||||||
|
color="light",
|
||||||
|
),
|
||||||
|
"light",
|
||||||
|
)
|
||||||
|
|
||||||
|
if not model_name:
|
||||||
|
return (
|
||||||
|
True,
|
||||||
|
dbc.Alert(
|
||||||
|
[
|
||||||
|
html.I(className="fas fa-exclamation-triangle me-2"),
|
||||||
|
"Select an embedding model to continue.",
|
||||||
|
],
|
||||||
|
color="warning",
|
||||||
|
),
|
||||||
|
"warning",
|
||||||
|
)
|
||||||
|
|
||||||
|
text_length = len(text_value.strip())
|
||||||
|
if text_length > AppSettings.MAX_TEXT_LENGTH:
|
||||||
|
return (
|
||||||
|
True,
|
||||||
|
dbc.Alert(
|
||||||
|
[
|
||||||
|
html.I(className="fas fa-exclamation-triangle me-2"),
|
||||||
|
f"Text too long ({text_length:,} characters). Maximum allowed: {AppSettings.MAX_TEXT_LENGTH:,} characters.",
|
||||||
|
],
|
||||||
|
color="danger",
|
||||||
|
),
|
||||||
|
"danger",
|
||||||
|
)
|
||||||
|
|
||||||
|
return (
|
||||||
|
False,
|
||||||
|
dbc.Alert(
|
||||||
|
[
|
||||||
|
html.I(className="fas fa-check-circle me-2"),
|
||||||
|
f"Ready to generate embeddings for {text_length:,} characters using {model_name}.",
|
||||||
|
],
|
||||||
|
color="success",
|
||||||
|
),
|
||||||
|
"success",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Clear text callback
|
||||||
|
@callback(
|
||||||
|
Output("text-input-area", "value"),
|
||||||
|
[Input("clear-text-btn", "n_clicks"), Input("load-sample-btn", "n_clicks")],
|
||||||
|
prevent_initial_call=True,
|
||||||
|
)
|
||||||
|
def handle_text_input_actions(clear_clicks, load_clicks):
|
||||||
|
from dash import ctx
|
||||||
|
|
||||||
|
if not ctx.triggered:
|
||||||
|
return no_update
|
||||||
|
|
||||||
|
button_id = ctx.triggered[0]["prop_id"].split(".")[0]
|
||||||
|
|
||||||
|
if button_id == "clear-text-btn" and clear_clicks:
|
||||||
|
return ""
|
||||||
|
elif button_id == "load-sample-btn" and load_clicks:
|
||||||
|
return self._load_sample_text()
|
||||||
|
|
||||||
|
return no_update
|
||||||
|
|
||||||
|
# Model info callback
|
||||||
|
@callback(
|
||||||
|
Output("model-info", "children"),
|
||||||
|
Input("model-selection", "value"),
|
||||||
|
prevent_initial_call=False,
|
||||||
|
)
|
||||||
|
def update_model_info(model_name):
|
||||||
|
if not model_name:
|
||||||
|
return html.Span("Please select a model", className="text-muted")
|
||||||
|
|
||||||
|
from ...config.settings import AppSettings
|
||||||
|
|
||||||
|
settings = AppSettings()
|
||||||
|
|
||||||
|
for model in settings.AVAILABLE_MODELS:
|
||||||
|
if model["name"] == model_name:
|
||||||
|
return html.Div(
|
||||||
|
[
|
||||||
|
html.Strong(
|
||||||
|
f"Dimensions: {model['dimensions']} | Context Length: {model['context_length']}"
|
||||||
|
),
|
||||||
|
html.Br(),
|
||||||
|
html.Span(model["description"]),
|
||||||
|
html.Br(),
|
||||||
|
html.Small(
|
||||||
|
f"Multilingual: {'Yes' if model.get('multilingual', False) else 'No'} | Size: {model['size']}",
|
||||||
|
className="text-muted",
|
||||||
|
),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
return html.Span("Model information not available", className="text-muted")
|
||||||
|
|
||||||
|
# Process client-side embeddings result callback
|
||||||
|
@callback(
|
||||||
|
[
|
||||||
|
Output("processed-data", "data", allow_duplicate=True),
|
||||||
|
Output("text-input-status", "children"),
|
||||||
|
Output("text-input-status", "color"),
|
||||||
|
Output("text-input-status", "style"),
|
||||||
|
Output("generate-embeddings-btn", "disabled", allow_duplicate=True),
|
||||||
|
],
|
||||||
|
[Input("embeddings-generated-trigger", "data")],
|
||||||
|
prevent_initial_call=True,
|
||||||
|
)
|
||||||
|
def process_embeddings_result(embeddings_data):
|
||||||
|
"""Process embeddings generated client-side."""
|
||||||
|
if not embeddings_data:
|
||||||
|
return no_update, no_update, no_update, no_update, no_update
|
||||||
|
|
||||||
|
processed_data = self.processor.process_client_embeddings(embeddings_data)
|
||||||
|
|
||||||
|
if processed_data.error:
|
||||||
|
return (
|
||||||
|
{"error": processed_data.error},
|
||||||
|
f"❌ Error: {processed_data.error}",
|
||||||
|
"danger",
|
||||||
|
{"display": "block"},
|
||||||
|
False,
|
||||||
|
)
|
||||||
|
|
||||||
|
return (
|
||||||
|
{
|
||||||
|
"documents": [
|
||||||
|
self._document_to_dict(doc) for doc in processed_data.documents
|
||||||
|
],
|
||||||
|
"embeddings": processed_data.embeddings.tolist(),
|
||||||
|
},
|
||||||
|
f"✅ Generated embeddings for {len(processed_data.documents)} text chunks",
|
||||||
|
"success",
|
||||||
|
{"display": "block"},
|
||||||
|
False,
|
||||||
|
)
|
||||||
|
|
||||||
|
def _load_sample_text(self):
|
||||||
|
"""Load sample text from assets/sample-txt.md file."""
|
||||||
|
import os
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Get the project root directory (four levels up from this file)
|
||||||
|
current_file = os.path.abspath(__file__)
|
||||||
|
project_root = os.path.dirname(
|
||||||
|
os.path.dirname(
|
||||||
|
os.path.dirname(os.path.dirname(os.path.dirname(current_file)))
|
||||||
|
)
|
||||||
|
)
|
||||||
|
sample_file_path = os.path.join(project_root, "assets", "sample-txt.md")
|
||||||
|
|
||||||
|
if os.path.exists(sample_file_path):
|
||||||
|
with open(sample_file_path, "r", encoding="utf-8") as file:
|
||||||
|
return file.read()
|
||||||
|
else:
|
||||||
|
# Fallback sample text if file doesn't exist
|
||||||
|
return """The sun peeked through the clouds after a drizzly morning.
|
||||||
|
A gentle breeze rustled the leaves as we walked along the shoreline.
|
||||||
|
Heavy rains caused flooding in several low-lying neighborhoods.
|
||||||
|
It was so hot that even the birds sought shade under the palm trees.
|
||||||
|
By midnight, the temperature had dropped below freezing.
|
||||||
|
|
||||||
|
The new smartphone features a foldable display and 5G connectivity.
|
||||||
|
In the world of AI, transformers have revolutionized natural language processing.
|
||||||
|
Quantum computing promises to solve problems beyond classical computers' reach.
|
||||||
|
Blockchain technology is being explored for secure voting systems.
|
||||||
|
Virtual reality headsets are becoming more affordable and accessible.
|
||||||
|
|
||||||
|
Preheat the oven to 375°F before you start mixing the batter.
|
||||||
|
She finely chopped the garlic and sautéed it in two tablespoons of olive oil.
|
||||||
|
A pinch of saffron adds a beautiful color and aroma to traditional paella.
|
||||||
|
If the soup is too salty, add a peeled potato to absorb excess sodium.
|
||||||
|
Let the bread dough rise for at least an hour in a warm, draft-free spot."""
|
||||||
|
|
||||||
|
except Exception:
|
||||||
|
# Return a simple fallback if there's any error
|
||||||
|
return "This is sample text for testing embedding generation. You can replace this with your own text."
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _document_to_dict(doc):
|
def _document_to_dict(doc):
|
||||||
return {
|
return {
|
||||||
|
@@ -1,11 +1,13 @@
|
|||||||
from dash import dcc, html
|
from dash import dcc, html
|
||||||
import dash_bootstrap_components as dbc
|
import dash_bootstrap_components as dbc
|
||||||
from .upload import UploadComponent
|
from .upload import UploadComponent
|
||||||
|
from .textinput import TextInputComponent
|
||||||
|
|
||||||
|
|
||||||
class DataSourceComponent:
|
class DataSourceComponent:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.upload_component = UploadComponent()
|
self.upload_component = UploadComponent()
|
||||||
|
self.text_input_component = TextInputComponent()
|
||||||
|
|
||||||
def create_tabbed_interface(self):
|
def create_tabbed_interface(self):
|
||||||
"""Create tabbed interface for different data sources."""
|
"""Create tabbed interface for different data sources."""
|
||||||
@@ -17,6 +19,7 @@ class DataSourceComponent:
|
|||||||
[
|
[
|
||||||
dbc.Tab(label="File Upload", tab_id="file-tab"),
|
dbc.Tab(label="File Upload", tab_id="file-tab"),
|
||||||
dbc.Tab(label="OpenSearch", tab_id="opensearch-tab"),
|
dbc.Tab(label="OpenSearch", tab_id="opensearch-tab"),
|
||||||
|
dbc.Tab(label="Text Input", tab_id="text-input-tab"),
|
||||||
],
|
],
|
||||||
id="data-source-tabs",
|
id="data-source-tabs",
|
||||||
active_tab="file-tab",
|
active_tab="file-tab",
|
||||||
@@ -208,6 +211,10 @@ class DataSourceComponent:
|
|||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def create_text_input_tab(self):
|
||||||
|
"""Create text input tab content for browser-based embedding generation."""
|
||||||
|
return html.Div([self.text_input_component.create_text_input_interface()])
|
||||||
|
|
||||||
def _create_opensearch_section(self, section_type):
|
def _create_opensearch_section(self, section_type):
|
||||||
"""Create a complete OpenSearch section for either 'data' or 'prompts'."""
|
"""Create a complete OpenSearch section for either 'data' or 'prompts'."""
|
||||||
section_id = section_type # 'data' or 'prompts'
|
section_id = section_type # 'data' or 'prompts'
|
||||||
|
402
src/embeddingbuddy/ui/components/textinput.py
Normal file
402
src/embeddingbuddy/ui/components/textinput.py
Normal file
@@ -0,0 +1,402 @@
|
|||||||
|
"""Text input component for generating embeddings from user text."""
|
||||||
|
|
||||||
|
import dash_bootstrap_components as dbc
|
||||||
|
from dash import dcc, html
|
||||||
|
|
||||||
|
from embeddingbuddy.config.settings import AppSettings
|
||||||
|
|
||||||
|
|
||||||
|
class TextInputComponent:
|
||||||
|
"""Component for text input and embedding generation."""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.settings = AppSettings()
|
||||||
|
|
||||||
|
def create_text_input_interface(self):
|
||||||
|
"""Create the complete text input interface with model selection and processing options."""
|
||||||
|
return html.Div(
|
||||||
|
[
|
||||||
|
# Model selection section
|
||||||
|
self._create_model_selection(),
|
||||||
|
html.Hr(),
|
||||||
|
# Text input section
|
||||||
|
self._create_text_input_area(),
|
||||||
|
# Text action buttons
|
||||||
|
self._create_text_action_buttons(),
|
||||||
|
html.Hr(),
|
||||||
|
# Processing options
|
||||||
|
self._create_processing_options(),
|
||||||
|
html.Hr(),
|
||||||
|
# Generation controls
|
||||||
|
self._create_generation_controls(),
|
||||||
|
html.Hr(),
|
||||||
|
# Progress indicators
|
||||||
|
self._create_progress_indicators(),
|
||||||
|
html.Hr(),
|
||||||
|
# Status and results
|
||||||
|
self._create_status_section(),
|
||||||
|
# Hidden components for data flow
|
||||||
|
self._create_hidden_components(),
|
||||||
|
],
|
||||||
|
className="p-3",
|
||||||
|
)
|
||||||
|
|
||||||
|
def _create_model_selection(self):
|
||||||
|
"""Create model selection dropdown with descriptions."""
|
||||||
|
model_options = []
|
||||||
|
for model in self.settings.AVAILABLE_MODELS:
|
||||||
|
label = f"{model['label']} - {model['size']}"
|
||||||
|
if model.get("default", False):
|
||||||
|
label += " (Recommended)"
|
||||||
|
|
||||||
|
model_options.append({"label": label, "value": model["name"]})
|
||||||
|
|
||||||
|
return html.Div(
|
||||||
|
[
|
||||||
|
html.H5("Embedding Model", className="mb-3"),
|
||||||
|
html.Div(
|
||||||
|
[
|
||||||
|
dcc.Dropdown(
|
||||||
|
id="model-selection",
|
||||||
|
options=model_options,
|
||||||
|
value=self.settings.DEFAULT_EMBEDDING_MODEL,
|
||||||
|
placeholder="Select an embedding model...",
|
||||||
|
className="mb-2",
|
||||||
|
),
|
||||||
|
dbc.Alert(
|
||||||
|
[
|
||||||
|
html.Div(
|
||||||
|
id="model-info",
|
||||||
|
children=self._get_model_description(
|
||||||
|
self.settings.DEFAULT_EMBEDDING_MODEL
|
||||||
|
),
|
||||||
|
)
|
||||||
|
],
|
||||||
|
color="info",
|
||||||
|
className="small",
|
||||||
|
),
|
||||||
|
]
|
||||||
|
),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
def _create_text_input_area(self):
|
||||||
|
"""Create text input textarea with character limits."""
|
||||||
|
return html.Div(
|
||||||
|
[
|
||||||
|
html.H5("Text Input", className="mb-3"),
|
||||||
|
dcc.Textarea(
|
||||||
|
id="text-input-area",
|
||||||
|
placeholder="Paste your text here... Each sentence, paragraph, or line will become a separate data point depending on your tokenization method below.",
|
||||||
|
value="",
|
||||||
|
style={
|
||||||
|
"width": "100%",
|
||||||
|
"height": "300px",
|
||||||
|
"resize": "vertical",
|
||||||
|
"font-family": "monospace",
|
||||||
|
"font-size": "14px",
|
||||||
|
},
|
||||||
|
maxLength=self.settings.MAX_TEXT_LENGTH,
|
||||||
|
className="form-control",
|
||||||
|
),
|
||||||
|
html.Small(
|
||||||
|
f"Maximum {self.settings.MAX_TEXT_LENGTH:,} characters. Current: ",
|
||||||
|
className="text-muted",
|
||||||
|
),
|
||||||
|
html.Small(
|
||||||
|
id="text-length-counter",
|
||||||
|
children="0",
|
||||||
|
className="text-muted fw-bold",
|
||||||
|
),
|
||||||
|
html.Small(" characters", className="text-muted"),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
def _create_text_action_buttons(self):
|
||||||
|
"""Create action buttons for text input (Load Sample, Clear)."""
|
||||||
|
return html.Div(
|
||||||
|
[
|
||||||
|
dbc.Row(
|
||||||
|
[
|
||||||
|
dbc.Col(
|
||||||
|
[
|
||||||
|
dbc.Button(
|
||||||
|
[
|
||||||
|
html.I(className="fas fa-file-text me-2"),
|
||||||
|
"Load Sample Text",
|
||||||
|
],
|
||||||
|
id="load-sample-btn",
|
||||||
|
color="info",
|
||||||
|
size="sm",
|
||||||
|
className="w-100",
|
||||||
|
)
|
||||||
|
],
|
||||||
|
md=6,
|
||||||
|
),
|
||||||
|
dbc.Col(
|
||||||
|
[
|
||||||
|
dbc.Button(
|
||||||
|
[
|
||||||
|
html.I(className="fas fa-trash me-2"),
|
||||||
|
"Clear Text",
|
||||||
|
],
|
||||||
|
id="clear-text-btn",
|
||||||
|
color="outline-secondary",
|
||||||
|
size="sm",
|
||||||
|
className="w-100",
|
||||||
|
)
|
||||||
|
],
|
||||||
|
md=6,
|
||||||
|
),
|
||||||
|
],
|
||||||
|
className="mt-2 mb-3",
|
||||||
|
)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
def _create_processing_options(self):
|
||||||
|
"""Create tokenization and metadata options."""
|
||||||
|
return html.Div(
|
||||||
|
[
|
||||||
|
html.H5("Processing Options", className="mb-3"),
|
||||||
|
dbc.Row(
|
||||||
|
[
|
||||||
|
dbc.Col(
|
||||||
|
[
|
||||||
|
html.Label(
|
||||||
|
"Text Splitting Method:", className="form-label"
|
||||||
|
),
|
||||||
|
dcc.Dropdown(
|
||||||
|
id="tokenization-method",
|
||||||
|
options=[
|
||||||
|
{
|
||||||
|
"label": "Sentences (split on . ! ?)",
|
||||||
|
"value": "sentence",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"label": "Paragraphs (split on double newline)",
|
||||||
|
"value": "paragraph",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"label": "Lines (split on single newline)",
|
||||||
|
"value": "manual",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"label": "Entire text as one document",
|
||||||
|
"value": "whole",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
value=self.settings.DEFAULT_TOKENIZATION_METHOD,
|
||||||
|
className="mb-3",
|
||||||
|
),
|
||||||
|
],
|
||||||
|
md=6,
|
||||||
|
),
|
||||||
|
dbc.Col(
|
||||||
|
[
|
||||||
|
html.Label("Batch Size:", className="form-label"),
|
||||||
|
dcc.Dropdown(
|
||||||
|
id="batch-size",
|
||||||
|
options=[
|
||||||
|
{
|
||||||
|
"label": "Small batches (4) - Lower memory",
|
||||||
|
"value": 4,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"label": "Medium batches (8) - Balanced",
|
||||||
|
"value": 8,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"label": "Large batches (16) - Faster",
|
||||||
|
"value": 16,
|
||||||
|
},
|
||||||
|
],
|
||||||
|
value=self.settings.MAX_BATCH_SIZE,
|
||||||
|
className="mb-3",
|
||||||
|
),
|
||||||
|
],
|
||||||
|
md=6,
|
||||||
|
),
|
||||||
|
]
|
||||||
|
),
|
||||||
|
dbc.Row(
|
||||||
|
[
|
||||||
|
dbc.Col(
|
||||||
|
[
|
||||||
|
html.Label(
|
||||||
|
"Category (Optional):", className="form-label"
|
||||||
|
),
|
||||||
|
dcc.Input(
|
||||||
|
id="text-category",
|
||||||
|
type="text",
|
||||||
|
placeholder="e.g., Notes, Articles, Ideas...",
|
||||||
|
value="Text Input",
|
||||||
|
className="form-control mb-3",
|
||||||
|
),
|
||||||
|
],
|
||||||
|
md=6,
|
||||||
|
),
|
||||||
|
dbc.Col(
|
||||||
|
[
|
||||||
|
html.Label(
|
||||||
|
"Subcategory (Optional):", className="form-label"
|
||||||
|
),
|
||||||
|
dcc.Input(
|
||||||
|
id="text-subcategory",
|
||||||
|
type="text",
|
||||||
|
placeholder="e.g., Meeting Notes, Research...",
|
||||||
|
value="Generated",
|
||||||
|
className="form-control mb-3",
|
||||||
|
),
|
||||||
|
],
|
||||||
|
md=6,
|
||||||
|
),
|
||||||
|
]
|
||||||
|
),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
def _create_generation_controls(self):
|
||||||
|
"""Create embedding generation button and controls."""
|
||||||
|
return html.Div(
|
||||||
|
[
|
||||||
|
html.H5("Generate Embeddings", className="mb-3"),
|
||||||
|
dbc.Row(
|
||||||
|
[
|
||||||
|
dbc.Col(
|
||||||
|
[
|
||||||
|
dbc.Button(
|
||||||
|
[
|
||||||
|
html.I(className="fas fa-magic me-2"),
|
||||||
|
"Generate Embeddings",
|
||||||
|
],
|
||||||
|
id="generate-embeddings-btn",
|
||||||
|
color="primary",
|
||||||
|
size="lg",
|
||||||
|
disabled=True,
|
||||||
|
className="w-100",
|
||||||
|
)
|
||||||
|
],
|
||||||
|
md=12,
|
||||||
|
),
|
||||||
|
]
|
||||||
|
),
|
||||||
|
html.Div(
|
||||||
|
[
|
||||||
|
dbc.Alert(
|
||||||
|
[
|
||||||
|
html.I(className="fas fa-info-circle me-2"),
|
||||||
|
"Enter some text above and select a model to enable embedding generation.",
|
||||||
|
],
|
||||||
|
color="light",
|
||||||
|
className="mt-3",
|
||||||
|
id="generation-help",
|
||||||
|
)
|
||||||
|
]
|
||||||
|
),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
def _create_progress_indicators(self):
|
||||||
|
"""Create progress bars for model loading and embedding generation."""
|
||||||
|
return html.Div(
|
||||||
|
[
|
||||||
|
# Model loading progress
|
||||||
|
html.Div(
|
||||||
|
[
|
||||||
|
html.H6("Model Loading Progress", className="mb-2"),
|
||||||
|
dbc.Progress(
|
||||||
|
id="model-loading-progress",
|
||||||
|
value=0,
|
||||||
|
striped=True,
|
||||||
|
animated=True,
|
||||||
|
className="mb-2",
|
||||||
|
),
|
||||||
|
html.Small(
|
||||||
|
id="model-loading-status",
|
||||||
|
children="No model loading in progress",
|
||||||
|
className="text-muted",
|
||||||
|
),
|
||||||
|
],
|
||||||
|
id="model-loading-section",
|
||||||
|
style={"display": "none"},
|
||||||
|
),
|
||||||
|
html.Br(),
|
||||||
|
# Embedding generation progress
|
||||||
|
html.Div(
|
||||||
|
[
|
||||||
|
html.H6("Embedding Generation Progress", className="mb-2"),
|
||||||
|
dbc.Progress(
|
||||||
|
id="embedding-progress",
|
||||||
|
value=0,
|
||||||
|
striped=True,
|
||||||
|
animated=True,
|
||||||
|
className="mb-2",
|
||||||
|
),
|
||||||
|
html.Small(
|
||||||
|
id="embedding-status",
|
||||||
|
children="No embedding generation in progress",
|
||||||
|
className="text-muted",
|
||||||
|
),
|
||||||
|
],
|
||||||
|
id="embedding-progress-section",
|
||||||
|
style={"display": "none"},
|
||||||
|
),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
def _create_status_section(self):
|
||||||
|
"""Create status alerts and results preview."""
|
||||||
|
return html.Div(
|
||||||
|
[
|
||||||
|
# Immediate status (from client-side)
|
||||||
|
dbc.Alert(
|
||||||
|
id="text-input-status-immediate",
|
||||||
|
children="Ready to generate embeddings",
|
||||||
|
color="light",
|
||||||
|
className="mb-3",
|
||||||
|
),
|
||||||
|
# Server-side status
|
||||||
|
dbc.Alert(
|
||||||
|
id="text-input-status",
|
||||||
|
children="",
|
||||||
|
color="light",
|
||||||
|
className="mb-3",
|
||||||
|
style={"display": "none"},
|
||||||
|
),
|
||||||
|
# Results preview
|
||||||
|
html.Div(id="embedding-results-preview"),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
def _create_hidden_components(self):
|
||||||
|
"""Create hidden components for data flow."""
|
||||||
|
return html.Div(
|
||||||
|
[
|
||||||
|
# Store for embeddings data from client-side
|
||||||
|
dcc.Store(id="embeddings-generated-trigger"),
|
||||||
|
# Store for tokenization preview
|
||||||
|
dcc.Store(id="tokenization-preview-data"),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
def _get_model_description(self, model_name):
|
||||||
|
"""Get description for a specific model."""
|
||||||
|
for model in self.settings.AVAILABLE_MODELS:
|
||||||
|
if model["name"] == model_name:
|
||||||
|
return html.Div(
|
||||||
|
[
|
||||||
|
html.Strong(
|
||||||
|
f"Dimensions: {model['dimensions']} | Context Length: {model['context_length']}"
|
||||||
|
),
|
||||||
|
html.Br(),
|
||||||
|
html.Span(model["description"]),
|
||||||
|
html.Br(),
|
||||||
|
html.Small(
|
||||||
|
f"Multilingual: {'Yes' if model.get('multilingual', False) else 'No'} | Size: {model['size']}",
|
||||||
|
className="text-muted",
|
||||||
|
),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
return html.Span("Model information not available", className="text-muted")
|
@@ -20,6 +20,15 @@ class AppLayout:
|
|||||||
dbc.Col(
|
dbc.Col(
|
||||||
[
|
[
|
||||||
html.H1("EmbeddingBuddy", className="text-center mb-4"),
|
html.H1("EmbeddingBuddy", className="text-center mb-4"),
|
||||||
|
# Load Transformers.js from CDN
|
||||||
|
html.Script(
|
||||||
|
"""
|
||||||
|
import { pipeline } from 'https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.7.2';
|
||||||
|
window.transformersPipeline = pipeline;
|
||||||
|
console.log('✅ Transformers.js pipeline loaded globally');
|
||||||
|
""",
|
||||||
|
type="module",
|
||||||
|
),
|
||||||
],
|
],
|
||||||
width=12,
|
width=12,
|
||||||
)
|
)
|
||||||
|
158
tests/test_client_embeddings.py
Normal file
158
tests/test_client_embeddings.py
Normal file
@@ -0,0 +1,158 @@
|
|||||||
|
"""Tests for client-side embedding processing functionality."""
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from src.embeddingbuddy.data.processor import DataProcessor
|
||||||
|
from src.embeddingbuddy.models.schemas import ProcessedData
|
||||||
|
|
||||||
|
|
||||||
|
class TestClientEmbeddingsProcessing:
|
||||||
|
"""Test client-side embeddings processing functionality."""
|
||||||
|
|
||||||
|
def setup_method(self):
|
||||||
|
"""Set up test instances."""
|
||||||
|
self.processor = DataProcessor()
|
||||||
|
|
||||||
|
def test_process_client_embeddings_success(self):
|
||||||
|
"""Test successful processing of client-side embeddings data."""
|
||||||
|
client_data = {
|
||||||
|
"documents": [
|
||||||
|
{
|
||||||
|
"id": "text_input_0",
|
||||||
|
"text": "First test document",
|
||||||
|
"category": "Text Input",
|
||||||
|
"subcategory": "Generated",
|
||||||
|
"tags": [],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "text_input_1",
|
||||||
|
"text": "Second test document",
|
||||||
|
"category": "Text Input",
|
||||||
|
"subcategory": "Generated",
|
||||||
|
"tags": [],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
"embeddings": [[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8]],
|
||||||
|
}
|
||||||
|
|
||||||
|
result = self.processor.process_client_embeddings(client_data)
|
||||||
|
|
||||||
|
assert isinstance(result, ProcessedData)
|
||||||
|
assert result.error is None
|
||||||
|
assert len(result.documents) == 2
|
||||||
|
assert result.embeddings.shape == (2, 4)
|
||||||
|
|
||||||
|
# Check document content
|
||||||
|
assert result.documents[0].text == "First test document"
|
||||||
|
assert result.documents[1].text == "Second test document"
|
||||||
|
|
||||||
|
# Check embeddings match
|
||||||
|
np.testing.assert_array_equal(result.embeddings[0], [0.1, 0.2, 0.3, 0.4])
|
||||||
|
np.testing.assert_array_equal(result.embeddings[1], [0.5, 0.6, 0.7, 0.8])
|
||||||
|
|
||||||
|
def test_process_client_embeddings_with_error(self):
|
||||||
|
"""Test processing client data with error."""
|
||||||
|
client_data = {"error": "Transformers.js not loaded"}
|
||||||
|
|
||||||
|
result = self.processor.process_client_embeddings(client_data)
|
||||||
|
|
||||||
|
assert isinstance(result, ProcessedData)
|
||||||
|
assert result.error == "Transformers.js not loaded"
|
||||||
|
assert len(result.documents) == 0
|
||||||
|
assert result.embeddings.size == 0
|
||||||
|
|
||||||
|
def test_process_client_embeddings_missing_data(self):
|
||||||
|
"""Test processing with missing documents or embeddings."""
|
||||||
|
client_data = {"documents": []}
|
||||||
|
|
||||||
|
result = self.processor.process_client_embeddings(client_data)
|
||||||
|
|
||||||
|
assert isinstance(result, ProcessedData)
|
||||||
|
assert "No documents or embeddings in client data" in result.error
|
||||||
|
assert len(result.documents) == 0
|
||||||
|
|
||||||
|
def test_process_client_embeddings_mismatch_count(self):
|
||||||
|
"""Test processing with mismatched document and embedding counts."""
|
||||||
|
client_data = {
|
||||||
|
"documents": [
|
||||||
|
{
|
||||||
|
"id": "test",
|
||||||
|
"text": "Test document",
|
||||||
|
"category": "Test",
|
||||||
|
"subcategory": "Test",
|
||||||
|
"tags": [],
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"embeddings": [[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8]],
|
||||||
|
}
|
||||||
|
|
||||||
|
result = self.processor.process_client_embeddings(client_data)
|
||||||
|
|
||||||
|
assert isinstance(result, ProcessedData)
|
||||||
|
assert "Mismatch between number of documents and embeddings" in result.error
|
||||||
|
assert len(result.documents) == 0
|
||||||
|
|
||||||
|
def test_process_client_embeddings_invalid_document(self):
|
||||||
|
"""Test processing with invalid document data."""
|
||||||
|
client_data = {
|
||||||
|
"documents": [
|
||||||
|
{"text": ""}, # Empty text should be skipped
|
||||||
|
{
|
||||||
|
"id": "test2",
|
||||||
|
"text": "Valid document",
|
||||||
|
"category": "Test",
|
||||||
|
"subcategory": "Test",
|
||||||
|
"tags": [],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
"embeddings": [[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8]],
|
||||||
|
}
|
||||||
|
|
||||||
|
result = self.processor.process_client_embeddings(client_data)
|
||||||
|
|
||||||
|
assert isinstance(result, ProcessedData)
|
||||||
|
assert result.error is None
|
||||||
|
assert len(result.documents) == 1 # Only valid document should be processed
|
||||||
|
assert result.documents[0].text == "Valid document"
|
||||||
|
|
||||||
|
def test_process_client_embeddings_auto_id_generation(self):
|
||||||
|
"""Test automatic ID generation for documents without IDs."""
|
||||||
|
client_data = {
|
||||||
|
"documents": [
|
||||||
|
{
|
||||||
|
"text": "Document without ID",
|
||||||
|
"category": "Test",
|
||||||
|
"subcategory": "Test",
|
||||||
|
"tags": [],
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"embeddings": [[0.1, 0.2, 0.3, 0.4]],
|
||||||
|
}
|
||||||
|
|
||||||
|
result = self.processor.process_client_embeddings(client_data)
|
||||||
|
|
||||||
|
assert isinstance(result, ProcessedData)
|
||||||
|
assert result.error is None
|
||||||
|
assert len(result.documents) == 1
|
||||||
|
assert result.documents[0].id.startswith("text_input_")
|
||||||
|
|
||||||
|
def test_process_client_embeddings_invalid_embedding_format(self):
|
||||||
|
"""Test processing with invalid embedding format."""
|
||||||
|
client_data = {
|
||||||
|
"documents": [
|
||||||
|
{
|
||||||
|
"id": "test",
|
||||||
|
"text": "Test document",
|
||||||
|
"category": "Test",
|
||||||
|
"subcategory": "Test",
|
||||||
|
"tags": [],
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"embeddings": 0.5, # Scalar instead of array
|
||||||
|
}
|
||||||
|
|
||||||
|
result = self.processor.process_client_embeddings(client_data)
|
||||||
|
|
||||||
|
assert isinstance(result, ProcessedData)
|
||||||
|
assert result.error is not None # Should have some error
|
||||||
|
assert len(result.documents) == 0
|
25
uv.lock
generated
25
uv.lock
generated
@@ -412,13 +412,12 @@ wheels = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "embeddingbuddy"
|
name = "embeddingbuddy"
|
||||||
version = "0.3.0"
|
version = "0.4.0"
|
||||||
source = { editable = "." }
|
source = { editable = "." }
|
||||||
dependencies = [
|
dependencies = [
|
||||||
{ name = "dash" },
|
{ name = "dash" },
|
||||||
{ name = "dash-bootstrap-components" },
|
{ name = "dash-bootstrap-components" },
|
||||||
{ name = "mypy" },
|
{ name = "mypy" },
|
||||||
{ name = "numba" },
|
|
||||||
{ name = "numpy" },
|
{ name = "numpy" },
|
||||||
{ name = "opensearch-py" },
|
{ name = "opensearch-py" },
|
||||||
{ name = "opentsne" },
|
{ name = "opentsne" },
|
||||||
@@ -431,6 +430,7 @@ dependencies = [
|
|||||||
[package.optional-dependencies]
|
[package.optional-dependencies]
|
||||||
all = [
|
all = [
|
||||||
{ name = "bandit" },
|
{ name = "bandit" },
|
||||||
|
{ name = "gunicorn" },
|
||||||
{ name = "mypy" },
|
{ name = "mypy" },
|
||||||
{ name = "pip-audit" },
|
{ name = "pip-audit" },
|
||||||
{ name = "pytest" },
|
{ name = "pytest" },
|
||||||
@@ -451,6 +451,9 @@ lint = [
|
|||||||
{ name = "mypy" },
|
{ name = "mypy" },
|
||||||
{ name = "ruff" },
|
{ name = "ruff" },
|
||||||
]
|
]
|
||||||
|
prod = [
|
||||||
|
{ name = "gunicorn" },
|
||||||
|
]
|
||||||
security = [
|
security = [
|
||||||
{ name = "bandit" },
|
{ name = "bandit" },
|
||||||
{ name = "pip-audit" },
|
{ name = "pip-audit" },
|
||||||
@@ -466,11 +469,11 @@ requires-dist = [
|
|||||||
{ name = "bandit", extras = ["toml"], marker = "extra == 'security'", specifier = ">=1.7.5" },
|
{ name = "bandit", extras = ["toml"], marker = "extra == 'security'", specifier = ">=1.7.5" },
|
||||||
{ name = "dash", specifier = ">=2.17.1" },
|
{ name = "dash", specifier = ">=2.17.1" },
|
||||||
{ name = "dash-bootstrap-components", specifier = ">=1.5.0" },
|
{ name = "dash-bootstrap-components", specifier = ">=1.5.0" },
|
||||||
{ name = "embeddingbuddy", extras = ["test", "lint", "security"], marker = "extra == 'all'" },
|
|
||||||
{ name = "embeddingbuddy", extras = ["test", "lint", "security"], marker = "extra == 'dev'" },
|
{ name = "embeddingbuddy", extras = ["test", "lint", "security"], marker = "extra == 'dev'" },
|
||||||
|
{ name = "embeddingbuddy", extras = ["test", "lint", "security", "prod"], marker = "extra == 'all'" },
|
||||||
|
{ name = "gunicorn", marker = "extra == 'prod'", specifier = ">=21.2.0" },
|
||||||
{ name = "mypy", specifier = ">=1.17.1" },
|
{ name = "mypy", specifier = ">=1.17.1" },
|
||||||
{ name = "mypy", marker = "extra == 'lint'", specifier = ">=1.5.0" },
|
{ name = "mypy", marker = "extra == 'lint'", specifier = ">=1.5.0" },
|
||||||
{ name = "numba", specifier = ">=0.56.4" },
|
|
||||||
{ name = "numpy", specifier = ">=1.24.4" },
|
{ name = "numpy", specifier = ">=1.24.4" },
|
||||||
{ name = "opensearch-py", specifier = ">=3.0.0" },
|
{ name = "opensearch-py", specifier = ">=3.0.0" },
|
||||||
{ name = "opentsne", specifier = ">=1.0.0" },
|
{ name = "opentsne", specifier = ">=1.0.0" },
|
||||||
@@ -484,7 +487,7 @@ requires-dist = [
|
|||||||
{ name = "scikit-learn", specifier = ">=1.3.2" },
|
{ name = "scikit-learn", specifier = ">=1.3.2" },
|
||||||
{ name = "umap-learn", specifier = ">=0.5.8" },
|
{ name = "umap-learn", specifier = ">=0.5.8" },
|
||||||
]
|
]
|
||||||
provides-extras = ["test", "lint", "security", "dev", "all"]
|
provides-extras = ["test", "lint", "security", "prod", "dev", "all"]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "events"
|
name = "events"
|
||||||
@@ -520,6 +523,18 @@ wheels = [
|
|||||||
{ url = "https://files.pythonhosted.org/packages/3d/68/9d4508e893976286d2ead7f8f571314af6c2037af34853a30fd769c02e9d/flask-3.1.1-py3-none-any.whl", hash = "sha256:07aae2bb5eaf77993ef57e357491839f5fd9f4dc281593a81a9e4d79a24f295c", size = 103305, upload-time = "2025-05-13T15:01:15.591Z" },
|
{ url = "https://files.pythonhosted.org/packages/3d/68/9d4508e893976286d2ead7f8f571314af6c2037af34853a30fd769c02e9d/flask-3.1.1-py3-none-any.whl", hash = "sha256:07aae2bb5eaf77993ef57e357491839f5fd9f4dc281593a81a9e4d79a24f295c", size = 103305, upload-time = "2025-05-13T15:01:15.591Z" },
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "gunicorn"
|
||||||
|
version = "23.0.0"
|
||||||
|
source = { registry = "https://pypi.org/simple" }
|
||||||
|
dependencies = [
|
||||||
|
{ name = "packaging" },
|
||||||
|
]
|
||||||
|
sdist = { url = "https://files.pythonhosted.org/packages/34/72/9614c465dc206155d93eff0ca20d42e1e35afc533971379482de953521a4/gunicorn-23.0.0.tar.gz", hash = "sha256:f014447a0101dc57e294f6c18ca6b40227a4c90e9bdb586042628030cba004ec", size = 375031, upload-time = "2024-08-10T20:25:27.378Z" }
|
||||||
|
wheels = [
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/cb/7d/6dac2a6e1eba33ee43f318edbed4ff29151a49b5d37f080aad1e6469bca4/gunicorn-23.0.0-py3-none-any.whl", hash = "sha256:ec400d38950de4dfd418cff8328b2c8faed0edb0d517d3394e457c317908ca4d", size = 85029, upload-time = "2024-08-10T20:25:24.996Z" },
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "h11"
|
name = "h11"
|
||||||
version = "0.16.0"
|
version = "0.16.0"
|
||||||
|
20
wsgi.py
Normal file
20
wsgi.py
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
"""
|
||||||
|
WSGI entry point for production deployment.
|
||||||
|
Use this with a production WSGI server like Gunicorn.
|
||||||
|
"""
|
||||||
|
from src.embeddingbuddy.app import create_app
|
||||||
|
|
||||||
|
# Create the application instance
|
||||||
|
application = create_app()
|
||||||
|
|
||||||
|
# For compatibility with different WSGI servers
|
||||||
|
app = application
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# This won't be used in production, but useful for testing
|
||||||
|
from src.embeddingbuddy.config.settings import AppSettings
|
||||||
|
application.run(
|
||||||
|
host=AppSettings.HOST,
|
||||||
|
port=AppSettings.PORT,
|
||||||
|
debug=False
|
||||||
|
)
|
Reference in New Issue
Block a user