Compare commits
1 Commits
v0.8.1
...
450f6b23e0
| Author | SHA1 | Date | |
|---|---|---|---|
| 450f6b23e0 |
@@ -3,13 +3,9 @@
|
||||
"allow": [
|
||||
"Bash(mkdir:*)",
|
||||
"Bash(uv run:*)",
|
||||
"Bash(uv add:*)",
|
||||
"Bash(uv sync:*)",
|
||||
"Bash(tree:*)",
|
||||
"WebFetch(domain:www.dash-bootstrap-components.com)"
|
||||
"Bash(uv add:*)"
|
||||
],
|
||||
"deny": [],
|
||||
"ask": [],
|
||||
"defaultMode": "acceptEdits"
|
||||
"ask": []
|
||||
}
|
||||
}
|
||||
@@ -1,76 +0,0 @@
|
||||
# Python
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
*.so
|
||||
.Python
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
.venv/
|
||||
|
||||
# Testing
|
||||
.pytest_cache/
|
||||
.coverage
|
||||
htmlcov/
|
||||
.tox/
|
||||
coverage.xml
|
||||
*.cover
|
||||
|
||||
# Development tools
|
||||
.mypy_cache/
|
||||
.ruff_cache/
|
||||
|
||||
# IDE
|
||||
.vscode/
|
||||
.idea/
|
||||
*.swp
|
||||
*.swo
|
||||
*~
|
||||
|
||||
# OS
|
||||
.DS_Store
|
||||
.DS_Store?
|
||||
._*
|
||||
.Spotlight-V100
|
||||
.Trashes
|
||||
ehthumbs.db
|
||||
Thumbs.db
|
||||
|
||||
# Git
|
||||
.git/
|
||||
.gitignore
|
||||
|
||||
# Documentation
|
||||
*.md
|
||||
!README.md
|
||||
|
||||
# Docker
|
||||
Dockerfile*
|
||||
docker-compose*.yml
|
||||
.dockerignore
|
||||
|
||||
# Data files (may contain sensitive information)
|
||||
*.ndjson
|
||||
*.ldjson
|
||||
*.json
|
||||
|
||||
# Reports
|
||||
*-report.json
|
||||
bandit-report.json
|
||||
safety-report.json
|
||||
|
||||
# Screenshots
|
||||
*.png
|
||||
*.jpg
|
||||
*.jpeg
|
||||
*.gif
|
||||
|
||||
# Logs
|
||||
*.log
|
||||
|
||||
# Temporary files
|
||||
*.tmp
|
||||
*.temp
|
||||
@@ -1,52 +0,0 @@
|
||||
name: Bump Version and Release
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
bump_type:
|
||||
description: 'Version bump type'
|
||||
required: true
|
||||
type: choice
|
||||
options:
|
||||
- patch
|
||||
- minor
|
||||
- major
|
||||
|
||||
jobs:
|
||||
bump-and-release:
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
contents: write
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
token: ${{ secrets.GITEA_TOKEN }}
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: '3.11'
|
||||
|
||||
- name: Bump version
|
||||
id: bump
|
||||
run: |
|
||||
python bump_version.py ${{ github.event.inputs.bump_type }}
|
||||
NEW_VERSION=$(grep -oP 'version = "\K[^"]+' pyproject.toml)
|
||||
echo "version=$NEW_VERSION" >> $GITHUB_OUTPUT
|
||||
echo "tag=v$NEW_VERSION" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Commit and tag
|
||||
run: |
|
||||
git config user.name "gitea-actions[bot]"
|
||||
git config user.email "gitea-actions[bot]@users.noreply.gitea.io"
|
||||
git add pyproject.toml
|
||||
git commit -m "bump version to v${{ steps.bump.outputs.version }}"
|
||||
git tag v${{ steps.bump.outputs.version }}
|
||||
|
||||
- name: Push changes
|
||||
run: |
|
||||
git push origin main
|
||||
git push origin v${{ steps.bump.outputs.version }}
|
||||
@@ -26,10 +26,12 @@ jobs:
|
||||
run: uv python install 3.11
|
||||
|
||||
- name: Install dependencies
|
||||
run: uv sync --extra test
|
||||
run: uv sync
|
||||
|
||||
- name: Run full test suite
|
||||
run: uv run pytest tests/ -v --cov=src/embeddingbuddy --cov-report=term-missing
|
||||
run: |
|
||||
uv add pytest-cov
|
||||
uv run pytest tests/ -v --cov=src/embeddingbuddy --cov-report=term-missing
|
||||
|
||||
build-and-release:
|
||||
runs-on: ubuntu-latest
|
||||
@@ -66,20 +68,27 @@ jobs:
|
||||
echo "## Installation" >> release-notes.md
|
||||
echo "" >> release-notes.md
|
||||
echo '```bash' >> release-notes.md
|
||||
echo 'pip install embeddingbuddy' >> release-notes.md
|
||||
echo 'embeddingbuddy serve' >> release-notes.md
|
||||
echo 'uv sync' >> release-notes.md
|
||||
echo 'uv run python main.py' >> release-notes.md
|
||||
echo '```' >> release-notes.md
|
||||
|
||||
- name: Create Release
|
||||
uses: akkuman/gitea-release-action@v1
|
||||
uses: actions/create-release@v1
|
||||
env:
|
||||
NODE_OPTIONS: '--experimental-fetch'
|
||||
GITHUB_TOKEN: ${{ secrets.GITEA_TOKEN }}
|
||||
with:
|
||||
token: ${{ secrets.GITEA_TOKEN }}
|
||||
tag_name: ${{ github.ref_name || github.event.inputs.version }}
|
||||
release_name: Release ${{ github.ref_name || github.event.inputs.version }}
|
||||
body_path: release-notes.md
|
||||
draft: false
|
||||
prerelease: false
|
||||
files: |-
|
||||
dist/*
|
||||
|
||||
- name: Upload Release Assets
|
||||
uses: actions/upload-release-asset@v1
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITEA_TOKEN }}
|
||||
with:
|
||||
upload_url: ${{ steps.create_release.outputs.upload_url }}
|
||||
asset_path: dist/
|
||||
asset_name: embeddingbuddy-dist
|
||||
asset_content_type: application/zip
|
||||
@@ -25,18 +25,23 @@ jobs:
|
||||
run: uv python install 3.11
|
||||
|
||||
- name: Install dependencies
|
||||
run: uv sync --extra security
|
||||
run: uv sync
|
||||
|
||||
- name: Add security tools
|
||||
run: |
|
||||
uv add bandit[toml]
|
||||
uv add safety
|
||||
|
||||
- name: Run bandit security linter
|
||||
run: uv run bandit -r src/ -f json -o bandit-report.json
|
||||
continue-on-error: true
|
||||
|
||||
- name: Run safety vulnerability check
|
||||
run: uv run safety check --json --save-json safety-report.json
|
||||
run: uv run safety check --json --output safety-report.json
|
||||
continue-on-error: true
|
||||
|
||||
- name: Upload security reports
|
||||
uses: actions/upload-artifact@v3
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: security-reports
|
||||
path: |
|
||||
@@ -59,12 +64,13 @@ jobs:
|
||||
|
||||
- name: Check for dependency vulnerabilities
|
||||
run: |
|
||||
uv sync --extra security
|
||||
uv sync
|
||||
uv add pip-audit
|
||||
uv run pip-audit --format=json --output=pip-audit-report.json
|
||||
continue-on-error: true
|
||||
|
||||
- name: Upload dependency audit report
|
||||
uses: actions/upload-artifact@v3
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: dependency-audit
|
||||
path: pip-audit-report.json
|
||||
@@ -2,20 +2,16 @@ name: Test Suite
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- "main"
|
||||
- "develop"
|
||||
branches: ["*"]
|
||||
pull_request:
|
||||
branches:
|
||||
- "main"
|
||||
workflow_dispatch:
|
||||
branches: ["main", "master"]
|
||||
|
||||
jobs:
|
||||
test:
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
matrix:
|
||||
python-version: ["3.11"]
|
||||
python-version: ["3.11", "3.12"]
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
@@ -30,13 +26,15 @@ jobs:
|
||||
run: uv python install ${{ matrix.python-version }}
|
||||
|
||||
- name: Install dependencies
|
||||
run: uv sync --extra test
|
||||
run: uv sync
|
||||
|
||||
- name: Run tests with pytest
|
||||
run: uv run pytest tests/ -v --tb=short
|
||||
|
||||
- name: Run tests with coverage
|
||||
run: uv run pytest tests/ --cov=src/embeddingbuddy --cov-report=term-missing --cov-report=xml
|
||||
run: |
|
||||
uv add pytest-cov
|
||||
uv run pytest tests/ --cov=src/embeddingbuddy --cov-report=term-missing --cov-report=xml
|
||||
|
||||
- name: Upload coverage reports
|
||||
uses: codecov/codecov-action@v4
|
||||
@@ -60,7 +58,12 @@ jobs:
|
||||
run: uv python install 3.11
|
||||
|
||||
- name: Install dependencies
|
||||
run: uv sync --extra lint
|
||||
run: uv sync
|
||||
|
||||
- name: Add linting tools
|
||||
run: |
|
||||
uv add ruff
|
||||
uv add mypy
|
||||
|
||||
- name: Run ruff linter
|
||||
run: uv run ruff check src/ tests/
|
||||
@@ -68,9 +71,8 @@ jobs:
|
||||
- name: Run ruff formatter check
|
||||
run: uv run ruff format --check src/ tests/
|
||||
|
||||
# TODO fix this it throws errors
|
||||
# - name: Run mypy type checker
|
||||
# run: uv run mypy src/embeddingbuddy/ --ignore-missing-imports
|
||||
- name: Run mypy type checker
|
||||
run: uv run mypy src/embeddingbuddy/ --ignore-missing-imports
|
||||
|
||||
build:
|
||||
runs-on: ubuntu-latest
|
||||
@@ -98,7 +100,7 @@ jobs:
|
||||
uv run python -c "from src.embeddingbuddy.app import create_app; app = create_app(); print('✅ Package builds and imports successfully')"
|
||||
|
||||
- name: Upload build artifacts
|
||||
uses: actions/upload-artifact@v3
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: dist-files
|
||||
path: dist/
|
||||
54
.github/workflows/docker-release.yml
vendored
54
.github/workflows/docker-release.yml
vendored
@@ -1,54 +0,0 @@
|
||||
name: Docker Release
|
||||
|
||||
on:
|
||||
push:
|
||||
tags:
|
||||
- 'v[0-9]+.[0-9]+.[0-9]+'
|
||||
workflow_dispatch:
|
||||
|
||||
env:
|
||||
REGISTRY: ghcr.io
|
||||
IMAGE_NAME: ${{ github.repository }}
|
||||
|
||||
jobs:
|
||||
build-and-push:
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
contents: read
|
||||
packages: write
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Log in to Container Registry
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
registry: ${{ env.REGISTRY }}
|
||||
username: ${{ github.actor }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Extract metadata
|
||||
id: meta
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
|
||||
tags: |
|
||||
type=ref,event=tag
|
||||
type=semver,pattern={{version}}
|
||||
type=semver,pattern={{major}}.{{minor}}
|
||||
type=semver,pattern={{major}}
|
||||
|
||||
- name: Build and push Docker image
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: .
|
||||
platforms: linux/amd64,linux/arm64
|
||||
push: true
|
||||
tags: ${{ steps.meta.outputs.tags }}
|
||||
labels: ${{ steps.meta.outputs.labels }}
|
||||
cache-from: type=gha
|
||||
cache-to: type=gha,mode=max
|
||||
33
.github/workflows/pypi-release.yml
vendored
33
.github/workflows/pypi-release.yml
vendored
@@ -1,33 +0,0 @@
|
||||
name: PyPI Release
|
||||
|
||||
on:
|
||||
push:
|
||||
tags:
|
||||
- 'v[0-9]+.[0-9]+.[0-9]+'
|
||||
workflow_dispatch:
|
||||
|
||||
jobs:
|
||||
pypi-publish:
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
contents: read
|
||||
id-token: write # For trusted publishing
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: '3.11'
|
||||
|
||||
- name: Install uv
|
||||
uses: astral-sh/setup-uv@v4
|
||||
|
||||
- name: Build package
|
||||
run: |
|
||||
uv build
|
||||
|
||||
- name: Publish to PyPI
|
||||
uses: pypa/gh-action-pypi-publish@release/v1
|
||||
5
.gitignore
vendored
5
.gitignore
vendored
@@ -81,7 +81,4 @@ safety-report.json
|
||||
pip-audit-report.json
|
||||
|
||||
# Temporary files
|
||||
*.tmp
|
||||
|
||||
|
||||
examples/extra
|
||||
*.tmp
|
||||
100
CLAUDE.md
100
CLAUDE.md
@@ -21,49 +21,18 @@ uv sync
|
||||
|
||||
**Run the application:**
|
||||
|
||||
Using the CLI (recommended):
|
||||
|
||||
```bash
|
||||
# Production mode (no debug, no auto-reload)
|
||||
embeddingbuddy serve
|
||||
|
||||
# Development mode (debug + auto-reload on code changes)
|
||||
embeddingbuddy serve --dev
|
||||
|
||||
# Debug logging only (no auto-reload)
|
||||
embeddingbuddy serve --debug
|
||||
|
||||
# With custom host/port
|
||||
embeddingbuddy serve --host 0.0.0.0 --port 8080
|
||||
uv run python main.py
|
||||
```
|
||||
|
||||
The app will be available at <http://127.0.0.1:8050> by default
|
||||
The app will be available at http://127.0.0.1:8050
|
||||
|
||||
**Run tests:**
|
||||
|
||||
```bash
|
||||
uv sync --extra test
|
||||
uv run pytest tests/ -v
|
||||
```
|
||||
|
||||
**Development tools:**
|
||||
|
||||
```bash
|
||||
# Install all dev dependencies
|
||||
uv sync --extra dev
|
||||
|
||||
# Linting and formatting
|
||||
uv run ruff check src/ tests/
|
||||
uv run ruff format src/ tests/
|
||||
|
||||
# Type checking
|
||||
uv run mypy src/embeddingbuddy/
|
||||
|
||||
# Security scanning
|
||||
uv run bandit -r src/
|
||||
uv run safety check
|
||||
```
|
||||
|
||||
**Test with sample data:**
|
||||
Use the included `sample_data.ndjson` and `sample_prompts.ndjson` files for testing the application functionality.
|
||||
|
||||
@@ -73,7 +42,7 @@ Use the included `sample_data.ndjson` and `sample_prompts.ndjson` files for test
|
||||
|
||||
The application follows a modular architecture with clear separation of concerns:
|
||||
|
||||
```text
|
||||
```
|
||||
src/embeddingbuddy/
|
||||
├── app.py # Main application entry point and factory
|
||||
├── main.py # Application runner
|
||||
@@ -103,32 +72,27 @@ src/embeddingbuddy/
|
||||
### Key Components
|
||||
|
||||
**Data Layer:**
|
||||
|
||||
- `data/parser.py` - NDJSON parsing with error handling
|
||||
- `data/processor.py` - Data transformation and combination logic
|
||||
- `models/schemas.py` - Dataclasses for type safety and validation
|
||||
|
||||
**Algorithm Layer:**
|
||||
|
||||
- `models/reducers.py` - Modular dimensionality reduction with factory pattern
|
||||
- Supports PCA, t-SNE (openTSNE), and UMAP algorithms
|
||||
- Abstract base class for easy extension
|
||||
|
||||
**Visualization Layer:**
|
||||
|
||||
- `visualization/plots.py` - Plot factory with single and dual plot support
|
||||
- `visualization/colors.py` - Color mapping and grayscale conversion utilities
|
||||
- Plotly-based 2D/3D scatter plots with interactive features
|
||||
|
||||
**UI Layer:**
|
||||
|
||||
- `ui/layout.py` - Main application layout composition
|
||||
- `ui/components/` - Reusable, testable UI components
|
||||
- `ui/callbacks/` - Organized callbacks grouped by functionality
|
||||
- Bootstrap-styled sidebar with controls and large visualization area
|
||||
|
||||
**Configuration:**
|
||||
|
||||
- `config/settings.py` - Centralized settings with environment variable support
|
||||
- Plot styling, marker configurations, and app-wide constants
|
||||
|
||||
@@ -148,19 +112,16 @@ Optional fields: `id`, `category`, `subcategory`, `tags`
|
||||
The refactored callback system is organized by functionality:
|
||||
|
||||
**Data Processing (`ui/callbacks/data_processing.py`):**
|
||||
|
||||
- File upload handling
|
||||
- NDJSON parsing and validation
|
||||
- Data storage in dcc.Store components
|
||||
|
||||
**Visualization (`ui/callbacks/visualization.py`):**
|
||||
|
||||
- Dimensionality reduction pipeline
|
||||
- Plot generation and updates
|
||||
- Method/parameter change handling
|
||||
|
||||
**Interactions (`ui/callbacks/interactions.py`):**
|
||||
|
||||
- Point click handling and detail display
|
||||
- Reset functionality
|
||||
- User interaction management
|
||||
@@ -170,18 +131,15 @@ The refactored callback system is organized by functionality:
|
||||
The modular design enables comprehensive testing:
|
||||
|
||||
**Unit Tests:**
|
||||
|
||||
- `tests/test_data_processing.py` - Parser and processor logic
|
||||
- `tests/test_reducers.py` - Dimensionality reduction algorithms
|
||||
- `tests/test_visualization.py` - Plot creation and color mapping
|
||||
|
||||
**Integration Tests:**
|
||||
|
||||
- End-to-end data pipeline testing
|
||||
- Component integration verification
|
||||
|
||||
**Key Testing Benefits:**
|
||||
|
||||
- Fast test execution (milliseconds vs seconds)
|
||||
- Isolated component testing
|
||||
- Easy mocking and fixture creation
|
||||
@@ -198,52 +156,6 @@ Uses modern Python stack with uv for dependency management:
|
||||
- **Testing:** pytest for test framework
|
||||
- **Dev Tools:** uv for package management
|
||||
|
||||
## CI/CD and Release Management
|
||||
|
||||
### Repository Setup
|
||||
|
||||
This project uses a **dual-repository workflow**:
|
||||
|
||||
- **Primary repository:** Gitea instance at `git.hawt.cloud` (read-write)
|
||||
- **Mirror repository:** GitHub (read-only mirror)
|
||||
|
||||
### Workflow Organization
|
||||
|
||||
**Gitea Workflows (`.gitea/workflows/`):**
|
||||
- **`bump-and-release.yml`** - Manual version bumping workflow
|
||||
- Runs `bump_version.py` to update version in `pyproject.toml`
|
||||
- Commits changes and creates git tag
|
||||
- Pushes to Gitea (main branch + tag)
|
||||
- Triggered manually via workflow_dispatch with choice of patch/minor/major bump
|
||||
- **`release.yml`** - Automated release creation
|
||||
- Triggered when version tags are pushed
|
||||
- Runs tests, builds packages
|
||||
- Creates Gitea release with artifacts
|
||||
- **`test.yml`** - Test suite execution
|
||||
- **`security.yml`** - Security scanning
|
||||
|
||||
**GitHub Workflows (`.github/workflows/`):**
|
||||
- **`docker-release.yml`** - Builds and publishes Docker images
|
||||
- **`pypi-release.yml`** - Publishes packages to PyPI
|
||||
- These workflows are read-only (no git commits/pushes) and create artifacts only
|
||||
|
||||
### Release Process
|
||||
|
||||
1. Run manual bump workflow on Gitea: **Actions → Bump Version and Release**
|
||||
2. Select version bump type (patch/minor/major)
|
||||
3. Workflow commits version change and pushes tag to Gitea
|
||||
4. Tag push triggers `release.yml` on Gitea (creates release)
|
||||
5. GitHub mirror receives tag and triggers artifact builds (Docker, PyPI)
|
||||
|
||||
### Version Management
|
||||
|
||||
Use `bump_version.py` for version updates:
|
||||
```bash
|
||||
python bump_version.py patch # 0.3.0 -> 0.3.1
|
||||
python bump_version.py minor # 0.3.0 -> 0.4.0
|
||||
python bump_version.py major # 0.3.0 -> 1.0.0
|
||||
```
|
||||
|
||||
## Development Guidelines
|
||||
|
||||
**When adding new features:**
|
||||
@@ -255,16 +167,14 @@ python bump_version.py major # 0.3.0 -> 1.0.0
|
||||
5. **Tests** - Write tests for all new functionality
|
||||
|
||||
**Code Organization Principles:**
|
||||
|
||||
- Single responsibility principle
|
||||
- Clear module boundaries
|
||||
- Clear module boundaries
|
||||
- Testable, isolated components
|
||||
- Configuration over hardcoding
|
||||
- Error handling at appropriate layers
|
||||
|
||||
**Testing Requirements:**
|
||||
|
||||
- Unit tests for all core logic
|
||||
- Integration tests for data flow
|
||||
- Component tests for UI elements
|
||||
- Maintain high test coverage
|
||||
- Maintain high test coverage
|
||||
84
Dockerfile
84
Dockerfile
@@ -1,84 +0,0 @@
|
||||
# Two-stage Dockerfile for EmbeddingBuddy
|
||||
# Stage 1: Builder
|
||||
FROM python:3.11-slim as builder
|
||||
|
||||
# Create non-root user early in builder stage
|
||||
RUN groupadd -r appuser && useradd -r -g appuser appuser
|
||||
|
||||
# Install system dependencies for building Python packages
|
||||
RUN apt-get update && apt-get install -y \
|
||||
build-essential \
|
||||
gcc \
|
||||
g++ \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install uv for dependency management
|
||||
RUN pip install uv
|
||||
|
||||
# Set working directory
|
||||
WORKDIR /app
|
||||
|
||||
# Copy dependency files
|
||||
COPY pyproject.toml uv.lock ./
|
||||
|
||||
# Copy source code (needed for editable install)
|
||||
COPY src/ src/
|
||||
COPY assets/ assets/
|
||||
|
||||
# Change ownership of source files before building (lighter I/O)
|
||||
RUN chown -R appuser:appuser /app
|
||||
|
||||
# Create and set permissions for appuser home directory (needed for uv cache)
|
||||
RUN mkdir -p /home/appuser && chown -R appuser:appuser /home/appuser
|
||||
|
||||
# Switch to non-root user before building
|
||||
USER appuser
|
||||
|
||||
# Create virtual environment and install dependencies (including production extras)
|
||||
RUN uv venv .venv
|
||||
RUN uv sync --frozen --extra prod
|
||||
|
||||
# Stage 2: Runtime
|
||||
FROM python:3.11-slim as runtime
|
||||
|
||||
# Create non-root user in runtime stage
|
||||
RUN groupadd -r appuser && useradd -r -g appuser appuser
|
||||
|
||||
# Install runtime dependencies for compiled packages
|
||||
RUN apt-get update && apt-get install -y \
|
||||
libgomp1 \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Set working directory and change ownership (small directory)
|
||||
WORKDIR /app
|
||||
RUN chown appuser:appuser /app
|
||||
|
||||
# Copy files from builder with correct ownership
|
||||
COPY --from=builder --chown=appuser:appuser /app/.venv /app/.venv
|
||||
COPY --from=builder --chown=appuser:appuser /app/src /app/src
|
||||
COPY --from=builder --chown=appuser:appuser /app/assets /app/assets
|
||||
|
||||
# Switch to non-root user
|
||||
USER appuser
|
||||
|
||||
# Make sure the virtual environment is in PATH
|
||||
ENV PATH="/app/.venv/bin:$PATH"
|
||||
|
||||
# Set Python path
|
||||
ENV PYTHONPATH="/app/src:$PYTHONPATH"
|
||||
|
||||
# Environment variables for production
|
||||
ENV EMBEDDINGBUDDY_HOST=0.0.0.0
|
||||
ENV EMBEDDINGBUDDY_PORT=8050
|
||||
ENV EMBEDDINGBUDDY_DEBUG=false
|
||||
ENV EMBEDDINGBUDDY_ENV=production
|
||||
|
||||
# Expose port
|
||||
EXPOSE 8050
|
||||
|
||||
# Health check
|
||||
HEALTHCHECK --interval=30s --timeout=10s --start-period=30s --retries=3 \
|
||||
CMD python -c "import requests; requests.get('http://localhost:8050/', timeout=5)" || exit 1
|
||||
|
||||
# Run application in production mode (no debug, no auto-reload)
|
||||
CMD ["embeddingbuddy", "serve"]
|
||||
21
LICENSE
21
LICENSE
@@ -1,21 +0,0 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2025 Austin Godber - EmbeddingBuddy
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
190
README.md
190
README.md
@@ -12,7 +12,7 @@ EmbeddingBuddy provides an intuitive web interface for analyzing high-dimensiona
|
||||
embedding vectors by applying various dimensionality reduction algorithms and
|
||||
visualizing the results in interactive 2D and 3D plots. The application features
|
||||
a clean, modular architecture that makes it easy to test, maintain, and extend
|
||||
with new features. It supports dual dataset visualization, allowing you to compare
|
||||
with new features. It supports dual dataset visualization, allowing you to compare
|
||||
documents and prompts to understand how queries relate to your content.
|
||||
|
||||
## Features
|
||||
@@ -28,61 +28,6 @@ documents and prompts to understand how queries relate to your content.
|
||||
- **Sidebar layout** with controls on left, large visualization area on right
|
||||
- **Real-time visualization** optimized for small to medium datasets
|
||||
|
||||
## Network Dependency
|
||||
|
||||
**Note:** The application loads the Transformers.js library (v3.0.0) from `cdn.jsdelivr.net` for client-side embedding generation. This requires an active internet connection and sends requests to a third-party CDN. The application will function without internet if you only use the file upload features for pre-computed embeddings.
|
||||
|
||||
## Quick Start
|
||||
|
||||
### Installation
|
||||
|
||||
**Option 1: Install with uv (recommended)**
|
||||
|
||||
```bash
|
||||
# Install as a CLI tool (no need to clone the repo)
|
||||
uv tool install embeddingbuddy
|
||||
|
||||
# Run the application
|
||||
embeddingbuddy serve
|
||||
```
|
||||
|
||||
**Option 2: Install with pip/pipx**
|
||||
|
||||
```bash
|
||||
# Install with pipx (isolated environment)
|
||||
pipx install embeddingbuddy
|
||||
|
||||
# Or install with pip
|
||||
pip install embeddingbuddy
|
||||
|
||||
# Run the application
|
||||
embeddingbuddy
|
||||
```
|
||||
|
||||
**Option 3: Run with Docker**
|
||||
|
||||
```bash
|
||||
# Pull and run the Docker image
|
||||
docker run -p 8050:8050 ghcr.io/godber/embedding-buddy:latest
|
||||
```
|
||||
|
||||
The application will be available at <http://127.0.0.1:8050>
|
||||
|
||||
### Using the Application
|
||||
|
||||
1. **Open your browser** to <http://127.0.0.1:8050>
|
||||
2. **Upload your data**:
|
||||
- Drag and drop an NDJSON file containing embeddings (see Data Format below)
|
||||
- Optionally upload a second file with prompts to compare against documents
|
||||
3. **Choose visualization settings**:
|
||||
- Select dimensionality reduction method (PCA, t-SNE, or UMAP)
|
||||
- Choose 2D or 3D visualization
|
||||
- Pick color coding (by category, subcategory, or tags)
|
||||
4. **Explore**:
|
||||
- Click points to view full content
|
||||
- Toggle prompt visibility
|
||||
- Rotate and zoom 3D plots
|
||||
|
||||
## Data Format
|
||||
|
||||
EmbeddingBuddy accepts newline-delimited JSON (NDJSON) files for both documents
|
||||
@@ -129,106 +74,40 @@ uv sync
|
||||
2. **Run the application:**
|
||||
|
||||
```bash
|
||||
# Production mode (no debug, no auto-reload)
|
||||
embeddingbuddy serve
|
||||
|
||||
# Development mode (debug + auto-reload on code changes)
|
||||
embeddingbuddy serve --dev
|
||||
|
||||
# Debug logging only (no auto-reload)
|
||||
embeddingbuddy serve --debug
|
||||
|
||||
# Custom host/port
|
||||
embeddingbuddy serve --host 0.0.0.0 --port 8080
|
||||
uv run python main.py
|
||||
```
|
||||
|
||||
3. **Open your browser** to <http://127.0.0.1:8050>
|
||||
3. **Open your browser** to http://127.0.0.1:8050
|
||||
|
||||
4. **Test with sample data**:
|
||||
- Upload `sample_data.ndjson` (documents)
|
||||
- Upload `sample_prompts.ndjson` (prompts) to see dual visualization
|
||||
- Use the "Show prompts" toggle to compare how prompts relate to documents
|
||||
|
||||
## Docker
|
||||
|
||||
You can also run EmbeddingBuddy using Docker:
|
||||
|
||||
### Basic Usage
|
||||
|
||||
```bash
|
||||
# Run in the background
|
||||
docker compose up -d
|
||||
```
|
||||
|
||||
The application will be available at <http://127.0.0.1:8050>
|
||||
|
||||
### With OpenSearch
|
||||
|
||||
To run with OpenSearch for enhanced search capabilities:
|
||||
|
||||
```bash
|
||||
# Run in the background with OpenSearch
|
||||
docker compose --profile opensearch up -d
|
||||
```
|
||||
|
||||
This will start both the EmbeddingBuddy application and an OpenSearch instance.
|
||||
OpenSearch will be available at <http://127.0.0.1:9200>
|
||||
|
||||
### Docker Commands
|
||||
|
||||
```bash
|
||||
# Stop all services
|
||||
docker compose down
|
||||
|
||||
# Stop and remove volumes
|
||||
docker compose down -v
|
||||
|
||||
# View logs
|
||||
docker compose logs embeddingbuddy
|
||||
docker compose logs opensearch
|
||||
|
||||
# Rebuild containers
|
||||
docker compose build
|
||||
```
|
||||
|
||||
## Development
|
||||
|
||||
### Project Structure
|
||||
|
||||
The application follows a modular architecture for improved maintainability and testability:
|
||||
|
||||
```text
|
||||
```
|
||||
src/embeddingbuddy/
|
||||
├── app.py # Main application entry point and factory
|
||||
├── config/ # Configuration management
|
||||
│ └── settings.py # Centralized app settings
|
||||
├── data/ # Data parsing and processing
|
||||
│ ├── parser.py # NDJSON parsing logic
|
||||
│ ├── processor.py # Data transformation utilities
|
||||
│ └── sources/ # Data source integrations
|
||||
│ └── opensearch.py # OpenSearch data source
|
||||
├── models/ # Data schemas and algorithms
|
||||
│ ├── schemas.py # Pydantic data models
|
||||
│ ├── reducers.py # Dimensionality reduction algorithms
|
||||
│ └── field_mapper.py # Field mapping utilities
|
||||
├── visualization/ # Plot creation and styling
|
||||
│ ├── plots.py # Plot factory and creation logic
|
||||
│ └── colors.py # Color mapping utilities
|
||||
├── ui/ # User interface components
|
||||
│ ├── layout.py # Main application layout
|
||||
│ ├── components/ # Reusable UI components
|
||||
│ │ ├── sidebar.py # Sidebar component
|
||||
│ │ ├── upload.py # Upload components
|
||||
│ │ ├── textinput.py # Text input components
|
||||
│ │ └── datasource.py # Data source components
|
||||
│ └── callbacks/ # Organized callback functions
|
||||
│ ├── data_processing.py # Data upload/processing callbacks
|
||||
│ ├── visualization.py # Plot update callbacks
|
||||
│ └── interactions.py # User interaction callbacks
|
||||
└── utils/ # Utility functions
|
||||
|
||||
# CLI entry point
|
||||
embeddingbuddy serve # Main CLI command to start the server
|
||||
├── config/ # Configuration management
|
||||
│ └── settings.py # Centralized app settings
|
||||
├── data/ # Data parsing and processing
|
||||
│ ├── parser.py # NDJSON parsing logic
|
||||
│ └── processor.py # Data transformation utilities
|
||||
├── models/ # Data schemas and algorithms
|
||||
│ ├── schemas.py # Pydantic data models
|
||||
│ └── reducers.py # Dimensionality reduction algorithms
|
||||
├── visualization/ # Plot creation and styling
|
||||
│ ├── plots.py # Plot factory and creation logic
|
||||
│ └── colors.py # Color mapping utilities
|
||||
├── ui/ # User interface components
|
||||
│ ├── layout.py # Main application layout
|
||||
│ ├── components/ # Reusable UI components
|
||||
│ └── callbacks/ # Organized callback functions
|
||||
└── utils/ # Utility functions
|
||||
```
|
||||
|
||||
### Testing
|
||||
@@ -236,8 +115,8 @@ embeddingbuddy serve # Main CLI command to start the server
|
||||
Run the test suite to verify functionality:
|
||||
|
||||
```bash
|
||||
# Install test dependencies
|
||||
uv sync --extra test
|
||||
# Install pytest
|
||||
uv add pytest
|
||||
|
||||
# Run all tests
|
||||
uv run pytest tests/ -v
|
||||
@@ -249,31 +128,6 @@ uv run pytest tests/test_data_processing.py -v
|
||||
uv run pytest tests/ --cov=src/embeddingbuddy
|
||||
```
|
||||
|
||||
### Development Tools
|
||||
|
||||
Install development dependencies for linting, type checking, and security:
|
||||
|
||||
```bash
|
||||
# Install all dev dependencies
|
||||
uv sync --extra dev
|
||||
|
||||
# Or install specific groups
|
||||
uv sync --extra test # Testing tools
|
||||
uv sync --extra lint # Linting and formatting
|
||||
uv sync --extra security # Security scanning tools
|
||||
|
||||
# Run linting
|
||||
uv run ruff check src/ tests/
|
||||
uv run ruff format src/ tests/
|
||||
|
||||
# Run type checking
|
||||
uv run mypy src/embeddingbuddy/
|
||||
|
||||
# Run security scans
|
||||
uv run bandit -r src/
|
||||
uv run safety check
|
||||
```
|
||||
|
||||
### Adding New Features
|
||||
|
||||
The modular architecture makes it easy to extend functionality:
|
||||
|
||||
515
app.py
Normal file
515
app.py
Normal file
@@ -0,0 +1,515 @@
|
||||
import json
|
||||
import uuid
|
||||
from io import StringIO
|
||||
import base64
|
||||
|
||||
import dash
|
||||
from dash import dcc, html, Input, Output, State, callback
|
||||
import dash_bootstrap_components as dbc
|
||||
import plotly.express as px
|
||||
import plotly.graph_objects as go
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from sklearn.decomposition import PCA
|
||||
import umap
|
||||
from openTSNE import TSNE
|
||||
|
||||
|
||||
app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])
|
||||
|
||||
def parse_ndjson(contents):
|
||||
"""Parse NDJSON content and return list of documents."""
|
||||
content_type, content_string = contents.split(',')
|
||||
decoded = base64.b64decode(content_string)
|
||||
text_content = decoded.decode('utf-8')
|
||||
|
||||
documents = []
|
||||
for line in text_content.strip().split('\n'):
|
||||
if line.strip():
|
||||
doc = json.loads(line)
|
||||
if 'id' not in doc:
|
||||
doc['id'] = str(uuid.uuid4())
|
||||
documents.append(doc)
|
||||
return documents
|
||||
|
||||
def apply_dimensionality_reduction(embeddings, method='pca', n_components=3):
|
||||
"""Apply dimensionality reduction to embeddings."""
|
||||
if method == 'pca':
|
||||
reducer = PCA(n_components=n_components)
|
||||
reduced = reducer.fit_transform(embeddings)
|
||||
variance_explained = reducer.explained_variance_ratio_
|
||||
return reduced, variance_explained
|
||||
elif method == 'tsne':
|
||||
reducer = TSNE(n_components=n_components, random_state=42)
|
||||
reduced = reducer.fit(embeddings)
|
||||
return reduced, None
|
||||
elif method == 'umap':
|
||||
reducer = umap.UMAP(n_components=n_components, random_state=42)
|
||||
reduced = reducer.fit_transform(embeddings)
|
||||
return reduced, None
|
||||
else:
|
||||
raise ValueError(f"Unknown method: {method}")
|
||||
|
||||
def create_color_mapping(documents, color_by):
|
||||
"""Create color mapping for documents based on specified field."""
|
||||
if color_by == 'category':
|
||||
values = [doc.get('category', 'Unknown') for doc in documents]
|
||||
elif color_by == 'subcategory':
|
||||
values = [doc.get('subcategory', 'Unknown') for doc in documents]
|
||||
elif color_by == 'tags':
|
||||
values = [', '.join(doc.get('tags', [])) if doc.get('tags') else 'No tags' for doc in documents]
|
||||
else:
|
||||
values = ['All'] * len(documents)
|
||||
|
||||
return values
|
||||
|
||||
def create_plot(df, dimensions='3d', color_by='category', method='PCA'):
|
||||
"""Create plotly scatter plot."""
|
||||
color_values = create_color_mapping(df.to_dict('records'), color_by)
|
||||
|
||||
# Truncate text for hover display
|
||||
df_display = df.copy()
|
||||
df_display['text_preview'] = df_display['text'].apply(lambda x: x[:100] + "..." if len(x) > 100 else x)
|
||||
|
||||
# Include all metadata fields in hover
|
||||
hover_fields = ['id', 'text_preview', 'category', 'subcategory']
|
||||
# Add tags as a string for hover
|
||||
df_display['tags_str'] = df_display['tags'].apply(lambda x: ', '.join(x) if x else 'None')
|
||||
hover_fields.append('tags_str')
|
||||
|
||||
if dimensions == '3d':
|
||||
fig = px.scatter_3d(
|
||||
df_display, x='dim_1', y='dim_2', z='dim_3',
|
||||
color=color_values,
|
||||
hover_data=hover_fields,
|
||||
title=f'3D Embedding Visualization - {method} (colored by {color_by})'
|
||||
)
|
||||
fig.update_traces(marker=dict(size=5))
|
||||
else:
|
||||
fig = px.scatter(
|
||||
df_display, x='dim_1', y='dim_2',
|
||||
color=color_values,
|
||||
hover_data=hover_fields,
|
||||
title=f'2D Embedding Visualization - {method} (colored by {color_by})'
|
||||
)
|
||||
fig.update_traces(marker=dict(size=8))
|
||||
|
||||
fig.update_layout(
|
||||
height=None, # Let CSS height control this
|
||||
autosize=True,
|
||||
margin=dict(l=0, r=0, t=50, b=0)
|
||||
)
|
||||
return fig
|
||||
|
||||
def create_dual_plot(doc_df, prompt_df, dimensions='3d', color_by='category', method='PCA', show_prompts=None):
|
||||
"""Create plotly scatter plot with separate traces for documents and prompts."""
|
||||
|
||||
# Create the base figure
|
||||
fig = go.Figure()
|
||||
|
||||
# Helper function to convert colors to grayscale
|
||||
def to_grayscale_hex(color_str):
|
||||
"""Convert a color to grayscale while maintaining some distinction."""
|
||||
import plotly.colors as pc
|
||||
# Try to get RGB values from the color
|
||||
try:
|
||||
if color_str.startswith('#'):
|
||||
# Hex color
|
||||
rgb = tuple(int(color_str[i:i+2], 16) for i in (1, 3, 5))
|
||||
else:
|
||||
# Named color or other format - convert through plotly
|
||||
rgb = pc.hex_to_rgb(pc.convert_colors_to_same_type([color_str], colortype='hex')[0][0])
|
||||
|
||||
# Convert to grayscale using luminance formula, but keep some color
|
||||
gray_value = int(0.299 * rgb[0] + 0.587 * rgb[1] + 0.114 * rgb[2])
|
||||
# Make it a bit more gray but not completely
|
||||
gray_rgb = (gray_value * 0.7 + rgb[0] * 0.3,
|
||||
gray_value * 0.7 + rgb[1] * 0.3,
|
||||
gray_value * 0.7 + rgb[2] * 0.3)
|
||||
return f'rgb({int(gray_rgb[0])},{int(gray_rgb[1])},{int(gray_rgb[2])})'
|
||||
except:
|
||||
return 'rgb(128,128,128)' # fallback gray
|
||||
|
||||
# Create document plot using plotly express for consistent colors
|
||||
doc_color_values = create_color_mapping(doc_df.to_dict('records'), color_by)
|
||||
doc_df_display = doc_df.copy()
|
||||
doc_df_display['text_preview'] = doc_df_display['text'].apply(lambda x: x[:100] + "..." if len(x) > 100 else x)
|
||||
doc_df_display['tags_str'] = doc_df_display['tags'].apply(lambda x: ', '.join(x) if x else 'None')
|
||||
|
||||
hover_fields = ['id', 'text_preview', 'category', 'subcategory', 'tags_str']
|
||||
|
||||
# Create documents plot to get the color mapping
|
||||
if dimensions == '3d':
|
||||
doc_fig = px.scatter_3d(
|
||||
doc_df_display, x='dim_1', y='dim_2', z='dim_3',
|
||||
color=doc_color_values,
|
||||
hover_data=hover_fields
|
||||
)
|
||||
else:
|
||||
doc_fig = px.scatter(
|
||||
doc_df_display, x='dim_1', y='dim_2',
|
||||
color=doc_color_values,
|
||||
hover_data=hover_fields
|
||||
)
|
||||
|
||||
# Add document traces to main figure
|
||||
for trace in doc_fig.data:
|
||||
trace.name = f'Documents - {trace.name}'
|
||||
if dimensions == '3d':
|
||||
trace.marker.size = 5
|
||||
trace.marker.symbol = 'circle'
|
||||
else:
|
||||
trace.marker.size = 8
|
||||
trace.marker.symbol = 'circle'
|
||||
trace.marker.opacity = 1.0
|
||||
fig.add_trace(trace)
|
||||
|
||||
# Add prompt traces if they exist
|
||||
if prompt_df is not None and show_prompts and 'show' in show_prompts:
|
||||
prompt_color_values = create_color_mapping(prompt_df.to_dict('records'), color_by)
|
||||
prompt_df_display = prompt_df.copy()
|
||||
prompt_df_display['text_preview'] = prompt_df_display['text'].apply(lambda x: x[:100] + "..." if len(x) > 100 else x)
|
||||
prompt_df_display['tags_str'] = prompt_df_display['tags'].apply(lambda x: ', '.join(x) if x else 'None')
|
||||
|
||||
# Create prompts plot to get consistent color grouping
|
||||
if dimensions == '3d':
|
||||
prompt_fig = px.scatter_3d(
|
||||
prompt_df_display, x='dim_1', y='dim_2', z='dim_3',
|
||||
color=prompt_color_values,
|
||||
hover_data=hover_fields
|
||||
)
|
||||
else:
|
||||
prompt_fig = px.scatter(
|
||||
prompt_df_display, x='dim_1', y='dim_2',
|
||||
color=prompt_color_values,
|
||||
hover_data=hover_fields
|
||||
)
|
||||
|
||||
# Add prompt traces with grayed colors
|
||||
for trace in prompt_fig.data:
|
||||
# Convert the color to grayscale
|
||||
original_color = trace.marker.color
|
||||
if hasattr(trace.marker, 'color') and isinstance(trace.marker.color, str):
|
||||
trace.marker.color = to_grayscale_hex(trace.marker.color)
|
||||
|
||||
trace.name = f'Prompts - {trace.name}'
|
||||
if dimensions == '3d':
|
||||
trace.marker.size = 6
|
||||
trace.marker.symbol = 'diamond'
|
||||
else:
|
||||
trace.marker.size = 10
|
||||
trace.marker.symbol = 'diamond'
|
||||
trace.marker.opacity = 0.8
|
||||
fig.add_trace(trace)
|
||||
|
||||
title = f'{dimensions.upper()} Embedding Visualization - {method} (colored by {color_by})'
|
||||
fig.update_layout(
|
||||
title=title,
|
||||
height=None,
|
||||
autosize=True,
|
||||
margin=dict(l=0, r=0, t=50, b=0)
|
||||
)
|
||||
|
||||
return fig
|
||||
|
||||
# Layout
|
||||
app.layout = dbc.Container([
|
||||
dbc.Row([
|
||||
dbc.Col([
|
||||
html.H1("EmbeddingBuddy", className="text-center mb-4"),
|
||||
], width=12)
|
||||
]),
|
||||
|
||||
dbc.Row([
|
||||
# Left sidebar with controls
|
||||
dbc.Col([
|
||||
html.H5("Upload Data", className="mb-3"),
|
||||
dcc.Upload(
|
||||
id='upload-data',
|
||||
children=html.Div([
|
||||
'Drag and Drop or ',
|
||||
html.A('Select Files')
|
||||
]),
|
||||
style={
|
||||
'width': '100%',
|
||||
'height': '60px',
|
||||
'lineHeight': '60px',
|
||||
'borderWidth': '1px',
|
||||
'borderStyle': 'dashed',
|
||||
'borderRadius': '5px',
|
||||
'textAlign': 'center',
|
||||
'margin-bottom': '20px'
|
||||
},
|
||||
multiple=False
|
||||
),
|
||||
|
||||
dcc.Upload(
|
||||
id='upload-prompts',
|
||||
children=html.Div([
|
||||
'Drag and Drop Prompts or ',
|
||||
html.A('Select Files')
|
||||
]),
|
||||
style={
|
||||
'width': '100%',
|
||||
'height': '60px',
|
||||
'lineHeight': '60px',
|
||||
'borderWidth': '1px',
|
||||
'borderStyle': 'dashed',
|
||||
'borderRadius': '5px',
|
||||
'textAlign': 'center',
|
||||
'margin-bottom': '20px',
|
||||
'borderColor': '#28a745'
|
||||
},
|
||||
multiple=False
|
||||
),
|
||||
|
||||
dbc.Button(
|
||||
"Reset All Data",
|
||||
id='reset-button',
|
||||
color='danger',
|
||||
outline=True,
|
||||
size='sm',
|
||||
className='mb-3',
|
||||
style={'width': '100%'}
|
||||
),
|
||||
|
||||
html.H5("Visualization Controls", className="mb-3"),
|
||||
|
||||
dbc.Label("Method:"),
|
||||
dcc.Dropdown(
|
||||
id='method-dropdown',
|
||||
options=[
|
||||
{'label': 'PCA', 'value': 'pca'},
|
||||
{'label': 't-SNE', 'value': 'tsne'},
|
||||
{'label': 'UMAP', 'value': 'umap'}
|
||||
],
|
||||
value='pca',
|
||||
style={'margin-bottom': '15px'}
|
||||
),
|
||||
|
||||
dbc.Label("Color by:"),
|
||||
dcc.Dropdown(
|
||||
id='color-dropdown',
|
||||
options=[
|
||||
{'label': 'Category', 'value': 'category'},
|
||||
{'label': 'Subcategory', 'value': 'subcategory'},
|
||||
{'label': 'Tags', 'value': 'tags'}
|
||||
],
|
||||
value='category',
|
||||
style={'margin-bottom': '15px'}
|
||||
),
|
||||
|
||||
dbc.Label("Dimensions:"),
|
||||
dcc.RadioItems(
|
||||
id='dimension-toggle',
|
||||
options=[
|
||||
{'label': '2D', 'value': '2d'},
|
||||
{'label': '3D', 'value': '3d'}
|
||||
],
|
||||
value='3d',
|
||||
style={'margin-bottom': '20px'}
|
||||
),
|
||||
|
||||
dbc.Label("Show Prompts:"),
|
||||
dcc.Checklist(
|
||||
id='show-prompts-toggle',
|
||||
options=[{'label': 'Show prompts on plot', 'value': 'show'}],
|
||||
value=['show'],
|
||||
style={'margin-bottom': '20px'}
|
||||
),
|
||||
|
||||
html.H5("Point Details", className="mb-3"),
|
||||
html.Div(id='point-details', children="Click on a point to see details")
|
||||
|
||||
], width=3, style={'padding-right': '20px'}),
|
||||
|
||||
# Main visualization area
|
||||
dbc.Col([
|
||||
dcc.Graph(
|
||||
id='embedding-plot',
|
||||
style={'height': '85vh', 'width': '100%'},
|
||||
config={'responsive': True, 'displayModeBar': True}
|
||||
)
|
||||
], width=9)
|
||||
]),
|
||||
|
||||
dcc.Store(id='processed-data'),
|
||||
dcc.Store(id='processed-prompts')
|
||||
], fluid=True)
|
||||
|
||||
@callback(
|
||||
Output('processed-data', 'data'),
|
||||
Input('upload-data', 'contents'),
|
||||
State('upload-data', 'filename')
|
||||
)
|
||||
def process_uploaded_file(contents, filename):
|
||||
if contents is None:
|
||||
return None
|
||||
|
||||
try:
|
||||
documents = parse_ndjson(contents)
|
||||
embeddings = np.array([doc['embedding'] for doc in documents])
|
||||
|
||||
# Store original embeddings and documents
|
||||
return {
|
||||
'documents': documents,
|
||||
'embeddings': embeddings.tolist()
|
||||
}
|
||||
except Exception as e:
|
||||
return {'error': str(e)}
|
||||
|
||||
@callback(
|
||||
Output('processed-prompts', 'data'),
|
||||
Input('upload-prompts', 'contents'),
|
||||
State('upload-prompts', 'filename')
|
||||
)
|
||||
def process_uploaded_prompts(contents, filename):
|
||||
if contents is None:
|
||||
return None
|
||||
|
||||
try:
|
||||
prompts = parse_ndjson(contents)
|
||||
embeddings = np.array([prompt['embedding'] for prompt in prompts])
|
||||
|
||||
# Store original embeddings and prompts
|
||||
return {
|
||||
'prompts': prompts,
|
||||
'embeddings': embeddings.tolist()
|
||||
}
|
||||
except Exception as e:
|
||||
return {'error': str(e)}
|
||||
|
||||
@callback(
|
||||
Output('embedding-plot', 'figure'),
|
||||
[Input('processed-data', 'data'),
|
||||
Input('processed-prompts', 'data'),
|
||||
Input('method-dropdown', 'value'),
|
||||
Input('color-dropdown', 'value'),
|
||||
Input('dimension-toggle', 'value'),
|
||||
Input('show-prompts-toggle', 'value')]
|
||||
)
|
||||
def update_plot(data, prompts_data, method, color_by, dimensions, show_prompts):
|
||||
if not data or 'error' in data:
|
||||
return go.Figure().add_annotation(
|
||||
text="Upload a valid NDJSON file to see visualization",
|
||||
xref="paper", yref="paper",
|
||||
x=0.5, y=0.5, xanchor='center', yanchor='middle',
|
||||
showarrow=False, font=dict(size=16)
|
||||
)
|
||||
|
||||
# Prepare embeddings for dimensionality reduction
|
||||
doc_embeddings = np.array(data['embeddings'])
|
||||
all_embeddings = doc_embeddings
|
||||
has_prompts = prompts_data and 'error' not in prompts_data and prompts_data.get('prompts')
|
||||
|
||||
if has_prompts:
|
||||
prompt_embeddings = np.array(prompts_data['embeddings'])
|
||||
all_embeddings = np.vstack([doc_embeddings, prompt_embeddings])
|
||||
|
||||
n_components = 3 if dimensions == '3d' else 2
|
||||
|
||||
# Apply dimensionality reduction to combined data
|
||||
reduced, variance_explained = apply_dimensionality_reduction(
|
||||
all_embeddings, method=method, n_components=n_components
|
||||
)
|
||||
|
||||
# Split reduced embeddings back
|
||||
doc_reduced = reduced[:len(doc_embeddings)]
|
||||
prompt_reduced = reduced[len(doc_embeddings):] if has_prompts else None
|
||||
|
||||
# Create dataframes
|
||||
doc_df_data = []
|
||||
for i, doc in enumerate(data['documents']):
|
||||
row = {
|
||||
'id': doc['id'],
|
||||
'text': doc['text'],
|
||||
'category': doc.get('category', 'Unknown'),
|
||||
'subcategory': doc.get('subcategory', 'Unknown'),
|
||||
'tags': doc.get('tags', []),
|
||||
'dim_1': doc_reduced[i, 0],
|
||||
'dim_2': doc_reduced[i, 1],
|
||||
'type': 'document'
|
||||
}
|
||||
if dimensions == '3d':
|
||||
row['dim_3'] = doc_reduced[i, 2]
|
||||
doc_df_data.append(row)
|
||||
|
||||
doc_df = pd.DataFrame(doc_df_data)
|
||||
|
||||
prompt_df = None
|
||||
if has_prompts and prompt_reduced is not None:
|
||||
prompt_df_data = []
|
||||
for i, prompt in enumerate(prompts_data['prompts']):
|
||||
row = {
|
||||
'id': prompt['id'],
|
||||
'text': prompt['text'],
|
||||
'category': prompt.get('category', 'Unknown'),
|
||||
'subcategory': prompt.get('subcategory', 'Unknown'),
|
||||
'tags': prompt.get('tags', []),
|
||||
'dim_1': prompt_reduced[i, 0],
|
||||
'dim_2': prompt_reduced[i, 1],
|
||||
'type': 'prompt'
|
||||
}
|
||||
if dimensions == '3d':
|
||||
row['dim_3'] = prompt_reduced[i, 2]
|
||||
prompt_df_data.append(row)
|
||||
|
||||
prompt_df = pd.DataFrame(prompt_df_data)
|
||||
|
||||
return create_dual_plot(doc_df, prompt_df, dimensions, color_by, method.upper(), show_prompts)
|
||||
|
||||
@callback(
|
||||
Output('point-details', 'children'),
|
||||
Input('embedding-plot', 'clickData'),
|
||||
[State('processed-data', 'data'),
|
||||
State('processed-prompts', 'data')]
|
||||
)
|
||||
def display_click_data(clickData, data, prompts_data):
|
||||
if not clickData or not data:
|
||||
return "Click on a point to see details"
|
||||
|
||||
# Get point info from click
|
||||
point_data = clickData['points'][0]
|
||||
trace_name = point_data.get('fullData', {}).get('name', 'Documents')
|
||||
|
||||
if 'pointIndex' in point_data:
|
||||
point_index = point_data['pointIndex']
|
||||
elif 'pointNumber' in point_data:
|
||||
point_index = point_data['pointNumber']
|
||||
else:
|
||||
return "Could not identify clicked point"
|
||||
|
||||
# Determine which dataset this point belongs to
|
||||
if trace_name == 'Prompts' and prompts_data and 'prompts' in prompts_data:
|
||||
item = prompts_data['prompts'][point_index]
|
||||
item_type = 'Prompt'
|
||||
else:
|
||||
item = data['documents'][point_index]
|
||||
item_type = 'Document'
|
||||
|
||||
return dbc.Card([
|
||||
dbc.CardBody([
|
||||
html.H5(f"{item_type}: {item['id']}", className="card-title"),
|
||||
html.P(f"Text: {item['text']}", className="card-text"),
|
||||
html.P(f"Category: {item.get('category', 'Unknown')}", className="card-text"),
|
||||
html.P(f"Subcategory: {item.get('subcategory', 'Unknown')}", className="card-text"),
|
||||
html.P(f"Tags: {', '.join(item.get('tags', [])) if item.get('tags') else 'None'}", className="card-text"),
|
||||
html.P(f"Type: {item_type}", className="card-text text-muted")
|
||||
])
|
||||
])
|
||||
|
||||
@callback(
|
||||
[Output('processed-data', 'data', allow_duplicate=True),
|
||||
Output('processed-prompts', 'data', allow_duplicate=True),
|
||||
Output('point-details', 'children', allow_duplicate=True)],
|
||||
Input('reset-button', 'n_clicks'),
|
||||
prevent_initial_call=True
|
||||
)
|
||||
def reset_data(n_clicks):
|
||||
if n_clicks is None or n_clicks == 0:
|
||||
return dash.no_update, dash.no_update, dash.no_update
|
||||
|
||||
return None, None, "Click on a point to see details"
|
||||
|
||||
if __name__ == '__main__':
|
||||
app.run(debug=True)
|
||||
133
bump_version.py
133
bump_version.py
@@ -1,133 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Version bump script for EmbeddingBuddy.
|
||||
Automatically updates version in pyproject.toml following semantic versioning.
|
||||
"""
|
||||
import argparse
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def get_current_version(pyproject_path: Path) -> str:
|
||||
"""Extract current version from pyproject.toml."""
|
||||
content = pyproject_path.read_text()
|
||||
match = re.search(r'version\s*=\s*"([^"]+)"', content)
|
||||
if not match:
|
||||
raise ValueError("Could not find version in pyproject.toml")
|
||||
return match.group(1)
|
||||
|
||||
|
||||
def parse_version(version_str: str) -> tuple[int, int, int]:
|
||||
"""Parse semantic version string into major, minor, patch tuple."""
|
||||
match = re.match(r'(\d+)\.(\d+)\.(\d+)', version_str)
|
||||
if not match:
|
||||
raise ValueError(f"Invalid version format: {version_str}")
|
||||
return int(match.group(1)), int(match.group(2)), int(match.group(3))
|
||||
|
||||
|
||||
def bump_version(current: str, bump_type: str) -> str:
|
||||
"""Bump version based on type (major, minor, patch)."""
|
||||
major, minor, patch = parse_version(current)
|
||||
|
||||
if bump_type == "major":
|
||||
return f"{major + 1}.0.0"
|
||||
elif bump_type == "minor":
|
||||
return f"{major}.{minor + 1}.0"
|
||||
elif bump_type == "patch":
|
||||
return f"{major}.{minor}.{patch + 1}"
|
||||
else:
|
||||
raise ValueError(f"Invalid bump type: {bump_type}")
|
||||
|
||||
|
||||
def update_version_in_file(pyproject_path: Path, new_version: str) -> None:
|
||||
"""Update version in pyproject.toml file."""
|
||||
content = pyproject_path.read_text()
|
||||
updated_content = re.sub(
|
||||
r'version\s*=\s*"[^"]+"',
|
||||
f'version = "{new_version}"',
|
||||
content
|
||||
)
|
||||
pyproject_path.write_text(updated_content)
|
||||
|
||||
|
||||
def main():
|
||||
"""Main version bump function."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Bump version in pyproject.toml",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
python bump_version.py patch # 0.3.0 -> 0.3.1
|
||||
python bump_version.py minor # 0.3.0 -> 0.4.0
|
||||
python bump_version.py major # 0.3.0 -> 1.0.0
|
||||
python bump_version.py --set 1.2.3 # Set specific version
|
||||
|
||||
Semantic versioning guide:
|
||||
- patch: Bug fixes, no API changes
|
||||
- minor: New features, backward compatible
|
||||
- major: Breaking changes, not backward compatible
|
||||
"""
|
||||
)
|
||||
|
||||
group = parser.add_mutually_exclusive_group(required=True)
|
||||
group.add_argument(
|
||||
"bump_type",
|
||||
nargs="?",
|
||||
choices=["major", "minor", "patch"],
|
||||
help="Type of version bump"
|
||||
)
|
||||
group.add_argument(
|
||||
"--set",
|
||||
dest="set_version",
|
||||
help="Set specific version (e.g., 1.2.3)"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--dry-run",
|
||||
action="store_true",
|
||||
help="Show what would be changed without making changes"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Find pyproject.toml
|
||||
pyproject_path = Path("pyproject.toml")
|
||||
if not pyproject_path.exists():
|
||||
print("❌ pyproject.toml not found in current directory")
|
||||
sys.exit(1)
|
||||
|
||||
try:
|
||||
current_version = get_current_version(pyproject_path)
|
||||
print(f"📦 Current version: {current_version}")
|
||||
|
||||
if args.set_version:
|
||||
# Validate the set version format
|
||||
parse_version(args.set_version)
|
||||
new_version = args.set_version
|
||||
else:
|
||||
new_version = bump_version(current_version, args.bump_type)
|
||||
|
||||
print(f"🚀 New version: {new_version}")
|
||||
|
||||
if args.dry_run:
|
||||
print("🔍 Dry run - no changes made")
|
||||
else:
|
||||
update_version_in_file(pyproject_path, new_version)
|
||||
print("✅ Version updated in pyproject.toml")
|
||||
print()
|
||||
print("💡 Next steps:")
|
||||
print(" 1. Review changes: git diff")
|
||||
print(" 2. Commit changes: git add . && git commit -m 'bump version to {}'".format(new_version))
|
||||
print(" 3. Tag release: git tag v{}".format(new_version))
|
||||
|
||||
except ValueError as e:
|
||||
print(f"❌ Error: {e}")
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
print(f"❌ Unexpected error: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,69 +0,0 @@
|
||||
services:
|
||||
opensearch:
|
||||
image: opensearchproject/opensearch:2
|
||||
container_name: embeddingbuddy-opensearch
|
||||
profiles:
|
||||
- opensearch
|
||||
environment:
|
||||
- cluster.name=embeddingbuddy-cluster
|
||||
- node.name=embeddingbuddy-node
|
||||
- discovery.type=single-node
|
||||
- bootstrap.memory_lock=true
|
||||
- "OPENSEARCH_JAVA_OPTS=-Xms512m -Xmx512m"
|
||||
- "DISABLE_INSTALL_DEMO_CONFIG=true"
|
||||
- "DISABLE_SECURITY_PLUGIN=true"
|
||||
ulimits:
|
||||
memlock:
|
||||
soft: -1
|
||||
hard: -1
|
||||
nofile:
|
||||
soft: 65536
|
||||
hard: 65536
|
||||
volumes:
|
||||
- opensearch-data:/usr/share/opensearch/data
|
||||
ports:
|
||||
- "9200:9200"
|
||||
- "9600:9600"
|
||||
networks:
|
||||
- embeddingbuddy
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "curl -f http://localhost:9200/_cluster/health || exit 1"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 5
|
||||
start_period: 60s
|
||||
|
||||
embeddingbuddy:
|
||||
build: .
|
||||
container_name: embeddingbuddy-app
|
||||
environment:
|
||||
- EMBEDDINGBUDDY_HOST=0.0.0.0
|
||||
- EMBEDDINGBUDDY_PORT=8050
|
||||
- EMBEDDINGBUDDY_DEBUG=false
|
||||
- OPENSEARCH_HOST=opensearch
|
||||
- OPENSEARCH_PORT=9200
|
||||
- OPENSEARCH_SCHEME=http
|
||||
- OPENSEARCH_VERIFY_CERTS=false
|
||||
ports:
|
||||
- "8050:8050"
|
||||
networks:
|
||||
- embeddingbuddy
|
||||
depends_on:
|
||||
opensearch:
|
||||
condition: service_healthy
|
||||
required: false
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "python -c 'import requests; requests.get(\"http://localhost:8050/\", timeout=5)'"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 30s
|
||||
restart: unless-stopped
|
||||
|
||||
volumes:
|
||||
opensearch-data:
|
||||
driver: local
|
||||
|
||||
networks:
|
||||
embeddingbuddy:
|
||||
driver: bridge
|
||||
Binary file not shown.
|
Before Width: | Height: | Size: 844 KiB After Width: | Height: | Size: 339 KiB |
@@ -1,157 +0,0 @@
|
||||
# Elasticsearch/OpenSearch Sample Data
|
||||
|
||||
This directory contains sample data files in Elasticsearch bulk index format for testing the OpenSearch integration in EmbeddingBuddy.
|
||||
|
||||
## Files
|
||||
|
||||
### Original NDJSON Files
|
||||
|
||||
- `sample_data.ndjson` - Original sample documents in EmbeddingBuddy format
|
||||
- `sample_prompts.ndjson` - Original sample prompts in EmbeddingBuddy format
|
||||
|
||||
### Elasticsearch Bulk Files
|
||||
|
||||
- `sample_data_es_bulk.ndjson` - Documents in ES bulk format (index: "embeddings")
|
||||
- `sample_prompts_es_bulk.ndjson` - Prompts in ES bulk format (index: "prompts")
|
||||
|
||||
## Usage
|
||||
|
||||
### 1. Index the data using curl
|
||||
|
||||
```bash
|
||||
# Index main documents
|
||||
curl -X POST "localhost:9200/_bulk" \
|
||||
-H "Content-Type: application/x-ndjson" \
|
||||
--data-binary @sample_data_es_bulk.ndjson
|
||||
|
||||
# Index prompts
|
||||
curl -X POST "localhost:9200/_bulk" \
|
||||
-H "Content-Type: application/x-ndjson" \
|
||||
--data-binary @sample_prompts_es_bulk.ndjson
|
||||
```
|
||||
|
||||
### 2. Create proper mappings (recommended)
|
||||
|
||||
First create the index with proper dense_vector mapping:
|
||||
|
||||
```bash
|
||||
# Create embeddings index with dense_vector mapping
|
||||
curl -X PUT "localhost:9200/embeddings" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"settings": {
|
||||
"index.knn": true
|
||||
},
|
||||
"mappings": {
|
||||
"properties": {
|
||||
"id": {"type": "keyword"},
|
||||
"embedding": {
|
||||
"type": "knn_vector",
|
||||
"dimension": 8,
|
||||
"method": {
|
||||
"engine": "lucene",
|
||||
"space_type": "cosinesimil",
|
||||
"name": "hnsw",
|
||||
"parameters": {}
|
||||
}
|
||||
},
|
||||
"text": {"type": "text"},
|
||||
"category": {"type": "keyword"},
|
||||
"subcategory": {"type": "keyword"},
|
||||
"tags": {"type": "keyword"}
|
||||
}
|
||||
}
|
||||
}'
|
||||
|
||||
# Create dense vector index with alternative field names
|
||||
curl -X PUT "localhost:9200/prompts" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"settings": {
|
||||
"index.knn": true
|
||||
},
|
||||
"mappings": {
|
||||
"properties": {
|
||||
"id": {"type": "keyword"},
|
||||
"embedding": {
|
||||
"type": "knn_vector",
|
||||
"dimension": 8,
|
||||
"method": {
|
||||
"engine": "lucene",
|
||||
"space_type": "cosinesimil",
|
||||
"name": "hnsw",
|
||||
"parameters": {}
|
||||
}
|
||||
},
|
||||
"text": {"type": "text"},
|
||||
"category": {"type": "keyword"},
|
||||
"subcategory": {"type": "keyword"},
|
||||
"tags": {"type": "keyword"}
|
||||
}
|
||||
}
|
||||
}'
|
||||
```
|
||||
|
||||
Then index the data using the bulk files above.
|
||||
|
||||
### 3. Test in EmbeddingBuddy
|
||||
|
||||
#### For "embeddings" index
|
||||
|
||||
- **OpenSearch URL**: `http://localhost:9200`
|
||||
- **Index Name**: `embeddings`
|
||||
- **Field Mapping**:
|
||||
- Embedding Field: `embedding`
|
||||
- Text Field: `text`
|
||||
- ID Field: `id`
|
||||
- Category Field: `category`
|
||||
- Subcategory Field: `subcategory`
|
||||
- Tags Field: `tags`
|
||||
|
||||
#### For "embeddings-dense" index (alternative field names)
|
||||
|
||||
- **OpenSearch URL**: `http://localhost:9200`
|
||||
- **Index Name**: `embeddings-dense`
|
||||
- **Field Mapping**:
|
||||
- Embedding Field: `vector`
|
||||
- Text Field: `content`
|
||||
- ID Field: `doc_id`
|
||||
- Category Field: `type`
|
||||
- Subcategory Field: `subtopic`
|
||||
- Tags Field: `keywords`
|
||||
|
||||
## Data Structure
|
||||
|
||||
### Original Format (from NDJSON files)
|
||||
|
||||
```json
|
||||
{
|
||||
"id": "doc_001",
|
||||
"embedding": [0.2, -0.1, 0.8, 0.3, -0.5, 0.7, 0.1, -0.3],
|
||||
"text": "Machine learning algorithms are transforming healthcare...",
|
||||
"category": "technology",
|
||||
"subcategory": "healthcare",
|
||||
"tags": ["ai", "medicine", "prediction"]
|
||||
}
|
||||
```
|
||||
|
||||
### ES Bulk Format
|
||||
|
||||
```json
|
||||
{"index": {"_index": "embeddings", "_id": "doc_001"}}
|
||||
{"id": "doc_001", "embedding": [...], "text": "...", "category": "...", ...}
|
||||
```
|
||||
|
||||
### Alternative Field Names (dense vector format)
|
||||
|
||||
```json
|
||||
{"index": {"_index": "embeddings-dense", "_id": "doc_001"}}
|
||||
{"doc_id": "doc_001", "vector": [...], "content": "...", "type": "...", ...}
|
||||
```
|
||||
|
||||
## Notes
|
||||
|
||||
- All embedding vectors are 8-dimensional for these sample files
|
||||
- The alternative format demonstrates how EmbeddingBuddy's field mapping handles different field names
|
||||
- For production use, you may want larger embedding dimensions (e.g., 384, 768, 1536)
|
||||
- The `dense_vector` field type in Elasticsearch/OpenSearch enables vector similarity search
|
||||
@@ -1,2 +0,0 @@
|
||||
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>{"id": "doc_001", "embedding": [0.1, -0.3, 0.7, 0.2], "text": "Binary junk at start"}
|
||||
{"id": "doc_002", "embedding": [0.5, 0.1, -0.2, 0.8], "text": "Normal line"}<7D><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
@@ -1,6 +0,0 @@
|
||||
{"id": "doc_001", "embedding": [0.1, -0.3, 0.7, 0.2], "text": "First line"}
|
||||
|
||||
{"id": "doc_002", "embedding": [0.5, 0.1, -0.2, 0.8], "text": "After empty line"}
|
||||
|
||||
|
||||
{"id": "doc_003", "embedding": [0.3, 0.4, 0.1, -0.1], "text": "After multiple empty lines"}
|
||||
@@ -1,4 +0,0 @@
|
||||
{"id": "doc_001", "embedding": [0.1, -0.3, 0.7, 0.2], "text": "4D embedding"}
|
||||
{"id": "doc_002", "embedding": [0.5, 0.1, -0.2], "text": "3D embedding"}
|
||||
{"id": "doc_003", "embedding": [0.3, 0.4, 0.1, -0.1, 0.8], "text": "5D embedding"}
|
||||
{"id": "doc_004", "embedding": [0.2, 0.1], "text": "2D embedding"}
|
||||
@@ -1,8 +0,0 @@
|
||||
{"id": "doc_001", "embedding": "not_an_array", "text": "Embedding as string"}
|
||||
{"id": "doc_002", "embedding": [0.1, "text", 0.7, 0.2], "text": "Mixed types in embedding"}
|
||||
{"id": "doc_003", "embedding": [], "text": "Empty embedding array"}
|
||||
{"id": "doc_004", "embedding": [0.1], "text": "Single dimension embedding"}
|
||||
{"id": "doc_005", "embedding": null, "text": "Null embedding"}
|
||||
{"id": "doc_006", "embedding": [0.1, 0.2, null, 0.4], "text": "Null value in embedding"}
|
||||
{"id": "doc_007", "embedding": [0.1, 0.2, "NaN", 0.4], "text": "String NaN in embedding"}
|
||||
{"id": "doc_008", "embedding": [0.1, 0.2, Infinity, 0.4], "text": "Infinity in embedding"}
|
||||
@@ -1,5 +0,0 @@
|
||||
{"id": "doc_001", "embedding": [0.1, -0.3, 0.7, "text": "Valid line"}
|
||||
{"id": "doc_002", "embedding": [0.5, 0.1, -0.2, 0.8], "text": "Missing closing brace"
|
||||
{"id": "doc_003" "embedding": [0.3, 0.4, 0.1, -0.1], "text": "Missing colon after id"}
|
||||
{id: "doc_004", "embedding": [0.2, 0.1, 0.3, 0.4], "text": "Unquoted key"}
|
||||
{"id": "doc_005", "embedding": [0.1, 0.2, 0.3, 0.4], "text": "Valid line again"}
|
||||
@@ -1,3 +0,0 @@
|
||||
{"id": "doc_001", "text": "Sample text without embedding field", "category": "test"}
|
||||
{"id": "doc_002", "text": "Another text without embedding", "category": "test"}
|
||||
{"id": "doc_003", "text": "Third text missing embedding", "category": "test"}
|
||||
@@ -1,3 +0,0 @@
|
||||
{"id": "doc_001", "embedding": [0.1, -0.3, 0.7, 0.2], "category": "test"}
|
||||
{"id": "doc_002", "embedding": [0.5, 0.1, -0.2, 0.8], "category": "test"}
|
||||
{"id": "doc_003", "embedding": [0.3, 0.4, 0.1, -0.1], "category": "test"}
|
||||
@@ -1,4 +0,0 @@
|
||||
[
|
||||
{"id": "doc_001", "embedding": [0.1, -0.3, 0.7, 0.2], "text": "Regular JSON array"},
|
||||
{"id": "doc_002", "embedding": [0.5, 0.1, -0.2, 0.8], "text": "Instead of NDJSON"}
|
||||
]
|
||||
@@ -1,40 +0,0 @@
|
||||
{"index": {"_index": "embeddings", "_id": "doc_001"}}
|
||||
{"id": "doc_001", "embedding": [0.2, -0.1, 0.8, 0.3, -0.5, 0.7, 0.1, -0.3], "text": "Machine learning algorithms are transforming healthcare by enabling predictive analytics and personalized medicine.", "category": "technology", "subcategory": "healthcare", "tags": ["ai", "medicine", "prediction"]}
|
||||
{"index": {"_index": "embeddings", "_id": "doc_002"}}
|
||||
{"id": "doc_002", "embedding": [0.1, 0.4, -0.2, 0.6, 0.3, -0.4, 0.8, 0.2], "text": "Climate change poses significant challenges to global food security and agricultural sustainability.", "category": "environment", "subcategory": "agriculture", "tags": ["climate", "food", "sustainability"]}
|
||||
{"index": {"_index": "embeddings", "_id": "doc_003"}}
|
||||
{"id": "doc_003", "embedding": [-0.3, 0.7, 0.1, -0.2, 0.9, 0.4, -0.1, 0.5], "text": "The rise of electric vehicles is reshaping the automotive industry and urban transportation systems.", "category": "technology", "subcategory": "automotive", "tags": ["electric", "transport", "urban"]}
|
||||
{"index": {"_index": "embeddings", "_id": "doc_004"}}
|
||||
{"id": "doc_004", "embedding": [0.5, -0.6, 0.3, 0.8, -0.2, 0.1, 0.7, -0.4], "text": "Renewable energy sources like solar and wind are becoming increasingly cost-competitive with fossil fuels.", "category": "environment", "subcategory": "energy", "tags": ["renewable", "solar", "wind"]}
|
||||
{"index": {"_index": "embeddings", "_id": "doc_005"}}
|
||||
{"id": "doc_005", "embedding": [0.8, 0.2, -0.5, 0.1, 0.6, -0.3, 0.4, 0.9], "text": "Financial markets are experiencing volatility due to geopolitical tensions and inflation concerns.", "category": "finance", "subcategory": "markets", "tags": ["volatility", "inflation", "geopolitics"]}
|
||||
{"index": {"_index": "embeddings", "_id": "doc_006"}}
|
||||
{"id": "doc_006", "embedding": [-0.1, 0.5, 0.7, -0.4, 0.2, 0.8, -0.6, 0.3], "text": "Quantum computing research is advancing rapidly with potential applications in cryptography and drug discovery.", "category": "technology", "subcategory": "research", "tags": ["quantum", "cryptography", "research"]}
|
||||
{"index": {"_index": "embeddings", "_id": "doc_007"}}
|
||||
{"id": "doc_007", "embedding": [0.4, -0.3, 0.6, 0.7, -0.8, 0.2, 0.5, -0.1], "text": "Ocean pollution from plastic waste is threatening marine ecosystems and biodiversity worldwide.", "category": "environment", "subcategory": "marine", "tags": ["pollution", "plastic", "marine"]}
|
||||
{"index": {"_index": "embeddings", "_id": "doc_008"}}
|
||||
{"id": "doc_008", "embedding": [0.3, 0.8, -0.2, 0.5, 0.1, -0.7, 0.6, 0.4], "text": "Artificial intelligence is revolutionizing customer service through chatbots and automated support systems.", "category": "technology", "subcategory": "customer_service", "tags": ["ai", "chatbots", "automation"]}
|
||||
{"index": {"_index": "embeddings", "_id": "doc_009"}}
|
||||
{"id": "doc_009", "embedding": [-0.5, 0.3, 0.9, -0.1, 0.7, 0.4, -0.2, 0.8], "text": "Global supply chains are being redesigned for resilience after pandemic-related disruptions.", "category": "business", "subcategory": "logistics", "tags": ["supply_chain", "pandemic", "resilience"]}
|
||||
{"index": {"_index": "embeddings", "_id": "doc_010"}}
|
||||
{"id": "doc_010", "embedding": [0.7, -0.4, 0.2, 0.9, -0.3, 0.6, 0.1, -0.8], "text": "Space exploration missions are expanding our understanding of the solar system and potential for life.", "category": "science", "subcategory": "space", "tags": ["space", "exploration", "life"]}
|
||||
{"index": {"_index": "embeddings", "_id": "doc_011"}}
|
||||
{"id": "doc_011", "embedding": [-0.2, 0.6, 0.4, -0.7, 0.8, 0.3, -0.5, 0.1], "text": "Cryptocurrency adoption is growing among institutional investors despite regulatory uncertainties.", "category": "finance", "subcategory": "crypto", "tags": ["cryptocurrency", "institutional", "regulation"]}
|
||||
{"index": {"_index": "embeddings", "_id": "doc_012"}}
|
||||
{"id": "doc_012", "embedding": [0.6, 0.1, -0.8, 0.4, 0.5, -0.2, 0.9, -0.3], "text": "Remote work technologies are transforming traditional office environments and work-life balance.", "category": "technology", "subcategory": "workplace", "tags": ["remote", "work", "balance"]}
|
||||
{"index": {"_index": "embeddings", "_id": "doc_013"}}
|
||||
{"id": "doc_013", "embedding": [0.1, -0.7, 0.5, 0.8, -0.4, 0.3, 0.2, 0.6], "text": "Gene therapy breakthroughs are offering new hope for treating previously incurable genetic diseases.", "category": "science", "subcategory": "medicine", "tags": ["gene_therapy", "genetics", "medicine"]}
|
||||
{"index": {"_index": "embeddings", "_id": "doc_014"}}
|
||||
{"id": "doc_014", "embedding": [-0.4, 0.2, 0.7, -0.1, 0.9, -0.6, 0.3, 0.5], "text": "Urban planning is evolving to create more sustainable and livable cities for growing populations.", "category": "environment", "subcategory": "urban", "tags": ["urban_planning", "sustainability", "cities"]}
|
||||
{"index": {"_index": "embeddings", "_id": "doc_015"}}
|
||||
{"id": "doc_015", "embedding": [0.9, -0.1, 0.3, 0.6, -0.5, 0.8, -0.2, 0.4], "text": "Social media platforms are implementing new policies to combat misinformation and protect user privacy.", "category": "technology", "subcategory": "social_media", "tags": ["social_media", "misinformation", "privacy"]}
|
||||
{"index": {"_index": "embeddings", "_id": "doc_016"}}
|
||||
{"id": "doc_016", "embedding": [-0.3, 0.8, -0.1, 0.4, 0.7, -0.5, 0.6, -0.9], "text": "Educational technology is personalizing learning experiences and improving student outcomes.", "category": "education", "subcategory": "technology", "tags": ["education", "personalization", "technology"]}
|
||||
{"index": {"_index": "embeddings", "_id": "doc_017"}}
|
||||
{"id": "doc_017", "embedding": [0.5, 0.3, -0.6, 0.2, 0.8, 0.1, -0.4, 0.7], "text": "Biodiversity conservation efforts are critical for maintaining ecosystem balance and preventing species extinction.", "category": "environment", "subcategory": "conservation", "tags": ["biodiversity", "conservation", "extinction"]}
|
||||
{"index": {"_index": "embeddings", "_id": "doc_018"}}
|
||||
{"id": "doc_018", "embedding": [0.2, -0.8, 0.4, 0.7, -0.1, 0.5, 0.9, -0.3], "text": "Healthcare systems are adopting telemedicine to improve access and reduce costs for patients.", "category": "technology", "subcategory": "healthcare", "tags": ["telemedicine", "healthcare", "access"]}
|
||||
{"index": {"_index": "embeddings", "_id": "doc_019"}}
|
||||
{"id": "doc_019", "embedding": [-0.7, 0.4, 0.8, -0.2, 0.3, 0.6, -0.1, 0.9], "text": "Autonomous vehicles are being tested extensively with promises of safer and more efficient transportation.", "category": "technology", "subcategory": "automotive", "tags": ["autonomous", "safety", "efficiency"]}
|
||||
{"index": {"_index": "embeddings", "_id": "doc_020"}}
|
||||
{"id": "doc_020", "embedding": [0.4, 0.7, -0.3, 0.9, -0.6, 0.2, 0.5, -0.1], "text": "Mental health awareness is increasing with new approaches to therapy and workplace wellness programs.", "category": "health", "subcategory": "mental", "tags": ["mental_health", "therapy", "wellness"]}
|
||||
@@ -1,20 +0,0 @@
|
||||
{"index": {"_index": "prompts", "_id": "prompt_001"}}
|
||||
{"id": "prompt_001", "embedding": [0.15, -0.28, 0.65, 0.42, -0.11, 0.33, 0.78, -0.52], "text": "Find articles about machine learning applications", "category": "search", "subcategory": "technology", "tags": ["AI", "research"]}
|
||||
{"index": {"_index": "prompts", "_id": "prompt_002"}}
|
||||
{"id": "prompt_002", "embedding": [0.72, 0.18, -0.35, 0.51, 0.09, -0.44, 0.27, 0.63], "text": "Show me product reviews for smartphones", "category": "search", "subcategory": "product", "tags": ["mobile", "reviews"]}
|
||||
{"index": {"_index": "prompts", "_id": "prompt_003"}}
|
||||
{"id": "prompt_003", "embedding": [-0.21, 0.59, 0.34, -0.67, 0.45, 0.12, -0.38, 0.76], "text": "What are the latest political developments?", "category": "search", "subcategory": "news", "tags": ["politics", "current events"]}
|
||||
{"index": {"_index": "prompts", "_id": "prompt_004"}}
|
||||
{"id": "prompt_004", "embedding": [0.48, -0.15, 0.72, 0.31, -0.58, 0.24, 0.67, -0.39], "text": "Summarize recent tech industry trends", "category": "analysis", "subcategory": "technology", "tags": ["tech", "trends", "summary"]}
|
||||
{"index": {"_index": "prompts", "_id": "prompt_005"}}
|
||||
{"id": "prompt_005", "embedding": [-0.33, 0.47, -0.62, 0.28, 0.71, -0.18, 0.54, 0.35], "text": "Compare different smartphone models", "category": "analysis", "subcategory": "product", "tags": ["comparison", "mobile", "evaluation"]}
|
||||
{"index": {"_index": "prompts", "_id": "prompt_006"}}
|
||||
{"id": "prompt_006", "embedding": [0.64, 0.21, 0.39, -0.45, 0.13, 0.58, -0.27, 0.74], "text": "Analyze voter sentiment on recent policies", "category": "analysis", "subcategory": "politics", "tags": ["sentiment", "politics", "analysis"]}
|
||||
{"index": {"_index": "prompts", "_id": "prompt_007"}}
|
||||
{"id": "prompt_007", "embedding": [0.29, -0.43, 0.56, 0.68, -0.22, 0.37, 0.14, -0.61], "text": "Generate a summary of machine learning research", "category": "generation", "subcategory": "technology", "tags": ["AI", "research", "summary"]}
|
||||
{"index": {"_index": "prompts", "_id": "prompt_008"}}
|
||||
{"id": "prompt_008", "embedding": [-0.17, 0.52, -0.48, 0.36, 0.74, -0.29, 0.61, 0.18], "text": "Create a product recommendation report", "category": "generation", "subcategory": "product", "tags": ["recommendation", "report", "analysis"]}
|
||||
{"index": {"_index": "prompts", "_id": "prompt_009"}}
|
||||
{"id": "prompt_009", "embedding": [0.55, 0.08, 0.41, -0.37, 0.26, 0.69, -0.14, 0.58], "text": "Write a news brief on election updates", "category": "generation", "subcategory": "news", "tags": ["election", "news", "brief"]}
|
||||
{"index": {"_index": "prompts", "_id": "prompt_010"}}
|
||||
{"id": "prompt_010", "embedding": [0.23, -0.59, 0.47, 0.61, -0.35, 0.18, 0.72, -0.26], "text": "Explain how neural networks work", "category": "explanation", "subcategory": "technology", "tags": ["AI", "education", "neural networks"]}
|
||||
10
main.py
Normal file
10
main.py
Normal file
@@ -0,0 +1,10 @@
|
||||
from src.embeddingbuddy.app import create_app, run_app
|
||||
|
||||
|
||||
def main():
|
||||
app = create_app()
|
||||
run_app(app)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,6 +1,6 @@
|
||||
[project]
|
||||
name = "embeddingbuddy"
|
||||
version = "0.8.1"
|
||||
version = "0.2.0"
|
||||
description = "A Python Dash application for interactive exploration and visualization of embedding vectors through dimensionality reduction techniques."
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.11"
|
||||
@@ -12,37 +12,9 @@ dependencies = [
|
||||
"scikit-learn>=1.3.2",
|
||||
"dash-bootstrap-components>=1.5.0",
|
||||
"umap-learn>=0.5.8",
|
||||
"numba>=0.56.4",
|
||||
"openTSNE>=1.0.0",
|
||||
"mypy>=1.17.1",
|
||||
"opensearch-py>=3.0.0",
|
||||
]
|
||||
|
||||
[project.scripts]
|
||||
embeddingbuddy = "embeddingbuddy.cli:main"
|
||||
embeddingbuddy-serve = "embeddingbuddy.app:serve"
|
||||
|
||||
[project.optional-dependencies]
|
||||
test = [
|
||||
"pytest>=8.4.1",
|
||||
"pytest-cov>=4.1.0",
|
||||
]
|
||||
lint = [
|
||||
"ruff>=0.1.0",
|
||||
"mypy>=1.5.0",
|
||||
]
|
||||
security = [
|
||||
"bandit[toml]>=1.7.5",
|
||||
"safety>=2.3.0",
|
||||
"pip-audit>=2.6.0",
|
||||
]
|
||||
prod = [
|
||||
"gunicorn>=21.2.0",
|
||||
]
|
||||
dev = [
|
||||
"embeddingbuddy[test,lint,security]",
|
||||
]
|
||||
all = [
|
||||
"embeddingbuddy[test,lint,security,prod]",
|
||||
]
|
||||
|
||||
[build-system]
|
||||
@@ -54,6 +26,3 @@ where = ["src"]
|
||||
|
||||
[tool.setuptools.package-dir]
|
||||
"" = "src"
|
||||
|
||||
[tool.setuptools.package-data]
|
||||
embeddingbuddy = ["assets/**/*"]
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
"""EmbeddingBuddy - Interactive exploration and visualization of embedding vectors."""
|
||||
|
||||
__version__ = "0.1.0"
|
||||
__version__ = "0.1.0"
|
||||
@@ -1,216 +1,39 @@
|
||||
"""
|
||||
EmbeddingBuddy application factory and server functions.
|
||||
|
||||
This module contains the main application creation logic with imports
|
||||
moved inside functions to avoid loading heavy dependencies at module level.
|
||||
"""
|
||||
import dash
|
||||
import dash_bootstrap_components as dbc
|
||||
from .config.settings import AppSettings
|
||||
from .ui.layout import AppLayout
|
||||
from .ui.callbacks.data_processing import DataProcessingCallbacks
|
||||
from .ui.callbacks.visualization import VisualizationCallbacks
|
||||
from .ui.callbacks.interactions import InteractionCallbacks
|
||||
|
||||
|
||||
def create_app():
|
||||
"""Create and configure the Dash application instance."""
|
||||
import os
|
||||
import dash
|
||||
import dash_bootstrap_components as dbc
|
||||
from .ui.layout import AppLayout
|
||||
from .ui.callbacks.data_processing import DataProcessingCallbacks
|
||||
from .ui.callbacks.visualization import VisualizationCallbacks
|
||||
from .ui.callbacks.interactions import InteractionCallbacks
|
||||
|
||||
# Get the assets directory relative to this module
|
||||
module_dir = os.path.dirname(__file__)
|
||||
assets_path = os.path.join(module_dir, "assets")
|
||||
|
||||
app = dash.Dash(
|
||||
__name__,
|
||||
title="EmbeddingBuddy",
|
||||
external_stylesheets=[
|
||||
dbc.themes.BOOTSTRAP,
|
||||
],
|
||||
assets_folder=assets_path,
|
||||
meta_tags=[
|
||||
{
|
||||
"name": "description",
|
||||
"content": "Interactive embedding visualization tool for exploring high-dimensional vectors through dimensionality reduction techniques like PCA, t-SNE, and UMAP.",
|
||||
},
|
||||
{"name": "author", "content": "EmbeddingBuddy"},
|
||||
{
|
||||
"name": "keywords",
|
||||
"content": "embeddings, visualization, dimensionality reduction, PCA, t-SNE, UMAP, machine learning, data science",
|
||||
},
|
||||
{"name": "viewport", "content": "width=device-width, initial-scale=1.0"},
|
||||
{
|
||||
"property": "og:title",
|
||||
"content": "EmbeddingBuddy - Interactive Embedding Visualization",
|
||||
},
|
||||
{
|
||||
"property": "og:description",
|
||||
"content": "Explore and visualize embedding vectors through interactive 2D/3D plots with multiple dimensionality reduction techniques.",
|
||||
},
|
||||
{"property": "og:type", "content": "website"},
|
||||
],
|
||||
__name__,
|
||||
external_stylesheets=[dbc.themes.BOOTSTRAP]
|
||||
)
|
||||
|
||||
# Allow callbacks to components that are dynamically created in tabs
|
||||
app.config.suppress_callback_exceptions = True
|
||||
|
||||
|
||||
layout_manager = AppLayout()
|
||||
app.layout = layout_manager.create_layout()
|
||||
|
||||
|
||||
DataProcessingCallbacks()
|
||||
VisualizationCallbacks()
|
||||
InteractionCallbacks()
|
||||
|
||||
# Register client-side callback for embedding generation
|
||||
_register_client_side_callbacks(app)
|
||||
|
||||
|
||||
return app
|
||||
|
||||
|
||||
def _register_client_side_callbacks(app):
|
||||
"""Register client-side callbacks for browser-based processing."""
|
||||
from dash import Input, Output, State
|
||||
|
||||
# Client-side callback for embedding generation
|
||||
app.clientside_callback(
|
||||
"""
|
||||
function(nClicks, textContent, modelName, tokenizationMethod, batchSize, category, subcategory) {
|
||||
if (!nClicks || !textContent || !textContent.trim()) {
|
||||
return window.dash_clientside.no_update;
|
||||
}
|
||||
|
||||
console.log('🔍 Checking for Transformers.js...');
|
||||
console.log('window.dash_clientside:', typeof window.dash_clientside);
|
||||
console.log('window.dash_clientside.transformers:', typeof window.dash_clientside?.transformers);
|
||||
console.log('generateEmbeddings function:', typeof window.dash_clientside?.transformers?.generateEmbeddings);
|
||||
|
||||
if (typeof window.dash_clientside !== 'undefined' &&
|
||||
typeof window.dash_clientside.transformers !== 'undefined' &&
|
||||
typeof window.dash_clientside.transformers.generateEmbeddings === 'function') {
|
||||
|
||||
console.log('✅ Calling Transformers.js generateEmbeddings...');
|
||||
return window.dash_clientside.transformers.generateEmbeddings(
|
||||
nClicks, textContent, modelName, tokenizationMethod, category, subcategory
|
||||
);
|
||||
}
|
||||
|
||||
// More detailed error information
|
||||
let errorMsg = '❌ Transformers.js not available. ';
|
||||
if (typeof window.dash_clientside === 'undefined') {
|
||||
errorMsg += 'dash_clientside not found.';
|
||||
} else if (typeof window.dash_clientside.transformers === 'undefined') {
|
||||
errorMsg += 'transformers module not found.';
|
||||
} else if (typeof window.dash_clientside.transformers.generateEmbeddings !== 'function') {
|
||||
errorMsg += 'generateEmbeddings function not found.';
|
||||
}
|
||||
|
||||
console.error(errorMsg);
|
||||
|
||||
return [
|
||||
{ error: 'Transformers.js not loaded. Please refresh the page and try again.' },
|
||||
false
|
||||
];
|
||||
}
|
||||
""",
|
||||
[
|
||||
Output("embeddings-generated-trigger", "data"),
|
||||
Output("generate-embeddings-btn", "disabled", allow_duplicate=True),
|
||||
],
|
||||
[Input("generate-embeddings-btn", "n_clicks")],
|
||||
[
|
||||
State("text-input-area", "value"),
|
||||
State("model-selection", "value"),
|
||||
State("tokenization-method", "value"),
|
||||
State("batch-size", "value"),
|
||||
State("text-category", "value"),
|
||||
State("text-subcategory", "value"),
|
||||
],
|
||||
prevent_initial_call=True,
|
||||
)
|
||||
|
||||
|
||||
def run_app(app=None, debug=None, host=None, port=None):
|
||||
"""Run the Dash application with specified settings."""
|
||||
from .config.settings import AppSettings
|
||||
|
||||
if app is None:
|
||||
app = create_app()
|
||||
|
||||
|
||||
app.run(
|
||||
debug=debug if debug is not None else AppSettings.DEBUG,
|
||||
host=host if host is not None else AppSettings.HOST,
|
||||
port=port if port is not None else AppSettings.PORT,
|
||||
port=port if port is not None else AppSettings.PORT
|
||||
)
|
||||
|
||||
|
||||
def serve(host=None, port=None, dev=False, debug=False):
|
||||
"""Start the EmbeddingBuddy web server.
|
||||
|
||||
Args:
|
||||
host: Host to bind to (default: 127.0.0.1)
|
||||
port: Port to bind to (default: 8050)
|
||||
dev: Development mode - enable debug logging and auto-reload (default: False)
|
||||
debug: Enable debug logging only, no auto-reload (default: False)
|
||||
"""
|
||||
import os
|
||||
from .config.settings import AppSettings
|
||||
|
||||
# Determine actual values to use
|
||||
actual_host = host if host is not None else AppSettings.HOST
|
||||
actual_port = port if port is not None else AppSettings.PORT
|
||||
|
||||
# Determine mode
|
||||
# --dev takes precedence and enables both debug and auto-reload
|
||||
# --debug enables only debug logging
|
||||
# No flags = production mode (no debug, no auto-reload)
|
||||
use_reloader = dev
|
||||
use_debug = dev or debug
|
||||
|
||||
# Only print startup messages in main process (not in Flask reloader)
|
||||
if not os.environ.get("WERKZEUG_RUN_MAIN"):
|
||||
from importlib.metadata import version
|
||||
|
||||
try:
|
||||
pkg_version = version("embeddingbuddy")
|
||||
except Exception:
|
||||
pkg_version = "unknown"
|
||||
|
||||
mode = "development" if dev else ("debug" if debug else "production")
|
||||
print(f"Starting EmbeddingBuddy v{pkg_version} in {mode} mode...")
|
||||
print("Loading dependencies (this may take a few seconds)...")
|
||||
print(f"Server will start at http://{actual_host}:{actual_port}")
|
||||
if use_reloader:
|
||||
print("Auto-reload enabled - server will restart on code changes")
|
||||
|
||||
if __name__ == '__main__':
|
||||
app = create_app()
|
||||
|
||||
# Suppress Flask development server warning in production mode
|
||||
if not use_debug and not use_reloader:
|
||||
import warnings
|
||||
import logging
|
||||
|
||||
# Suppress the werkzeug warning
|
||||
warnings.filterwarnings("ignore", message=".*development server.*")
|
||||
|
||||
# Set werkzeug logger to ERROR level to suppress the warning
|
||||
werkzeug_logger = logging.getLogger("werkzeug")
|
||||
werkzeug_logger.setLevel(logging.ERROR)
|
||||
|
||||
# Use Flask's built-in server with appropriate settings
|
||||
app.run(
|
||||
debug=use_debug, host=actual_host, port=actual_port, use_reloader=use_reloader
|
||||
)
|
||||
|
||||
|
||||
def main():
|
||||
"""Legacy entry point - redirects to cli module.
|
||||
|
||||
This is kept for backward compatibility but the main CLI
|
||||
is now in embeddingbuddy.cli for faster startup.
|
||||
"""
|
||||
from .cli import main as cli_main
|
||||
|
||||
cli_main()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
run_app(app)
|
||||
@@ -1,17 +0,0 @@
|
||||
/* CSS override for transparent hover boxes in Plotly plots */
|
||||
|
||||
/* Make hover boxes transparent while preserving text readability */
|
||||
.hovertext {
|
||||
fill-opacity: 0.8 !important;
|
||||
stroke-opacity: 1 !important;
|
||||
}
|
||||
|
||||
/* Alternative selector for different Plotly versions */
|
||||
g.hovertext > path {
|
||||
opacity: 0.8 !important;
|
||||
}
|
||||
|
||||
/* Ensure text remains fully visible */
|
||||
.hovertext text {
|
||||
opacity: 1 !important;
|
||||
}
|
||||
@@ -1,225 +0,0 @@
|
||||
// Text input embedding generation using Transformers.js
|
||||
// This module runs entirely in the browser for privacy and performance
|
||||
|
||||
// Global flag to track initialization
|
||||
window.transformersLoading = false;
|
||||
window.transformersLoaded = false;
|
||||
|
||||
class TransformersEmbedder {
|
||||
constructor() {
|
||||
this.extractor = null;
|
||||
this.currentModel = null;
|
||||
this.modelCache = new Map();
|
||||
this.isLoading = false;
|
||||
}
|
||||
|
||||
async initializeModel(modelName = 'Xenova/all-MiniLM-L6-v2') {
|
||||
try {
|
||||
if (this.modelCache.has(modelName)) {
|
||||
this.extractor = this.modelCache.get(modelName);
|
||||
this.currentModel = modelName;
|
||||
return { success: true, model: modelName };
|
||||
}
|
||||
|
||||
if (this.isLoading) {
|
||||
return { success: false, error: 'Model loading already in progress' };
|
||||
}
|
||||
|
||||
this.isLoading = true;
|
||||
|
||||
// Use globally loaded Transformers.js pipeline
|
||||
if (!window.transformers) {
|
||||
if (!window.transformersPipeline) {
|
||||
// Wait for the pipeline to load
|
||||
let attempts = 0;
|
||||
while (!window.transformersPipeline && attempts < 50) { // Wait up to 5 seconds
|
||||
await new Promise(resolve => setTimeout(resolve, 100));
|
||||
attempts++;
|
||||
}
|
||||
if (!window.transformersPipeline) {
|
||||
throw new Error('Transformers.js pipeline not available. Please refresh the page.');
|
||||
}
|
||||
}
|
||||
window.transformers = { pipeline: window.transformersPipeline };
|
||||
window.transformersLoaded = true;
|
||||
console.log('✅ Using globally loaded Transformers.js pipeline');
|
||||
}
|
||||
|
||||
this.extractor = await window.transformers.pipeline('feature-extraction', modelName);
|
||||
|
||||
this.modelCache.set(modelName, this.extractor);
|
||||
this.currentModel = modelName;
|
||||
this.isLoading = false;
|
||||
|
||||
return { success: true, model: modelName };
|
||||
} catch (error) {
|
||||
this.isLoading = false;
|
||||
console.error('Model initialization error:', error);
|
||||
return { success: false, error: error.message };
|
||||
}
|
||||
}
|
||||
|
||||
async generateEmbeddings(texts, options = {}) {
|
||||
if (!this.extractor) {
|
||||
throw new Error('Model not initialized. Call initializeModel() first.');
|
||||
}
|
||||
|
||||
if (!texts || texts.length === 0) {
|
||||
throw new Error('No texts provided for embedding generation.');
|
||||
}
|
||||
|
||||
const embeddings = [];
|
||||
const defaultOptions = {
|
||||
pooling: 'mean',
|
||||
normalize: true,
|
||||
...options
|
||||
};
|
||||
|
||||
// Process in batches to avoid memory issues
|
||||
const batchSize = options.batchSize || 8;
|
||||
|
||||
try {
|
||||
for (let i = 0; i < texts.length; i += batchSize) {
|
||||
const batch = texts.slice(i, i + batchSize);
|
||||
|
||||
const batchResults = await Promise.all(
|
||||
batch.map(text => {
|
||||
if (!text || text.trim().length === 0) {
|
||||
throw new Error('Empty text found in batch');
|
||||
}
|
||||
return this.extractor(text.trim(), defaultOptions);
|
||||
})
|
||||
);
|
||||
|
||||
// Convert tensor output to arrays
|
||||
batchResults.forEach((result, idx) => {
|
||||
if (result && result.data) {
|
||||
embeddings.push(Array.from(result.data));
|
||||
} else {
|
||||
throw new Error(`Invalid embedding result for text: ${batch[idx]}`);
|
||||
}
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
return embeddings;
|
||||
} catch (error) {
|
||||
console.error('Embedding generation error:', error);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Global instance
|
||||
window.transformersEmbedder = new TransformersEmbedder();
|
||||
console.log('📦 TransformersEmbedder instance created');
|
||||
|
||||
|
||||
// Dash clientside callback functions
|
||||
window.dash_clientside = window.dash_clientside || {};
|
||||
console.log('🔧 Setting up window.dash_clientside.transformers');
|
||||
window.dash_clientside.transformers = {
|
||||
generateEmbeddings: async function(nClicks, textContent, modelName, tokenizationMethod, category, subcategory) {
|
||||
console.log('🚀 generateEmbeddings called with:', { nClicks, modelName, tokenizationMethod, textLength: textContent?.length });
|
||||
|
||||
if (!nClicks || !textContent || textContent.trim().length === 0) {
|
||||
console.log('⚠️ Early return - missing required parameters');
|
||||
return window.dash_clientside.no_update;
|
||||
}
|
||||
|
||||
try {
|
||||
// Initialize model if needed
|
||||
const initResult = await window.transformersEmbedder.initializeModel(modelName);
|
||||
if (!initResult.success) {
|
||||
return [
|
||||
{ error: `Model loading error: ${initResult.error}` },
|
||||
false
|
||||
];
|
||||
}
|
||||
|
||||
// Tokenize text based on method
|
||||
let textChunks;
|
||||
const trimmedText = textContent.trim();
|
||||
|
||||
switch (tokenizationMethod) {
|
||||
case 'sentence':
|
||||
textChunks = trimmedText
|
||||
.split(/[.!?]+/)
|
||||
.map(s => s.trim())
|
||||
.filter(s => s.length > 0);
|
||||
break;
|
||||
case 'paragraph':
|
||||
textChunks = trimmedText
|
||||
.split(/\n\s*\n/)
|
||||
.map(s => s.trim())
|
||||
.filter(s => s.length > 0);
|
||||
break;
|
||||
case 'manual':
|
||||
textChunks = trimmedText
|
||||
.split('\n')
|
||||
.map(s => s.trim())
|
||||
.filter(s => s.length > 0);
|
||||
break;
|
||||
default:
|
||||
textChunks = [trimmedText];
|
||||
}
|
||||
|
||||
if (textChunks.length === 0) {
|
||||
return [
|
||||
{ error: 'No valid text chunks found after tokenization' },
|
||||
false
|
||||
];
|
||||
}
|
||||
|
||||
// Generate embeddings
|
||||
const embeddings = await window.transformersEmbedder.generateEmbeddings(textChunks);
|
||||
|
||||
if (!embeddings || embeddings.length !== textChunks.length) {
|
||||
return [
|
||||
{ error: 'Embedding generation failed' },
|
||||
false
|
||||
];
|
||||
}
|
||||
|
||||
// Create documents structure
|
||||
const documents = textChunks.map((text, i) => ({
|
||||
id: `text_input_${Date.now()}_${i}`,
|
||||
text: text,
|
||||
embedding: embeddings[i],
|
||||
category: category || "Text Input",
|
||||
subcategory: subcategory || "Generated",
|
||||
tags: []
|
||||
}));
|
||||
|
||||
// Return the successful embeddings data
|
||||
const embeddingsData = {
|
||||
documents: documents,
|
||||
embeddings: embeddings
|
||||
};
|
||||
|
||||
console.log('✅ Embeddings generated successfully:', embeddingsData);
|
||||
|
||||
return [
|
||||
embeddingsData,
|
||||
false
|
||||
];
|
||||
|
||||
} catch (error) {
|
||||
console.error('Client-side embedding error:', error);
|
||||
return [
|
||||
{ error: error.message },
|
||||
false
|
||||
];
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
console.log('✅ Transformers.js client-side setup complete');
|
||||
console.log('Available:', {
|
||||
transformersEmbedder: !!window.transformersEmbedder,
|
||||
dashClientside: !!window.dash_clientside,
|
||||
transformersModule: !!window.dash_clientside?.transformers,
|
||||
generateFunction: typeof window.dash_clientside?.transformers?.generateEmbeddings,
|
||||
processAsync: typeof window.processEmbeddingsAsync
|
||||
});
|
||||
2
src/embeddingbuddy/assets/fontawesome.css
vendored
2
src/embeddingbuddy/assets/fontawesome.css
vendored
@@ -1,2 +0,0 @@
|
||||
/* Load Font Awesome from local assets */
|
||||
@import url("/assets/fontawesome/css/all.min.css");
|
||||
@@ -1,165 +0,0 @@
|
||||
Fonticons, Inc. (https://fontawesome.com)
|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
Font Awesome Free License
|
||||
|
||||
Font Awesome Free is free, open source, and GPL friendly. You can use it for
|
||||
commercial projects, open source projects, or really almost whatever you want.
|
||||
Full Font Awesome Free license: https://fontawesome.com/license/free.
|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
# Icons: CC BY 4.0 License (https://creativecommons.org/licenses/by/4.0/)
|
||||
|
||||
The Font Awesome Free download is licensed under a Creative Commons
|
||||
Attribution 4.0 International License and applies to all icons packaged
|
||||
as SVG and JS file types.
|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
# Fonts: SIL OFL 1.1 License
|
||||
|
||||
In the Font Awesome Free download, the SIL OFL license applies to all icons
|
||||
packaged as web and desktop font files.
|
||||
|
||||
Copyright (c) 2023 Fonticons, Inc. (https://fontawesome.com)
|
||||
with Reserved Font Name: "Font Awesome".
|
||||
|
||||
This Font Software is licensed under the SIL Open Font License, Version 1.1.
|
||||
This license is copied below, and is also available with a FAQ at:
|
||||
http://scripts.sil.org/OFL
|
||||
|
||||
SIL OPEN FONT LICENSE
|
||||
Version 1.1 - 26 February 2007
|
||||
|
||||
PREAMBLE
|
||||
The goals of the Open Font License (OFL) are to stimulate worldwide
|
||||
development of collaborative font projects, to support the font creation
|
||||
efforts of academic and linguistic communities, and to provide a free and
|
||||
open framework in which fonts may be shared and improved in partnership
|
||||
with others.
|
||||
|
||||
The OFL allows the licensed fonts to be used, studied, modified and
|
||||
redistributed freely as long as they are not sold by themselves. The
|
||||
fonts, including any derivative works, can be bundled, embedded,
|
||||
redistributed and/or sold with any software provided that any reserved
|
||||
names are not used by derivative works. The fonts and derivatives,
|
||||
however, cannot be released under any other type of license. The
|
||||
requirement for fonts to remain under this license does not apply
|
||||
to any document created using the fonts or their derivatives.
|
||||
|
||||
DEFINITIONS
|
||||
"Font Software" refers to the set of files released by the Copyright
|
||||
Holder(s) under this license and clearly marked as such. This may
|
||||
include source files, build scripts and documentation.
|
||||
|
||||
"Reserved Font Name" refers to any names specified as such after the
|
||||
copyright statement(s).
|
||||
|
||||
"Original Version" refers to the collection of Font Software components as
|
||||
distributed by the Copyright Holder(s).
|
||||
|
||||
"Modified Version" refers to any derivative made by adding to, deleting,
|
||||
or substituting — in part or in whole — any of the components of the
|
||||
Original Version, by changing formats or by porting the Font Software to a
|
||||
new environment.
|
||||
|
||||
"Author" refers to any designer, engineer, programmer, technical
|
||||
writer or other person who contributed to the Font Software.
|
||||
|
||||
PERMISSION & CONDITIONS
|
||||
Permission is hereby granted, free of charge, to any person obtaining
|
||||
a copy of the Font Software, to use, study, copy, merge, embed, modify,
|
||||
redistribute, and sell modified and unmodified copies of the Font
|
||||
Software, subject to the following conditions:
|
||||
|
||||
1) Neither the Font Software nor any of its individual components,
|
||||
in Original or Modified Versions, may be sold by itself.
|
||||
|
||||
2) Original or Modified Versions of the Font Software may be bundled,
|
||||
redistributed and/or sold with any software, provided that each copy
|
||||
contains the above copyright notice and this license. These can be
|
||||
included either as stand-alone text files, human-readable headers or
|
||||
in the appropriate machine-readable metadata fields within text or
|
||||
binary files as long as those fields can be easily viewed by the user.
|
||||
|
||||
3) No Modified Version of the Font Software may use the Reserved Font
|
||||
Name(s) unless explicit written permission is granted by the corresponding
|
||||
Copyright Holder. This restriction only applies to the primary font name as
|
||||
presented to the users.
|
||||
|
||||
4) The name(s) of the Copyright Holder(s) or the Author(s) of the Font
|
||||
Software shall not be used to promote, endorse or advertise any
|
||||
Modified Version, except to acknowledge the contribution(s) of the
|
||||
Copyright Holder(s) and the Author(s) or with their explicit written
|
||||
permission.
|
||||
|
||||
5) The Font Software, modified or unmodified, in part or in whole,
|
||||
must be distributed entirely under this license, and must not be
|
||||
distributed under any other license. The requirement for fonts to
|
||||
remain under this license does not apply to any document created
|
||||
using the Font Software.
|
||||
|
||||
TERMINATION
|
||||
This license becomes null and void if any of the above conditions are
|
||||
not met.
|
||||
|
||||
DISCLAIMER
|
||||
THE FONT SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO ANY WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT
|
||||
OF COPYRIGHT, PATENT, TRADEMARK, OR OTHER RIGHT. IN NO EVENT SHALL THE
|
||||
COPYRIGHT HOLDER BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
INCLUDING ANY GENERAL, SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL
|
||||
DAMAGES, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
FROM, OUT OF THE USE OR INABILITY TO USE THE FONT SOFTWARE OR FROM
|
||||
OTHER DEALINGS IN THE FONT SOFTWARE.
|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
# Code: MIT License (https://opensource.org/licenses/MIT)
|
||||
|
||||
In the Font Awesome Free download, the MIT license applies to all non-font and
|
||||
non-icon files.
|
||||
|
||||
Copyright 2023 Fonticons, Inc.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||||
this software and associated documentation files (the "Software"), to deal in the
|
||||
Software without restriction, including without limitation the rights to use, copy,
|
||||
modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
|
||||
and to permit persons to whom the Software is furnished to do so, subject to the
|
||||
following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
|
||||
INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
|
||||
PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
||||
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
# Attribution
|
||||
|
||||
Attribution is required by MIT, SIL OFL, and CC BY licenses. Downloaded Font
|
||||
Awesome Free files already contain embedded comments with sufficient
|
||||
attribution, so you shouldn't need to do anything additional when using these
|
||||
files normally.
|
||||
|
||||
We've kept attribution comments terse, so we ask that you do not actively work
|
||||
to remove them from files, especially code. They're a great way for folks to
|
||||
learn about Font Awesome.
|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
# Brand Icons
|
||||
|
||||
All brand icons are trademarks of their respective owners. The use of these
|
||||
trademarks does not indicate endorsement of the trademark holder by Font
|
||||
Awesome, nor vice versa. **Please do not use brand logos for any purpose except
|
||||
to represent the company, product, or service to which they refer.**
|
||||
File diff suppressed because one or more lines are too long
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -1,9 +0,0 @@
|
||||
{
|
||||
"name": "embeddingbuddy-assets",
|
||||
"version": "1.0.0",
|
||||
"description": "JavaScript dependencies for EmbeddingBuddy text input functionality",
|
||||
"dependencies": {
|
||||
"@huggingface/transformers": "^3.0.0"
|
||||
},
|
||||
"type": "module"
|
||||
}
|
||||
@@ -1,106 +0,0 @@
|
||||
The sun peeked through the clouds after a drizzly morning.
|
||||
A gentle breeze rustled the leaves as we walked along the shoreline.
|
||||
Heavy rains caused flooding in several low-lying neighborhoods.
|
||||
It was so hot that even the birds sought shade under the palm trees.
|
||||
By midnight, the temperature had dropped below freezing.
|
||||
Thunderstorms lit up the sky with flashes of lightning.
|
||||
A thick fog settled over the city streets at dawn.
|
||||
The air smelled of ozone after the sudden hailstorm.
|
||||
I watched the snowflakes drift silently onto the ground.
|
||||
A double rainbow appeared after the rain shower.
|
||||
The humidity soared to uncomfortable levels by midday.
|
||||
Dust devils formed in the dry desert plains.
|
||||
The barometer readings indicated an approaching front.
|
||||
A sudden gust of wind knocked over the garden chairs.
|
||||
Light drizzle turned into a torrential downpour within minutes.
|
||||
The new smartphone features a foldable display and 5G connectivity.
|
||||
In the world of AI, transformers have revolutionized natural language processing.
|
||||
Quantum computing promises to solve problems beyond classical computers' reach.
|
||||
Blockchain technology is being explored for secure voting systems.
|
||||
Virtual reality headsets are becoming more affordable and accessible.
|
||||
The rise of electric vehicles is reshaping the automotive industry.
|
||||
Cloud computing allows businesses to scale resources dynamically.
|
||||
Machine learning algorithms can now predict stock market trends with surprising accuracy.
|
||||
Augmented reality applications are transforming retail experiences.
|
||||
The Internet of Things connects everyday devices to the web for smarter living.
|
||||
Cybersecurity threats are evolving, requiring constant vigilance.
|
||||
3D printing is enabling rapid prototyping and custom manufacturing.
|
||||
Edge computing reduces latency by processing data closer to the source.
|
||||
Biometric authentication methods are enhancing security in devices.
|
||||
Wearable technology is tracking health metrics in real-time.
|
||||
Artificial intelligence is being used to create realistic deepfakes.
|
||||
Preheat the oven to 375°F before you start mixing the batter.
|
||||
She finely chopped the garlic and sautéed it in two tablespoons of olive oil.
|
||||
A pinch of saffron adds a beautiful color and aroma to traditional paella.
|
||||
If the soup is too salty, add a peeled potato to absorb excess sodium.
|
||||
Let the bread dough rise for at least an hour in a warm, draft-free spot.
|
||||
Marinate the chicken overnight in a blend of citrus and spices.
|
||||
Use a cast-iron skillet to sear the steak on high heat.
|
||||
Whisk the egg whites until they form stiff peaks.
|
||||
Fold in the chocolate chips gently to keep the batter airy.
|
||||
Brush the pastry with an egg wash for a golden finish.
|
||||
Slow-roast the pork shoulder until it falls off the bone.
|
||||
Garnish the salad with toasted nuts and fresh herbs.
|
||||
Deglaze the pan with white wine for a rich sauce.
|
||||
Simmer the curry paste until the aroma intensifies.
|
||||
Let the risotto rest before serving to thicken slightly.
|
||||
He dribbled past two defenders and sank a three-pointer at the buzzer.
|
||||
The marathon runner kept a steady pace despite the sweltering heat.
|
||||
Their home team clinched the championship with a last-minute goal.
|
||||
NASCAR fans cheered as the cars roared around the oval track.
|
||||
She landed a perfect triple axel at the figure skating championship.
|
||||
The cyclist pedaled up the steep hill in record time.
|
||||
He pitched a no-hitter during the high school baseball game.
|
||||
The quarterback threw a touchdown pass under heavy pressure.
|
||||
They scored a hat-trick in the hockey final.
|
||||
The boxer delivered a swift uppercut in the final round.
|
||||
Surfers caught massive waves at dawn on the Pacific coast.
|
||||
Fans erupted when the underdog scored the winning goal.
|
||||
The swimmer broke the national record in the 200m freestyle.
|
||||
The gymnast executed a flawless routine on the balance beam.
|
||||
The rugby team celebrated their victory with a traditional haka.
|
||||
The stock market rallied after positive earnings reports.
|
||||
Investors are closely watching interest rate changes by the Federal Reserve.
|
||||
Cryptocurrency prices have been extremely volatile this year.
|
||||
Diversification is key to managing investment risk effectively.
|
||||
Inflation rates have reached a 40-year high, impacting consumer spending.
|
||||
Many companies are adopting ESG criteria to attract socially conscious investors.
|
||||
The bond market is reacting to geopolitical tensions and supply chain disruptions.
|
||||
Venture capital funding for startups has surged in the tech sector.
|
||||
Exchange-traded funds (ETFs) offer a way to invest in diversified portfolios.
|
||||
The global economy is recovering from the pandemic, but challenges remain.
|
||||
Central banks are exploring digital currencies to modernize payment systems.
|
||||
Retail investors are increasingly participating in the stock market through apps.
|
||||
Hedge funds are using complex algorithms to gain an edge in trading.
|
||||
Real estate prices have skyrocketed in urban areas due to low inventory.
|
||||
The startup raised $10 million in its Series A funding round.
|
||||
The symphony orchestra played a hauntingly beautiful melody.
|
||||
She strummed her guitar softly, filling the room with a warm sound.
|
||||
The DJ mixed tracks seamlessly, keeping the crowd dancing all night.
|
||||
His voice soared during the high notes of the ballad.
|
||||
The band played an acoustic set in the intimate coffee shop.
|
||||
Jazz musicians often improvise solos based on the chord changes.
|
||||
The opera singer hit the high C with perfect pitch.
|
||||
The choir harmonized beautifully, filling the church with sound.
|
||||
He composed a symphony that was performed at the concert hall.
|
||||
The singer-songwriter wrote heartfelt lyrics about love and loss.
|
||||
The rock band headlined the festival, drawing a massive crowd.
|
||||
Hip-hop artists use rhythm and rhyme to tell powerful stories.
|
||||
The violinist played a virtuosic solo that left the audience in awe.
|
||||
Folk music often reflects the culture and traditions of a community.
|
||||
The gospel choir lifted spirits with their uplifting performance.
|
||||
The fall of the Berlin Wall in 1989 marked the end of the Cold War.
|
||||
Ancient Egypt's pyramids are a testament to their architectural prowess.
|
||||
Europe's Renaissance period sparked a revival in art and science.
|
||||
The signing of the Declaration of Independence in 1776 established the United States.
|
||||
The Industrial Revolution transformed economies and societies worldwide.
|
||||
Rome was the center of a vast empire that influenced law and governance.
|
||||
The discovery of the New World by Christopher Columbus in 1492 changed global trade.
|
||||
The French Revolution in 1789 led to significant political and social change.
|
||||
World War II was a global conflict that reshaped international relations.
|
||||
The fall of the Roman Empire in 476 AD marked the beginning of the Middle Ages.
|
||||
The invention of the printing press revolutionized the spread of knowledge.
|
||||
The Cold War was characterized by political tension between the U.S. and the Soviet Union.
|
||||
The ancient Silk Road connected East and West through trade routes.
|
||||
The signing of the Magna Carta in 1215 established principles of due process.
|
||||
Exploration during the Age of Discovery expanded European empires across the globe.
|
||||
@@ -1,188 +0,0 @@
|
||||
// Simple script to load Transformers.js from CDN and initialize embedding functionality
|
||||
// This approach uses traditional script loading instead of ES6 modules
|
||||
|
||||
console.log('🔧 Transformers.js loader starting...');
|
||||
|
||||
// Global state
|
||||
window.transformersLibraryLoaded = false;
|
||||
window.transformersLibraryLoading = false;
|
||||
|
||||
// Function to dynamically load a script
|
||||
function loadScript(src) {
|
||||
return new Promise((resolve, reject) => {
|
||||
const script = document.createElement('script');
|
||||
script.src = src;
|
||||
script.type = 'module';
|
||||
script.onload = () => resolve();
|
||||
script.onerror = () => reject(new Error(`Failed to load script: ${src}`));
|
||||
document.head.appendChild(script);
|
||||
});
|
||||
}
|
||||
|
||||
// Function to initialize Transformers.js
|
||||
async function initializeTransformers() {
|
||||
if (window.transformersLibraryLoaded) {
|
||||
console.log('✅ Transformers.js already loaded');
|
||||
return true;
|
||||
}
|
||||
|
||||
if (window.transformersLibraryLoading) {
|
||||
console.log('⏳ Transformers.js already loading, waiting...');
|
||||
// Wait for loading to complete
|
||||
while (window.transformersLibraryLoading) {
|
||||
await new Promise(resolve => setTimeout(resolve, 100));
|
||||
}
|
||||
return window.transformersLibraryLoaded;
|
||||
}
|
||||
|
||||
window.transformersLibraryLoading = true;
|
||||
|
||||
try {
|
||||
console.log('📦 Loading Transformers.js from CDN...');
|
||||
|
||||
// Use dynamic import since this is more reliable with ES modules
|
||||
const transformers = await import('https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.0.0');
|
||||
window.transformersLibrary = transformers;
|
||||
window.transformersLibraryLoaded = true;
|
||||
|
||||
console.log('✅ Transformers.js loaded successfully');
|
||||
return true;
|
||||
} catch (error) {
|
||||
console.error('❌ Failed to load Transformers.js:', error);
|
||||
return false;
|
||||
} finally {
|
||||
window.transformersLibraryLoading = false;
|
||||
}
|
||||
}
|
||||
|
||||
// Simple embeddings class
|
||||
class SimpleEmbedder {
|
||||
constructor() {
|
||||
this.pipeline = null;
|
||||
this.modelCache = new Map();
|
||||
}
|
||||
|
||||
async generateEmbeddings(texts, modelName = 'Xenova/all-MiniLM-L6-v2') {
|
||||
console.log('🔄 Generating embeddings for', texts.length, 'texts with model', modelName);
|
||||
|
||||
// Ensure Transformers.js is loaded
|
||||
if (!window.transformersLibraryLoaded) {
|
||||
const loaded = await initializeTransformers();
|
||||
if (!loaded) {
|
||||
throw new Error('Failed to load Transformers.js');
|
||||
}
|
||||
}
|
||||
|
||||
// Create pipeline if not cached
|
||||
if (!this.modelCache.has(modelName)) {
|
||||
console.log('🏗️ Creating pipeline for', modelName);
|
||||
const { pipeline } = window.transformersLibrary;
|
||||
this.pipeline = await pipeline('feature-extraction', modelName);
|
||||
this.modelCache.set(modelName, this.pipeline);
|
||||
} else {
|
||||
this.pipeline = this.modelCache.get(modelName);
|
||||
}
|
||||
|
||||
// Generate embeddings
|
||||
const embeddings = [];
|
||||
for (let i = 0; i < texts.length; i++) {
|
||||
console.log(`Processing text ${i + 1}/${texts.length}...`);
|
||||
const result = await this.pipeline(texts[i], { pooling: 'mean', normalize: true });
|
||||
embeddings.push(Array.from(result.data));
|
||||
}
|
||||
|
||||
console.log('✅ Generated', embeddings.length, 'embeddings');
|
||||
return embeddings;
|
||||
}
|
||||
}
|
||||
|
||||
// Create global instance
|
||||
window.simpleEmbedder = new SimpleEmbedder();
|
||||
|
||||
// Set up Dash clientside callbacks
|
||||
window.dash_clientside = window.dash_clientside || {};
|
||||
window.dash_clientside.transformers = {
|
||||
generateEmbeddings: async function(nClicks, textContent, modelName, tokenizationMethod, category, subcategory) {
|
||||
console.log('🚀 Client-side generateEmbeddings called');
|
||||
|
||||
if (!nClicks || !textContent || textContent.trim().length === 0) {
|
||||
console.log('⚠️ Missing required parameters');
|
||||
return window.dash_clientside.no_update;
|
||||
}
|
||||
|
||||
try {
|
||||
// Ensure Transformers.js is loaded
|
||||
if (!window.transformersLibraryLoaded) {
|
||||
const loaded = await initializeTransformers();
|
||||
if (!loaded) {
|
||||
return [
|
||||
{ error: 'Failed to load Transformers.js' },
|
||||
false
|
||||
];
|
||||
}
|
||||
}
|
||||
|
||||
// Tokenize text
|
||||
let textChunks;
|
||||
const trimmedText = textContent.trim();
|
||||
|
||||
switch (tokenizationMethod) {
|
||||
case 'sentence':
|
||||
textChunks = trimmedText.split(/[.!?]+/).map(s => s.trim()).filter(s => s.length > 0);
|
||||
break;
|
||||
case 'paragraph':
|
||||
textChunks = trimmedText.split(/\n\s*\n/).map(s => s.trim()).filter(s => s.length > 0);
|
||||
break;
|
||||
case 'manual':
|
||||
textChunks = trimmedText.split('\n').map(s => s.trim()).filter(s => s.length > 0);
|
||||
break;
|
||||
default:
|
||||
textChunks = [trimmedText];
|
||||
}
|
||||
|
||||
if (textChunks.length === 0) {
|
||||
return [
|
||||
{ error: 'No valid text chunks after tokenization' },
|
||||
false
|
||||
];
|
||||
}
|
||||
|
||||
// Generate embeddings
|
||||
const embeddings = await window.simpleEmbedder.generateEmbeddings(textChunks, modelName);
|
||||
|
||||
// Create documents
|
||||
const documents = textChunks.map((text, i) => ({
|
||||
id: `text_input_${Date.now()}_${i}`,
|
||||
text: text,
|
||||
embedding: embeddings[i],
|
||||
category: category || "Text Input",
|
||||
subcategory: subcategory || "Generated",
|
||||
tags: []
|
||||
}));
|
||||
|
||||
// Return the successful embeddings data
|
||||
const embeddingsData = {
|
||||
documents: documents,
|
||||
embeddings: embeddings
|
||||
};
|
||||
|
||||
console.log('✅ Embeddings generated successfully:', embeddingsData);
|
||||
|
||||
return [
|
||||
embeddingsData,
|
||||
false
|
||||
];
|
||||
|
||||
} catch (error) {
|
||||
console.error('❌ Error generating embeddings:', error);
|
||||
return [
|
||||
{ error: error.message },
|
||||
false
|
||||
];
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
console.log('✅ Simple Transformers.js setup complete');
|
||||
console.log('Available functions:', Object.keys(window.dash_clientside.transformers));
|
||||
@@ -1,67 +0,0 @@
|
||||
"""
|
||||
Lightweight CLI entry point for EmbeddingBuddy.
|
||||
|
||||
This module provides a fast command-line interface that only imports
|
||||
heavy dependencies when actually needed by subcommands.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
|
||||
|
||||
def main():
|
||||
"""Main CLI entry point with minimal imports for fast help text."""
|
||||
parser = argparse.ArgumentParser(
|
||||
prog="embeddingbuddy",
|
||||
description="EmbeddingBuddy - Interactive embedding visualization tool",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
embeddingbuddy serve # Production mode (no debug, no auto-reload)
|
||||
embeddingbuddy serve --dev # Development mode (debug + auto-reload)
|
||||
embeddingbuddy serve --debug # Debug logging only (no auto-reload)
|
||||
embeddingbuddy serve --port 8080 # Custom port
|
||||
embeddingbuddy serve --host 0.0.0.0 # Bind to all interfaces
|
||||
""",
|
||||
)
|
||||
|
||||
subparsers = parser.add_subparsers(
|
||||
dest="command", help="Available commands", metavar="<command>"
|
||||
)
|
||||
|
||||
# Serve subcommand
|
||||
serve_parser = subparsers.add_parser(
|
||||
"serve",
|
||||
help="Start the web server",
|
||||
description="Start the EmbeddingBuddy web server for interactive visualization",
|
||||
)
|
||||
serve_parser.add_argument(
|
||||
"--host", default=None, help="Host to bind to (default: 127.0.0.1)"
|
||||
)
|
||||
serve_parser.add_argument(
|
||||
"--port", type=int, default=None, help="Port to bind to (default: 8050)"
|
||||
)
|
||||
serve_parser.add_argument(
|
||||
"--dev",
|
||||
action="store_true",
|
||||
help="Development mode: enable debug logging and auto-reload",
|
||||
)
|
||||
serve_parser.add_argument(
|
||||
"--debug", action="store_true", help="Enable debug logging (no auto-reload)"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.command == "serve":
|
||||
# Only import heavy dependencies when actually running serve
|
||||
from embeddingbuddy.app import serve
|
||||
|
||||
serve(host=args.host, port=args.port, dev=args.dev, debug=args.debug)
|
||||
else:
|
||||
# No command specified, show help
|
||||
parser.print_help()
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -3,185 +3,105 @@ import os
|
||||
|
||||
|
||||
class AppSettings:
|
||||
|
||||
# UI Configuration
|
||||
UPLOAD_STYLE = {
|
||||
"width": "100%",
|
||||
"height": "60px",
|
||||
"lineHeight": "60px",
|
||||
"borderWidth": "1px",
|
||||
"borderStyle": "dashed",
|
||||
"borderRadius": "5px",
|
||||
"textAlign": "center",
|
||||
"margin-bottom": "20px",
|
||||
'width': '100%',
|
||||
'height': '60px',
|
||||
'lineHeight': '60px',
|
||||
'borderWidth': '1px',
|
||||
'borderStyle': 'dashed',
|
||||
'borderRadius': '5px',
|
||||
'textAlign': 'center',
|
||||
'margin-bottom': '20px'
|
||||
}
|
||||
|
||||
PROMPTS_UPLOAD_STYLE = {**UPLOAD_STYLE, "borderColor": "#28a745"}
|
||||
|
||||
PLOT_CONFIG = {"responsive": True, "displayModeBar": True}
|
||||
|
||||
PLOT_STYLE = {"height": "85vh", "width": "100%"}
|
||||
|
||||
|
||||
PROMPTS_UPLOAD_STYLE = {
|
||||
**UPLOAD_STYLE,
|
||||
'borderColor': '#28a745'
|
||||
}
|
||||
|
||||
PLOT_CONFIG = {
|
||||
'responsive': True,
|
||||
'displayModeBar': True
|
||||
}
|
||||
|
||||
PLOT_STYLE = {
|
||||
'height': '85vh',
|
||||
'width': '100%'
|
||||
}
|
||||
|
||||
PLOT_LAYOUT_CONFIG = {
|
||||
"height": None,
|
||||
"autosize": True,
|
||||
"margin": dict(l=0, r=0, t=50, b=0),
|
||||
'height': None,
|
||||
'autosize': True,
|
||||
'margin': dict(l=0, r=0, t=50, b=0)
|
||||
}
|
||||
|
||||
|
||||
# Dimensionality Reduction Settings
|
||||
DEFAULT_N_COMPONENTS_3D = 3
|
||||
DEFAULT_N_COMPONENTS_2D = 2
|
||||
DEFAULT_RANDOM_STATE = 42
|
||||
|
||||
|
||||
# Available Methods
|
||||
REDUCTION_METHODS = [
|
||||
{"label": "PCA", "value": "pca"},
|
||||
{"label": "t-SNE", "value": "tsne"},
|
||||
{"label": "UMAP", "value": "umap"},
|
||||
{'label': 'PCA', 'value': 'pca'},
|
||||
{'label': 't-SNE', 'value': 'tsne'},
|
||||
{'label': 'UMAP', 'value': 'umap'}
|
||||
]
|
||||
|
||||
|
||||
COLOR_OPTIONS = [
|
||||
{"label": "Category", "value": "category"},
|
||||
{"label": "Subcategory", "value": "subcategory"},
|
||||
{"label": "Tags", "value": "tags"},
|
||||
{'label': 'Category', 'value': 'category'},
|
||||
{'label': 'Subcategory', 'value': 'subcategory'},
|
||||
{'label': 'Tags', 'value': 'tags'}
|
||||
]
|
||||
|
||||
DIMENSION_OPTIONS = [{"label": "2D", "value": "2d"}, {"label": "3D", "value": "3d"}]
|
||||
|
||||
|
||||
DIMENSION_OPTIONS = [
|
||||
{'label': '2D', 'value': '2d'},
|
||||
{'label': '3D', 'value': '3d'}
|
||||
]
|
||||
|
||||
# Default Values
|
||||
DEFAULT_METHOD = "pca"
|
||||
DEFAULT_COLOR_BY = "category"
|
||||
DEFAULT_DIMENSIONS = "3d"
|
||||
DEFAULT_SHOW_PROMPTS = ["show"]
|
||||
|
||||
DEFAULT_METHOD = 'pca'
|
||||
DEFAULT_COLOR_BY = 'category'
|
||||
DEFAULT_DIMENSIONS = '3d'
|
||||
DEFAULT_SHOW_PROMPTS = ['show']
|
||||
|
||||
# Plot Marker Settings
|
||||
DOCUMENT_MARKER_SIZE_2D = 8
|
||||
DOCUMENT_MARKER_SIZE_3D = 5
|
||||
PROMPT_MARKER_SIZE_2D = 10
|
||||
PROMPT_MARKER_SIZE_3D = 6
|
||||
|
||||
DOCUMENT_MARKER_SYMBOL = "circle"
|
||||
PROMPT_MARKER_SYMBOL = "diamond"
|
||||
|
||||
|
||||
DOCUMENT_MARKER_SYMBOL = 'circle'
|
||||
PROMPT_MARKER_SYMBOL = 'diamond'
|
||||
|
||||
DOCUMENT_OPACITY = 1.0
|
||||
PROMPT_OPACITY = 0.8
|
||||
|
||||
|
||||
# Text Processing
|
||||
TEXT_PREVIEW_LENGTH = 100
|
||||
|
||||
|
||||
# App Configuration
|
||||
DEBUG = os.getenv("EMBEDDINGBUDDY_DEBUG", "False").lower() == "true"
|
||||
HOST = os.getenv("EMBEDDINGBUDDY_HOST", "127.0.0.1")
|
||||
PORT = int(os.getenv("EMBEDDINGBUDDY_PORT", "8050"))
|
||||
|
||||
# Environment Configuration
|
||||
ENVIRONMENT = os.getenv(
|
||||
"EMBEDDINGBUDDY_ENV", "development"
|
||||
) # development, production
|
||||
|
||||
# WSGI Server Configuration (for production)
|
||||
GUNICORN_WORKERS = int(os.getenv("GUNICORN_WORKERS", "4"))
|
||||
GUNICORN_BIND = os.getenv("GUNICORN_BIND", f"{HOST}:{PORT}")
|
||||
GUNICORN_TIMEOUT = int(os.getenv("GUNICORN_TIMEOUT", "120"))
|
||||
GUNICORN_KEEPALIVE = int(os.getenv("GUNICORN_KEEPALIVE", "5"))
|
||||
|
||||
# OpenSearch Configuration
|
||||
OPENSEARCH_ENABLED = (
|
||||
os.getenv("EMBEDDINGBUDDY_OPENSEARCH_ENABLED", "True").lower() == "true"
|
||||
)
|
||||
OPENSEARCH_DEFAULT_SIZE = 100
|
||||
OPENSEARCH_SAMPLE_SIZE = 5
|
||||
OPENSEARCH_CONNECTION_TIMEOUT = 30
|
||||
OPENSEARCH_VERIFY_CERTS = True
|
||||
|
||||
# Text Input / Transformers.js Configuration
|
||||
DEFAULT_EMBEDDING_MODEL = "Xenova/all-mpnet-base-v2"
|
||||
MAX_TEXT_LENGTH = 50000 # Characters (browser memory limits)
|
||||
DEFAULT_TOKENIZATION_METHOD = "sentence"
|
||||
MAX_BATCH_SIZE = 8 # Process in smaller batches for memory management
|
||||
|
||||
# Available Transformers.js compatible models
|
||||
AVAILABLE_MODELS = [
|
||||
{
|
||||
"name": "Xenova/all-mpnet-base-v2",
|
||||
"label": "All-MPNet-Base-v2 (Quality, 768d)",
|
||||
"description": "Higher quality embeddings with better semantic understanding",
|
||||
"dimensions": 768,
|
||||
"size": "109 MB",
|
||||
"context_length": 512,
|
||||
"multilingual": False,
|
||||
"default": True,
|
||||
},
|
||||
{
|
||||
"name": "Xenova/all-MiniLM-L6-v2",
|
||||
"label": "All-MiniLM-L6-v2 (Fast, 384d)",
|
||||
"description": "Lightweight model, good for quick testing and general purpose",
|
||||
"dimensions": 384,
|
||||
"size": "23 MB",
|
||||
"context_length": 512,
|
||||
"multilingual": False,
|
||||
"default": False,
|
||||
},
|
||||
{
|
||||
"name": "Xenova/paraphrase-multilingual-MiniLM-L12-v2",
|
||||
"label": "Multilingual MiniLM (50+ languages)",
|
||||
"description": "Support for multiple languages with good performance",
|
||||
"dimensions": 384,
|
||||
"size": "127 MB",
|
||||
"context_length": 512,
|
||||
"multilingual": True,
|
||||
},
|
||||
{
|
||||
"name": "Xenova/bge-small-en-v1.5",
|
||||
"label": "BGE Small English (High quality, 384d)",
|
||||
"description": "Beijing Academy of AI model with excellent performance on retrieval tasks",
|
||||
"dimensions": 384,
|
||||
"size": "67 MB",
|
||||
"context_length": 512,
|
||||
"multilingual": False,
|
||||
},
|
||||
{
|
||||
"name": "Xenova/gte-small",
|
||||
"label": "GTE Small (General Text Embeddings, 384d)",
|
||||
"description": "Alibaba's general text embedding model, balanced performance",
|
||||
"dimensions": 384,
|
||||
"size": "67 MB",
|
||||
"context_length": 512,
|
||||
"multilingual": False,
|
||||
},
|
||||
]
|
||||
|
||||
# Browser compatibility requirements
|
||||
SUPPORTED_BROWSERS = {
|
||||
"chrome": ">=88",
|
||||
"firefox": ">=92",
|
||||
"safari": ">=15.4",
|
||||
"edge": ">=88",
|
||||
}
|
||||
|
||||
DEBUG = os.getenv('EMBEDDINGBUDDY_DEBUG', 'True').lower() == 'true'
|
||||
HOST = os.getenv('EMBEDDINGBUDDY_HOST', '127.0.0.1')
|
||||
PORT = int(os.getenv('EMBEDDINGBUDDY_PORT', '8050'))
|
||||
|
||||
# Bootstrap Theme
|
||||
EXTERNAL_STYLESHEETS = [
|
||||
"https://cdn.jsdelivr.net/npm/bootstrap@5.1.3/dist/css/bootstrap.min.css"
|
||||
]
|
||||
|
||||
EXTERNAL_STYLESHEETS = ['https://cdn.jsdelivr.net/npm/bootstrap@5.1.3/dist/css/bootstrap.min.css']
|
||||
|
||||
@classmethod
|
||||
def get_plot_marker_config(
|
||||
cls, dimensions: str, is_prompt: bool = False
|
||||
) -> Dict[str, Any]:
|
||||
def get_plot_marker_config(cls, dimensions: str, is_prompt: bool = False) -> Dict[str, Any]:
|
||||
if is_prompt:
|
||||
size = (
|
||||
cls.PROMPT_MARKER_SIZE_3D
|
||||
if dimensions == "3d"
|
||||
else cls.PROMPT_MARKER_SIZE_2D
|
||||
)
|
||||
size = cls.PROMPT_MARKER_SIZE_3D if dimensions == '3d' else cls.PROMPT_MARKER_SIZE_2D
|
||||
symbol = cls.PROMPT_MARKER_SYMBOL
|
||||
opacity = cls.PROMPT_OPACITY
|
||||
else:
|
||||
size = (
|
||||
cls.DOCUMENT_MARKER_SIZE_3D
|
||||
if dimensions == "3d"
|
||||
else cls.DOCUMENT_MARKER_SIZE_2D
|
||||
)
|
||||
size = cls.DOCUMENT_MARKER_SIZE_3D if dimensions == '3d' else cls.DOCUMENT_MARKER_SIZE_2D
|
||||
symbol = cls.DOCUMENT_MARKER_SYMBOL
|
||||
opacity = cls.DOCUMENT_OPACITY
|
||||
|
||||
return {"size": size, "symbol": symbol, "opacity": opacity}
|
||||
|
||||
return {
|
||||
'size': size,
|
||||
'symbol': symbol,
|
||||
'opacity': opacity
|
||||
}
|
||||
@@ -1,72 +1,39 @@
|
||||
import json
|
||||
import uuid
|
||||
import base64
|
||||
from typing import List
|
||||
from ..models.schemas import Document
|
||||
from typing import List, Union
|
||||
from ..models.schemas import Document, ProcessedData
|
||||
|
||||
|
||||
class NDJSONParser:
|
||||
|
||||
@staticmethod
|
||||
def parse_upload_contents(contents: str) -> List[Document]:
|
||||
content_type, content_string = contents.split(",")
|
||||
content_type, content_string = contents.split(',')
|
||||
decoded = base64.b64decode(content_string)
|
||||
text_content = decoded.decode("utf-8")
|
||||
text_content = decoded.decode('utf-8')
|
||||
return NDJSONParser.parse_text(text_content)
|
||||
|
||||
|
||||
@staticmethod
|
||||
def parse_text(text_content: str) -> List[Document]:
|
||||
documents = []
|
||||
for line_num, line in enumerate(text_content.strip().split("\n"), 1):
|
||||
for line in text_content.strip().split('\n'):
|
||||
if line.strip():
|
||||
try:
|
||||
doc_dict = json.loads(line)
|
||||
doc = NDJSONParser._dict_to_document(doc_dict)
|
||||
documents.append(doc)
|
||||
except json.JSONDecodeError as e:
|
||||
raise json.JSONDecodeError(
|
||||
f"Invalid JSON on line {line_num}: {e.msg}", e.doc, e.pos
|
||||
)
|
||||
except KeyError as e:
|
||||
raise KeyError(f"Missing required field {e} on line {line_num}")
|
||||
except (TypeError, ValueError) as e:
|
||||
raise ValueError(
|
||||
f"Invalid data format on line {line_num}: {str(e)}"
|
||||
)
|
||||
doc_dict = json.loads(line)
|
||||
doc = NDJSONParser._dict_to_document(doc_dict)
|
||||
documents.append(doc)
|
||||
return documents
|
||||
|
||||
|
||||
@staticmethod
|
||||
def _dict_to_document(doc_dict: dict) -> Document:
|
||||
if "id" not in doc_dict:
|
||||
doc_dict["id"] = str(uuid.uuid4())
|
||||
|
||||
# Validate required fields
|
||||
if "text" not in doc_dict:
|
||||
raise KeyError("'text'")
|
||||
if "embedding" not in doc_dict:
|
||||
raise KeyError("'embedding'")
|
||||
|
||||
# Validate embedding format
|
||||
embedding = doc_dict["embedding"]
|
||||
if not isinstance(embedding, list):
|
||||
raise ValueError(
|
||||
f"Embedding must be a list, got {type(embedding).__name__}"
|
||||
)
|
||||
|
||||
if not embedding:
|
||||
raise ValueError("Embedding cannot be empty")
|
||||
|
||||
# Check that all embedding values are numbers
|
||||
for i, val in enumerate(embedding):
|
||||
if not isinstance(val, (int, float)) or val != val: # NaN check
|
||||
raise ValueError(
|
||||
f"Embedding contains invalid value at index {i}: {val}"
|
||||
)
|
||||
|
||||
if 'id' not in doc_dict:
|
||||
doc_dict['id'] = str(uuid.uuid4())
|
||||
|
||||
return Document(
|
||||
id=doc_dict["id"],
|
||||
text=doc_dict["text"],
|
||||
embedding=embedding,
|
||||
category=doc_dict.get("category"),
|
||||
subcategory=doc_dict.get("subcategory"),
|
||||
tags=doc_dict.get("tags"),
|
||||
)
|
||||
id=doc_dict['id'],
|
||||
text=doc_dict['text'],
|
||||
embedding=doc_dict['embedding'],
|
||||
category=doc_dict.get('category'),
|
||||
subcategory=doc_dict.get('subcategory'),
|
||||
tags=doc_dict.get('tags')
|
||||
)
|
||||
@@ -1,24 +1,22 @@
|
||||
import numpy as np
|
||||
from typing import List, Optional, Tuple
|
||||
from ..models.schemas import Document, ProcessedData
|
||||
from ..models.field_mapper import FieldMapper
|
||||
from .parser import NDJSONParser
|
||||
|
||||
|
||||
class DataProcessor:
|
||||
|
||||
def __init__(self):
|
||||
self.parser = NDJSONParser()
|
||||
|
||||
def process_upload(
|
||||
self, contents: str, filename: Optional[str] = None
|
||||
) -> ProcessedData:
|
||||
|
||||
def process_upload(self, contents: str, filename: Optional[str] = None) -> ProcessedData:
|
||||
try:
|
||||
documents = self.parser.parse_upload_contents(contents)
|
||||
embeddings = self._extract_embeddings(documents)
|
||||
return ProcessedData(documents=documents, embeddings=embeddings)
|
||||
except Exception as e:
|
||||
return ProcessedData(documents=[], embeddings=np.array([]), error=str(e))
|
||||
|
||||
|
||||
def process_text(self, text_content: str) -> ProcessedData:
|
||||
try:
|
||||
documents = self.parser.parse_text(text_content)
|
||||
@@ -26,155 +24,31 @@ class DataProcessor:
|
||||
return ProcessedData(documents=documents, embeddings=embeddings)
|
||||
except Exception as e:
|
||||
return ProcessedData(documents=[], embeddings=np.array([]), error=str(e))
|
||||
|
||||
def process_opensearch_data(
|
||||
self, raw_documents: List[dict], field_mapping
|
||||
) -> ProcessedData:
|
||||
"""Process raw OpenSearch documents using field mapping."""
|
||||
try:
|
||||
# Transform documents using field mapping
|
||||
transformed_docs = FieldMapper.transform_documents(
|
||||
raw_documents, field_mapping
|
||||
)
|
||||
|
||||
# Parse transformed documents
|
||||
documents = []
|
||||
for doc_dict in transformed_docs:
|
||||
try:
|
||||
# Ensure required fields are present with defaults if needed
|
||||
if "id" not in doc_dict or not doc_dict["id"]:
|
||||
doc_dict["id"] = f"doc_{len(documents)}"
|
||||
|
||||
doc = Document(**doc_dict)
|
||||
documents.append(doc)
|
||||
except Exception:
|
||||
continue # Skip invalid documents
|
||||
|
||||
if not documents:
|
||||
return ProcessedData(
|
||||
documents=[],
|
||||
embeddings=np.array([]),
|
||||
error="No valid documents after transformation",
|
||||
)
|
||||
|
||||
embeddings = self._extract_embeddings(documents)
|
||||
return ProcessedData(documents=documents, embeddings=embeddings)
|
||||
|
||||
except Exception as e:
|
||||
return ProcessedData(documents=[], embeddings=np.array([]), error=str(e))
|
||||
|
||||
def process_client_embeddings(self, embeddings_data: dict) -> ProcessedData:
|
||||
"""Process embeddings data received from client-side JavaScript."""
|
||||
try:
|
||||
if "error" in embeddings_data:
|
||||
return ProcessedData(
|
||||
documents=[],
|
||||
embeddings=np.array([]),
|
||||
error=embeddings_data["error"],
|
||||
)
|
||||
|
||||
# Extract documents and embeddings from client data
|
||||
documents_data = embeddings_data.get("documents", [])
|
||||
embeddings_list = embeddings_data.get("embeddings", [])
|
||||
|
||||
if not documents_data or not embeddings_list:
|
||||
return ProcessedData(
|
||||
documents=[],
|
||||
embeddings=np.array([]),
|
||||
error="No documents or embeddings in client data",
|
||||
)
|
||||
|
||||
if len(documents_data) != len(embeddings_list):
|
||||
return ProcessedData(
|
||||
documents=[],
|
||||
embeddings=np.array([]),
|
||||
error="Mismatch between number of documents and embeddings",
|
||||
)
|
||||
|
||||
# Convert embeddings to numpy array first
|
||||
try:
|
||||
embeddings = np.array(embeddings_list)
|
||||
|
||||
if embeddings.ndim != 2:
|
||||
return ProcessedData(
|
||||
documents=[],
|
||||
embeddings=np.array([]),
|
||||
error="Invalid embedding dimensions",
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
return ProcessedData(
|
||||
documents=[],
|
||||
embeddings=np.array([]),
|
||||
error=f"Error processing embeddings: {str(e)}",
|
||||
)
|
||||
|
||||
# Convert to Document objects with embeddings
|
||||
documents = []
|
||||
for i, doc_data in enumerate(documents_data):
|
||||
try:
|
||||
# Skip if we don't have a corresponding embedding
|
||||
if i >= len(embeddings):
|
||||
continue
|
||||
|
||||
# Ensure required fields are present
|
||||
if "id" not in doc_data or not doc_data["id"]:
|
||||
doc_data["id"] = f"text_input_{i}"
|
||||
if "text" not in doc_data or not doc_data["text"].strip():
|
||||
continue # Skip documents without text
|
||||
|
||||
# Add the embedding to doc_data
|
||||
doc_data["embedding"] = embeddings[i].tolist()
|
||||
|
||||
doc = Document(**doc_data)
|
||||
documents.append(doc)
|
||||
except Exception:
|
||||
# Skip invalid documents but continue processing
|
||||
continue
|
||||
|
||||
if not documents:
|
||||
return ProcessedData(
|
||||
documents=[],
|
||||
embeddings=np.array([]),
|
||||
error="No valid documents found in client data",
|
||||
)
|
||||
|
||||
# Only keep embeddings for valid documents
|
||||
valid_embeddings = embeddings[: len(documents)]
|
||||
|
||||
return ProcessedData(documents=documents, embeddings=valid_embeddings)
|
||||
|
||||
except Exception as e:
|
||||
return ProcessedData(documents=[], embeddings=np.array([]), error=str(e))
|
||||
|
||||
|
||||
def _extract_embeddings(self, documents: List[Document]) -> np.ndarray:
|
||||
if not documents:
|
||||
return np.array([])
|
||||
return np.array([doc.embedding for doc in documents])
|
||||
|
||||
def combine_data(
|
||||
self, doc_data: ProcessedData, prompt_data: Optional[ProcessedData] = None
|
||||
) -> Tuple[np.ndarray, List[Document], Optional[List[Document]]]:
|
||||
|
||||
def combine_data(self, doc_data: ProcessedData, prompt_data: Optional[ProcessedData] = None) -> Tuple[np.ndarray, List[Document], Optional[List[Document]]]:
|
||||
if not doc_data or doc_data.error:
|
||||
raise ValueError("Invalid document data")
|
||||
|
||||
|
||||
all_embeddings = doc_data.embeddings
|
||||
documents = doc_data.documents
|
||||
prompts = None
|
||||
|
||||
|
||||
if prompt_data and not prompt_data.error and prompt_data.documents:
|
||||
all_embeddings = np.vstack([doc_data.embeddings, prompt_data.embeddings])
|
||||
prompts = prompt_data.documents
|
||||
|
||||
|
||||
return all_embeddings, documents, prompts
|
||||
|
||||
def split_reduced_data(
|
||||
self, reduced_embeddings: np.ndarray, n_documents: int, n_prompts: int = 0
|
||||
) -> Tuple[np.ndarray, Optional[np.ndarray]]:
|
||||
|
||||
def split_reduced_data(self, reduced_embeddings: np.ndarray, n_documents: int, n_prompts: int = 0) -> Tuple[np.ndarray, Optional[np.ndarray]]:
|
||||
doc_reduced = reduced_embeddings[:n_documents]
|
||||
prompt_reduced = None
|
||||
|
||||
|
||||
if n_prompts > 0:
|
||||
prompt_reduced = reduced_embeddings[n_documents : n_documents + n_prompts]
|
||||
|
||||
return doc_reduced, prompt_reduced
|
||||
prompt_reduced = reduced_embeddings[n_documents:n_documents + n_prompts]
|
||||
|
||||
return doc_reduced, prompt_reduced
|
||||
@@ -1,189 +0,0 @@
|
||||
from typing import Dict, List, Optional, Any, Tuple
|
||||
import logging
|
||||
from opensearchpy import OpenSearch
|
||||
from opensearchpy.exceptions import OpenSearchException
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class OpenSearchClient:
|
||||
def __init__(self):
|
||||
self.client: Optional[OpenSearch] = None
|
||||
self.connection_info: Optional[Dict[str, Any]] = None
|
||||
|
||||
def connect(
|
||||
self,
|
||||
url: str,
|
||||
username: Optional[str] = None,
|
||||
password: Optional[str] = None,
|
||||
api_key: Optional[str] = None,
|
||||
verify_certs: bool = True,
|
||||
) -> Tuple[bool, str]:
|
||||
"""
|
||||
Connect to OpenSearch instance.
|
||||
|
||||
Returns:
|
||||
Tuple of (success: bool, message: str)
|
||||
"""
|
||||
try:
|
||||
# Parse URL to extract host and port
|
||||
if url.startswith("http://") or url.startswith("https://"):
|
||||
host = url
|
||||
else:
|
||||
host = f"https://{url}"
|
||||
|
||||
# Build auth configuration
|
||||
auth_config = {}
|
||||
if username and password:
|
||||
auth_config["http_auth"] = (username, password)
|
||||
elif api_key:
|
||||
auth_config["api_key"] = api_key
|
||||
|
||||
# Create client
|
||||
self.client = OpenSearch([host], verify_certs=verify_certs, **auth_config)
|
||||
|
||||
# Test connection
|
||||
info = self.client.info()
|
||||
self.connection_info = {
|
||||
"url": host,
|
||||
"cluster_name": info.get("cluster_name", "Unknown"),
|
||||
"version": info.get("version", {}).get("number", "Unknown"),
|
||||
}
|
||||
|
||||
return (
|
||||
True,
|
||||
f"Connected to {info.get('cluster_name', 'OpenSearch cluster')}",
|
||||
)
|
||||
|
||||
except OpenSearchException as e:
|
||||
logger.error(f"OpenSearch connection error: {e}")
|
||||
return False, f"Connection failed: {str(e)}"
|
||||
except Exception as e:
|
||||
logger.error(f"Unexpected error connecting to OpenSearch: {e}")
|
||||
return False, f"Unexpected error: {str(e)}"
|
||||
|
||||
def get_index_mapping(self, index_name: str) -> Tuple[bool, Optional[Dict], str]:
|
||||
"""
|
||||
Get the mapping for a specific index.
|
||||
|
||||
Returns:
|
||||
Tuple of (success: bool, mapping: Dict or None, message: str)
|
||||
"""
|
||||
if not self.client:
|
||||
return False, None, "Not connected to OpenSearch"
|
||||
|
||||
try:
|
||||
mapping = self.client.indices.get_mapping(index=index_name)
|
||||
return True, mapping, "Mapping retrieved successfully"
|
||||
except OpenSearchException as e:
|
||||
logger.error(f"Error getting mapping for index {index_name}: {e}")
|
||||
return False, None, f"Failed to get mapping: {str(e)}"
|
||||
|
||||
def analyze_fields(self, index_name: str) -> Tuple[bool, Optional[Dict], str]:
|
||||
"""
|
||||
Analyze index fields to detect potential embedding and text fields.
|
||||
|
||||
Returns:
|
||||
Tuple of (success: bool, analysis: Dict or None, message: str)
|
||||
"""
|
||||
success, mapping, message = self.get_index_mapping(index_name)
|
||||
if not success:
|
||||
return False, None, message
|
||||
|
||||
try:
|
||||
# Extract field information from mapping
|
||||
index_mapping = mapping[index_name]["mappings"]["properties"]
|
||||
|
||||
analysis = {
|
||||
"vector_fields": [],
|
||||
"text_fields": [],
|
||||
"keyword_fields": [],
|
||||
"numeric_fields": [],
|
||||
"all_fields": [],
|
||||
}
|
||||
|
||||
for field_name, field_info in index_mapping.items():
|
||||
field_type = field_info.get("type", "unknown")
|
||||
analysis["all_fields"].append(field_name)
|
||||
|
||||
if field_type == "dense_vector":
|
||||
analysis["vector_fields"].append(
|
||||
{
|
||||
"name": field_name,
|
||||
"dimension": field_info.get("dimension", "unknown"),
|
||||
}
|
||||
)
|
||||
elif field_type == "text":
|
||||
analysis["text_fields"].append(field_name)
|
||||
elif field_type == "keyword":
|
||||
analysis["keyword_fields"].append(field_name)
|
||||
elif field_type in ["integer", "long", "float", "double"]:
|
||||
analysis["numeric_fields"].append(field_name)
|
||||
|
||||
return True, analysis, "Field analysis completed"
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error analyzing fields: {e}")
|
||||
return False, None, f"Field analysis failed: {str(e)}"
|
||||
|
||||
def fetch_sample_data(
|
||||
self, index_name: str, size: int = 5
|
||||
) -> Tuple[bool, List[Dict], str]:
|
||||
"""
|
||||
Fetch sample documents from the index.
|
||||
|
||||
Returns:
|
||||
Tuple of (success: bool, documents: List[Dict], message: str)
|
||||
"""
|
||||
if not self.client:
|
||||
return False, [], "Not connected to OpenSearch"
|
||||
|
||||
try:
|
||||
response = self.client.search(
|
||||
index=index_name, body={"query": {"match_all": {}}, "size": size}
|
||||
)
|
||||
|
||||
documents = [hit["_source"] for hit in response["hits"]["hits"]]
|
||||
return True, documents, f"Retrieved {len(documents)} sample documents"
|
||||
|
||||
except OpenSearchException as e:
|
||||
logger.error(f"Error fetching sample data: {e}")
|
||||
return False, [], f"Failed to fetch sample data: {str(e)}"
|
||||
|
||||
def fetch_data(
|
||||
self, index_name: str, size: int = 100
|
||||
) -> Tuple[bool, List[Dict], str]:
|
||||
"""
|
||||
Fetch documents from the index.
|
||||
|
||||
Returns:
|
||||
Tuple of (success: bool, documents: List[Dict], message: str)
|
||||
"""
|
||||
if not self.client:
|
||||
return False, [], "Not connected to OpenSearch"
|
||||
|
||||
try:
|
||||
response = self.client.search(
|
||||
index=index_name, body={"query": {"match_all": {}}, "size": size}
|
||||
)
|
||||
|
||||
documents = [hit["_source"] for hit in response["hits"]["hits"]]
|
||||
total_hits = response["hits"]["total"]["value"]
|
||||
|
||||
message = f"Retrieved {len(documents)} documents from {total_hits} total"
|
||||
return True, documents, message
|
||||
|
||||
except OpenSearchException as e:
|
||||
logger.error(f"Error fetching data: {e}")
|
||||
return False, [], f"Failed to fetch data: {str(e)}"
|
||||
|
||||
def disconnect(self):
|
||||
"""Disconnect from OpenSearch."""
|
||||
if self.client:
|
||||
self.client = None
|
||||
self.connection_info = None
|
||||
|
||||
def is_connected(self) -> bool:
|
||||
"""Check if connected to OpenSearch."""
|
||||
return self.client is not None
|
||||
@@ -1,254 +0,0 @@
|
||||
from dataclasses import dataclass
|
||||
from typing import Dict, List, Optional, Any
|
||||
import logging
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class FieldMapping:
|
||||
"""Configuration for mapping OpenSearch fields to standard format."""
|
||||
|
||||
embedding_field: str
|
||||
text_field: str
|
||||
id_field: Optional[str] = None
|
||||
category_field: Optional[str] = None
|
||||
subcategory_field: Optional[str] = None
|
||||
tags_field: Optional[str] = None
|
||||
|
||||
|
||||
class FieldMapper:
|
||||
"""Handles field mapping and data transformation from OpenSearch to standard format."""
|
||||
|
||||
@staticmethod
|
||||
def suggest_mappings(field_analysis: Dict) -> Dict[str, List[str]]:
|
||||
"""
|
||||
Suggest field mappings based on field analysis.
|
||||
|
||||
Each dropdown will show ALL available fields, but ordered by relevance
|
||||
with the most likely candidates first.
|
||||
|
||||
Args:
|
||||
field_analysis: Analysis results from OpenSearchClient.analyze_fields
|
||||
|
||||
Returns:
|
||||
Dictionary with suggested fields for each mapping (ordered by relevance)
|
||||
"""
|
||||
all_fields = field_analysis.get("all_fields", [])
|
||||
vector_fields = [vf["name"] for vf in field_analysis.get("vector_fields", [])]
|
||||
text_fields = field_analysis.get("text_fields", [])
|
||||
keyword_fields = field_analysis.get("keyword_fields", [])
|
||||
|
||||
# Helper function to create ordered suggestions
|
||||
def create_ordered_suggestions(primary_candidates, all_available_fields):
|
||||
# Start with primary candidates, then add all other fields
|
||||
ordered = []
|
||||
# Add primary candidates first
|
||||
for field in primary_candidates:
|
||||
if field in all_available_fields and field not in ordered:
|
||||
ordered.append(field)
|
||||
# Add remaining fields
|
||||
for field in all_available_fields:
|
||||
if field not in ordered:
|
||||
ordered.append(field)
|
||||
return ordered
|
||||
|
||||
suggestions = {}
|
||||
|
||||
# Embedding field suggestions (vector fields first, then name-based candidates, then all fields)
|
||||
embedding_candidates = vector_fields.copy()
|
||||
# Add fields that likely contain embeddings based on name
|
||||
embedding_name_candidates = [
|
||||
f
|
||||
for f in all_fields
|
||||
if any(
|
||||
keyword in f.lower()
|
||||
for keyword in ["embedding", "embeddings", "vector", "vectors", "embed"]
|
||||
)
|
||||
]
|
||||
# Add name-based candidates that aren't already in vector_fields
|
||||
for candidate in embedding_name_candidates:
|
||||
if candidate not in embedding_candidates:
|
||||
embedding_candidates.append(candidate)
|
||||
suggestions["embedding"] = create_ordered_suggestions(
|
||||
embedding_candidates, all_fields
|
||||
)
|
||||
|
||||
# Text field suggestions (text fields first, then all fields)
|
||||
text_candidates = text_fields.copy()
|
||||
suggestions["text"] = create_ordered_suggestions(text_candidates, all_fields)
|
||||
|
||||
# ID field suggestions (ID-like fields first, then all fields)
|
||||
id_candidates = [
|
||||
f
|
||||
for f in keyword_fields
|
||||
if any(keyword in f.lower() for keyword in ["id", "_id", "doc", "document"])
|
||||
]
|
||||
id_candidates.append("_id") # _id is always available
|
||||
suggestions["id"] = create_ordered_suggestions(id_candidates, all_fields)
|
||||
|
||||
# Category field suggestions (category-like fields first, then all fields)
|
||||
category_candidates = [
|
||||
f
|
||||
for f in keyword_fields
|
||||
if any(
|
||||
keyword in f.lower()
|
||||
for keyword in ["category", "class", "type", "label"]
|
||||
)
|
||||
]
|
||||
suggestions["category"] = create_ordered_suggestions(
|
||||
category_candidates, all_fields
|
||||
)
|
||||
|
||||
# Subcategory field suggestions (subcategory-like fields first, then all fields)
|
||||
subcategory_candidates = [
|
||||
f
|
||||
for f in keyword_fields
|
||||
if any(
|
||||
keyword in f.lower()
|
||||
for keyword in ["subcategory", "subclass", "subtype", "subtopic"]
|
||||
)
|
||||
]
|
||||
suggestions["subcategory"] = create_ordered_suggestions(
|
||||
subcategory_candidates, all_fields
|
||||
)
|
||||
|
||||
# Tags field suggestions (tag-like fields first, then all fields)
|
||||
tags_candidates = [
|
||||
f
|
||||
for f in keyword_fields
|
||||
if any(
|
||||
keyword in f.lower()
|
||||
for keyword in ["tag", "tags", "keyword", "keywords"]
|
||||
)
|
||||
]
|
||||
suggestions["tags"] = create_ordered_suggestions(tags_candidates, all_fields)
|
||||
|
||||
return suggestions
|
||||
|
||||
@staticmethod
|
||||
def validate_mapping(
|
||||
mapping: FieldMapping, available_fields: List[str]
|
||||
) -> List[str]:
|
||||
"""
|
||||
Validate that the field mapping is correct.
|
||||
|
||||
Returns:
|
||||
List of validation errors (empty if valid)
|
||||
"""
|
||||
errors = []
|
||||
|
||||
# Required fields validation
|
||||
if not mapping.embedding_field:
|
||||
errors.append("Embedding field is required")
|
||||
elif mapping.embedding_field not in available_fields:
|
||||
errors.append(
|
||||
f"Embedding field '{mapping.embedding_field}' not found in index"
|
||||
)
|
||||
|
||||
if not mapping.text_field:
|
||||
errors.append("Text field is required")
|
||||
elif mapping.text_field not in available_fields:
|
||||
errors.append(f"Text field '{mapping.text_field}' not found in index")
|
||||
|
||||
# Optional fields validation
|
||||
optional_fields = {
|
||||
"id_field": mapping.id_field,
|
||||
"category_field": mapping.category_field,
|
||||
"subcategory_field": mapping.subcategory_field,
|
||||
"tags_field": mapping.tags_field,
|
||||
}
|
||||
|
||||
for field_name, field_value in optional_fields.items():
|
||||
if field_value and field_value not in available_fields:
|
||||
errors.append(
|
||||
f"Field '{field_value}' for {field_name} not found in index"
|
||||
)
|
||||
|
||||
return errors
|
||||
|
||||
@staticmethod
|
||||
def transform_documents(
|
||||
documents: List[Dict[str, Any]], mapping: FieldMapping
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Transform OpenSearch documents to standard format using field mapping.
|
||||
|
||||
Args:
|
||||
documents: Raw documents from OpenSearch
|
||||
mapping: Field mapping configuration
|
||||
|
||||
Returns:
|
||||
List of transformed documents in standard format
|
||||
"""
|
||||
transformed = []
|
||||
|
||||
for doc in documents:
|
||||
try:
|
||||
# Build standard format document
|
||||
standard_doc = {}
|
||||
|
||||
# Required fields
|
||||
if mapping.embedding_field in doc:
|
||||
standard_doc["embedding"] = doc[mapping.embedding_field]
|
||||
else:
|
||||
logger.warning(
|
||||
f"Missing embedding field '{mapping.embedding_field}' in document"
|
||||
)
|
||||
continue
|
||||
|
||||
if mapping.text_field in doc:
|
||||
standard_doc["text"] = str(doc[mapping.text_field])
|
||||
else:
|
||||
logger.warning(
|
||||
f"Missing text field '{mapping.text_field}' in document"
|
||||
)
|
||||
continue
|
||||
|
||||
# Optional fields
|
||||
if mapping.id_field and mapping.id_field in doc:
|
||||
standard_doc["id"] = str(doc[mapping.id_field])
|
||||
|
||||
if mapping.category_field and mapping.category_field in doc:
|
||||
standard_doc["category"] = str(doc[mapping.category_field])
|
||||
|
||||
if mapping.subcategory_field and mapping.subcategory_field in doc:
|
||||
standard_doc["subcategory"] = str(doc[mapping.subcategory_field])
|
||||
|
||||
if mapping.tags_field and mapping.tags_field in doc:
|
||||
tags = doc[mapping.tags_field]
|
||||
# Handle both string and list tags
|
||||
if isinstance(tags, list):
|
||||
standard_doc["tags"] = [str(tag) for tag in tags]
|
||||
else:
|
||||
standard_doc["tags"] = [str(tags)]
|
||||
|
||||
transformed.append(standard_doc)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error transforming document: {e}")
|
||||
continue
|
||||
|
||||
logger.info(f"Transformed {len(transformed)} documents out of {len(documents)}")
|
||||
return transformed
|
||||
|
||||
@staticmethod
|
||||
def create_mapping_from_dict(mapping_dict: Dict[str, str]) -> FieldMapping:
|
||||
"""
|
||||
Create a FieldMapping from a dictionary.
|
||||
|
||||
Args:
|
||||
mapping_dict: Dictionary with field mappings
|
||||
|
||||
Returns:
|
||||
FieldMapping instance
|
||||
"""
|
||||
return FieldMapping(
|
||||
embedding_field=mapping_dict.get("embedding", ""),
|
||||
text_field=mapping_dict.get("text", ""),
|
||||
id_field=mapping_dict.get("id") or None,
|
||||
category_field=mapping_dict.get("category") or None,
|
||||
subcategory_field=mapping_dict.get("subcategory") or None,
|
||||
tags_field=mapping_dict.get("tags") or None,
|
||||
)
|
||||
@@ -1,5 +1,6 @@
|
||||
from abc import ABC, abstractmethod
|
||||
import numpy as np
|
||||
from typing import Optional, Tuple
|
||||
from sklearn.decomposition import PCA
|
||||
import umap
|
||||
from openTSNE import TSNE
|
||||
@@ -7,89 +8,88 @@ from .schemas import ReducedData
|
||||
|
||||
|
||||
class DimensionalityReducer(ABC):
|
||||
|
||||
def __init__(self, n_components: int = 3, random_state: int = 42):
|
||||
self.n_components = n_components
|
||||
self.random_state = random_state
|
||||
self._reducer = None
|
||||
|
||||
|
||||
@abstractmethod
|
||||
def fit_transform(self, embeddings: np.ndarray) -> ReducedData:
|
||||
pass
|
||||
|
||||
|
||||
@abstractmethod
|
||||
def get_method_name(self) -> str:
|
||||
pass
|
||||
|
||||
|
||||
class PCAReducer(DimensionalityReducer):
|
||||
|
||||
def fit_transform(self, embeddings: np.ndarray) -> ReducedData:
|
||||
self._reducer = PCA(n_components=self.n_components)
|
||||
reduced = self._reducer.fit_transform(embeddings)
|
||||
variance_explained = self._reducer.explained_variance_ratio_
|
||||
|
||||
|
||||
return ReducedData(
|
||||
reduced_embeddings=reduced,
|
||||
variance_explained=variance_explained,
|
||||
method=self.get_method_name(),
|
||||
n_components=self.n_components,
|
||||
n_components=self.n_components
|
||||
)
|
||||
|
||||
|
||||
def get_method_name(self) -> str:
|
||||
return "PCA"
|
||||
|
||||
|
||||
class TSNEReducer(DimensionalityReducer):
|
||||
|
||||
def fit_transform(self, embeddings: np.ndarray) -> ReducedData:
|
||||
self._reducer = TSNE(
|
||||
n_components=self.n_components, random_state=self.random_state
|
||||
)
|
||||
self._reducer = TSNE(n_components=self.n_components, random_state=self.random_state)
|
||||
reduced = self._reducer.fit(embeddings)
|
||||
|
||||
|
||||
return ReducedData(
|
||||
reduced_embeddings=reduced,
|
||||
variance_explained=None,
|
||||
method=self.get_method_name(),
|
||||
n_components=self.n_components,
|
||||
n_components=self.n_components
|
||||
)
|
||||
|
||||
|
||||
def get_method_name(self) -> str:
|
||||
return "t-SNE"
|
||||
|
||||
|
||||
class UMAPReducer(DimensionalityReducer):
|
||||
|
||||
def fit_transform(self, embeddings: np.ndarray) -> ReducedData:
|
||||
self._reducer = umap.UMAP(
|
||||
n_components=self.n_components, random_state=self.random_state
|
||||
)
|
||||
self._reducer = umap.UMAP(n_components=self.n_components, random_state=self.random_state)
|
||||
reduced = self._reducer.fit_transform(embeddings)
|
||||
|
||||
|
||||
return ReducedData(
|
||||
reduced_embeddings=reduced,
|
||||
variance_explained=None,
|
||||
method=self.get_method_name(),
|
||||
n_components=self.n_components,
|
||||
n_components=self.n_components
|
||||
)
|
||||
|
||||
|
||||
def get_method_name(self) -> str:
|
||||
return "UMAP"
|
||||
|
||||
|
||||
class ReducerFactory:
|
||||
|
||||
@staticmethod
|
||||
def create_reducer(
|
||||
method: str, n_components: int = 3, random_state: int = 42
|
||||
) -> DimensionalityReducer:
|
||||
def create_reducer(method: str, n_components: int = 3, random_state: int = 42) -> DimensionalityReducer:
|
||||
method_lower = method.lower()
|
||||
|
||||
if method_lower == "pca":
|
||||
|
||||
if method_lower == 'pca':
|
||||
return PCAReducer(n_components=n_components, random_state=random_state)
|
||||
elif method_lower == "tsne":
|
||||
elif method_lower == 'tsne':
|
||||
return TSNEReducer(n_components=n_components, random_state=random_state)
|
||||
elif method_lower == "umap":
|
||||
elif method_lower == 'umap':
|
||||
return UMAPReducer(n_components=n_components, random_state=random_state)
|
||||
else:
|
||||
raise ValueError(f"Unknown reduction method: {method}")
|
||||
|
||||
|
||||
@staticmethod
|
||||
def get_available_methods() -> list:
|
||||
return ["pca", "tsne", "umap"]
|
||||
return ['pca', 'tsne', 'umap']
|
||||
@@ -1,4 +1,4 @@
|
||||
from typing import List, Optional
|
||||
from typing import List, Optional, Any, Dict
|
||||
from dataclasses import dataclass
|
||||
import numpy as np
|
||||
|
||||
@@ -50,11 +50,9 @@ class PlotData:
|
||||
coordinates: np.ndarray
|
||||
prompts: Optional[List[Document]] = None
|
||||
prompt_coordinates: Optional[np.ndarray] = None
|
||||
|
||||
|
||||
def __post_init__(self):
|
||||
if not isinstance(self.coordinates, np.ndarray):
|
||||
self.coordinates = np.array(self.coordinates)
|
||||
if self.prompt_coordinates is not None and not isinstance(
|
||||
self.prompt_coordinates, np.ndarray
|
||||
):
|
||||
self.prompt_coordinates = np.array(self.prompt_coordinates)
|
||||
if self.prompt_coordinates is not None and not isinstance(self.prompt_coordinates, np.ndarray):
|
||||
self.prompt_coordinates = np.array(self.prompt_coordinates)
|
||||
@@ -1,756 +1,61 @@
|
||||
from dash import callback, Input, Output, State, no_update, html
|
||||
import numpy as np
|
||||
from dash import callback, Input, Output, State
|
||||
from ...data.processor import DataProcessor
|
||||
from ...data.sources.opensearch import OpenSearchClient
|
||||
from ...models.field_mapper import FieldMapper
|
||||
from ...config.settings import AppSettings
|
||||
|
||||
|
||||
class DataProcessingCallbacks:
|
||||
|
||||
def __init__(self):
|
||||
self.processor = DataProcessor()
|
||||
self.opensearch_client_data = OpenSearchClient() # For data/documents
|
||||
self.opensearch_client_prompts = OpenSearchClient() # For prompts
|
||||
self._register_callbacks()
|
||||
|
||||
|
||||
def _register_callbacks(self):
|
||||
|
||||
@callback(
|
||||
[
|
||||
Output("processed-data", "data", allow_duplicate=True),
|
||||
Output("upload-error-alert", "children", allow_duplicate=True),
|
||||
Output("upload-error-alert", "is_open", allow_duplicate=True),
|
||||
],
|
||||
Input("upload-data", "contents"),
|
||||
State("upload-data", "filename"),
|
||||
prevent_initial_call=True,
|
||||
Output('processed-data', 'data'),
|
||||
Input('upload-data', 'contents'),
|
||||
State('upload-data', 'filename')
|
||||
)
|
||||
def process_uploaded_file(contents, filename):
|
||||
if contents is None:
|
||||
return None, "", False
|
||||
|
||||
return None
|
||||
|
||||
processed_data = self.processor.process_upload(contents, filename)
|
||||
|
||||
|
||||
if processed_data.error:
|
||||
error_message = self._format_error_message(
|
||||
processed_data.error, filename
|
||||
)
|
||||
return (
|
||||
{"error": processed_data.error},
|
||||
error_message,
|
||||
True, # Show error alert
|
||||
)
|
||||
|
||||
return (
|
||||
{
|
||||
"documents": [
|
||||
self._document_to_dict(doc) for doc in processed_data.documents
|
||||
],
|
||||
"embeddings": processed_data.embeddings.tolist(),
|
||||
},
|
||||
"",
|
||||
False, # Hide error alert
|
||||
)
|
||||
|
||||
return {'error': processed_data.error}
|
||||
|
||||
return {
|
||||
'documents': [self._document_to_dict(doc) for doc in processed_data.documents],
|
||||
'embeddings': processed_data.embeddings.tolist()
|
||||
}
|
||||
|
||||
@callback(
|
||||
Output("processed-prompts", "data", allow_duplicate=True),
|
||||
Input("upload-prompts", "contents"),
|
||||
State("upload-prompts", "filename"),
|
||||
prevent_initial_call=True,
|
||||
Output('processed-prompts', 'data'),
|
||||
Input('upload-prompts', 'contents'),
|
||||
State('upload-prompts', 'filename')
|
||||
)
|
||||
def process_uploaded_prompts(contents, filename):
|
||||
if contents is None:
|
||||
return None
|
||||
|
||||
|
||||
processed_data = self.processor.process_upload(contents, filename)
|
||||
|
||||
|
||||
if processed_data.error:
|
||||
return {"error": processed_data.error}
|
||||
|
||||
return {'error': processed_data.error}
|
||||
|
||||
return {
|
||||
"prompts": [
|
||||
self._document_to_dict(doc) for doc in processed_data.documents
|
||||
],
|
||||
"embeddings": processed_data.embeddings.tolist(),
|
||||
'prompts': [self._document_to_dict(doc) for doc in processed_data.documents],
|
||||
'embeddings': processed_data.embeddings.tolist()
|
||||
}
|
||||
|
||||
# OpenSearch callbacks
|
||||
@callback(
|
||||
[
|
||||
Output("tab-content", "children"),
|
||||
],
|
||||
[Input("data-source-tabs", "active_tab")],
|
||||
prevent_initial_call=False,
|
||||
)
|
||||
def render_tab_content(active_tab):
|
||||
from ...ui.components.datasource import DataSourceComponent
|
||||
from ...config.settings import AppSettings
|
||||
|
||||
datasource = DataSourceComponent()
|
||||
|
||||
if active_tab == "opensearch-tab" and AppSettings.OPENSEARCH_ENABLED:
|
||||
return [datasource.create_opensearch_tab()]
|
||||
elif active_tab == "text-input-tab":
|
||||
return [datasource.create_text_input_tab()]
|
||||
else:
|
||||
return [datasource.create_file_upload_tab()]
|
||||
|
||||
# Register callbacks for both data and prompts sections (only if OpenSearch is enabled)
|
||||
if AppSettings.OPENSEARCH_ENABLED:
|
||||
self._register_opensearch_callbacks("data", self.opensearch_client_data)
|
||||
self._register_opensearch_callbacks(
|
||||
"prompts", self.opensearch_client_prompts
|
||||
)
|
||||
|
||||
# Register collapsible section callbacks
|
||||
self._register_collapse_callbacks()
|
||||
|
||||
# Register text input callbacks
|
||||
self._register_text_input_callbacks()
|
||||
|
||||
def _register_opensearch_callbacks(self, section_type, opensearch_client):
|
||||
"""Register callbacks for a specific section (data or prompts)."""
|
||||
|
||||
@callback(
|
||||
Output(f"{section_type}-auth-collapse", "is_open"),
|
||||
[Input(f"{section_type}-auth-toggle", "n_clicks")],
|
||||
[State(f"{section_type}-auth-collapse", "is_open")],
|
||||
prevent_initial_call=True,
|
||||
)
|
||||
def toggle_auth(n_clicks, is_open):
|
||||
if n_clicks:
|
||||
return not is_open
|
||||
return is_open
|
||||
|
||||
@callback(
|
||||
Output(f"{section_type}-auth-toggle", "children"),
|
||||
[Input(f"{section_type}-auth-collapse", "is_open")],
|
||||
prevent_initial_call=False,
|
||||
)
|
||||
def update_auth_button_text(is_open):
|
||||
return "Hide Authentication" if is_open else "Show Authentication"
|
||||
|
||||
@callback(
|
||||
[
|
||||
Output(f"{section_type}-connection-status", "children"),
|
||||
Output(f"{section_type}-field-mapping-section", "children"),
|
||||
Output(f"{section_type}-field-mapping-section", "style"),
|
||||
Output(f"{section_type}-load-data-section", "style"),
|
||||
Output(f"{section_type}-load-opensearch-data-btn", "disabled"),
|
||||
Output(f"{section_type}-embedding-field-dropdown", "options"),
|
||||
Output(f"{section_type}-text-field-dropdown", "options"),
|
||||
Output(f"{section_type}-id-field-dropdown", "options"),
|
||||
Output(f"{section_type}-category-field-dropdown", "options"),
|
||||
Output(f"{section_type}-subcategory-field-dropdown", "options"),
|
||||
Output(f"{section_type}-tags-field-dropdown", "options"),
|
||||
],
|
||||
[Input(f"{section_type}-test-connection-btn", "n_clicks")],
|
||||
[
|
||||
State(f"{section_type}-opensearch-url", "value"),
|
||||
State(f"{section_type}-opensearch-index", "value"),
|
||||
State(f"{section_type}-opensearch-username", "value"),
|
||||
State(f"{section_type}-opensearch-password", "value"),
|
||||
State(f"{section_type}-opensearch-api-key", "value"),
|
||||
],
|
||||
prevent_initial_call=True,
|
||||
)
|
||||
def test_opensearch_connection(
|
||||
n_clicks, url, index_name, username, password, api_key
|
||||
):
|
||||
if not n_clicks or not url or not index_name:
|
||||
return (
|
||||
no_update,
|
||||
no_update,
|
||||
no_update,
|
||||
no_update,
|
||||
no_update,
|
||||
no_update,
|
||||
no_update,
|
||||
no_update,
|
||||
no_update,
|
||||
no_update,
|
||||
no_update,
|
||||
)
|
||||
|
||||
# Test connection
|
||||
success, message = opensearch_client.connect(
|
||||
url=url,
|
||||
username=username,
|
||||
password=password,
|
||||
api_key=api_key,
|
||||
verify_certs=AppSettings.OPENSEARCH_VERIFY_CERTS,
|
||||
)
|
||||
|
||||
if not success:
|
||||
return (
|
||||
self._create_status_alert(f"❌ {message}", "danger"),
|
||||
[],
|
||||
{"display": "none"},
|
||||
{"display": "none"},
|
||||
True,
|
||||
[], # empty options for hidden dropdowns
|
||||
[],
|
||||
[],
|
||||
[],
|
||||
[],
|
||||
[],
|
||||
)
|
||||
|
||||
# Analyze fields
|
||||
success, field_analysis, analysis_message = (
|
||||
opensearch_client.analyze_fields(index_name)
|
||||
)
|
||||
|
||||
if not success:
|
||||
return (
|
||||
self._create_status_alert(f"❌ {analysis_message}", "danger"),
|
||||
[],
|
||||
{"display": "none"},
|
||||
{"display": "none"},
|
||||
True,
|
||||
[], # empty options for hidden dropdowns
|
||||
[],
|
||||
[],
|
||||
[],
|
||||
[],
|
||||
[],
|
||||
)
|
||||
|
||||
# Generate field suggestions
|
||||
field_suggestions = FieldMapper.suggest_mappings(field_analysis)
|
||||
|
||||
from ...ui.components.datasource import DataSourceComponent
|
||||
|
||||
datasource = DataSourceComponent()
|
||||
field_mapping_ui = datasource.create_field_mapping_interface(
|
||||
field_suggestions, section_type
|
||||
)
|
||||
|
||||
return (
|
||||
self._create_status_alert(f"✅ {message}", "success"),
|
||||
field_mapping_ui,
|
||||
{"display": "block"},
|
||||
{"display": "block"},
|
||||
False,
|
||||
[
|
||||
{"label": field, "value": field}
|
||||
for field in field_suggestions.get("embedding", [])
|
||||
],
|
||||
[
|
||||
{"label": field, "value": field}
|
||||
for field in field_suggestions.get("text", [])
|
||||
],
|
||||
[
|
||||
{"label": field, "value": field}
|
||||
for field in field_suggestions.get("id", [])
|
||||
],
|
||||
[
|
||||
{"label": field, "value": field}
|
||||
for field in field_suggestions.get("category", [])
|
||||
],
|
||||
[
|
||||
{"label": field, "value": field}
|
||||
for field in field_suggestions.get("subcategory", [])
|
||||
],
|
||||
[
|
||||
{"label": field, "value": field}
|
||||
for field in field_suggestions.get("tags", [])
|
||||
],
|
||||
)
|
||||
|
||||
# Determine output target based on section type
|
||||
output_target = (
|
||||
"processed-data" if section_type == "data" else "processed-prompts"
|
||||
)
|
||||
|
||||
@callback(
|
||||
[
|
||||
Output(output_target, "data", allow_duplicate=True),
|
||||
Output("opensearch-success-alert", "children", allow_duplicate=True),
|
||||
Output("opensearch-success-alert", "is_open", allow_duplicate=True),
|
||||
Output("opensearch-error-alert", "children", allow_duplicate=True),
|
||||
Output("opensearch-error-alert", "is_open", allow_duplicate=True),
|
||||
],
|
||||
[Input(f"{section_type}-load-opensearch-data-btn", "n_clicks")],
|
||||
[
|
||||
State(f"{section_type}-opensearch-index", "value"),
|
||||
State(f"{section_type}-opensearch-query-size", "value"),
|
||||
State(f"{section_type}-embedding-field-dropdown-ui", "value"),
|
||||
State(f"{section_type}-text-field-dropdown-ui", "value"),
|
||||
State(f"{section_type}-id-field-dropdown-ui", "value"),
|
||||
State(f"{section_type}-category-field-dropdown-ui", "value"),
|
||||
State(f"{section_type}-subcategory-field-dropdown-ui", "value"),
|
||||
State(f"{section_type}-tags-field-dropdown-ui", "value"),
|
||||
],
|
||||
prevent_initial_call=True,
|
||||
)
|
||||
def load_opensearch_data(
|
||||
n_clicks,
|
||||
index_name,
|
||||
query_size,
|
||||
embedding_field,
|
||||
text_field,
|
||||
id_field,
|
||||
category_field,
|
||||
subcategory_field,
|
||||
tags_field,
|
||||
):
|
||||
if not n_clicks or not index_name or not embedding_field or not text_field:
|
||||
return no_update, no_update, no_update, no_update, no_update
|
||||
|
||||
try:
|
||||
# Validate and set query size
|
||||
if not query_size or query_size < 1:
|
||||
query_size = AppSettings.OPENSEARCH_DEFAULT_SIZE
|
||||
elif query_size > 1000:
|
||||
query_size = 1000 # Cap at reasonable maximum
|
||||
|
||||
# Create field mapping
|
||||
field_mapping = FieldMapper.create_mapping_from_dict(
|
||||
{
|
||||
"embedding": embedding_field,
|
||||
"text": text_field,
|
||||
"id": id_field,
|
||||
"category": category_field,
|
||||
"subcategory": subcategory_field,
|
||||
"tags": tags_field,
|
||||
}
|
||||
)
|
||||
|
||||
# Fetch data from OpenSearch
|
||||
success, raw_documents, message = opensearch_client.fetch_data(
|
||||
index_name, size=query_size
|
||||
)
|
||||
|
||||
if not success:
|
||||
return (
|
||||
no_update,
|
||||
"",
|
||||
False,
|
||||
f"❌ Failed to fetch {section_type}: {message}",
|
||||
True,
|
||||
)
|
||||
|
||||
# Process the data
|
||||
processed_data = self.processor.process_opensearch_data(
|
||||
raw_documents, field_mapping
|
||||
)
|
||||
|
||||
if processed_data.error:
|
||||
return (
|
||||
{"error": processed_data.error},
|
||||
"",
|
||||
False,
|
||||
f"❌ {section_type.title()} processing error: {processed_data.error}",
|
||||
True,
|
||||
)
|
||||
|
||||
success_message = f"✅ Successfully loaded {len(processed_data.documents)} {section_type} from OpenSearch"
|
||||
|
||||
# Format for appropriate target (data vs prompts)
|
||||
if section_type == "data":
|
||||
return (
|
||||
{
|
||||
"documents": [
|
||||
self._document_to_dict(doc)
|
||||
for doc in processed_data.documents
|
||||
],
|
||||
"embeddings": processed_data.embeddings.tolist(),
|
||||
},
|
||||
success_message,
|
||||
True,
|
||||
"",
|
||||
False,
|
||||
)
|
||||
else: # prompts
|
||||
return (
|
||||
{
|
||||
"prompts": [
|
||||
self._document_to_dict(doc)
|
||||
for doc in processed_data.documents
|
||||
],
|
||||
"embeddings": processed_data.embeddings.tolist(),
|
||||
},
|
||||
success_message,
|
||||
True,
|
||||
"",
|
||||
False,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
return (no_update, "", False, f"❌ Unexpected error: {str(e)}", True)
|
||||
|
||||
# Sync callbacks to update hidden dropdowns from UI dropdowns
|
||||
@callback(
|
||||
Output(f"{section_type}-embedding-field-dropdown", "value"),
|
||||
Input(f"{section_type}-embedding-field-dropdown-ui", "value"),
|
||||
prevent_initial_call=True,
|
||||
)
|
||||
def sync_embedding_dropdown(value):
|
||||
return value
|
||||
|
||||
@callback(
|
||||
Output(f"{section_type}-text-field-dropdown", "value"),
|
||||
Input(f"{section_type}-text-field-dropdown-ui", "value"),
|
||||
prevent_initial_call=True,
|
||||
)
|
||||
def sync_text_dropdown(value):
|
||||
return value
|
||||
|
||||
@callback(
|
||||
Output(f"{section_type}-id-field-dropdown", "value"),
|
||||
Input(f"{section_type}-id-field-dropdown-ui", "value"),
|
||||
prevent_initial_call=True,
|
||||
)
|
||||
def sync_id_dropdown(value):
|
||||
return value
|
||||
|
||||
@callback(
|
||||
Output(f"{section_type}-category-field-dropdown", "value"),
|
||||
Input(f"{section_type}-category-field-dropdown-ui", "value"),
|
||||
prevent_initial_call=True,
|
||||
)
|
||||
def sync_category_dropdown(value):
|
||||
return value
|
||||
|
||||
@callback(
|
||||
Output(f"{section_type}-subcategory-field-dropdown", "value"),
|
||||
Input(f"{section_type}-subcategory-field-dropdown-ui", "value"),
|
||||
prevent_initial_call=True,
|
||||
)
|
||||
def sync_subcategory_dropdown(value):
|
||||
return value
|
||||
|
||||
@callback(
|
||||
Output(f"{section_type}-tags-field-dropdown", "value"),
|
||||
Input(f"{section_type}-tags-field-dropdown-ui", "value"),
|
||||
prevent_initial_call=True,
|
||||
)
|
||||
def sync_tags_dropdown(value):
|
||||
return value
|
||||
|
||||
def _register_collapse_callbacks(self):
|
||||
"""Register callbacks for collapsible sections."""
|
||||
|
||||
# Data section collapse callback
|
||||
@callback(
|
||||
[
|
||||
Output("data-collapse", "is_open"),
|
||||
Output("data-collapse-icon", "className"),
|
||||
],
|
||||
[Input("data-collapse-toggle", "n_clicks")],
|
||||
[State("data-collapse", "is_open")],
|
||||
prevent_initial_call=True,
|
||||
)
|
||||
def toggle_data_collapse(n_clicks, is_open):
|
||||
if n_clicks:
|
||||
new_state = not is_open
|
||||
icon_class = (
|
||||
"fas fa-chevron-down me-2"
|
||||
if new_state
|
||||
else "fas fa-chevron-right me-2"
|
||||
)
|
||||
return new_state, icon_class
|
||||
return is_open, "fas fa-chevron-down me-2"
|
||||
|
||||
# Prompts section collapse callback
|
||||
@callback(
|
||||
[
|
||||
Output("prompts-collapse", "is_open"),
|
||||
Output("prompts-collapse-icon", "className"),
|
||||
],
|
||||
[Input("prompts-collapse-toggle", "n_clicks")],
|
||||
[State("prompts-collapse", "is_open")],
|
||||
prevent_initial_call=True,
|
||||
)
|
||||
def toggle_prompts_collapse(n_clicks, is_open):
|
||||
if n_clicks:
|
||||
new_state = not is_open
|
||||
icon_class = (
|
||||
"fas fa-chevron-down me-2"
|
||||
if new_state
|
||||
else "fas fa-chevron-right me-2"
|
||||
)
|
||||
return new_state, icon_class
|
||||
return is_open, "fas fa-chevron-down me-2"
|
||||
|
||||
def _register_text_input_callbacks(self):
|
||||
"""Register callbacks for text input functionality."""
|
||||
|
||||
# Text length counter callback
|
||||
@callback(
|
||||
Output("text-length-counter", "children"),
|
||||
Input("text-input-area", "value"),
|
||||
prevent_initial_call=False,
|
||||
)
|
||||
def update_text_length_counter(text_value):
|
||||
if not text_value:
|
||||
return "0"
|
||||
return f"{len(text_value):,}"
|
||||
|
||||
# Generate button enable/disable callback
|
||||
@callback(
|
||||
[
|
||||
Output("generate-embeddings-btn", "disabled"),
|
||||
Output("generation-help", "children"),
|
||||
Output("generation-help", "color"),
|
||||
],
|
||||
[
|
||||
Input("text-input-area", "value"),
|
||||
Input("model-selection", "value"),
|
||||
],
|
||||
prevent_initial_call=False,
|
||||
)
|
||||
def toggle_generate_button(text_value, model_name):
|
||||
import dash_bootstrap_components as dbc
|
||||
|
||||
if not text_value or not text_value.strip():
|
||||
return (
|
||||
True,
|
||||
dbc.Alert(
|
||||
[
|
||||
html.I(className="fas fa-info-circle me-2"),
|
||||
"Enter some text above to enable embedding generation.",
|
||||
],
|
||||
color="light",
|
||||
),
|
||||
"light",
|
||||
)
|
||||
|
||||
if not model_name:
|
||||
return (
|
||||
True,
|
||||
dbc.Alert(
|
||||
[
|
||||
html.I(className="fas fa-exclamation-triangle me-2"),
|
||||
"Select an embedding model to continue.",
|
||||
],
|
||||
color="warning",
|
||||
),
|
||||
"warning",
|
||||
)
|
||||
|
||||
text_length = len(text_value.strip())
|
||||
if text_length > AppSettings.MAX_TEXT_LENGTH:
|
||||
return (
|
||||
True,
|
||||
dbc.Alert(
|
||||
[
|
||||
html.I(className="fas fa-exclamation-triangle me-2"),
|
||||
f"Text too long ({text_length:,} characters). Maximum allowed: {AppSettings.MAX_TEXT_LENGTH:,} characters.",
|
||||
],
|
||||
color="danger",
|
||||
),
|
||||
"danger",
|
||||
)
|
||||
|
||||
return (
|
||||
False,
|
||||
dbc.Alert(
|
||||
[
|
||||
html.I(className="fas fa-check-circle me-2"),
|
||||
f"Ready to generate embeddings for {text_length:,} characters using {model_name}.",
|
||||
],
|
||||
color="success",
|
||||
),
|
||||
"success",
|
||||
)
|
||||
|
||||
# Clear text callback
|
||||
@callback(
|
||||
Output("text-input-area", "value"),
|
||||
[Input("clear-text-btn", "n_clicks"), Input("load-sample-btn", "n_clicks")],
|
||||
prevent_initial_call=True,
|
||||
)
|
||||
def handle_text_input_actions(clear_clicks, load_clicks):
|
||||
from dash import ctx
|
||||
|
||||
if not ctx.triggered:
|
||||
return no_update
|
||||
|
||||
button_id = ctx.triggered[0]["prop_id"].split(".")[0]
|
||||
|
||||
if button_id == "clear-text-btn" and clear_clicks:
|
||||
return ""
|
||||
elif button_id == "load-sample-btn" and load_clicks:
|
||||
return self._load_sample_text()
|
||||
|
||||
return no_update
|
||||
|
||||
# Model info callback
|
||||
@callback(
|
||||
Output("model-info", "children"),
|
||||
Input("model-selection", "value"),
|
||||
prevent_initial_call=False,
|
||||
)
|
||||
def update_model_info(model_name):
|
||||
if not model_name:
|
||||
return html.Span("Please select a model", className="text-muted")
|
||||
|
||||
from ...config.settings import AppSettings
|
||||
|
||||
settings = AppSettings()
|
||||
|
||||
for model in settings.AVAILABLE_MODELS:
|
||||
if model["name"] == model_name:
|
||||
return html.Div(
|
||||
[
|
||||
html.Strong(
|
||||
f"Dimensions: {model['dimensions']} | Context Length: {model['context_length']}"
|
||||
),
|
||||
html.Br(),
|
||||
html.Span(model["description"]),
|
||||
html.Br(),
|
||||
html.Small(
|
||||
f"Multilingual: {'Yes' if model.get('multilingual', False) else 'No'} | Size: {model['size']}",
|
||||
className="text-muted",
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
return html.Span("Model information not available", className="text-muted")
|
||||
|
||||
# Process client-side embeddings result callback
|
||||
@callback(
|
||||
[
|
||||
Output("processed-data", "data", allow_duplicate=True),
|
||||
Output("text-input-status", "children"),
|
||||
Output("text-input-status", "color"),
|
||||
Output("text-input-status", "style"),
|
||||
Output("generate-embeddings-btn", "disabled", allow_duplicate=True),
|
||||
],
|
||||
[Input("embeddings-generated-trigger", "data")],
|
||||
prevent_initial_call=True,
|
||||
)
|
||||
def process_embeddings_result(embeddings_data):
|
||||
"""Process embeddings generated client-side."""
|
||||
if not embeddings_data:
|
||||
return no_update, no_update, no_update, no_update, no_update
|
||||
|
||||
# Check if this is a request trigger (contains textContent) vs actual embeddings data
|
||||
if isinstance(embeddings_data, dict) and "textContent" in embeddings_data:
|
||||
# This is a processing request trigger, not the actual results
|
||||
# The JavaScript will handle the async processing and update the UI directly
|
||||
return no_update, no_update, no_update, no_update, no_update
|
||||
|
||||
processed_data = self.processor.process_client_embeddings(embeddings_data)
|
||||
|
||||
if processed_data.error:
|
||||
return (
|
||||
{"error": processed_data.error},
|
||||
f"❌ Error: {processed_data.error}",
|
||||
"danger",
|
||||
{"display": "block"},
|
||||
False,
|
||||
)
|
||||
|
||||
return (
|
||||
{
|
||||
"documents": [
|
||||
self._document_to_dict(doc) for doc in processed_data.documents
|
||||
],
|
||||
"embeddings": processed_data.embeddings.tolist(),
|
||||
},
|
||||
f"✅ Generated embeddings for {len(processed_data.documents)} text chunks",
|
||||
"success",
|
||||
{"display": "block"},
|
||||
False,
|
||||
)
|
||||
|
||||
def _load_sample_text(self):
|
||||
"""Load sample text from assets/sample-txt.md file."""
|
||||
import os
|
||||
|
||||
try:
|
||||
# Get the project root directory (four levels up from this file)
|
||||
current_file = os.path.abspath(__file__)
|
||||
project_root = os.path.dirname(
|
||||
os.path.dirname(
|
||||
os.path.dirname(os.path.dirname(os.path.dirname(current_file)))
|
||||
)
|
||||
)
|
||||
sample_file_path = os.path.join(project_root, "assets", "sample-txt.md")
|
||||
|
||||
if os.path.exists(sample_file_path):
|
||||
with open(sample_file_path, "r", encoding="utf-8") as file:
|
||||
return file.read()
|
||||
else:
|
||||
# Fallback sample text if file doesn't exist
|
||||
return """The sun peeked through the clouds after a drizzly morning.
|
||||
A gentle breeze rustled the leaves as we walked along the shoreline.
|
||||
Heavy rains caused flooding in several low-lying neighborhoods.
|
||||
It was so hot that even the birds sought shade under the palm trees.
|
||||
By midnight, the temperature had dropped below freezing.
|
||||
|
||||
The new smartphone features a foldable display and 5G connectivity.
|
||||
In the world of AI, transformers have revolutionized natural language processing.
|
||||
Quantum computing promises to solve problems beyond classical computers' reach.
|
||||
Blockchain technology is being explored for secure voting systems.
|
||||
Virtual reality headsets are becoming more affordable and accessible.
|
||||
|
||||
Preheat the oven to 375°F before you start mixing the batter.
|
||||
She finely chopped the garlic and sautéed it in two tablespoons of olive oil.
|
||||
A pinch of saffron adds a beautiful color and aroma to traditional paella.
|
||||
If the soup is too salty, add a peeled potato to absorb excess sodium.
|
||||
Let the bread dough rise for at least an hour in a warm, draft-free spot."""
|
||||
|
||||
except Exception:
|
||||
# Return a simple fallback if there's any error
|
||||
return "This is sample text for testing embedding generation. You can replace this with your own text."
|
||||
|
||||
|
||||
@staticmethod
|
||||
def _document_to_dict(doc):
|
||||
return {
|
||||
"id": doc.id,
|
||||
"text": doc.text,
|
||||
"embedding": doc.embedding,
|
||||
"category": doc.category,
|
||||
"subcategory": doc.subcategory,
|
||||
"tags": doc.tags,
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def _format_error_message(error: str, filename: str | None = None) -> str:
|
||||
"""Format error message with helpful guidance for users."""
|
||||
file_part = f" in file '{filename}'" if filename else ""
|
||||
|
||||
# Check for common error patterns and provide helpful messages
|
||||
if "embedding" in error.lower() and (
|
||||
"key" in error.lower() or "required field" in error.lower()
|
||||
):
|
||||
return (
|
||||
f"❌ Missing 'embedding' field{file_part}. "
|
||||
"Each line must contain an 'embedding' field with a list of numbers."
|
||||
)
|
||||
elif "text" in error.lower() and (
|
||||
"key" in error.lower() or "required field" in error.lower()
|
||||
):
|
||||
return (
|
||||
f"❌ Missing 'text' field{file_part}. "
|
||||
"Each line must contain a 'text' field with the document content."
|
||||
)
|
||||
elif "json" in error.lower() and "decode" in error.lower():
|
||||
return (
|
||||
f"❌ Invalid JSON format{file_part}. "
|
||||
"Please check that each line is valid JSON with proper syntax (quotes, braces, etc.)."
|
||||
)
|
||||
elif "unicode" in error.lower() or "decode" in error.lower():
|
||||
return (
|
||||
f"❌ File encoding issue{file_part}. "
|
||||
"Please ensure the file is saved in UTF-8 format and contains no binary data."
|
||||
)
|
||||
elif "array" in error.lower() or "list" in error.lower():
|
||||
return (
|
||||
f"❌ Invalid embedding format{file_part}. "
|
||||
"Embeddings must be arrays/lists of numbers, not strings or other types."
|
||||
)
|
||||
else:
|
||||
return (
|
||||
f"❌ Error processing file{file_part}: {error}. "
|
||||
"Please check that your file is valid NDJSON with required 'text' and 'embedding' fields."
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _create_status_alert(message: str, color: str):
|
||||
"""Create a status alert component."""
|
||||
import dash_bootstrap_components as dbc
|
||||
|
||||
return dbc.Alert(message, color=color, className="mb-2")
|
||||
'id': doc.id,
|
||||
'text': doc.text,
|
||||
'embedding': doc.embedding,
|
||||
'category': doc.category,
|
||||
'subcategory': doc.subcategory,
|
||||
'tags': doc.tags
|
||||
}
|
||||
@@ -1,32 +1,66 @@
|
||||
import dash
|
||||
from dash import callback, Input, Output
|
||||
from dash import callback, Input, Output, State, html
|
||||
import dash_bootstrap_components as dbc
|
||||
|
||||
|
||||
class InteractionCallbacks:
|
||||
|
||||
def __init__(self):
|
||||
self._register_callbacks()
|
||||
|
||||
|
||||
def _register_callbacks(self):
|
||||
|
||||
@callback(
|
||||
Output("about-modal", "is_open"),
|
||||
[Input("about-button", "n_clicks"), Input("about-modal-close", "n_clicks")],
|
||||
prevent_initial_call=True,
|
||||
Output('point-details', 'children'),
|
||||
Input('embedding-plot', 'clickData'),
|
||||
[State('processed-data', 'data'),
|
||||
State('processed-prompts', 'data')]
|
||||
)
|
||||
def toggle_about_modal(about_clicks, close_clicks):
|
||||
if about_clicks or close_clicks:
|
||||
return True if about_clicks else False
|
||||
return False
|
||||
|
||||
def display_click_data(clickData, data, prompts_data):
|
||||
if not clickData or not data:
|
||||
return "Click on a point to see details"
|
||||
|
||||
point_data = clickData['points'][0]
|
||||
trace_name = point_data.get('fullData', {}).get('name', 'Documents')
|
||||
|
||||
if 'pointIndex' in point_data:
|
||||
point_index = point_data['pointIndex']
|
||||
elif 'pointNumber' in point_data:
|
||||
point_index = point_data['pointNumber']
|
||||
else:
|
||||
return "Could not identify clicked point"
|
||||
|
||||
if trace_name.startswith('Prompts') and prompts_data and 'prompts' in prompts_data:
|
||||
item = prompts_data['prompts'][point_index]
|
||||
item_type = 'Prompt'
|
||||
else:
|
||||
item = data['documents'][point_index]
|
||||
item_type = 'Document'
|
||||
|
||||
return self._create_detail_card(item, item_type)
|
||||
|
||||
@callback(
|
||||
[
|
||||
Output("processed-data", "data", allow_duplicate=True),
|
||||
Output("processed-prompts", "data", allow_duplicate=True),
|
||||
],
|
||||
Input("reset-button", "n_clicks"),
|
||||
prevent_initial_call=True,
|
||||
[Output('processed-data', 'data', allow_duplicate=True),
|
||||
Output('processed-prompts', 'data', allow_duplicate=True),
|
||||
Output('point-details', 'children', allow_duplicate=True)],
|
||||
Input('reset-button', 'n_clicks'),
|
||||
prevent_initial_call=True
|
||||
)
|
||||
def reset_data(n_clicks):
|
||||
if n_clicks is None or n_clicks == 0:
|
||||
return dash.no_update, dash.no_update
|
||||
|
||||
return None, None
|
||||
return dash.no_update, dash.no_update, dash.no_update
|
||||
|
||||
return None, None, "Click on a point to see details"
|
||||
|
||||
@staticmethod
|
||||
def _create_detail_card(item, item_type):
|
||||
return dbc.Card([
|
||||
dbc.CardBody([
|
||||
html.H5(f"{item_type}: {item['id']}", className="card-title"),
|
||||
html.P(f"Text: {item['text']}", className="card-text"),
|
||||
html.P(f"Category: {item.get('category', 'Unknown')}", className="card-text"),
|
||||
html.P(f"Subcategory: {item.get('subcategory', 'Unknown')}", className="card-text"),
|
||||
html.P(f"Tags: {', '.join(item.get('tags', [])) if item.get('tags') else 'None'}", className="card-text"),
|
||||
html.P(f"Type: {item_type}", className="card-text text-muted")
|
||||
])
|
||||
])
|
||||
@@ -7,102 +7,81 @@ from ...visualization.plots import PlotFactory
|
||||
|
||||
|
||||
class VisualizationCallbacks:
|
||||
|
||||
def __init__(self):
|
||||
self.plot_factory = PlotFactory()
|
||||
self._register_callbacks()
|
||||
|
||||
|
||||
def _register_callbacks(self):
|
||||
|
||||
@callback(
|
||||
Output("embedding-plot", "figure"),
|
||||
[
|
||||
Input("processed-data", "data"),
|
||||
Input("processed-prompts", "data"),
|
||||
Input("method-dropdown", "value"),
|
||||
Input("color-dropdown", "value"),
|
||||
Input("dimension-toggle", "value"),
|
||||
Input("show-prompts-toggle", "value"),
|
||||
],
|
||||
Output('embedding-plot', 'figure'),
|
||||
[Input('processed-data', 'data'),
|
||||
Input('processed-prompts', 'data'),
|
||||
Input('method-dropdown', 'value'),
|
||||
Input('color-dropdown', 'value'),
|
||||
Input('dimension-toggle', 'value'),
|
||||
Input('show-prompts-toggle', 'value')]
|
||||
)
|
||||
def update_plot(data, prompts_data, method, color_by, dimensions, show_prompts):
|
||||
if not data or "error" in data:
|
||||
if not data or 'error' in data:
|
||||
return go.Figure().add_annotation(
|
||||
text="Upload a valid NDJSON file to see visualization",
|
||||
xref="paper",
|
||||
yref="paper",
|
||||
x=0.5,
|
||||
y=0.5,
|
||||
xanchor="center",
|
||||
yanchor="middle",
|
||||
showarrow=False,
|
||||
font=dict(size=16),
|
||||
xref="paper", yref="paper",
|
||||
x=0.5, y=0.5, xanchor='center', yanchor='middle',
|
||||
showarrow=False, font=dict(size=16)
|
||||
)
|
||||
|
||||
|
||||
try:
|
||||
doc_embeddings = np.array(data["embeddings"])
|
||||
doc_embeddings = np.array(data['embeddings'])
|
||||
all_embeddings = doc_embeddings
|
||||
has_prompts = (
|
||||
prompts_data
|
||||
and "error" not in prompts_data
|
||||
and prompts_data.get("prompts")
|
||||
)
|
||||
|
||||
has_prompts = prompts_data and 'error' not in prompts_data and prompts_data.get('prompts')
|
||||
|
||||
if has_prompts:
|
||||
prompt_embeddings = np.array(prompts_data["embeddings"])
|
||||
prompt_embeddings = np.array(prompts_data['embeddings'])
|
||||
all_embeddings = np.vstack([doc_embeddings, prompt_embeddings])
|
||||
|
||||
n_components = 3 if dimensions == "3d" else 2
|
||||
|
||||
reducer = ReducerFactory.create_reducer(
|
||||
method, n_components=n_components
|
||||
)
|
||||
|
||||
n_components = 3 if dimensions == '3d' else 2
|
||||
|
||||
reducer = ReducerFactory.create_reducer(method, n_components=n_components)
|
||||
reduced_data = reducer.fit_transform(all_embeddings)
|
||||
|
||||
doc_reduced = reduced_data.reduced_embeddings[: len(doc_embeddings)]
|
||||
|
||||
doc_reduced = reduced_data.reduced_embeddings[:len(doc_embeddings)]
|
||||
prompt_reduced = None
|
||||
if has_prompts:
|
||||
prompt_reduced = reduced_data.reduced_embeddings[
|
||||
len(doc_embeddings) :
|
||||
]
|
||||
|
||||
documents = [self._dict_to_document(doc) for doc in data["documents"]]
|
||||
prompt_reduced = reduced_data.reduced_embeddings[len(doc_embeddings):]
|
||||
|
||||
documents = [self._dict_to_document(doc) for doc in data['documents']]
|
||||
prompts = None
|
||||
if has_prompts:
|
||||
prompts = [
|
||||
self._dict_to_document(prompt)
|
||||
for prompt in prompts_data["prompts"]
|
||||
]
|
||||
|
||||
prompts = [self._dict_to_document(prompt) for prompt in prompts_data['prompts']]
|
||||
|
||||
plot_data = PlotData(
|
||||
documents=documents,
|
||||
coordinates=doc_reduced,
|
||||
prompts=prompts,
|
||||
prompt_coordinates=prompt_reduced,
|
||||
prompt_coordinates=prompt_reduced
|
||||
)
|
||||
|
||||
|
||||
return self.plot_factory.create_plot(
|
||||
plot_data, dimensions, color_by, reduced_data.method, show_prompts
|
||||
)
|
||||
|
||||
|
||||
except Exception as e:
|
||||
return go.Figure().add_annotation(
|
||||
text=f"Error creating visualization: {str(e)}",
|
||||
xref="paper",
|
||||
yref="paper",
|
||||
x=0.5,
|
||||
y=0.5,
|
||||
xanchor="center",
|
||||
yanchor="middle",
|
||||
showarrow=False,
|
||||
font=dict(size=16),
|
||||
xref="paper", yref="paper",
|
||||
x=0.5, y=0.5, xanchor='center', yanchor='middle',
|
||||
showarrow=False, font=dict(size=16)
|
||||
)
|
||||
|
||||
|
||||
@staticmethod
|
||||
def _dict_to_document(doc_dict):
|
||||
return Document(
|
||||
id=doc_dict["id"],
|
||||
text=doc_dict["text"],
|
||||
embedding=doc_dict["embedding"],
|
||||
category=doc_dict.get("category"),
|
||||
subcategory=doc_dict.get("subcategory"),
|
||||
tags=doc_dict.get("tags", []),
|
||||
)
|
||||
id=doc_dict['id'],
|
||||
text=doc_dict['text'],
|
||||
embedding=doc_dict['embedding'],
|
||||
category=doc_dict.get('category'),
|
||||
subcategory=doc_dict.get('subcategory'),
|
||||
tags=doc_dict.get('tags', [])
|
||||
)
|
||||
@@ -1,90 +0,0 @@
|
||||
from dash import html, dcc
|
||||
import dash_bootstrap_components as dbc
|
||||
|
||||
|
||||
class AboutComponent:
|
||||
def _get_about_content(self):
|
||||
return """
|
||||
# 🔍 Interactive Embedding Vector Visualization
|
||||
|
||||
EmbeddingBuddy is a web application for interactive exploration and
|
||||
visualization of embedding vectors through dimensionality reduction techniques
|
||||
(PCA, t-SNE, UMAP).
|
||||
|
||||
You have two ways to get started:
|
||||
|
||||
1. Generate embeddings directly in the browser if it supports WebGPU.
|
||||
2. Upload your NDJSON file containing embedding vectors and metadata.
|
||||
|
||||
## Generating Embeddings in Browser
|
||||
|
||||
1. Expand the "Generate Embeddings" section.
|
||||
2. Input your text data (one entry per line).
|
||||
1. Optionally you can use the built in sample data by clicking "Load Sample Data" button.
|
||||
3. Click "Generate Embeddings" to create vectors using a pre-trained model.
|
||||
|
||||
## NDJSON File Format
|
||||
|
||||
```json
|
||||
{"id": "doc_001", "embedding": [0.1, -0.3, 0.7, ...], "text": "Sample text content", "category": "news", "subcategory": "politics", "tags": ["election", "politics"]}
|
||||
{"id": "doc_002", "embedding": [0.2, -0.1, 0.9, ...], "text": "Another example", "category": "review", "subcategory": "product", "tags": ["tech", "gadget"]}
|
||||
```
|
||||
|
||||
|
||||
## ✨ Features
|
||||
|
||||
- Drag-and-drop NDJSON file upload
|
||||
- Multiple dimensionality reduction algorithms
|
||||
- 2D/3D interactive plots with Plotly
|
||||
- Color coding by categories, subcategories, or tags
|
||||
- In-browser embedding generation
|
||||
- OpenSearch integration for data loading
|
||||
|
||||
## 🔧 Supported Algorithms
|
||||
|
||||
- **PCA** (Principal Component Analysis)
|
||||
- **t-SNE** (t-Distributed Stochastic Neighbor Embedding)
|
||||
- **UMAP** (Uniform Manifold Approximation and Projection)
|
||||
|
||||
---
|
||||
|
||||
📂 [View on GitHub](https://github.com/godber/EmbeddingBuddy)
|
||||
|
||||
*Built with: Python, Dash, Plotly, scikit-learn, OpenTSNE, UMAP*
|
||||
""".strip()
|
||||
|
||||
def create_about_modal(self):
|
||||
return dbc.Modal(
|
||||
[
|
||||
dbc.ModalHeader(
|
||||
dbc.ModalTitle("Welcome to EmbeddingBuddy"),
|
||||
close_button=True,
|
||||
),
|
||||
dbc.ModalBody(
|
||||
[dcc.Markdown(self._get_about_content(), className="mb-0")]
|
||||
),
|
||||
dbc.ModalFooter(
|
||||
[
|
||||
dbc.Button(
|
||||
"Close",
|
||||
id="about-modal-close",
|
||||
color="secondary",
|
||||
n_clicks=0,
|
||||
)
|
||||
]
|
||||
),
|
||||
],
|
||||
id="about-modal",
|
||||
is_open=True,
|
||||
size="lg",
|
||||
)
|
||||
|
||||
def create_about_button(self):
|
||||
return dbc.Button(
|
||||
[html.I(className="fas fa-info-circle me-2"), "About"],
|
||||
id="about-button",
|
||||
color="outline-info",
|
||||
size="sm",
|
||||
n_clicks=0,
|
||||
className="ms-2",
|
||||
)
|
||||
@@ -1,523 +0,0 @@
|
||||
from dash import dcc, html
|
||||
import dash_bootstrap_components as dbc
|
||||
from .upload import UploadComponent
|
||||
from embeddingbuddy.config.settings import AppSettings
|
||||
|
||||
|
||||
class DataSourceComponent:
|
||||
def __init__(self):
|
||||
self.upload_component = UploadComponent()
|
||||
|
||||
def create_tabbed_interface(self):
|
||||
"""Create tabbed interface for different data sources."""
|
||||
tabs = [dbc.Tab(label="File Upload", tab_id="file-tab")]
|
||||
|
||||
# Only add OpenSearch tab if enabled
|
||||
if AppSettings.OPENSEARCH_ENABLED:
|
||||
tabs.append(dbc.Tab(label="OpenSearch", tab_id="opensearch-tab"))
|
||||
|
||||
return dbc.Card(
|
||||
[
|
||||
dbc.CardHeader(
|
||||
[
|
||||
dbc.Tabs(
|
||||
tabs,
|
||||
id="data-source-tabs",
|
||||
active_tab="file-tab",
|
||||
)
|
||||
]
|
||||
),
|
||||
dbc.CardBody([html.Div(id="tab-content")]),
|
||||
]
|
||||
)
|
||||
|
||||
def create_file_upload_tab(self):
|
||||
"""Create file upload tab content."""
|
||||
return html.Div(
|
||||
[
|
||||
self.upload_component.create_error_alert(),
|
||||
self.upload_component.create_data_upload(),
|
||||
self.upload_component.create_prompts_upload(),
|
||||
self.upload_component.create_reset_button(),
|
||||
]
|
||||
)
|
||||
|
||||
def create_opensearch_tab(self):
|
||||
"""Create OpenSearch tab content with separate Data and Prompts sections."""
|
||||
return html.Div(
|
||||
[
|
||||
# Data Section
|
||||
dbc.Card(
|
||||
[
|
||||
dbc.CardHeader(
|
||||
[
|
||||
dbc.Button(
|
||||
[
|
||||
html.I(
|
||||
className="fas fa-chevron-down me-2",
|
||||
id="data-collapse-icon",
|
||||
),
|
||||
"📄 Documents/Data",
|
||||
],
|
||||
id="data-collapse-toggle",
|
||||
color="link",
|
||||
className="text-start p-0 w-100 text-decoration-none",
|
||||
style={
|
||||
"border": "none",
|
||||
"font-size": "1.25rem",
|
||||
"font-weight": "500",
|
||||
},
|
||||
),
|
||||
]
|
||||
),
|
||||
dbc.Collapse(
|
||||
[dbc.CardBody([self._create_opensearch_section("data")])],
|
||||
id="data-collapse",
|
||||
is_open=True,
|
||||
),
|
||||
],
|
||||
className="mb-4",
|
||||
),
|
||||
# Prompts Section
|
||||
dbc.Card(
|
||||
[
|
||||
dbc.CardHeader(
|
||||
[
|
||||
dbc.Button(
|
||||
[
|
||||
html.I(
|
||||
className="fas fa-chevron-down me-2",
|
||||
id="prompts-collapse-icon",
|
||||
),
|
||||
"💬 Prompts",
|
||||
],
|
||||
id="prompts-collapse-toggle",
|
||||
color="link",
|
||||
className="text-start p-0 w-100 text-decoration-none",
|
||||
style={
|
||||
"border": "none",
|
||||
"font-size": "1.25rem",
|
||||
"font-weight": "500",
|
||||
},
|
||||
),
|
||||
]
|
||||
),
|
||||
dbc.Collapse(
|
||||
[
|
||||
dbc.CardBody(
|
||||
[self._create_opensearch_section("prompts")]
|
||||
)
|
||||
],
|
||||
id="prompts-collapse",
|
||||
is_open=True,
|
||||
),
|
||||
],
|
||||
className="mb-4",
|
||||
),
|
||||
# Hidden dropdowns to prevent callback errors (for both sections)
|
||||
html.Div(
|
||||
[
|
||||
# Data dropdowns (hidden sync targets)
|
||||
dcc.Dropdown(
|
||||
id="data-embedding-field-dropdown",
|
||||
style={"display": "none"},
|
||||
),
|
||||
dcc.Dropdown(
|
||||
id="data-text-field-dropdown", style={"display": "none"}
|
||||
),
|
||||
dcc.Dropdown(
|
||||
id="data-id-field-dropdown", style={"display": "none"}
|
||||
),
|
||||
dcc.Dropdown(
|
||||
id="data-category-field-dropdown", style={"display": "none"}
|
||||
),
|
||||
dcc.Dropdown(
|
||||
id="data-subcategory-field-dropdown",
|
||||
style={"display": "none"},
|
||||
),
|
||||
dcc.Dropdown(
|
||||
id="data-tags-field-dropdown", style={"display": "none"}
|
||||
),
|
||||
# Data UI dropdowns (hidden placeholders)
|
||||
dcc.Dropdown(
|
||||
id="data-embedding-field-dropdown-ui",
|
||||
style={"display": "none"},
|
||||
),
|
||||
dcc.Dropdown(
|
||||
id="data-text-field-dropdown-ui", style={"display": "none"}
|
||||
),
|
||||
dcc.Dropdown(
|
||||
id="data-id-field-dropdown-ui", style={"display": "none"}
|
||||
),
|
||||
dcc.Dropdown(
|
||||
id="data-category-field-dropdown-ui",
|
||||
style={"display": "none"},
|
||||
),
|
||||
dcc.Dropdown(
|
||||
id="data-subcategory-field-dropdown-ui",
|
||||
style={"display": "none"},
|
||||
),
|
||||
dcc.Dropdown(
|
||||
id="data-tags-field-dropdown-ui", style={"display": "none"}
|
||||
),
|
||||
# Prompts dropdowns (hidden sync targets)
|
||||
dcc.Dropdown(
|
||||
id="prompts-embedding-field-dropdown",
|
||||
style={"display": "none"},
|
||||
),
|
||||
dcc.Dropdown(
|
||||
id="prompts-text-field-dropdown", style={"display": "none"}
|
||||
),
|
||||
dcc.Dropdown(
|
||||
id="prompts-id-field-dropdown", style={"display": "none"}
|
||||
),
|
||||
dcc.Dropdown(
|
||||
id="prompts-category-field-dropdown",
|
||||
style={"display": "none"},
|
||||
),
|
||||
dcc.Dropdown(
|
||||
id="prompts-subcategory-field-dropdown",
|
||||
style={"display": "none"},
|
||||
),
|
||||
dcc.Dropdown(
|
||||
id="prompts-tags-field-dropdown", style={"display": "none"}
|
||||
),
|
||||
# Prompts UI dropdowns (hidden placeholders)
|
||||
dcc.Dropdown(
|
||||
id="prompts-embedding-field-dropdown-ui",
|
||||
style={"display": "none"},
|
||||
),
|
||||
dcc.Dropdown(
|
||||
id="prompts-text-field-dropdown-ui",
|
||||
style={"display": "none"},
|
||||
),
|
||||
dcc.Dropdown(
|
||||
id="prompts-id-field-dropdown-ui", style={"display": "none"}
|
||||
),
|
||||
dcc.Dropdown(
|
||||
id="prompts-category-field-dropdown-ui",
|
||||
style={"display": "none"},
|
||||
),
|
||||
dcc.Dropdown(
|
||||
id="prompts-subcategory-field-dropdown-ui",
|
||||
style={"display": "none"},
|
||||
),
|
||||
dcc.Dropdown(
|
||||
id="prompts-tags-field-dropdown-ui",
|
||||
style={"display": "none"},
|
||||
),
|
||||
],
|
||||
style={"display": "none"},
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
def _create_opensearch_section(self, section_type):
|
||||
"""Create a complete OpenSearch section for either 'data' or 'prompts'."""
|
||||
section_id = section_type # 'data' or 'prompts'
|
||||
|
||||
return html.Div(
|
||||
[
|
||||
# Connection section
|
||||
html.H6("Connection", className="mb-2"),
|
||||
dbc.Row(
|
||||
[
|
||||
dbc.Col(
|
||||
[
|
||||
dbc.Label("OpenSearch URL:"),
|
||||
dbc.Input(
|
||||
id=f"{section_id}-opensearch-url",
|
||||
type="text",
|
||||
placeholder="https://opensearch.example.com:9200",
|
||||
className="mb-2",
|
||||
),
|
||||
],
|
||||
width=12,
|
||||
),
|
||||
]
|
||||
),
|
||||
dbc.Row(
|
||||
[
|
||||
dbc.Col(
|
||||
[
|
||||
dbc.Label("Index Name:"),
|
||||
dbc.Input(
|
||||
id=f"{section_id}-opensearch-index",
|
||||
type="text",
|
||||
placeholder="my-embeddings-index",
|
||||
className="mb-2",
|
||||
),
|
||||
],
|
||||
width=6,
|
||||
),
|
||||
dbc.Col(
|
||||
[
|
||||
dbc.Label("Query Size:"),
|
||||
dbc.Input(
|
||||
id=f"{section_id}-opensearch-query-size",
|
||||
type="number",
|
||||
value=100,
|
||||
min=1,
|
||||
max=1000,
|
||||
placeholder="100",
|
||||
className="mb-2",
|
||||
),
|
||||
],
|
||||
width=6,
|
||||
),
|
||||
]
|
||||
),
|
||||
dbc.Row(
|
||||
[
|
||||
dbc.Col(
|
||||
[
|
||||
dbc.Button(
|
||||
"Test Connection",
|
||||
id=f"{section_id}-test-connection-btn",
|
||||
color="primary",
|
||||
className="mb-3",
|
||||
),
|
||||
],
|
||||
width=12,
|
||||
),
|
||||
]
|
||||
),
|
||||
# Authentication section (collapsible)
|
||||
dbc.Collapse(
|
||||
[
|
||||
html.Hr(),
|
||||
html.H6("Authentication (Optional)", className="mb-2"),
|
||||
dbc.Row(
|
||||
[
|
||||
dbc.Col(
|
||||
[
|
||||
dbc.Label("Username:"),
|
||||
dbc.Input(
|
||||
id=f"{section_id}-opensearch-username",
|
||||
type="text",
|
||||
className="mb-2",
|
||||
),
|
||||
],
|
||||
width=6,
|
||||
),
|
||||
dbc.Col(
|
||||
[
|
||||
dbc.Label("Password:"),
|
||||
dbc.Input(
|
||||
id=f"{section_id}-opensearch-password",
|
||||
type="password",
|
||||
className="mb-2",
|
||||
),
|
||||
],
|
||||
width=6,
|
||||
),
|
||||
]
|
||||
),
|
||||
dbc.Label("OR"),
|
||||
dbc.Input(
|
||||
id=f"{section_id}-opensearch-api-key",
|
||||
type="text",
|
||||
placeholder="API Key",
|
||||
className="mb-2",
|
||||
),
|
||||
],
|
||||
id=f"{section_id}-auth-collapse",
|
||||
is_open=False,
|
||||
),
|
||||
dbc.Button(
|
||||
"Show Authentication",
|
||||
id=f"{section_id}-auth-toggle",
|
||||
color="link",
|
||||
size="sm",
|
||||
className="p-0 mb-3",
|
||||
),
|
||||
# Connection status
|
||||
html.Div(id=f"{section_id}-connection-status", className="mb-3"),
|
||||
# Field mapping section (hidden initially)
|
||||
html.Div(
|
||||
id=f"{section_id}-field-mapping-section", style={"display": "none"}
|
||||
),
|
||||
# Load data button (hidden initially)
|
||||
html.Div(
|
||||
[
|
||||
dbc.Button(
|
||||
f"Load {section_type.title()}",
|
||||
id=f"{section_id}-load-opensearch-data-btn",
|
||||
color="success",
|
||||
className="mb-2",
|
||||
disabled=True,
|
||||
),
|
||||
],
|
||||
id=f"{section_id}-load-data-section",
|
||||
style={"display": "none"},
|
||||
),
|
||||
# OpenSearch status/results
|
||||
html.Div(id=f"{section_id}-opensearch-status", className="mb-3"),
|
||||
]
|
||||
)
|
||||
|
||||
def create_field_mapping_interface(self, field_suggestions, section_type="data"):
|
||||
"""Create field mapping interface based on detected fields."""
|
||||
return html.Div(
|
||||
[
|
||||
html.Hr(),
|
||||
html.H6("Field Mapping", className="mb-2"),
|
||||
html.P(
|
||||
"Map your OpenSearch fields to the required format:",
|
||||
className="text-muted small",
|
||||
),
|
||||
# Required fields
|
||||
dbc.Row(
|
||||
[
|
||||
dbc.Col(
|
||||
[
|
||||
dbc.Label(
|
||||
"Embedding Field (required):", className="fw-bold"
|
||||
),
|
||||
dcc.Dropdown(
|
||||
id=f"{section_type}-embedding-field-dropdown-ui",
|
||||
options=[
|
||||
{"label": field, "value": field}
|
||||
for field in field_suggestions.get(
|
||||
"embedding", []
|
||||
)
|
||||
],
|
||||
value=field_suggestions.get("embedding", [None])[
|
||||
0
|
||||
], # Default to first suggestion
|
||||
placeholder="Select embedding field...",
|
||||
className="mb-2",
|
||||
),
|
||||
],
|
||||
width=6,
|
||||
),
|
||||
dbc.Col(
|
||||
[
|
||||
dbc.Label(
|
||||
"Text Field (required):", className="fw-bold"
|
||||
),
|
||||
dcc.Dropdown(
|
||||
id=f"{section_type}-text-field-dropdown-ui",
|
||||
options=[
|
||||
{"label": field, "value": field}
|
||||
for field in field_suggestions.get("text", [])
|
||||
],
|
||||
value=field_suggestions.get("text", [None])[
|
||||
0
|
||||
], # Default to first suggestion
|
||||
placeholder="Select text field...",
|
||||
className="mb-2",
|
||||
),
|
||||
],
|
||||
width=6,
|
||||
),
|
||||
]
|
||||
),
|
||||
# Optional fields
|
||||
html.H6("Optional Fields", className="mb-2 mt-3"),
|
||||
dbc.Row(
|
||||
[
|
||||
dbc.Col(
|
||||
[
|
||||
dbc.Label("ID Field:"),
|
||||
dcc.Dropdown(
|
||||
id=f"{section_type}-id-field-dropdown-ui",
|
||||
options=[
|
||||
{"label": field, "value": field}
|
||||
for field in field_suggestions.get("id", [])
|
||||
],
|
||||
value=field_suggestions.get("id", [None])[
|
||||
0
|
||||
], # Default to first suggestion
|
||||
placeholder="Select ID field...",
|
||||
className="mb-2",
|
||||
),
|
||||
],
|
||||
width=6,
|
||||
),
|
||||
dbc.Col(
|
||||
[
|
||||
dbc.Label("Category Field:"),
|
||||
dcc.Dropdown(
|
||||
id=f"{section_type}-category-field-dropdown-ui",
|
||||
options=[
|
||||
{"label": field, "value": field}
|
||||
for field in field_suggestions.get(
|
||||
"category", []
|
||||
)
|
||||
],
|
||||
value=field_suggestions.get("category", [None])[
|
||||
0
|
||||
], # Default to first suggestion
|
||||
placeholder="Select category field...",
|
||||
className="mb-2",
|
||||
),
|
||||
],
|
||||
width=6,
|
||||
),
|
||||
]
|
||||
),
|
||||
dbc.Row(
|
||||
[
|
||||
dbc.Col(
|
||||
[
|
||||
dbc.Label("Subcategory Field:"),
|
||||
dcc.Dropdown(
|
||||
id=f"{section_type}-subcategory-field-dropdown-ui",
|
||||
options=[
|
||||
{"label": field, "value": field}
|
||||
for field in field_suggestions.get(
|
||||
"subcategory", []
|
||||
)
|
||||
],
|
||||
value=field_suggestions.get("subcategory", [None])[
|
||||
0
|
||||
], # Default to first suggestion
|
||||
placeholder="Select subcategory field...",
|
||||
className="mb-2",
|
||||
),
|
||||
],
|
||||
width=6,
|
||||
),
|
||||
dbc.Col(
|
||||
[
|
||||
dbc.Label("Tags Field:"),
|
||||
dcc.Dropdown(
|
||||
id=f"{section_type}-tags-field-dropdown-ui",
|
||||
options=[
|
||||
{"label": field, "value": field}
|
||||
for field in field_suggestions.get("tags", [])
|
||||
],
|
||||
value=field_suggestions.get("tags", [None])[
|
||||
0
|
||||
], # Default to first suggestion
|
||||
placeholder="Select tags field...",
|
||||
className="mb-2",
|
||||
),
|
||||
],
|
||||
width=6,
|
||||
),
|
||||
]
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
def create_error_alert(self):
|
||||
"""Create error alert component for OpenSearch issues."""
|
||||
return dbc.Alert(
|
||||
id="opensearch-error-alert",
|
||||
dismissable=True,
|
||||
is_open=False,
|
||||
color="danger",
|
||||
className="mb-3",
|
||||
)
|
||||
|
||||
def create_success_alert(self):
|
||||
"""Create success alert component for OpenSearch operations."""
|
||||
return dbc.Alert(
|
||||
id="opensearch-success-alert",
|
||||
dismissable=True,
|
||||
is_open=False,
|
||||
color="success",
|
||||
className="mb-3",
|
||||
)
|
||||
@@ -1,148 +1,82 @@
|
||||
from dash import dcc, html
|
||||
import dash_bootstrap_components as dbc
|
||||
from .upload import UploadComponent
|
||||
from .datasource import DataSourceComponent
|
||||
from .textinput import TextInputComponent
|
||||
from embeddingbuddy.config.settings import AppSettings
|
||||
|
||||
|
||||
class SidebarComponent:
|
||||
|
||||
def __init__(self):
|
||||
self.upload_component = UploadComponent()
|
||||
self.datasource_component = DataSourceComponent()
|
||||
self.textinput_component = TextInputComponent()
|
||||
|
||||
|
||||
def create_layout(self):
|
||||
return dbc.Col(
|
||||
[
|
||||
dbc.Accordion(
|
||||
[
|
||||
self._create_data_sources_item(),
|
||||
self._create_generate_embeddings_item(),
|
||||
self._create_visualization_controls_item(),
|
||||
],
|
||||
always_open=True,
|
||||
)
|
||||
],
|
||||
width=3,
|
||||
style={"padding-right": "20px"},
|
||||
)
|
||||
|
||||
return dbc.Col([
|
||||
html.H5("Upload Data", className="mb-3"),
|
||||
self.upload_component.create_data_upload(),
|
||||
self.upload_component.create_prompts_upload(),
|
||||
self.upload_component.create_reset_button(),
|
||||
|
||||
html.H5("Visualization Controls", className="mb-3"),
|
||||
self._create_method_dropdown(),
|
||||
self._create_color_dropdown(),
|
||||
self._create_dimension_toggle(),
|
||||
self._create_prompts_toggle(),
|
||||
|
||||
html.H5("Point Details", className="mb-3"),
|
||||
html.Div(id='point-details', children="Click on a point to see details")
|
||||
|
||||
], width=3, style={'padding-right': '20px'})
|
||||
|
||||
def _create_method_dropdown(self):
|
||||
return [
|
||||
dbc.Label("Method:"),
|
||||
dcc.Dropdown(
|
||||
id="method-dropdown",
|
||||
id='method-dropdown',
|
||||
options=[
|
||||
{"label": "PCA", "value": "pca"},
|
||||
{"label": "t-SNE", "value": "tsne"},
|
||||
{"label": "UMAP", "value": "umap"},
|
||||
{'label': 'PCA', 'value': 'pca'},
|
||||
{'label': 't-SNE', 'value': 'tsne'},
|
||||
{'label': 'UMAP', 'value': 'umap'}
|
||||
],
|
||||
value="pca",
|
||||
style={"margin-bottom": "15px"},
|
||||
),
|
||||
value='pca',
|
||||
style={'margin-bottom': '15px'}
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
def _create_color_dropdown(self):
|
||||
return [
|
||||
dbc.Label("Color by:"),
|
||||
dcc.Dropdown(
|
||||
id="color-dropdown",
|
||||
id='color-dropdown',
|
||||
options=[
|
||||
{"label": "Category", "value": "category"},
|
||||
{"label": "Subcategory", "value": "subcategory"},
|
||||
{"label": "Tags", "value": "tags"},
|
||||
{'label': 'Category', 'value': 'category'},
|
||||
{'label': 'Subcategory', 'value': 'subcategory'},
|
||||
{'label': 'Tags', 'value': 'tags'}
|
||||
],
|
||||
value="category",
|
||||
style={"margin-bottom": "15px"},
|
||||
),
|
||||
value='category',
|
||||
style={'margin-bottom': '15px'}
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
def _create_dimension_toggle(self):
|
||||
return [
|
||||
dbc.Label("Dimensions:"),
|
||||
dcc.RadioItems(
|
||||
id="dimension-toggle",
|
||||
id='dimension-toggle',
|
||||
options=[
|
||||
{"label": "2D", "value": "2d"},
|
||||
{"label": "3D", "value": "3d"},
|
||||
{'label': '2D', 'value': '2d'},
|
||||
{'label': '3D', 'value': '3d'}
|
||||
],
|
||||
value="3d",
|
||||
style={"margin-bottom": "20px"},
|
||||
),
|
||||
value='3d',
|
||||
style={'margin-bottom': '20px'}
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
def _create_prompts_toggle(self):
|
||||
return [
|
||||
dbc.Label("Show Prompts:"),
|
||||
dcc.Checklist(
|
||||
id="show-prompts-toggle",
|
||||
options=[{"label": "Show prompts on plot", "value": "show"}],
|
||||
value=["show"],
|
||||
style={"margin-bottom": "20px"},
|
||||
),
|
||||
]
|
||||
|
||||
def _create_generate_embeddings_item(self):
|
||||
return dbc.AccordionItem(
|
||||
[
|
||||
self.textinput_component.create_text_input_interface(),
|
||||
],
|
||||
title=html.Span(
|
||||
[
|
||||
"Generate Embeddings ",
|
||||
html.I(
|
||||
className="fas fa-info-circle text-muted",
|
||||
style={"cursor": "pointer"},
|
||||
id="generate-embeddings-info-icon",
|
||||
title="Create new embeddings from text input using various in-browser models",
|
||||
),
|
||||
]
|
||||
),
|
||||
item_id="generate-embeddings-accordion",
|
||||
)
|
||||
|
||||
def _create_data_sources_item(self):
|
||||
tooltip_text = "Load existing embeddings: upload files"
|
||||
if AppSettings.OPENSEARCH_ENABLED:
|
||||
tooltip_text += " or read from OpenSearch"
|
||||
|
||||
return dbc.AccordionItem(
|
||||
[
|
||||
self.datasource_component.create_error_alert(),
|
||||
self.datasource_component.create_success_alert(),
|
||||
self.datasource_component.create_tabbed_interface(),
|
||||
],
|
||||
title=html.Span(
|
||||
[
|
||||
"Load Embeddings ",
|
||||
html.I(
|
||||
className="fas fa-info-circle text-muted",
|
||||
style={"cursor": "pointer"},
|
||||
id="load-embeddings-info-icon",
|
||||
title=tooltip_text,
|
||||
),
|
||||
]
|
||||
),
|
||||
item_id="data-sources-accordion",
|
||||
)
|
||||
|
||||
def _create_visualization_controls_item(self):
|
||||
return dbc.AccordionItem(
|
||||
self._create_method_dropdown()
|
||||
+ self._create_color_dropdown()
|
||||
+ self._create_dimension_toggle()
|
||||
+ self._create_prompts_toggle(),
|
||||
title=html.Span(
|
||||
[
|
||||
"Visualization Controls ",
|
||||
html.I(
|
||||
className="fas fa-info-circle text-muted",
|
||||
style={"cursor": "pointer"},
|
||||
id="visualization-controls-info-icon",
|
||||
title="Configure plot settings: select dimensionality reduction method, colors, and display options",
|
||||
),
|
||||
]
|
||||
),
|
||||
item_id="visualization-controls-accordion",
|
||||
)
|
||||
id='show-prompts-toggle',
|
||||
options=[{'label': 'Show prompts on plot', 'value': 'show'}],
|
||||
value=['show'],
|
||||
style={'margin-bottom': '20px'}
|
||||
)
|
||||
]
|
||||
@@ -1,344 +0,0 @@
|
||||
"""Text input component for generating embeddings from user text."""
|
||||
|
||||
import dash_bootstrap_components as dbc
|
||||
from dash import dcc, html
|
||||
|
||||
from embeddingbuddy.config.settings import AppSettings
|
||||
|
||||
|
||||
class TextInputComponent:
|
||||
"""Component for text input and embedding generation."""
|
||||
|
||||
def __init__(self):
|
||||
self.settings = AppSettings()
|
||||
|
||||
def create_text_input_interface(self):
|
||||
"""Create the complete text input interface with model selection and processing options."""
|
||||
return html.Div(
|
||||
[
|
||||
# Text input section
|
||||
self._create_text_input_area(),
|
||||
# Text action buttons
|
||||
self._create_text_action_buttons(),
|
||||
html.Hr(),
|
||||
# Model selection section
|
||||
self._create_model_selection(),
|
||||
html.Hr(),
|
||||
# Processing options
|
||||
self._create_processing_options(),
|
||||
html.Hr(),
|
||||
# Generation controls
|
||||
self._create_generation_controls(),
|
||||
html.Hr(),
|
||||
# Status and results
|
||||
self._create_status_section(),
|
||||
# Hidden components for data flow
|
||||
self._create_hidden_components(),
|
||||
],
|
||||
className="p-3",
|
||||
)
|
||||
|
||||
def _create_model_selection(self):
|
||||
"""Create model selection dropdown with descriptions."""
|
||||
model_options = []
|
||||
for model in self.settings.AVAILABLE_MODELS:
|
||||
label = f"{model['label']} - {model['size']}"
|
||||
if model.get("default", False):
|
||||
label += " (Recommended)"
|
||||
|
||||
model_options.append({"label": label, "value": model["name"]})
|
||||
|
||||
return html.Div(
|
||||
[
|
||||
html.H5("Embedding Model", className="mb-3"),
|
||||
html.Div(
|
||||
[
|
||||
dcc.Dropdown(
|
||||
id="model-selection",
|
||||
options=model_options,
|
||||
value=self.settings.DEFAULT_EMBEDDING_MODEL,
|
||||
placeholder="Select an embedding model...",
|
||||
className="mb-2",
|
||||
),
|
||||
dbc.Alert(
|
||||
[
|
||||
html.Div(
|
||||
id="model-info",
|
||||
children=self._get_model_description(
|
||||
self.settings.DEFAULT_EMBEDDING_MODEL
|
||||
),
|
||||
)
|
||||
],
|
||||
color="info",
|
||||
className="small",
|
||||
),
|
||||
]
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
def _create_text_input_area(self):
|
||||
"""Create text input textarea with character limits."""
|
||||
return html.Div(
|
||||
[
|
||||
html.H5("Text Input", className="mb-3"),
|
||||
dcc.Textarea(
|
||||
id="text-input-area",
|
||||
placeholder="Paste your text here... Each sentence, paragraph, or line will become a separate data point depending on your tokenization method below.",
|
||||
value="",
|
||||
style={
|
||||
"width": "100%",
|
||||
"height": "300px",
|
||||
"resize": "vertical",
|
||||
"font-family": "monospace",
|
||||
"font-size": "14px",
|
||||
},
|
||||
maxLength=self.settings.MAX_TEXT_LENGTH,
|
||||
className="form-control",
|
||||
),
|
||||
html.Small(
|
||||
f"Maximum {self.settings.MAX_TEXT_LENGTH:,} characters. Current: ",
|
||||
className="text-muted",
|
||||
),
|
||||
html.Small(
|
||||
id="text-length-counter",
|
||||
children="0",
|
||||
className="text-muted fw-bold",
|
||||
),
|
||||
html.Small(" characters", className="text-muted"),
|
||||
]
|
||||
)
|
||||
|
||||
def _create_text_action_buttons(self):
|
||||
"""Create action buttons for text input (Load Sample, Clear)."""
|
||||
return html.Div(
|
||||
[
|
||||
dbc.Row(
|
||||
[
|
||||
dbc.Col(
|
||||
[
|
||||
dbc.Button(
|
||||
[
|
||||
html.I(className="fas fa-file-text me-2"),
|
||||
"Load Sample Text",
|
||||
],
|
||||
id="load-sample-btn",
|
||||
color="info",
|
||||
size="sm",
|
||||
className="w-100",
|
||||
)
|
||||
],
|
||||
md=6,
|
||||
),
|
||||
dbc.Col(
|
||||
[
|
||||
dbc.Button(
|
||||
[
|
||||
html.I(className="fas fa-trash me-2"),
|
||||
"Clear Text",
|
||||
],
|
||||
id="clear-text-btn",
|
||||
color="outline-secondary",
|
||||
size="sm",
|
||||
className="w-100",
|
||||
)
|
||||
],
|
||||
md=6,
|
||||
),
|
||||
],
|
||||
className="mt-2 mb-3",
|
||||
)
|
||||
]
|
||||
)
|
||||
|
||||
def _create_processing_options(self):
|
||||
"""Create tokenization and metadata options."""
|
||||
return html.Div(
|
||||
[
|
||||
html.H5("Processing Options", className="mb-3"),
|
||||
dbc.Row(
|
||||
[
|
||||
dbc.Col(
|
||||
[
|
||||
html.Label(
|
||||
"Text Splitting Method:", className="form-label"
|
||||
),
|
||||
dcc.Dropdown(
|
||||
id="tokenization-method",
|
||||
options=[
|
||||
{
|
||||
"label": "Sentences (split on . ! ?)",
|
||||
"value": "sentence",
|
||||
},
|
||||
{
|
||||
"label": "Paragraphs (split on double newline)",
|
||||
"value": "paragraph",
|
||||
},
|
||||
{
|
||||
"label": "Lines (split on single newline)",
|
||||
"value": "manual",
|
||||
},
|
||||
{
|
||||
"label": "Entire text as one document",
|
||||
"value": "whole",
|
||||
},
|
||||
],
|
||||
value=self.settings.DEFAULT_TOKENIZATION_METHOD,
|
||||
className="mb-3",
|
||||
),
|
||||
],
|
||||
md=6,
|
||||
),
|
||||
dbc.Col(
|
||||
[
|
||||
html.Label("Batch Size:", className="form-label"),
|
||||
dcc.Dropdown(
|
||||
id="batch-size",
|
||||
options=[
|
||||
{
|
||||
"label": "Small batches (4) - Lower memory",
|
||||
"value": 4,
|
||||
},
|
||||
{
|
||||
"label": "Medium batches (8) - Balanced",
|
||||
"value": 8,
|
||||
},
|
||||
{
|
||||
"label": "Large batches (16) - Faster",
|
||||
"value": 16,
|
||||
},
|
||||
],
|
||||
value=self.settings.MAX_BATCH_SIZE,
|
||||
className="mb-3",
|
||||
),
|
||||
],
|
||||
md=6,
|
||||
),
|
||||
]
|
||||
),
|
||||
dbc.Row(
|
||||
[
|
||||
dbc.Col(
|
||||
[
|
||||
html.Label(
|
||||
"Category (Optional):", className="form-label"
|
||||
),
|
||||
dcc.Input(
|
||||
id="text-category",
|
||||
type="text",
|
||||
placeholder="e.g., Notes, Articles, Ideas...",
|
||||
value="Text Input",
|
||||
className="form-control mb-3",
|
||||
),
|
||||
],
|
||||
md=6,
|
||||
),
|
||||
dbc.Col(
|
||||
[
|
||||
html.Label(
|
||||
"Subcategory (Optional):", className="form-label"
|
||||
),
|
||||
dcc.Input(
|
||||
id="text-subcategory",
|
||||
type="text",
|
||||
placeholder="e.g., Meeting Notes, Research...",
|
||||
value="Generated",
|
||||
className="form-control mb-3",
|
||||
),
|
||||
],
|
||||
md=6,
|
||||
),
|
||||
]
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
def _create_generation_controls(self):
|
||||
"""Create embedding generation button and controls."""
|
||||
return html.Div(
|
||||
[
|
||||
html.H5("Generate Embeddings", className="mb-3"),
|
||||
dbc.Row(
|
||||
[
|
||||
dbc.Col(
|
||||
[
|
||||
dbc.Button(
|
||||
[
|
||||
html.I(className="fas fa-magic me-2"),
|
||||
"Generate Embeddings",
|
||||
],
|
||||
id="generate-embeddings-btn",
|
||||
color="primary",
|
||||
size="lg",
|
||||
disabled=True,
|
||||
className="w-100",
|
||||
)
|
||||
],
|
||||
md=12,
|
||||
),
|
||||
]
|
||||
),
|
||||
html.Div(
|
||||
[
|
||||
dbc.Alert(
|
||||
[
|
||||
html.I(className="fas fa-info-circle me-2"),
|
||||
"Enter some text above and select a model to enable embedding generation.",
|
||||
],
|
||||
color="light",
|
||||
className="mt-3",
|
||||
id="generation-help",
|
||||
)
|
||||
]
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
def _create_status_section(self):
|
||||
"""Create status alerts and results preview."""
|
||||
return html.Div(
|
||||
[
|
||||
# Server-side status
|
||||
dbc.Alert(
|
||||
id="text-input-status",
|
||||
children="",
|
||||
color="light",
|
||||
className="mb-3",
|
||||
style={"display": "none"},
|
||||
),
|
||||
# Results preview
|
||||
html.Div(id="embedding-results-preview"),
|
||||
]
|
||||
)
|
||||
|
||||
def _create_hidden_components(self):
|
||||
"""Create hidden components for data flow."""
|
||||
return html.Div(
|
||||
[
|
||||
# Store for embeddings data from client-side
|
||||
dcc.Store(id="embeddings-generated-trigger"),
|
||||
# Store for tokenization preview
|
||||
dcc.Store(id="tokenization-preview-data"),
|
||||
]
|
||||
)
|
||||
|
||||
def _get_model_description(self, model_name):
|
||||
"""Get description for a specific model."""
|
||||
for model in self.settings.AVAILABLE_MODELS:
|
||||
if model["name"] == model_name:
|
||||
return html.Div(
|
||||
[
|
||||
html.Strong(
|
||||
f"Dimensions: {model['dimensions']} | Context Length: {model['context_length']}"
|
||||
),
|
||||
html.Br(),
|
||||
html.Span(model["description"]),
|
||||
html.Br(),
|
||||
html.Small(
|
||||
f"Multilingual: {'Yes' if model.get('multilingual', False) else 'No'} | Size: {model['size']}",
|
||||
className="text-muted",
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
return html.Span("Model information not available", className="text-muted")
|
||||
@@ -3,98 +3,58 @@ import dash_bootstrap_components as dbc
|
||||
|
||||
|
||||
class UploadComponent:
|
||||
|
||||
@staticmethod
|
||||
def create_data_upload():
|
||||
return html.Div(
|
||||
[
|
||||
dcc.Upload(
|
||||
id="upload-data",
|
||||
children=html.Div(
|
||||
[
|
||||
"Upload Data ",
|
||||
html.I(
|
||||
className="fas fa-info-circle",
|
||||
style={"color": "#6c757d", "fontSize": "14px"},
|
||||
id="data-upload-info",
|
||||
),
|
||||
]
|
||||
),
|
||||
style={
|
||||
"width": "100%",
|
||||
"height": "60px",
|
||||
"lineHeight": "60px",
|
||||
"borderWidth": "1px",
|
||||
"borderStyle": "dashed",
|
||||
"borderRadius": "5px",
|
||||
"textAlign": "center",
|
||||
"margin-bottom": "20px",
|
||||
},
|
||||
multiple=False,
|
||||
),
|
||||
dbc.Tooltip(
|
||||
"Click here or drag and drop NDJSON files containing document embeddings",
|
||||
target="data-upload-info",
|
||||
placement="top",
|
||||
),
|
||||
]
|
||||
return dcc.Upload(
|
||||
id='upload-data',
|
||||
children=html.Div([
|
||||
'Drag and Drop or ',
|
||||
html.A('Select Files')
|
||||
]),
|
||||
style={
|
||||
'width': '100%',
|
||||
'height': '60px',
|
||||
'lineHeight': '60px',
|
||||
'borderWidth': '1px',
|
||||
'borderStyle': 'dashed',
|
||||
'borderRadius': '5px',
|
||||
'textAlign': 'center',
|
||||
'margin-bottom': '20px'
|
||||
},
|
||||
multiple=False
|
||||
)
|
||||
|
||||
|
||||
@staticmethod
|
||||
def create_prompts_upload():
|
||||
return html.Div(
|
||||
[
|
||||
dcc.Upload(
|
||||
id="upload-prompts",
|
||||
children=html.Div(
|
||||
[
|
||||
"Upload Prompts ",
|
||||
html.I(
|
||||
className="fas fa-info-circle",
|
||||
style={"color": "#6c757d", "fontSize": "14px"},
|
||||
id="prompts-upload-info",
|
||||
),
|
||||
]
|
||||
),
|
||||
style={
|
||||
"width": "100%",
|
||||
"height": "60px",
|
||||
"lineHeight": "60px",
|
||||
"borderWidth": "1px",
|
||||
"borderStyle": "dashed",
|
||||
"borderRadius": "5px",
|
||||
"textAlign": "center",
|
||||
"margin-bottom": "20px",
|
||||
"borderColor": "#28a745",
|
||||
},
|
||||
multiple=False,
|
||||
),
|
||||
dbc.Tooltip(
|
||||
"Click here or drag and drop NDJSON files containing prompt embeddings",
|
||||
target="prompts-upload-info",
|
||||
placement="top",
|
||||
),
|
||||
]
|
||||
return dcc.Upload(
|
||||
id='upload-prompts',
|
||||
children=html.Div([
|
||||
'Drag and Drop Prompts or ',
|
||||
html.A('Select Files')
|
||||
]),
|
||||
style={
|
||||
'width': '100%',
|
||||
'height': '60px',
|
||||
'lineHeight': '60px',
|
||||
'borderWidth': '1px',
|
||||
'borderStyle': 'dashed',
|
||||
'borderRadius': '5px',
|
||||
'textAlign': 'center',
|
||||
'margin-bottom': '20px',
|
||||
'borderColor': '#28a745'
|
||||
},
|
||||
multiple=False
|
||||
)
|
||||
|
||||
|
||||
@staticmethod
|
||||
def create_reset_button():
|
||||
return dbc.Button(
|
||||
"Reset All Data",
|
||||
id="reset-button",
|
||||
color="danger",
|
||||
id='reset-button',
|
||||
color='danger',
|
||||
outline=True,
|
||||
size="sm",
|
||||
className="mb-3",
|
||||
style={"width": "100%"},
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def create_error_alert():
|
||||
"""Create error alert component for data upload issues."""
|
||||
return dbc.Alert(
|
||||
id="upload-error-alert",
|
||||
dismissable=True,
|
||||
is_open=False,
|
||||
color="danger",
|
||||
className="mb-3",
|
||||
)
|
||||
size='sm',
|
||||
className='mb-3',
|
||||
style={'width': '100%'}
|
||||
)
|
||||
@@ -1,71 +1,44 @@
|
||||
from dash import dcc, html
|
||||
import dash_bootstrap_components as dbc
|
||||
from .components.sidebar import SidebarComponent
|
||||
from .components.about import AboutComponent
|
||||
|
||||
|
||||
class AppLayout:
|
||||
|
||||
def __init__(self):
|
||||
self.sidebar = SidebarComponent()
|
||||
self.about = AboutComponent()
|
||||
|
||||
|
||||
def create_layout(self):
|
||||
return dbc.Container(
|
||||
[self._create_header(), self._create_main_content()]
|
||||
+ self._create_stores()
|
||||
+ [self.about.create_about_modal()],
|
||||
fluid=True,
|
||||
)
|
||||
|
||||
return dbc.Container([
|
||||
self._create_header(),
|
||||
self._create_main_content(),
|
||||
self._create_stores()
|
||||
], fluid=True)
|
||||
|
||||
def _create_header(self):
|
||||
return dbc.Row(
|
||||
[
|
||||
dbc.Col(
|
||||
[
|
||||
html.Div(
|
||||
[
|
||||
html.H1(
|
||||
"EmbeddingBuddy",
|
||||
className="text-center mb-4 d-inline",
|
||||
),
|
||||
html.Div(
|
||||
[self.about.create_about_button()],
|
||||
className="float-end",
|
||||
),
|
||||
],
|
||||
className="d-flex justify-content-between align-items-center",
|
||||
),
|
||||
# Load Transformers.js from CDN
|
||||
html.Script(
|
||||
"""
|
||||
import { pipeline } from 'https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.7.2';
|
||||
window.transformersPipeline = pipeline;
|
||||
console.log('✅ Transformers.js pipeline loaded globally');
|
||||
""",
|
||||
type="module",
|
||||
),
|
||||
],
|
||||
width=12,
|
||||
)
|
||||
]
|
||||
)
|
||||
|
||||
return dbc.Row([
|
||||
dbc.Col([
|
||||
html.H1("EmbeddingBuddy", className="text-center mb-4"),
|
||||
], width=12)
|
||||
])
|
||||
|
||||
def _create_main_content(self):
|
||||
return dbc.Row(
|
||||
[self.sidebar.create_layout(), self._create_visualization_area()]
|
||||
)
|
||||
|
||||
return dbc.Row([
|
||||
self.sidebar.create_layout(),
|
||||
self._create_visualization_area()
|
||||
])
|
||||
|
||||
def _create_visualization_area(self):
|
||||
return dbc.Col(
|
||||
[
|
||||
dcc.Graph(
|
||||
id="embedding-plot",
|
||||
style={"height": "85vh", "width": "100%"},
|
||||
config={"responsive": True, "displayModeBar": True},
|
||||
)
|
||||
],
|
||||
width=9,
|
||||
)
|
||||
|
||||
return dbc.Col([
|
||||
dcc.Graph(
|
||||
id='embedding-plot',
|
||||
style={'height': '85vh', 'width': '100%'},
|
||||
config={'responsive': True, 'displayModeBar': True}
|
||||
)
|
||||
], width=9)
|
||||
|
||||
def _create_stores(self):
|
||||
return [dcc.Store(id="processed-data"), dcc.Store(id="processed-prompts")]
|
||||
return [
|
||||
dcc.Store(id='processed-data'),
|
||||
dcc.Store(id='processed-prompts')
|
||||
]
|
||||
@@ -1,36 +1,33 @@
|
||||
from typing import List
|
||||
from typing import List, Dict, Any
|
||||
import plotly.colors as pc
|
||||
from ..models.schemas import Document
|
||||
|
||||
|
||||
class ColorMapper:
|
||||
|
||||
@staticmethod
|
||||
def create_color_mapping(documents: List[Document], color_by: str) -> List[str]:
|
||||
if color_by == "category":
|
||||
if color_by == 'category':
|
||||
return [doc.category for doc in documents]
|
||||
elif color_by == "subcategory":
|
||||
elif color_by == 'subcategory':
|
||||
return [doc.subcategory for doc in documents]
|
||||
elif color_by == "tags":
|
||||
return [", ".join(doc.tags) if doc.tags else "No tags" for doc in documents]
|
||||
elif color_by == 'tags':
|
||||
return [', '.join(doc.tags) if doc.tags else 'No tags' for doc in documents]
|
||||
else:
|
||||
return ["All"] * len(documents)
|
||||
|
||||
return ['All'] * len(documents)
|
||||
|
||||
@staticmethod
|
||||
def to_grayscale_hex(color_str: str) -> str:
|
||||
try:
|
||||
if color_str.startswith("#"):
|
||||
rgb = tuple(int(color_str[i : i + 2], 16) for i in (1, 3, 5))
|
||||
if color_str.startswith('#'):
|
||||
rgb = tuple(int(color_str[i:i+2], 16) for i in (1, 3, 5))
|
||||
else:
|
||||
rgb = pc.hex_to_rgb(
|
||||
pc.convert_colors_to_same_type([color_str], colortype="hex")[0][0]
|
||||
)
|
||||
|
||||
rgb = pc.hex_to_rgb(pc.convert_colors_to_same_type([color_str], colortype='hex')[0][0])
|
||||
|
||||
gray_value = int(0.299 * rgb[0] + 0.587 * rgb[1] + 0.114 * rgb[2])
|
||||
gray_rgb = (
|
||||
gray_value * 0.7 + rgb[0] * 0.3,
|
||||
gray_value * 0.7 + rgb[1] * 0.3,
|
||||
gray_value * 0.7 + rgb[2] * 0.3,
|
||||
)
|
||||
return f"rgb({int(gray_rgb[0])},{int(gray_rgb[1])},{int(gray_rgb[2])})"
|
||||
except: # noqa: E722
|
||||
return "rgb(128,128,128)"
|
||||
gray_rgb = (gray_value * 0.7 + rgb[0] * 0.3,
|
||||
gray_value * 0.7 + rgb[1] * 0.3,
|
||||
gray_value * 0.7 + rgb[2] * 0.3)
|
||||
return f'rgb({int(gray_rgb[0])},{int(gray_rgb[1])},{int(gray_rgb[2])})'
|
||||
except:
|
||||
return 'rgb(128,128,128)'
|
||||
@@ -7,172 +7,139 @@ from .colors import ColorMapper
|
||||
|
||||
|
||||
class PlotFactory:
|
||||
|
||||
def __init__(self):
|
||||
self.color_mapper = ColorMapper()
|
||||
|
||||
def create_plot(
|
||||
self,
|
||||
plot_data: PlotData,
|
||||
dimensions: str = "3d",
|
||||
color_by: str = "category",
|
||||
method: str = "PCA",
|
||||
show_prompts: Optional[List[str]] = None,
|
||||
) -> go.Figure:
|
||||
if plot_data.prompts and show_prompts and "show" in show_prompts:
|
||||
|
||||
def create_plot(self, plot_data: PlotData, dimensions: str = '3d',
|
||||
color_by: str = 'category', method: str = 'PCA',
|
||||
show_prompts: Optional[List[str]] = None) -> go.Figure:
|
||||
|
||||
if plot_data.prompts and show_prompts and 'show' in show_prompts:
|
||||
return self._create_dual_plot(plot_data, dimensions, color_by, method)
|
||||
else:
|
||||
return self._create_single_plot(plot_data, dimensions, color_by, method)
|
||||
|
||||
def _create_single_plot(
|
||||
self, plot_data: PlotData, dimensions: str, color_by: str, method: str
|
||||
) -> go.Figure:
|
||||
df = self._prepare_dataframe(
|
||||
plot_data.documents, plot_data.coordinates, dimensions
|
||||
)
|
||||
color_values = self.color_mapper.create_color_mapping(
|
||||
plot_data.documents, color_by
|
||||
)
|
||||
|
||||
hover_fields = ["id", "text_preview", "category", "subcategory", "tags_str"]
|
||||
|
||||
if dimensions == "3d":
|
||||
|
||||
def _create_single_plot(self, plot_data: PlotData, dimensions: str,
|
||||
color_by: str, method: str) -> go.Figure:
|
||||
df = self._prepare_dataframe(plot_data.documents, plot_data.coordinates, dimensions)
|
||||
color_values = self.color_mapper.create_color_mapping(plot_data.documents, color_by)
|
||||
|
||||
hover_fields = ['id', 'text_preview', 'category', 'subcategory', 'tags_str']
|
||||
|
||||
if dimensions == '3d':
|
||||
fig = px.scatter_3d(
|
||||
df,
|
||||
x="x",
|
||||
y="y",
|
||||
z="z",
|
||||
df, x='dim_1', y='dim_2', z='dim_3',
|
||||
color=color_values,
|
||||
hover_data=hover_fields,
|
||||
title=f"3D Embedding Visualization - {method} (colored by {color_by})",
|
||||
title=f'3D Embedding Visualization - {method} (colored by {color_by})'
|
||||
)
|
||||
fig.update_traces(marker=dict(size=5))
|
||||
else:
|
||||
fig = px.scatter(
|
||||
df,
|
||||
x="x",
|
||||
y="y",
|
||||
df, x='dim_1', y='dim_2',
|
||||
color=color_values,
|
||||
hover_data=hover_fields,
|
||||
title=f"2D Embedding Visualization - {method} (colored by {color_by})",
|
||||
title=f'2D Embedding Visualization - {method} (colored by {color_by})'
|
||||
)
|
||||
fig.update_traces(marker=dict(size=8))
|
||||
|
||||
fig.update_layout(height=None, autosize=True, margin=dict(l=0, r=0, t=50, b=0))
|
||||
|
||||
fig.update_layout(
|
||||
height=None,
|
||||
autosize=True,
|
||||
margin=dict(l=0, r=0, t=50, b=0)
|
||||
)
|
||||
return fig
|
||||
|
||||
def _create_dual_plot(
|
||||
self, plot_data: PlotData, dimensions: str, color_by: str, method: str
|
||||
) -> go.Figure:
|
||||
|
||||
def _create_dual_plot(self, plot_data: PlotData, dimensions: str,
|
||||
color_by: str, method: str) -> go.Figure:
|
||||
fig = go.Figure()
|
||||
|
||||
doc_df = self._prepare_dataframe(
|
||||
plot_data.documents, plot_data.coordinates, dimensions
|
||||
)
|
||||
doc_color_values = self.color_mapper.create_color_mapping(
|
||||
plot_data.documents, color_by
|
||||
)
|
||||
|
||||
hover_fields = ["id", "text_preview", "category", "subcategory", "tags_str"]
|
||||
|
||||
if dimensions == "3d":
|
||||
|
||||
doc_df = self._prepare_dataframe(plot_data.documents, plot_data.coordinates, dimensions)
|
||||
doc_color_values = self.color_mapper.create_color_mapping(plot_data.documents, color_by)
|
||||
|
||||
hover_fields = ['id', 'text_preview', 'category', 'subcategory', 'tags_str']
|
||||
|
||||
if dimensions == '3d':
|
||||
doc_fig = px.scatter_3d(
|
||||
doc_df,
|
||||
x="x",
|
||||
y="y",
|
||||
z="z",
|
||||
doc_df, x='dim_1', y='dim_2', z='dim_3',
|
||||
color=doc_color_values,
|
||||
hover_data=hover_fields,
|
||||
hover_data=hover_fields
|
||||
)
|
||||
else:
|
||||
doc_fig = px.scatter(
|
||||
doc_df,
|
||||
x="x",
|
||||
y="y",
|
||||
doc_df, x='dim_1', y='dim_2',
|
||||
color=doc_color_values,
|
||||
hover_data=hover_fields,
|
||||
hover_data=hover_fields
|
||||
)
|
||||
|
||||
|
||||
for trace in doc_fig.data:
|
||||
trace.name = f"Documents - {trace.name}"
|
||||
if dimensions == "3d":
|
||||
trace.name = f'Documents - {trace.name}'
|
||||
if dimensions == '3d':
|
||||
trace.marker.size = 5
|
||||
trace.marker.symbol = "circle"
|
||||
trace.marker.symbol = 'circle'
|
||||
else:
|
||||
trace.marker.size = 8
|
||||
trace.marker.symbol = "circle"
|
||||
trace.marker.symbol = 'circle'
|
||||
trace.marker.opacity = 1.0
|
||||
fig.add_trace(trace)
|
||||
|
||||
|
||||
if plot_data.prompts and plot_data.prompt_coordinates is not None:
|
||||
prompt_df = self._prepare_dataframe(
|
||||
plot_data.prompts, plot_data.prompt_coordinates, dimensions
|
||||
)
|
||||
prompt_color_values = self.color_mapper.create_color_mapping(
|
||||
plot_data.prompts, color_by
|
||||
)
|
||||
|
||||
if dimensions == "3d":
|
||||
prompt_df = self._prepare_dataframe(plot_data.prompts, plot_data.prompt_coordinates, dimensions)
|
||||
prompt_color_values = self.color_mapper.create_color_mapping(plot_data.prompts, color_by)
|
||||
|
||||
if dimensions == '3d':
|
||||
prompt_fig = px.scatter_3d(
|
||||
prompt_df,
|
||||
x="x",
|
||||
y="y",
|
||||
z="z",
|
||||
prompt_df, x='dim_1', y='dim_2', z='dim_3',
|
||||
color=prompt_color_values,
|
||||
hover_data=hover_fields,
|
||||
hover_data=hover_fields
|
||||
)
|
||||
else:
|
||||
prompt_fig = px.scatter(
|
||||
prompt_df,
|
||||
x="x",
|
||||
y="y",
|
||||
prompt_df, x='dim_1', y='dim_2',
|
||||
color=prompt_color_values,
|
||||
hover_data=hover_fields,
|
||||
hover_data=hover_fields
|
||||
)
|
||||
|
||||
|
||||
for trace in prompt_fig.data:
|
||||
if hasattr(trace.marker, "color") and isinstance(
|
||||
trace.marker.color, str
|
||||
):
|
||||
trace.marker.color = self.color_mapper.to_grayscale_hex(
|
||||
trace.marker.color
|
||||
)
|
||||
|
||||
trace.name = f"Prompts - {trace.name}"
|
||||
if dimensions == "3d":
|
||||
if hasattr(trace.marker, 'color') and isinstance(trace.marker.color, str):
|
||||
trace.marker.color = self.color_mapper.to_grayscale_hex(trace.marker.color)
|
||||
|
||||
trace.name = f'Prompts - {trace.name}'
|
||||
if dimensions == '3d':
|
||||
trace.marker.size = 6
|
||||
trace.marker.symbol = "diamond"
|
||||
trace.marker.symbol = 'diamond'
|
||||
else:
|
||||
trace.marker.size = 10
|
||||
trace.marker.symbol = "diamond"
|
||||
trace.marker.symbol = 'diamond'
|
||||
trace.marker.opacity = 0.8
|
||||
fig.add_trace(trace)
|
||||
|
||||
title = f"{dimensions.upper()} Embedding Visualization - {method} (colored by {color_by})"
|
||||
|
||||
title = f'{dimensions.upper()} Embedding Visualization - {method} (colored by {color_by})'
|
||||
fig.update_layout(
|
||||
title=title, height=None, autosize=True, margin=dict(l=0, r=0, t=50, b=0)
|
||||
title=title,
|
||||
height=None,
|
||||
autosize=True,
|
||||
margin=dict(l=0, r=0, t=50, b=0)
|
||||
)
|
||||
|
||||
|
||||
return fig
|
||||
|
||||
def _prepare_dataframe(
|
||||
self, documents: List[Document], coordinates, dimensions: str
|
||||
) -> pd.DataFrame:
|
||||
|
||||
def _prepare_dataframe(self, documents: List[Document], coordinates, dimensions: str) -> pd.DataFrame:
|
||||
df_data = []
|
||||
for i, doc in enumerate(documents):
|
||||
row = {
|
||||
"id": doc.id,
|
||||
"text": doc.text,
|
||||
"text_preview": doc.text[:100] + "..."
|
||||
if len(doc.text) > 100
|
||||
else doc.text,
|
||||
"category": doc.category,
|
||||
"subcategory": doc.subcategory,
|
||||
"tags_str": ", ".join(doc.tags) if doc.tags else "None",
|
||||
"x": coordinates[i, 0],
|
||||
"y": coordinates[i, 1],
|
||||
'id': doc.id,
|
||||
'text': doc.text,
|
||||
'text_preview': doc.text[:100] + "..." if len(doc.text) > 100 else doc.text,
|
||||
'category': doc.category,
|
||||
'subcategory': doc.subcategory,
|
||||
'tags_str': ', '.join(doc.tags) if doc.tags else 'None',
|
||||
'dim_1': coordinates[i, 0],
|
||||
'dim_2': coordinates[i, 1],
|
||||
}
|
||||
if dimensions == "3d":
|
||||
row["z"] = coordinates[i, 2]
|
||||
if dimensions == '3d':
|
||||
row['dim_3'] = coordinates[i, 2]
|
||||
df_data.append(row)
|
||||
|
||||
return pd.DataFrame(df_data)
|
||||
|
||||
return pd.DataFrame(df_data)
|
||||
@@ -1,12 +0,0 @@
|
||||
"""
|
||||
WSGI entry point for production deployment.
|
||||
Use this with a production WSGI server like Gunicorn.
|
||||
"""
|
||||
|
||||
from embeddingbuddy.app import create_app
|
||||
|
||||
# Create the application instance
|
||||
application = create_app()
|
||||
|
||||
# For compatibility with different WSGI servers
|
||||
app = application
|
||||
@@ -1,197 +0,0 @@
|
||||
"""Tests for handling bad/invalid data files."""
|
||||
|
||||
import pytest
|
||||
import json
|
||||
import base64
|
||||
from src.embeddingbuddy.data.parser import NDJSONParser
|
||||
from src.embeddingbuddy.data.processor import DataProcessor
|
||||
|
||||
|
||||
class TestBadDataHandling:
|
||||
"""Test suite for various types of invalid input data."""
|
||||
|
||||
def setup_method(self):
|
||||
"""Set up test fixtures."""
|
||||
self.parser = NDJSONParser()
|
||||
self.processor = DataProcessor()
|
||||
|
||||
def _create_upload_contents(self, text_content: str) -> str:
|
||||
"""Helper to create upload contents format."""
|
||||
encoded = base64.b64encode(text_content.encode("utf-8")).decode("utf-8")
|
||||
return f"data:application/json;base64,{encoded}"
|
||||
|
||||
def test_missing_embedding_field(self):
|
||||
"""Test files missing required embedding field."""
|
||||
bad_content = '{"id": "doc_001", "text": "Sample text", "category": "test"}'
|
||||
|
||||
with pytest.raises(KeyError, match="embedding"):
|
||||
self.parser.parse_text(bad_content)
|
||||
|
||||
# Test processor error handling
|
||||
upload_contents = self._create_upload_contents(bad_content)
|
||||
result = self.processor.process_upload(upload_contents)
|
||||
assert result.error is not None
|
||||
assert "embedding" in result.error
|
||||
|
||||
def test_missing_text_field(self):
|
||||
"""Test files missing required text field."""
|
||||
bad_content = (
|
||||
'{"id": "doc_001", "embedding": [0.1, 0.2, 0.3], "category": "test"}'
|
||||
)
|
||||
|
||||
with pytest.raises(KeyError, match="text"):
|
||||
self.parser.parse_text(bad_content)
|
||||
|
||||
# Test processor error handling
|
||||
upload_contents = self._create_upload_contents(bad_content)
|
||||
result = self.processor.process_upload(upload_contents)
|
||||
assert result.error is not None
|
||||
assert "text" in result.error
|
||||
|
||||
def test_malformed_json_lines(self):
|
||||
"""Test files with malformed JSON syntax."""
|
||||
# Missing closing brace
|
||||
bad_content = '{"id": "doc_001", "embedding": [0.1, 0.2], "text": "test"'
|
||||
|
||||
with pytest.raises(json.JSONDecodeError):
|
||||
self.parser.parse_text(bad_content)
|
||||
|
||||
# Test processor error handling
|
||||
upload_contents = self._create_upload_contents(bad_content)
|
||||
result = self.processor.process_upload(upload_contents)
|
||||
assert result.error is not None
|
||||
|
||||
def test_invalid_embedding_types(self):
|
||||
"""Test files with invalid embedding data types."""
|
||||
test_cases = [
|
||||
# String instead of array
|
||||
'{"id": "doc_001", "embedding": "not_an_array", "text": "test"}',
|
||||
# Mixed types in array
|
||||
'{"id": "doc_002", "embedding": [0.1, "text", 0.3], "text": "test"}',
|
||||
# Empty array
|
||||
'{"id": "doc_003", "embedding": [], "text": "test"}',
|
||||
# Null embedding
|
||||
'{"id": "doc_004", "embedding": null, "text": "test"}',
|
||||
]
|
||||
|
||||
for bad_content in test_cases:
|
||||
upload_contents = self._create_upload_contents(bad_content)
|
||||
result = self.processor.process_upload(upload_contents)
|
||||
assert result.error is not None, f"Should fail for: {bad_content}"
|
||||
|
||||
def test_inconsistent_embedding_dimensions(self):
|
||||
"""Test files with embeddings of different dimensions."""
|
||||
bad_content = """{"id": "doc_001", "embedding": [0.1, 0.2, 0.3, 0.4], "text": "4D embedding"}
|
||||
{"id": "doc_002", "embedding": [0.1, 0.2, 0.3], "text": "3D embedding"}"""
|
||||
|
||||
upload_contents = self._create_upload_contents(bad_content)
|
||||
result = self.processor.process_upload(upload_contents)
|
||||
|
||||
# This might succeed parsing but fail in processing
|
||||
# The error depends on where dimension validation occurs
|
||||
if result.error is None:
|
||||
# If parsing succeeds, check that embeddings have inconsistent shapes
|
||||
assert len(result.documents) == 2
|
||||
assert len(result.documents[0].embedding) != len(
|
||||
result.documents[1].embedding
|
||||
)
|
||||
|
||||
def test_empty_lines_in_ndjson(self):
|
||||
"""Test files with empty lines mixed in."""
|
||||
content_with_empty_lines = """{"id": "doc_001", "embedding": [0.1, 0.2], "text": "First line"}
|
||||
|
||||
{"id": "doc_002", "embedding": [0.3, 0.4], "text": "After empty line"}"""
|
||||
|
||||
# This should work - empty lines should be skipped
|
||||
documents = self.parser.parse_text(content_with_empty_lines)
|
||||
assert len(documents) == 2
|
||||
assert documents[0].id == "doc_001"
|
||||
assert documents[1].id == "doc_002"
|
||||
|
||||
def test_not_ndjson_format(self):
|
||||
"""Test regular JSON array instead of NDJSON."""
|
||||
json_array = """[
|
||||
{"id": "doc_001", "embedding": [0.1, 0.2], "text": "First"},
|
||||
{"id": "doc_002", "embedding": [0.3, 0.4], "text": "Second"}
|
||||
]"""
|
||||
|
||||
with pytest.raises(json.JSONDecodeError):
|
||||
self.parser.parse_text(json_array)
|
||||
|
||||
def test_binary_content_in_file(self):
|
||||
"""Test files with binary content mixed in."""
|
||||
# Simulate binary content that can't be decoded
|
||||
binary_content = (
|
||||
b'\x00\x01\x02{"id": "doc_001", "embedding": [0.1], "text": "test"}'
|
||||
)
|
||||
|
||||
# This should result in an error when processing
|
||||
encoded = base64.b64encode(binary_content).decode("utf-8")
|
||||
upload_contents = f"data:application/json;base64,{encoded}"
|
||||
result = self.processor.process_upload(upload_contents)
|
||||
|
||||
# Should either fail with UnicodeDecodeError or JSON parsing error
|
||||
assert result.error is not None
|
||||
|
||||
def test_extremely_large_embeddings(self):
|
||||
"""Test embeddings with very large dimensions."""
|
||||
large_embedding = [0.1] * 10000 # 10k dimensions
|
||||
content = json.dumps(
|
||||
{
|
||||
"id": "doc_001",
|
||||
"embedding": large_embedding,
|
||||
"text": "Large embedding test",
|
||||
}
|
||||
)
|
||||
|
||||
# This should work but might be slow
|
||||
upload_contents = self._create_upload_contents(content)
|
||||
result = self.processor.process_upload(upload_contents)
|
||||
|
||||
if result.error is None:
|
||||
assert len(result.documents) == 1
|
||||
assert len(result.documents[0].embedding) == 10000
|
||||
|
||||
def test_special_characters_in_text(self):
|
||||
"""Test handling of special characters and unicode."""
|
||||
special_content = json.dumps(
|
||||
{
|
||||
"id": "doc_001",
|
||||
"embedding": [0.1, 0.2],
|
||||
"text": 'Special chars: 🚀 ñoñó 中文 \n\t"',
|
||||
},
|
||||
ensure_ascii=False,
|
||||
)
|
||||
|
||||
upload_contents = self._create_upload_contents(special_content)
|
||||
result = self.processor.process_upload(upload_contents)
|
||||
|
||||
assert result.error is None
|
||||
assert len(result.documents) == 1
|
||||
assert "🚀" in result.documents[0].text
|
||||
|
||||
def test_processor_error_structure(self):
|
||||
"""Test that processor returns proper error structure."""
|
||||
bad_content = '{"invalid": "json"' # Missing closing brace
|
||||
upload_contents = self._create_upload_contents(bad_content)
|
||||
|
||||
result = self.processor.process_upload(upload_contents)
|
||||
|
||||
# Check error structure
|
||||
assert result.error is not None
|
||||
assert isinstance(result.error, str)
|
||||
assert len(result.documents) == 0
|
||||
assert result.embeddings.size == 0
|
||||
|
||||
def test_multiple_errors_in_file(self):
|
||||
"""Test file with multiple different types of errors."""
|
||||
multi_error_content = """{"id": "doc_001", "text": "Missing embedding"}
|
||||
{"id": "doc_002", "embedding": "wrong_type", "text": "Wrong embedding type"}
|
||||
{"id": "doc_003", "embedding": [0.1, 0.2], "text": "Valid line"}
|
||||
{"id": "doc_004", "embedding": [0.3, 0.4]""" # Missing text and closing brace
|
||||
|
||||
upload_contents = self._create_upload_contents(multi_error_content)
|
||||
result = self.processor.process_upload(upload_contents)
|
||||
|
||||
# Should fail on first error encountered
|
||||
assert result.error is not None
|
||||
@@ -1,158 +0,0 @@
|
||||
"""Tests for client-side embedding processing functionality."""
|
||||
|
||||
import numpy as np
|
||||
|
||||
from src.embeddingbuddy.data.processor import DataProcessor
|
||||
from src.embeddingbuddy.models.schemas import ProcessedData
|
||||
|
||||
|
||||
class TestClientEmbeddingsProcessing:
|
||||
"""Test client-side embeddings processing functionality."""
|
||||
|
||||
def setup_method(self):
|
||||
"""Set up test instances."""
|
||||
self.processor = DataProcessor()
|
||||
|
||||
def test_process_client_embeddings_success(self):
|
||||
"""Test successful processing of client-side embeddings data."""
|
||||
client_data = {
|
||||
"documents": [
|
||||
{
|
||||
"id": "text_input_0",
|
||||
"text": "First test document",
|
||||
"category": "Text Input",
|
||||
"subcategory": "Generated",
|
||||
"tags": [],
|
||||
},
|
||||
{
|
||||
"id": "text_input_1",
|
||||
"text": "Second test document",
|
||||
"category": "Text Input",
|
||||
"subcategory": "Generated",
|
||||
"tags": [],
|
||||
},
|
||||
],
|
||||
"embeddings": [[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8]],
|
||||
}
|
||||
|
||||
result = self.processor.process_client_embeddings(client_data)
|
||||
|
||||
assert isinstance(result, ProcessedData)
|
||||
assert result.error is None
|
||||
assert len(result.documents) == 2
|
||||
assert result.embeddings.shape == (2, 4)
|
||||
|
||||
# Check document content
|
||||
assert result.documents[0].text == "First test document"
|
||||
assert result.documents[1].text == "Second test document"
|
||||
|
||||
# Check embeddings match
|
||||
np.testing.assert_array_equal(result.embeddings[0], [0.1, 0.2, 0.3, 0.4])
|
||||
np.testing.assert_array_equal(result.embeddings[1], [0.5, 0.6, 0.7, 0.8])
|
||||
|
||||
def test_process_client_embeddings_with_error(self):
|
||||
"""Test processing client data with error."""
|
||||
client_data = {"error": "Transformers.js not loaded"}
|
||||
|
||||
result = self.processor.process_client_embeddings(client_data)
|
||||
|
||||
assert isinstance(result, ProcessedData)
|
||||
assert result.error == "Transformers.js not loaded"
|
||||
assert len(result.documents) == 0
|
||||
assert result.embeddings.size == 0
|
||||
|
||||
def test_process_client_embeddings_missing_data(self):
|
||||
"""Test processing with missing documents or embeddings."""
|
||||
client_data = {"documents": []}
|
||||
|
||||
result = self.processor.process_client_embeddings(client_data)
|
||||
|
||||
assert isinstance(result, ProcessedData)
|
||||
assert "No documents or embeddings in client data" in result.error
|
||||
assert len(result.documents) == 0
|
||||
|
||||
def test_process_client_embeddings_mismatch_count(self):
|
||||
"""Test processing with mismatched document and embedding counts."""
|
||||
client_data = {
|
||||
"documents": [
|
||||
{
|
||||
"id": "test",
|
||||
"text": "Test document",
|
||||
"category": "Test",
|
||||
"subcategory": "Test",
|
||||
"tags": [],
|
||||
}
|
||||
],
|
||||
"embeddings": [[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8]],
|
||||
}
|
||||
|
||||
result = self.processor.process_client_embeddings(client_data)
|
||||
|
||||
assert isinstance(result, ProcessedData)
|
||||
assert "Mismatch between number of documents and embeddings" in result.error
|
||||
assert len(result.documents) == 0
|
||||
|
||||
def test_process_client_embeddings_invalid_document(self):
|
||||
"""Test processing with invalid document data."""
|
||||
client_data = {
|
||||
"documents": [
|
||||
{"text": ""}, # Empty text should be skipped
|
||||
{
|
||||
"id": "test2",
|
||||
"text": "Valid document",
|
||||
"category": "Test",
|
||||
"subcategory": "Test",
|
||||
"tags": [],
|
||||
},
|
||||
],
|
||||
"embeddings": [[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8]],
|
||||
}
|
||||
|
||||
result = self.processor.process_client_embeddings(client_data)
|
||||
|
||||
assert isinstance(result, ProcessedData)
|
||||
assert result.error is None
|
||||
assert len(result.documents) == 1 # Only valid document should be processed
|
||||
assert result.documents[0].text == "Valid document"
|
||||
|
||||
def test_process_client_embeddings_auto_id_generation(self):
|
||||
"""Test automatic ID generation for documents without IDs."""
|
||||
client_data = {
|
||||
"documents": [
|
||||
{
|
||||
"text": "Document without ID",
|
||||
"category": "Test",
|
||||
"subcategory": "Test",
|
||||
"tags": [],
|
||||
}
|
||||
],
|
||||
"embeddings": [[0.1, 0.2, 0.3, 0.4]],
|
||||
}
|
||||
|
||||
result = self.processor.process_client_embeddings(client_data)
|
||||
|
||||
assert isinstance(result, ProcessedData)
|
||||
assert result.error is None
|
||||
assert len(result.documents) == 1
|
||||
assert result.documents[0].id.startswith("text_input_")
|
||||
|
||||
def test_process_client_embeddings_invalid_embedding_format(self):
|
||||
"""Test processing with invalid embedding format."""
|
||||
client_data = {
|
||||
"documents": [
|
||||
{
|
||||
"id": "test",
|
||||
"text": "Test document",
|
||||
"category": "Test",
|
||||
"subcategory": "Test",
|
||||
"tags": [],
|
||||
}
|
||||
],
|
||||
"embeddings": 0.5, # Scalar instead of array
|
||||
}
|
||||
|
||||
result = self.processor.process_client_embeddings(client_data)
|
||||
|
||||
assert isinstance(result, ProcessedData)
|
||||
assert result.error is not None # Should have some error
|
||||
assert len(result.documents) == 0
|
||||
@@ -6,64 +6,62 @@ from src.embeddingbuddy.models.schemas import Document
|
||||
|
||||
|
||||
class TestNDJSONParser:
|
||||
|
||||
def test_parse_text_basic(self):
|
||||
text_content = (
|
||||
'{"id": "test1", "text": "Hello world", "embedding": [0.1, 0.2, 0.3]}'
|
||||
)
|
||||
text_content = '{"id": "test1", "text": "Hello world", "embedding": [0.1, 0.2, 0.3]}'
|
||||
documents = NDJSONParser.parse_text(text_content)
|
||||
|
||||
|
||||
assert len(documents) == 1
|
||||
assert documents[0].id == "test1"
|
||||
assert documents[0].text == "Hello world"
|
||||
assert documents[0].embedding == [0.1, 0.2, 0.3]
|
||||
|
||||
|
||||
def test_parse_text_with_metadata(self):
|
||||
text_content = '{"id": "test1", "text": "Hello", "embedding": [0.1, 0.2], "category": "greeting", "tags": ["test"]}'
|
||||
documents = NDJSONParser.parse_text(text_content)
|
||||
|
||||
|
||||
assert documents[0].category == "greeting"
|
||||
assert documents[0].tags == ["test"]
|
||||
|
||||
|
||||
def test_parse_text_missing_id(self):
|
||||
text_content = '{"text": "Hello", "embedding": [0.1, 0.2]}'
|
||||
documents = NDJSONParser.parse_text(text_content)
|
||||
|
||||
|
||||
assert len(documents) == 1
|
||||
assert documents[0].id is not None # Should be auto-generated
|
||||
|
||||
|
||||
class TestDataProcessor:
|
||||
|
||||
def test_extract_embeddings(self):
|
||||
documents = [
|
||||
Document(id="1", text="test1", embedding=[0.1, 0.2]),
|
||||
Document(id="2", text="test2", embedding=[0.3, 0.4]),
|
||||
Document(id="2", text="test2", embedding=[0.3, 0.4])
|
||||
]
|
||||
|
||||
|
||||
processor = DataProcessor()
|
||||
embeddings = processor._extract_embeddings(documents)
|
||||
|
||||
|
||||
assert embeddings.shape == (2, 2)
|
||||
assert np.allclose(embeddings[0], [0.1, 0.2])
|
||||
assert np.allclose(embeddings[1], [0.3, 0.4])
|
||||
|
||||
|
||||
def test_combine_data(self):
|
||||
from src.embeddingbuddy.models.schemas import ProcessedData
|
||||
|
||||
|
||||
doc_data = ProcessedData(
|
||||
documents=[Document(id="1", text="doc", embedding=[0.1, 0.2])],
|
||||
embeddings=np.array([[0.1, 0.2]]),
|
||||
embeddings=np.array([[0.1, 0.2]])
|
||||
)
|
||||
|
||||
|
||||
prompt_data = ProcessedData(
|
||||
documents=[Document(id="p1", text="prompt", embedding=[0.3, 0.4])],
|
||||
embeddings=np.array([[0.3, 0.4]]),
|
||||
embeddings=np.array([[0.3, 0.4]])
|
||||
)
|
||||
|
||||
|
||||
processor = DataProcessor()
|
||||
all_embeddings, documents, prompts = processor.combine_data(
|
||||
doc_data, prompt_data
|
||||
)
|
||||
|
||||
all_embeddings, documents, prompts = processor.combine_data(doc_data, prompt_data)
|
||||
|
||||
assert all_embeddings.shape == (2, 2)
|
||||
assert len(documents) == 1
|
||||
assert len(prompts) == 1
|
||||
@@ -72,4 +70,4 @@ class TestDataProcessor:
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
pytest.main([__file__])
|
||||
pytest.main([__file__])
|
||||
@@ -1,155 +0,0 @@
|
||||
from unittest.mock import patch
|
||||
from src.embeddingbuddy.data.processor import DataProcessor
|
||||
from src.embeddingbuddy.models.field_mapper import FieldMapping
|
||||
|
||||
|
||||
class TestDataProcessorOpenSearch:
|
||||
def test_process_opensearch_data_success(self):
|
||||
processor = DataProcessor()
|
||||
|
||||
# Mock raw OpenSearch documents
|
||||
raw_documents = [
|
||||
{
|
||||
"vector": [0.1, 0.2, 0.3],
|
||||
"content": "Test document 1",
|
||||
"doc_id": "doc1",
|
||||
"type": "news",
|
||||
},
|
||||
{
|
||||
"vector": [0.4, 0.5, 0.6],
|
||||
"content": "Test document 2",
|
||||
"doc_id": "doc2",
|
||||
"type": "blog",
|
||||
},
|
||||
]
|
||||
|
||||
# Create field mapping
|
||||
field_mapping = FieldMapping(
|
||||
embedding_field="vector",
|
||||
text_field="content",
|
||||
id_field="doc_id",
|
||||
category_field="type",
|
||||
)
|
||||
|
||||
# Process the data
|
||||
processed_data = processor.process_opensearch_data(raw_documents, field_mapping)
|
||||
|
||||
# Assertions
|
||||
assert processed_data.error is None
|
||||
assert len(processed_data.documents) == 2
|
||||
assert processed_data.embeddings.shape == (2, 3)
|
||||
|
||||
# Check first document
|
||||
doc1 = processed_data.documents[0]
|
||||
assert doc1.text == "Test document 1"
|
||||
assert doc1.embedding == [0.1, 0.2, 0.3]
|
||||
assert doc1.id == "doc1"
|
||||
assert doc1.category == "news"
|
||||
|
||||
# Check second document
|
||||
doc2 = processed_data.documents[1]
|
||||
assert doc2.text == "Test document 2"
|
||||
assert doc2.embedding == [0.4, 0.5, 0.6]
|
||||
assert doc2.id == "doc2"
|
||||
assert doc2.category == "blog"
|
||||
|
||||
def test_process_opensearch_data_with_tags(self):
|
||||
processor = DataProcessor()
|
||||
|
||||
# Mock raw OpenSearch documents with tags
|
||||
raw_documents = [
|
||||
{
|
||||
"vector": [0.1, 0.2, 0.3],
|
||||
"content": "Test document with tags",
|
||||
"keywords": ["tag1", "tag2"],
|
||||
}
|
||||
]
|
||||
|
||||
# Create field mapping
|
||||
field_mapping = FieldMapping(
|
||||
embedding_field="vector", text_field="content", tags_field="keywords"
|
||||
)
|
||||
|
||||
processed_data = processor.process_opensearch_data(raw_documents, field_mapping)
|
||||
|
||||
assert processed_data.error is None
|
||||
assert len(processed_data.documents) == 1
|
||||
doc = processed_data.documents[0]
|
||||
assert doc.tags == ["tag1", "tag2"]
|
||||
|
||||
def test_process_opensearch_data_invalid_documents(self):
|
||||
processor = DataProcessor()
|
||||
|
||||
# Mock raw documents with missing required fields
|
||||
raw_documents = [
|
||||
{
|
||||
"vector": [0.1, 0.2, 0.3],
|
||||
# Missing text field
|
||||
}
|
||||
]
|
||||
|
||||
field_mapping = FieldMapping(embedding_field="vector", text_field="content")
|
||||
|
||||
processed_data = processor.process_opensearch_data(raw_documents, field_mapping)
|
||||
|
||||
# Should return error since no valid documents
|
||||
assert processed_data.error is not None
|
||||
assert "No valid documents" in processed_data.error
|
||||
assert len(processed_data.documents) == 0
|
||||
|
||||
def test_process_opensearch_data_partial_success(self):
|
||||
processor = DataProcessor()
|
||||
|
||||
# Mix of valid and invalid documents
|
||||
raw_documents = [
|
||||
{
|
||||
"vector": [0.1, 0.2, 0.3],
|
||||
"content": "Valid document",
|
||||
},
|
||||
{
|
||||
"vector": [0.4, 0.5, 0.6],
|
||||
# Missing content field - should be skipped
|
||||
},
|
||||
{
|
||||
"vector": [0.7, 0.8, 0.9],
|
||||
"content": "Another valid document",
|
||||
},
|
||||
]
|
||||
|
||||
field_mapping = FieldMapping(embedding_field="vector", text_field="content")
|
||||
|
||||
processed_data = processor.process_opensearch_data(raw_documents, field_mapping)
|
||||
|
||||
# Should process valid documents only
|
||||
assert processed_data.error is None
|
||||
assert len(processed_data.documents) == 2
|
||||
assert processed_data.documents[0].text == "Valid document"
|
||||
assert processed_data.documents[1].text == "Another valid document"
|
||||
|
||||
@patch("src.embeddingbuddy.models.field_mapper.FieldMapper.transform_documents")
|
||||
def test_process_opensearch_data_transformation_error(self, mock_transform):
|
||||
processor = DataProcessor()
|
||||
|
||||
# Mock transformation error
|
||||
mock_transform.side_effect = Exception("Transformation failed")
|
||||
|
||||
raw_documents = [{"vector": [0.1], "content": "test"}]
|
||||
field_mapping = FieldMapping(embedding_field="vector", text_field="content")
|
||||
|
||||
processed_data = processor.process_opensearch_data(raw_documents, field_mapping)
|
||||
|
||||
assert processed_data.error is not None
|
||||
assert "Transformation failed" in processed_data.error
|
||||
assert len(processed_data.documents) == 0
|
||||
|
||||
def test_process_opensearch_data_empty_input(self):
|
||||
processor = DataProcessor()
|
||||
|
||||
raw_documents = []
|
||||
field_mapping = FieldMapping(embedding_field="vector", text_field="content")
|
||||
|
||||
processed_data = processor.process_opensearch_data(raw_documents, field_mapping)
|
||||
|
||||
assert processed_data.error is not None
|
||||
assert "No valid documents" in processed_data.error
|
||||
assert len(processed_data.documents) == 0
|
||||
@@ -1,310 +0,0 @@
|
||||
from unittest.mock import Mock, patch
|
||||
from src.embeddingbuddy.data.sources.opensearch import OpenSearchClient
|
||||
from src.embeddingbuddy.models.field_mapper import FieldMapper, FieldMapping
|
||||
|
||||
|
||||
class TestOpenSearchClient:
|
||||
def test_init(self):
|
||||
client = OpenSearchClient()
|
||||
assert client.client is None
|
||||
assert client.connection_info is None
|
||||
|
||||
@patch("src.embeddingbuddy.data.sources.opensearch.OpenSearch")
|
||||
def test_connect_success(self, mock_opensearch):
|
||||
# Mock the OpenSearch client
|
||||
mock_client_instance = Mock()
|
||||
mock_client_instance.info.return_value = {
|
||||
"cluster_name": "test-cluster",
|
||||
"version": {"number": "2.0.0"},
|
||||
}
|
||||
mock_opensearch.return_value = mock_client_instance
|
||||
|
||||
client = OpenSearchClient()
|
||||
success, message = client.connect("https://localhost:9200")
|
||||
|
||||
assert success is True
|
||||
assert "test-cluster" in message
|
||||
assert client.client is not None
|
||||
assert client.connection_info["cluster_name"] == "test-cluster"
|
||||
|
||||
@patch("src.embeddingbuddy.data.sources.opensearch.OpenSearch")
|
||||
def test_connect_failure(self, mock_opensearch):
|
||||
# Mock connection failure
|
||||
mock_opensearch.side_effect = Exception("Connection failed")
|
||||
|
||||
client = OpenSearchClient()
|
||||
success, message = client.connect("https://localhost:9200")
|
||||
|
||||
assert success is False
|
||||
assert "Connection failed" in message
|
||||
assert client.client is None
|
||||
|
||||
def test_analyze_fields(self):
|
||||
client = OpenSearchClient()
|
||||
client.client = Mock()
|
||||
|
||||
# Mock mapping response
|
||||
mock_mapping = {
|
||||
"test-index": {
|
||||
"mappings": {
|
||||
"properties": {
|
||||
"embedding": {"type": "dense_vector", "dimension": 768},
|
||||
"text": {"type": "text"},
|
||||
"category": {"type": "keyword"},
|
||||
"id": {"type": "keyword"},
|
||||
"count": {"type": "integer"},
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
client.client.indices.get_mapping.return_value = mock_mapping
|
||||
|
||||
success, analysis, message = client.analyze_fields("test-index")
|
||||
|
||||
assert success is True
|
||||
assert len(analysis["vector_fields"]) == 1
|
||||
assert analysis["vector_fields"][0]["name"] == "embedding"
|
||||
assert analysis["vector_fields"][0]["dimension"] == 768
|
||||
assert "text" in analysis["text_fields"]
|
||||
assert "category" in analysis["keyword_fields"]
|
||||
assert "count" in analysis["numeric_fields"]
|
||||
|
||||
def test_fetch_sample_data(self):
|
||||
client = OpenSearchClient()
|
||||
client.client = Mock()
|
||||
|
||||
# Mock search response
|
||||
mock_response = {
|
||||
"hits": {
|
||||
"hits": [
|
||||
{"_source": {"text": "doc1", "embedding": [0.1, 0.2]}},
|
||||
{"_source": {"text": "doc2", "embedding": [0.3, 0.4]}},
|
||||
]
|
||||
}
|
||||
}
|
||||
client.client.search.return_value = mock_response
|
||||
|
||||
success, documents, message = client.fetch_sample_data("test-index", size=2)
|
||||
|
||||
assert success is True
|
||||
assert len(documents) == 2
|
||||
assert documents[0]["text"] == "doc1"
|
||||
assert documents[1]["text"] == "doc2"
|
||||
|
||||
|
||||
class TestFieldMapper:
|
||||
def test_suggest_mappings(self):
|
||||
field_analysis = {
|
||||
"vector_fields": [{"name": "embedding", "dimension": 768}],
|
||||
"text_fields": ["content", "description"],
|
||||
"keyword_fields": ["doc_id", "category", "type", "tags"],
|
||||
"numeric_fields": ["count"],
|
||||
"all_fields": [
|
||||
"embedding",
|
||||
"content",
|
||||
"description",
|
||||
"doc_id",
|
||||
"category",
|
||||
"type",
|
||||
"tags",
|
||||
"count",
|
||||
],
|
||||
}
|
||||
|
||||
suggestions = FieldMapper.suggest_mappings(field_analysis)
|
||||
|
||||
# Check that all dropdowns contain all fields
|
||||
all_fields = [
|
||||
"embedding",
|
||||
"content",
|
||||
"description",
|
||||
"doc_id",
|
||||
"category",
|
||||
"type",
|
||||
"tags",
|
||||
"count",
|
||||
]
|
||||
for field_type in [
|
||||
"embedding",
|
||||
"text",
|
||||
"id",
|
||||
"category",
|
||||
"subcategory",
|
||||
"tags",
|
||||
]:
|
||||
for field in all_fields:
|
||||
assert field in suggestions[field_type], (
|
||||
f"Field '{field}' missing from {field_type} suggestions"
|
||||
)
|
||||
|
||||
# Check that best candidates are first
|
||||
assert (
|
||||
suggestions["embedding"][0] == "embedding"
|
||||
) # vector field should be first
|
||||
assert suggestions["text"][0] in [
|
||||
"content",
|
||||
"description",
|
||||
] # text fields should be first
|
||||
assert suggestions["id"][0] == "doc_id" # ID-like field should be first
|
||||
assert suggestions["category"][0] in [
|
||||
"category",
|
||||
"type",
|
||||
] # category-like field should be first
|
||||
assert suggestions["tags"][0] == "tags" # tags field should be first
|
||||
|
||||
def test_suggest_mappings_name_based_embedding(self):
|
||||
"""Test that fields named 'embedding' are prioritized even without vector type."""
|
||||
field_analysis = {
|
||||
"vector_fields": [], # No explicit vector fields detected
|
||||
"text_fields": ["content", "description"],
|
||||
"keyword_fields": ["doc_id", "category", "type", "tags"],
|
||||
"numeric_fields": ["count"],
|
||||
"all_fields": [
|
||||
"content",
|
||||
"description",
|
||||
"doc_id",
|
||||
"category",
|
||||
"embedding",
|
||||
"type",
|
||||
"tags",
|
||||
"count",
|
||||
],
|
||||
}
|
||||
|
||||
suggestions = FieldMapper.suggest_mappings(field_analysis)
|
||||
|
||||
# Check that 'embedding' field is prioritized despite not being detected as vector type
|
||||
assert suggestions["embedding"][0] == "embedding", (
|
||||
"Field named 'embedding' should be first priority"
|
||||
)
|
||||
|
||||
# Check that all fields are still available
|
||||
all_fields = [
|
||||
"content",
|
||||
"description",
|
||||
"doc_id",
|
||||
"category",
|
||||
"embedding",
|
||||
"type",
|
||||
"tags",
|
||||
"count",
|
||||
]
|
||||
for field_type in [
|
||||
"embedding",
|
||||
"text",
|
||||
"id",
|
||||
"category",
|
||||
"subcategory",
|
||||
"tags",
|
||||
]:
|
||||
for field in all_fields:
|
||||
assert field in suggestions[field_type], (
|
||||
f"Field '{field}' missing from {field_type} suggestions"
|
||||
)
|
||||
|
||||
def test_validate_mapping_success(self):
|
||||
mapping = FieldMapping(
|
||||
embedding_field="embedding", text_field="text", id_field="doc_id"
|
||||
)
|
||||
available_fields = ["embedding", "text", "doc_id", "category"]
|
||||
|
||||
errors = FieldMapper.validate_mapping(mapping, available_fields)
|
||||
|
||||
assert len(errors) == 0
|
||||
|
||||
def test_validate_mapping_missing_required(self):
|
||||
mapping = FieldMapping(embedding_field="missing_field", text_field="text")
|
||||
available_fields = ["text", "category"]
|
||||
|
||||
errors = FieldMapper.validate_mapping(mapping, available_fields)
|
||||
|
||||
assert len(errors) == 1
|
||||
assert "missing_field" in errors[0]
|
||||
assert "not found" in errors[0]
|
||||
|
||||
def test_validate_mapping_missing_optional(self):
|
||||
mapping = FieldMapping(
|
||||
embedding_field="embedding",
|
||||
text_field="text",
|
||||
category_field="missing_category",
|
||||
)
|
||||
available_fields = ["embedding", "text"]
|
||||
|
||||
errors = FieldMapper.validate_mapping(mapping, available_fields)
|
||||
|
||||
assert len(errors) == 1
|
||||
assert "missing_category" in errors[0]
|
||||
|
||||
def test_transform_documents(self):
|
||||
mapping = FieldMapping(
|
||||
embedding_field="vector",
|
||||
text_field="content",
|
||||
id_field="doc_id",
|
||||
category_field="type",
|
||||
)
|
||||
|
||||
raw_documents = [
|
||||
{
|
||||
"vector": [0.1, 0.2, 0.3],
|
||||
"content": "Test document 1",
|
||||
"doc_id": "doc1",
|
||||
"type": "news",
|
||||
},
|
||||
{
|
||||
"vector": [0.4, 0.5, 0.6],
|
||||
"content": "Test document 2",
|
||||
"doc_id": "doc2",
|
||||
"type": "blog",
|
||||
},
|
||||
]
|
||||
|
||||
transformed = FieldMapper.transform_documents(raw_documents, mapping)
|
||||
|
||||
assert len(transformed) == 2
|
||||
assert transformed[0]["embedding"] == [0.1, 0.2, 0.3]
|
||||
assert transformed[0]["text"] == "Test document 1"
|
||||
assert transformed[0]["id"] == "doc1"
|
||||
assert transformed[0]["category"] == "news"
|
||||
|
||||
def test_transform_documents_missing_required(self):
|
||||
mapping = FieldMapping(embedding_field="vector", text_field="content")
|
||||
|
||||
raw_documents = [
|
||||
{
|
||||
"vector": [0.1, 0.2, 0.3],
|
||||
# Missing content field
|
||||
}
|
||||
]
|
||||
|
||||
transformed = FieldMapper.transform_documents(raw_documents, mapping)
|
||||
|
||||
assert len(transformed) == 0 # Document should be skipped
|
||||
|
||||
def test_create_mapping_from_dict(self):
|
||||
mapping_dict = {
|
||||
"embedding": "vector_field",
|
||||
"text": "text_field",
|
||||
"id": "doc_id",
|
||||
"category": "cat_field",
|
||||
"subcategory": "subcat_field",
|
||||
"tags": "tags_field",
|
||||
}
|
||||
|
||||
mapping = FieldMapper.create_mapping_from_dict(mapping_dict)
|
||||
|
||||
assert mapping.embedding_field == "vector_field"
|
||||
assert mapping.text_field == "text_field"
|
||||
assert mapping.id_field == "doc_id"
|
||||
assert mapping.category_field == "cat_field"
|
||||
assert mapping.subcategory_field == "subcat_field"
|
||||
assert mapping.tags_field == "tags_field"
|
||||
|
||||
def test_create_mapping_from_dict_minimal(self):
|
||||
mapping_dict = {"embedding": "vector_field", "text": "text_field"}
|
||||
|
||||
mapping = FieldMapper.create_mapping_from_dict(mapping_dict)
|
||||
|
||||
assert mapping.embedding_field == "vector_field"
|
||||
assert mapping.text_field == "text_field"
|
||||
assert mapping.id_field is None
|
||||
assert mapping.category_field is None
|
||||
@@ -1,90 +1,89 @@
|
||||
import pytest
|
||||
import numpy as np
|
||||
from src.embeddingbuddy.models.reducers import (
|
||||
ReducerFactory,
|
||||
PCAReducer,
|
||||
TSNEReducer,
|
||||
UMAPReducer,
|
||||
)
|
||||
from src.embeddingbuddy.models.reducers import ReducerFactory, PCAReducer, TSNEReducer, UMAPReducer
|
||||
|
||||
|
||||
class TestReducerFactory:
|
||||
|
||||
def test_create_pca_reducer(self):
|
||||
reducer = ReducerFactory.create_reducer("pca", n_components=2)
|
||||
reducer = ReducerFactory.create_reducer('pca', n_components=2)
|
||||
assert isinstance(reducer, PCAReducer)
|
||||
assert reducer.n_components == 2
|
||||
|
||||
|
||||
def test_create_tsne_reducer(self):
|
||||
reducer = ReducerFactory.create_reducer("tsne", n_components=3)
|
||||
reducer = ReducerFactory.create_reducer('tsne', n_components=3)
|
||||
assert isinstance(reducer, TSNEReducer)
|
||||
assert reducer.n_components == 3
|
||||
|
||||
|
||||
def test_create_umap_reducer(self):
|
||||
reducer = ReducerFactory.create_reducer("umap", n_components=2)
|
||||
reducer = ReducerFactory.create_reducer('umap', n_components=2)
|
||||
assert isinstance(reducer, UMAPReducer)
|
||||
assert reducer.n_components == 2
|
||||
|
||||
|
||||
def test_invalid_method(self):
|
||||
with pytest.raises(ValueError, match="Unknown reduction method"):
|
||||
ReducerFactory.create_reducer("invalid_method")
|
||||
|
||||
ReducerFactory.create_reducer('invalid_method')
|
||||
|
||||
def test_available_methods(self):
|
||||
methods = ReducerFactory.get_available_methods()
|
||||
assert "pca" in methods
|
||||
assert "tsne" in methods
|
||||
assert "umap" in methods
|
||||
assert 'pca' in methods
|
||||
assert 'tsne' in methods
|
||||
assert 'umap' in methods
|
||||
|
||||
|
||||
class TestPCAReducer:
|
||||
|
||||
def test_fit_transform(self):
|
||||
embeddings = np.random.rand(100, 512)
|
||||
reducer = PCAReducer(n_components=2)
|
||||
|
||||
|
||||
result = reducer.fit_transform(embeddings)
|
||||
|
||||
|
||||
assert result.reduced_embeddings.shape == (100, 2)
|
||||
assert result.variance_explained is not None
|
||||
assert result.method == "PCA"
|
||||
assert result.n_components == 2
|
||||
|
||||
|
||||
def test_method_name(self):
|
||||
reducer = PCAReducer()
|
||||
assert reducer.get_method_name() == "PCA"
|
||||
|
||||
|
||||
class TestTSNEReducer:
|
||||
|
||||
def test_fit_transform_small_dataset(self):
|
||||
embeddings = np.random.rand(30, 10) # Small dataset for faster testing
|
||||
reducer = TSNEReducer(n_components=2)
|
||||
|
||||
|
||||
result = reducer.fit_transform(embeddings)
|
||||
|
||||
|
||||
assert result.reduced_embeddings.shape == (30, 2)
|
||||
assert result.variance_explained is None # t-SNE doesn't provide this
|
||||
assert result.method == "t-SNE"
|
||||
assert result.n_components == 2
|
||||
|
||||
|
||||
def test_method_name(self):
|
||||
reducer = TSNEReducer()
|
||||
assert reducer.get_method_name() == "t-SNE"
|
||||
|
||||
|
||||
class TestUMAPReducer:
|
||||
|
||||
def test_fit_transform(self):
|
||||
embeddings = np.random.rand(50, 10)
|
||||
reducer = UMAPReducer(n_components=2)
|
||||
|
||||
|
||||
result = reducer.fit_transform(embeddings)
|
||||
|
||||
|
||||
assert result.reduced_embeddings.shape == (50, 2)
|
||||
assert result.variance_explained is None # UMAP doesn't provide this
|
||||
assert result.method == "UMAP"
|
||||
assert result.n_components == 2
|
||||
|
||||
|
||||
def test_method_name(self):
|
||||
reducer = UMAPReducer()
|
||||
assert reducer.get_method_name() == "UMAP"
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
pytest.main([__file__])
|
||||
pytest.main([__file__])
|
||||
Reference in New Issue
Block a user