add in browser embedding generation
Some checks failed
Security Scan / security (pull_request) Successful in 44s
Security Scan / dependency-check (pull_request) Successful in 49s
Test Suite / lint (pull_request) Failing after 40s
Test Suite / test (3.11) (pull_request) Successful in 1m39s
Test Suite / build (pull_request) Has been skipped
Some checks failed
Security Scan / security (pull_request) Successful in 44s
Security Scan / dependency-check (pull_request) Successful in 49s
Test Suite / lint (pull_request) Failing after 40s
Test Suite / test (3.11) (pull_request) Successful in 1m39s
Test Suite / build (pull_request) Has been skipped
This commit is contained in:
@@ -8,7 +8,16 @@ from .ui.callbacks.interactions import InteractionCallbacks
|
||||
|
||||
|
||||
def create_app():
|
||||
app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])
|
||||
import os
|
||||
# Get the project root directory (two levels up from this file)
|
||||
project_root = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
|
||||
assets_path = os.path.join(project_root, 'assets')
|
||||
|
||||
app = dash.Dash(
|
||||
__name__,
|
||||
external_stylesheets=[dbc.themes.BOOTSTRAP],
|
||||
assets_folder=assets_path
|
||||
)
|
||||
|
||||
# Allow callbacks to components that are dynamically created in tabs
|
||||
app.config.suppress_callback_exceptions = True
|
||||
@@ -20,9 +29,78 @@ def create_app():
|
||||
VisualizationCallbacks()
|
||||
InteractionCallbacks()
|
||||
|
||||
# Register client-side callback for embedding generation
|
||||
_register_client_side_callbacks(app)
|
||||
|
||||
return app
|
||||
|
||||
|
||||
def _register_client_side_callbacks(app):
|
||||
"""Register client-side callbacks for browser-based processing."""
|
||||
from dash import Input, Output, State
|
||||
|
||||
# Client-side callback for embedding generation
|
||||
app.clientside_callback(
|
||||
"""
|
||||
function(nClicks, textContent, modelName, tokenizationMethod, batchSize, category, subcategory) {
|
||||
if (!nClicks || !textContent || !textContent.trim()) {
|
||||
return window.dash_clientside.no_update;
|
||||
}
|
||||
|
||||
console.log('🔍 Checking for Transformers.js...');
|
||||
console.log('window.dash_clientside:', typeof window.dash_clientside);
|
||||
console.log('window.dash_clientside.transformers:', typeof window.dash_clientside?.transformers);
|
||||
console.log('generateEmbeddings function:', typeof window.dash_clientside?.transformers?.generateEmbeddings);
|
||||
|
||||
if (typeof window.dash_clientside !== 'undefined' &&
|
||||
typeof window.dash_clientside.transformers !== 'undefined' &&
|
||||
typeof window.dash_clientside.transformers.generateEmbeddings === 'function') {
|
||||
|
||||
console.log('✅ Calling Transformers.js generateEmbeddings...');
|
||||
return window.dash_clientside.transformers.generateEmbeddings(
|
||||
nClicks, textContent, modelName, tokenizationMethod, category, subcategory
|
||||
);
|
||||
}
|
||||
|
||||
// More detailed error information
|
||||
let errorMsg = '❌ Transformers.js not available. ';
|
||||
if (typeof window.dash_clientside === 'undefined') {
|
||||
errorMsg += 'dash_clientside not found.';
|
||||
} else if (typeof window.dash_clientside.transformers === 'undefined') {
|
||||
errorMsg += 'transformers module not found.';
|
||||
} else if (typeof window.dash_clientside.transformers.generateEmbeddings !== 'function') {
|
||||
errorMsg += 'generateEmbeddings function not found.';
|
||||
}
|
||||
|
||||
console.error(errorMsg);
|
||||
|
||||
return [
|
||||
{ error: 'Transformers.js not loaded. Please refresh the page and try again.' },
|
||||
errorMsg + ' Please refresh the page.',
|
||||
'danger',
|
||||
false
|
||||
];
|
||||
}
|
||||
""",
|
||||
[
|
||||
Output("embeddings-generated-trigger", "data"),
|
||||
Output("text-input-status-immediate", "children"),
|
||||
Output("text-input-status-immediate", "color"),
|
||||
Output("generate-embeddings-btn", "disabled", allow_duplicate=True),
|
||||
],
|
||||
[Input("generate-embeddings-btn", "n_clicks")],
|
||||
[
|
||||
State("text-input-area", "value"),
|
||||
State("model-selection", "value"),
|
||||
State("tokenization-method", "value"),
|
||||
State("batch-size", "value"),
|
||||
State("text-category", "value"),
|
||||
State("text-subcategory", "value"),
|
||||
],
|
||||
prevent_initial_call=True,
|
||||
)
|
||||
|
||||
|
||||
def run_app(app=None, debug=None, host=None, port=None):
|
||||
if app is None:
|
||||
app = create_app()
|
||||
|
@@ -79,6 +79,71 @@ class AppSettings:
|
||||
OPENSEARCH_CONNECTION_TIMEOUT = 30
|
||||
OPENSEARCH_VERIFY_CERTS = True
|
||||
|
||||
# Text Input / Transformers.js Configuration
|
||||
DEFAULT_EMBEDDING_MODEL = "Xenova/all-mpnet-base-v2"
|
||||
MAX_TEXT_LENGTH = 50000 # Characters (browser memory limits)
|
||||
DEFAULT_TOKENIZATION_METHOD = "sentence"
|
||||
MAX_BATCH_SIZE = 8 # Process in smaller batches for memory management
|
||||
|
||||
# Available Transformers.js compatible models
|
||||
AVAILABLE_MODELS = [
|
||||
{
|
||||
"name": "Xenova/all-mpnet-base-v2",
|
||||
"label": "All-MPNet-Base-v2 (Quality, 768d)",
|
||||
"description": "Higher quality embeddings with better semantic understanding",
|
||||
"dimensions": 768,
|
||||
"size": "109 MB",
|
||||
"context_length": 512,
|
||||
"multilingual": False,
|
||||
"default": True,
|
||||
},
|
||||
{
|
||||
"name": "Xenova/all-MiniLM-L6-v2",
|
||||
"label": "All-MiniLM-L6-v2 (Fast, 384d)",
|
||||
"description": "Lightweight model, good for quick testing and general purpose",
|
||||
"dimensions": 384,
|
||||
"size": "23 MB",
|
||||
"context_length": 512,
|
||||
"multilingual": False,
|
||||
"default": False,
|
||||
},
|
||||
{
|
||||
"name": "Xenova/paraphrase-multilingual-MiniLM-L12-v2",
|
||||
"label": "Multilingual MiniLM (50+ languages)",
|
||||
"description": "Support for multiple languages with good performance",
|
||||
"dimensions": 384,
|
||||
"size": "127 MB",
|
||||
"context_length": 512,
|
||||
"multilingual": True,
|
||||
},
|
||||
{
|
||||
"name": "Xenova/bge-small-en-v1.5",
|
||||
"label": "BGE Small English (High quality, 384d)",
|
||||
"description": "Beijing Academy of AI model with excellent performance on retrieval tasks",
|
||||
"dimensions": 384,
|
||||
"size": "67 MB",
|
||||
"context_length": 512,
|
||||
"multilingual": False,
|
||||
},
|
||||
{
|
||||
"name": "Xenova/gte-small",
|
||||
"label": "GTE Small (General Text Embeddings, 384d)",
|
||||
"description": "Alibaba's general text embedding model, balanced performance",
|
||||
"dimensions": 384,
|
||||
"size": "67 MB",
|
||||
"context_length": 512,
|
||||
"multilingual": False,
|
||||
},
|
||||
]
|
||||
|
||||
# Browser compatibility requirements
|
||||
SUPPORTED_BROWSERS = {
|
||||
"chrome": ">=88",
|
||||
"firefox": ">=92",
|
||||
"safari": ">=15.4",
|
||||
"edge": ">=88",
|
||||
}
|
||||
|
||||
# Bootstrap Theme
|
||||
EXTERNAL_STYLESHEETS = [
|
||||
"https://cdn.jsdelivr.net/npm/bootstrap@5.1.3/dist/css/bootstrap.min.css"
|
||||
|
@@ -63,6 +63,90 @@ class DataProcessor:
|
||||
except Exception as e:
|
||||
return ProcessedData(documents=[], embeddings=np.array([]), error=str(e))
|
||||
|
||||
def process_client_embeddings(self, embeddings_data: dict) -> ProcessedData:
|
||||
"""Process embeddings data received from client-side JavaScript."""
|
||||
try:
|
||||
if "error" in embeddings_data:
|
||||
return ProcessedData(
|
||||
documents=[],
|
||||
embeddings=np.array([]),
|
||||
error=embeddings_data["error"],
|
||||
)
|
||||
|
||||
# Extract documents and embeddings from client data
|
||||
documents_data = embeddings_data.get("documents", [])
|
||||
embeddings_list = embeddings_data.get("embeddings", [])
|
||||
|
||||
if not documents_data or not embeddings_list:
|
||||
return ProcessedData(
|
||||
documents=[],
|
||||
embeddings=np.array([]),
|
||||
error="No documents or embeddings in client data",
|
||||
)
|
||||
|
||||
if len(documents_data) != len(embeddings_list):
|
||||
return ProcessedData(
|
||||
documents=[],
|
||||
embeddings=np.array([]),
|
||||
error="Mismatch between number of documents and embeddings",
|
||||
)
|
||||
|
||||
# Convert embeddings to numpy array first
|
||||
try:
|
||||
embeddings = np.array(embeddings_list)
|
||||
|
||||
if embeddings.ndim != 2:
|
||||
return ProcessedData(
|
||||
documents=[],
|
||||
embeddings=np.array([]),
|
||||
error="Invalid embedding dimensions",
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
return ProcessedData(
|
||||
documents=[],
|
||||
embeddings=np.array([]),
|
||||
error=f"Error processing embeddings: {str(e)}",
|
||||
)
|
||||
|
||||
# Convert to Document objects with embeddings
|
||||
documents = []
|
||||
for i, doc_data in enumerate(documents_data):
|
||||
try:
|
||||
# Skip if we don't have a corresponding embedding
|
||||
if i >= len(embeddings):
|
||||
continue
|
||||
|
||||
# Ensure required fields are present
|
||||
if "id" not in doc_data or not doc_data["id"]:
|
||||
doc_data["id"] = f"text_input_{i}"
|
||||
if "text" not in doc_data or not doc_data["text"].strip():
|
||||
continue # Skip documents without text
|
||||
|
||||
# Add the embedding to doc_data
|
||||
doc_data["embedding"] = embeddings[i].tolist()
|
||||
|
||||
doc = Document(**doc_data)
|
||||
documents.append(doc)
|
||||
except Exception:
|
||||
# Skip invalid documents but continue processing
|
||||
continue
|
||||
|
||||
if not documents:
|
||||
return ProcessedData(
|
||||
documents=[],
|
||||
embeddings=np.array([]),
|
||||
error="No valid documents found in client data",
|
||||
)
|
||||
|
||||
# Only keep embeddings for valid documents
|
||||
valid_embeddings = embeddings[: len(documents)]
|
||||
|
||||
return ProcessedData(documents=documents, embeddings=valid_embeddings)
|
||||
|
||||
except Exception as e:
|
||||
return ProcessedData(documents=[], embeddings=np.array([]), error=str(e))
|
||||
|
||||
def _extract_embeddings(self, documents: List[Document]) -> np.ndarray:
|
||||
if not documents:
|
||||
return np.array([])
|
||||
|
@@ -1,4 +1,4 @@
|
||||
from dash import callback, Input, Output, State, no_update
|
||||
from dash import callback, Input, Output, State, no_update, html
|
||||
from ...data.processor import DataProcessor
|
||||
from ...data.sources.opensearch import OpenSearchClient
|
||||
from ...models.field_mapper import FieldMapper
|
||||
@@ -87,6 +87,8 @@ class DataProcessingCallbacks:
|
||||
|
||||
if active_tab == "opensearch-tab":
|
||||
return [datasource.create_opensearch_tab()]
|
||||
elif active_tab == "text-input-tab":
|
||||
return [datasource.create_text_input_tab()]
|
||||
else:
|
||||
return [datasource.create_file_upload_tab()]
|
||||
|
||||
@@ -97,6 +99,9 @@ class DataProcessingCallbacks:
|
||||
# Register collapsible section callbacks
|
||||
self._register_collapse_callbacks()
|
||||
|
||||
# Register text input callbacks
|
||||
self._register_text_input_callbacks()
|
||||
|
||||
def _register_opensearch_callbacks(self, section_type, opensearch_client):
|
||||
"""Register callbacks for a specific section (data or prompts)."""
|
||||
|
||||
@@ -463,6 +468,220 @@ class DataProcessingCallbacks:
|
||||
return new_state, icon_class
|
||||
return is_open, "fas fa-chevron-down me-2"
|
||||
|
||||
def _register_text_input_callbacks(self):
|
||||
"""Register callbacks for text input functionality."""
|
||||
|
||||
# Text length counter callback
|
||||
@callback(
|
||||
Output("text-length-counter", "children"),
|
||||
Input("text-input-area", "value"),
|
||||
prevent_initial_call=False,
|
||||
)
|
||||
def update_text_length_counter(text_value):
|
||||
if not text_value:
|
||||
return "0"
|
||||
return f"{len(text_value):,}"
|
||||
|
||||
# Generate button enable/disable callback
|
||||
@callback(
|
||||
[
|
||||
Output("generate-embeddings-btn", "disabled"),
|
||||
Output("generation-help", "children"),
|
||||
Output("generation-help", "color"),
|
||||
],
|
||||
[
|
||||
Input("text-input-area", "value"),
|
||||
Input("model-selection", "value"),
|
||||
],
|
||||
prevent_initial_call=False,
|
||||
)
|
||||
def toggle_generate_button(text_value, model_name):
|
||||
import dash_bootstrap_components as dbc
|
||||
|
||||
if not text_value or not text_value.strip():
|
||||
return (
|
||||
True,
|
||||
dbc.Alert(
|
||||
[
|
||||
html.I(className="fas fa-info-circle me-2"),
|
||||
"Enter some text above to enable embedding generation.",
|
||||
],
|
||||
color="light",
|
||||
),
|
||||
"light",
|
||||
)
|
||||
|
||||
if not model_name:
|
||||
return (
|
||||
True,
|
||||
dbc.Alert(
|
||||
[
|
||||
html.I(className="fas fa-exclamation-triangle me-2"),
|
||||
"Select an embedding model to continue.",
|
||||
],
|
||||
color="warning",
|
||||
),
|
||||
"warning",
|
||||
)
|
||||
|
||||
text_length = len(text_value.strip())
|
||||
if text_length > AppSettings.MAX_TEXT_LENGTH:
|
||||
return (
|
||||
True,
|
||||
dbc.Alert(
|
||||
[
|
||||
html.I(className="fas fa-exclamation-triangle me-2"),
|
||||
f"Text too long ({text_length:,} characters). Maximum allowed: {AppSettings.MAX_TEXT_LENGTH:,} characters.",
|
||||
],
|
||||
color="danger",
|
||||
),
|
||||
"danger",
|
||||
)
|
||||
|
||||
return (
|
||||
False,
|
||||
dbc.Alert(
|
||||
[
|
||||
html.I(className="fas fa-check-circle me-2"),
|
||||
f"Ready to generate embeddings for {text_length:,} characters using {model_name}.",
|
||||
],
|
||||
color="success",
|
||||
),
|
||||
"success",
|
||||
)
|
||||
|
||||
# Clear text callback
|
||||
@callback(
|
||||
Output("text-input-area", "value"),
|
||||
[Input("clear-text-btn", "n_clicks"), Input("load-sample-btn", "n_clicks")],
|
||||
prevent_initial_call=True,
|
||||
)
|
||||
def handle_text_input_actions(clear_clicks, load_clicks):
|
||||
from dash import ctx
|
||||
|
||||
if not ctx.triggered:
|
||||
return no_update
|
||||
|
||||
button_id = ctx.triggered[0]['prop_id'].split('.')[0]
|
||||
|
||||
if button_id == "clear-text-btn" and clear_clicks:
|
||||
return ""
|
||||
elif button_id == "load-sample-btn" and load_clicks:
|
||||
return self._load_sample_text()
|
||||
|
||||
return no_update
|
||||
|
||||
# Model info callback
|
||||
@callback(
|
||||
Output("model-info", "children"),
|
||||
Input("model-selection", "value"),
|
||||
prevent_initial_call=False,
|
||||
)
|
||||
def update_model_info(model_name):
|
||||
if not model_name:
|
||||
return html.Span("Please select a model", className="text-muted")
|
||||
|
||||
from ...config.settings import AppSettings
|
||||
|
||||
settings = AppSettings()
|
||||
|
||||
for model in settings.AVAILABLE_MODELS:
|
||||
if model["name"] == model_name:
|
||||
return html.Div(
|
||||
[
|
||||
html.Strong(
|
||||
f"Dimensions: {model['dimensions']} | Context Length: {model['context_length']}"
|
||||
),
|
||||
html.Br(),
|
||||
html.Span(model["description"]),
|
||||
html.Br(),
|
||||
html.Small(
|
||||
f"Multilingual: {'Yes' if model.get('multilingual', False) else 'No'} | Size: {model['size']}",
|
||||
className="text-muted",
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
return html.Span("Model information not available", className="text-muted")
|
||||
|
||||
# Process client-side embeddings result callback
|
||||
@callback(
|
||||
[
|
||||
Output("processed-data", "data", allow_duplicate=True),
|
||||
Output("text-input-status", "children"),
|
||||
Output("text-input-status", "color"),
|
||||
Output("text-input-status", "style"),
|
||||
Output("generate-embeddings-btn", "disabled", allow_duplicate=True),
|
||||
],
|
||||
[Input("embeddings-generated-trigger", "data")],
|
||||
prevent_initial_call=True,
|
||||
)
|
||||
def process_embeddings_result(embeddings_data):
|
||||
"""Process embeddings generated client-side."""
|
||||
if not embeddings_data:
|
||||
return no_update, no_update, no_update, no_update, no_update
|
||||
|
||||
processed_data = self.processor.process_client_embeddings(embeddings_data)
|
||||
|
||||
if processed_data.error:
|
||||
return (
|
||||
{"error": processed_data.error},
|
||||
f"❌ Error: {processed_data.error}",
|
||||
"danger",
|
||||
{"display": "block"},
|
||||
False,
|
||||
)
|
||||
|
||||
return (
|
||||
{
|
||||
"documents": [
|
||||
self._document_to_dict(doc) for doc in processed_data.documents
|
||||
],
|
||||
"embeddings": processed_data.embeddings.tolist(),
|
||||
},
|
||||
f"✅ Generated embeddings for {len(processed_data.documents)} text chunks",
|
||||
"success",
|
||||
{"display": "block"},
|
||||
False,
|
||||
)
|
||||
|
||||
def _load_sample_text(self):
|
||||
"""Load sample text from assets/sample-txt.md file."""
|
||||
import os
|
||||
|
||||
try:
|
||||
# Get the project root directory (four levels up from this file)
|
||||
current_file = os.path.abspath(__file__)
|
||||
project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(current_file)))))
|
||||
sample_file_path = os.path.join(project_root, 'assets', 'sample-txt.md')
|
||||
|
||||
if os.path.exists(sample_file_path):
|
||||
with open(sample_file_path, 'r', encoding='utf-8') as file:
|
||||
return file.read()
|
||||
else:
|
||||
# Fallback sample text if file doesn't exist
|
||||
return """The sun peeked through the clouds after a drizzly morning.
|
||||
A gentle breeze rustled the leaves as we walked along the shoreline.
|
||||
Heavy rains caused flooding in several low-lying neighborhoods.
|
||||
It was so hot that even the birds sought shade under the palm trees.
|
||||
By midnight, the temperature had dropped below freezing.
|
||||
|
||||
The new smartphone features a foldable display and 5G connectivity.
|
||||
In the world of AI, transformers have revolutionized natural language processing.
|
||||
Quantum computing promises to solve problems beyond classical computers' reach.
|
||||
Blockchain technology is being explored for secure voting systems.
|
||||
Virtual reality headsets are becoming more affordable and accessible.
|
||||
|
||||
Preheat the oven to 375°F before you start mixing the batter.
|
||||
She finely chopped the garlic and sautéed it in two tablespoons of olive oil.
|
||||
A pinch of saffron adds a beautiful color and aroma to traditional paella.
|
||||
If the soup is too salty, add a peeled potato to absorb excess sodium.
|
||||
Let the bread dough rise for at least an hour in a warm, draft-free spot."""
|
||||
|
||||
except Exception as e:
|
||||
# Return a simple fallback if there's any error
|
||||
return "This is sample text for testing embedding generation. You can replace this with your own text."
|
||||
|
||||
@staticmethod
|
||||
def _document_to_dict(doc):
|
||||
return {
|
||||
|
@@ -1,11 +1,13 @@
|
||||
from dash import dcc, html
|
||||
import dash_bootstrap_components as dbc
|
||||
from .upload import UploadComponent
|
||||
from .textinput import TextInputComponent
|
||||
|
||||
|
||||
class DataSourceComponent:
|
||||
def __init__(self):
|
||||
self.upload_component = UploadComponent()
|
||||
self.text_input_component = TextInputComponent()
|
||||
|
||||
def create_tabbed_interface(self):
|
||||
"""Create tabbed interface for different data sources."""
|
||||
@@ -17,6 +19,7 @@ class DataSourceComponent:
|
||||
[
|
||||
dbc.Tab(label="File Upload", tab_id="file-tab"),
|
||||
dbc.Tab(label="OpenSearch", tab_id="opensearch-tab"),
|
||||
dbc.Tab(label="Text Input", tab_id="text-input-tab"),
|
||||
],
|
||||
id="data-source-tabs",
|
||||
active_tab="file-tab",
|
||||
@@ -208,6 +211,10 @@ class DataSourceComponent:
|
||||
]
|
||||
)
|
||||
|
||||
def create_text_input_tab(self):
|
||||
"""Create text input tab content for browser-based embedding generation."""
|
||||
return html.Div([self.text_input_component.create_text_input_interface()])
|
||||
|
||||
def _create_opensearch_section(self, section_type):
|
||||
"""Create a complete OpenSearch section for either 'data' or 'prompts'."""
|
||||
section_id = section_type # 'data' or 'prompts'
|
||||
|
402
src/embeddingbuddy/ui/components/textinput.py
Normal file
402
src/embeddingbuddy/ui/components/textinput.py
Normal file
@@ -0,0 +1,402 @@
|
||||
"""Text input component for generating embeddings from user text."""
|
||||
|
||||
import dash_bootstrap_components as dbc
|
||||
from dash import dcc, html
|
||||
|
||||
from embeddingbuddy.config.settings import AppSettings
|
||||
|
||||
|
||||
class TextInputComponent:
|
||||
"""Component for text input and embedding generation."""
|
||||
|
||||
def __init__(self):
|
||||
self.settings = AppSettings()
|
||||
|
||||
def create_text_input_interface(self):
|
||||
"""Create the complete text input interface with model selection and processing options."""
|
||||
return html.Div(
|
||||
[
|
||||
# Model selection section
|
||||
self._create_model_selection(),
|
||||
html.Hr(),
|
||||
# Text input section
|
||||
self._create_text_input_area(),
|
||||
# Text action buttons
|
||||
self._create_text_action_buttons(),
|
||||
html.Hr(),
|
||||
# Processing options
|
||||
self._create_processing_options(),
|
||||
html.Hr(),
|
||||
# Generation controls
|
||||
self._create_generation_controls(),
|
||||
html.Hr(),
|
||||
# Progress indicators
|
||||
self._create_progress_indicators(),
|
||||
html.Hr(),
|
||||
# Status and results
|
||||
self._create_status_section(),
|
||||
# Hidden components for data flow
|
||||
self._create_hidden_components(),
|
||||
],
|
||||
className="p-3",
|
||||
)
|
||||
|
||||
def _create_model_selection(self):
|
||||
"""Create model selection dropdown with descriptions."""
|
||||
model_options = []
|
||||
for model in self.settings.AVAILABLE_MODELS:
|
||||
label = f"{model['label']} - {model['size']}"
|
||||
if model.get("default", False):
|
||||
label += " (Recommended)"
|
||||
|
||||
model_options.append({"label": label, "value": model["name"]})
|
||||
|
||||
return html.Div(
|
||||
[
|
||||
html.H5("Embedding Model", className="mb-3"),
|
||||
html.Div(
|
||||
[
|
||||
dcc.Dropdown(
|
||||
id="model-selection",
|
||||
options=model_options,
|
||||
value=self.settings.DEFAULT_EMBEDDING_MODEL,
|
||||
placeholder="Select an embedding model...",
|
||||
className="mb-2",
|
||||
),
|
||||
dbc.Alert(
|
||||
[
|
||||
html.Div(
|
||||
id="model-info",
|
||||
children=self._get_model_description(
|
||||
self.settings.DEFAULT_EMBEDDING_MODEL
|
||||
),
|
||||
)
|
||||
],
|
||||
color="info",
|
||||
className="small",
|
||||
),
|
||||
]
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
def _create_text_input_area(self):
|
||||
"""Create text input textarea with character limits."""
|
||||
return html.Div(
|
||||
[
|
||||
html.H5("Text Input", className="mb-3"),
|
||||
dcc.Textarea(
|
||||
id="text-input-area",
|
||||
placeholder="Paste your text here... Each sentence, paragraph, or line will become a separate data point depending on your tokenization method below.",
|
||||
value="",
|
||||
style={
|
||||
"width": "100%",
|
||||
"height": "300px",
|
||||
"resize": "vertical",
|
||||
"font-family": "monospace",
|
||||
"font-size": "14px",
|
||||
},
|
||||
maxLength=self.settings.MAX_TEXT_LENGTH,
|
||||
className="form-control",
|
||||
),
|
||||
html.Small(
|
||||
f"Maximum {self.settings.MAX_TEXT_LENGTH:,} characters. Current: ",
|
||||
className="text-muted",
|
||||
),
|
||||
html.Small(
|
||||
id="text-length-counter",
|
||||
children="0",
|
||||
className="text-muted fw-bold",
|
||||
),
|
||||
html.Small(" characters", className="text-muted"),
|
||||
]
|
||||
)
|
||||
|
||||
def _create_text_action_buttons(self):
|
||||
"""Create action buttons for text input (Load Sample, Clear)."""
|
||||
return html.Div(
|
||||
[
|
||||
dbc.Row(
|
||||
[
|
||||
dbc.Col(
|
||||
[
|
||||
dbc.Button(
|
||||
[
|
||||
html.I(className="fas fa-file-text me-2"),
|
||||
"Load Sample Text",
|
||||
],
|
||||
id="load-sample-btn",
|
||||
color="info",
|
||||
size="sm",
|
||||
className="w-100",
|
||||
)
|
||||
],
|
||||
md=6,
|
||||
),
|
||||
dbc.Col(
|
||||
[
|
||||
dbc.Button(
|
||||
[
|
||||
html.I(className="fas fa-trash me-2"),
|
||||
"Clear Text",
|
||||
],
|
||||
id="clear-text-btn",
|
||||
color="outline-secondary",
|
||||
size="sm",
|
||||
className="w-100",
|
||||
)
|
||||
],
|
||||
md=6,
|
||||
),
|
||||
],
|
||||
className="mt-2 mb-3",
|
||||
)
|
||||
]
|
||||
)
|
||||
|
||||
def _create_processing_options(self):
|
||||
"""Create tokenization and metadata options."""
|
||||
return html.Div(
|
||||
[
|
||||
html.H5("Processing Options", className="mb-3"),
|
||||
dbc.Row(
|
||||
[
|
||||
dbc.Col(
|
||||
[
|
||||
html.Label(
|
||||
"Text Splitting Method:", className="form-label"
|
||||
),
|
||||
dcc.Dropdown(
|
||||
id="tokenization-method",
|
||||
options=[
|
||||
{
|
||||
"label": "Sentences (split on . ! ?)",
|
||||
"value": "sentence",
|
||||
},
|
||||
{
|
||||
"label": "Paragraphs (split on double newline)",
|
||||
"value": "paragraph",
|
||||
},
|
||||
{
|
||||
"label": "Lines (split on single newline)",
|
||||
"value": "manual",
|
||||
},
|
||||
{
|
||||
"label": "Entire text as one document",
|
||||
"value": "whole",
|
||||
},
|
||||
],
|
||||
value=self.settings.DEFAULT_TOKENIZATION_METHOD,
|
||||
className="mb-3",
|
||||
),
|
||||
],
|
||||
md=6,
|
||||
),
|
||||
dbc.Col(
|
||||
[
|
||||
html.Label("Batch Size:", className="form-label"),
|
||||
dcc.Dropdown(
|
||||
id="batch-size",
|
||||
options=[
|
||||
{
|
||||
"label": "Small batches (4) - Lower memory",
|
||||
"value": 4,
|
||||
},
|
||||
{
|
||||
"label": "Medium batches (8) - Balanced",
|
||||
"value": 8,
|
||||
},
|
||||
{
|
||||
"label": "Large batches (16) - Faster",
|
||||
"value": 16,
|
||||
},
|
||||
],
|
||||
value=self.settings.MAX_BATCH_SIZE,
|
||||
className="mb-3",
|
||||
),
|
||||
],
|
||||
md=6,
|
||||
),
|
||||
]
|
||||
),
|
||||
dbc.Row(
|
||||
[
|
||||
dbc.Col(
|
||||
[
|
||||
html.Label(
|
||||
"Category (Optional):", className="form-label"
|
||||
),
|
||||
dcc.Input(
|
||||
id="text-category",
|
||||
type="text",
|
||||
placeholder="e.g., Notes, Articles, Ideas...",
|
||||
value="Text Input",
|
||||
className="form-control mb-3",
|
||||
),
|
||||
],
|
||||
md=6,
|
||||
),
|
||||
dbc.Col(
|
||||
[
|
||||
html.Label(
|
||||
"Subcategory (Optional):", className="form-label"
|
||||
),
|
||||
dcc.Input(
|
||||
id="text-subcategory",
|
||||
type="text",
|
||||
placeholder="e.g., Meeting Notes, Research...",
|
||||
value="Generated",
|
||||
className="form-control mb-3",
|
||||
),
|
||||
],
|
||||
md=6,
|
||||
),
|
||||
]
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
def _create_generation_controls(self):
|
||||
"""Create embedding generation button and controls."""
|
||||
return html.Div(
|
||||
[
|
||||
html.H5("Generate Embeddings", className="mb-3"),
|
||||
dbc.Row(
|
||||
[
|
||||
dbc.Col(
|
||||
[
|
||||
dbc.Button(
|
||||
[
|
||||
html.I(className="fas fa-magic me-2"),
|
||||
"Generate Embeddings",
|
||||
],
|
||||
id="generate-embeddings-btn",
|
||||
color="primary",
|
||||
size="lg",
|
||||
disabled=True,
|
||||
className="w-100",
|
||||
)
|
||||
],
|
||||
md=12,
|
||||
),
|
||||
]
|
||||
),
|
||||
html.Div(
|
||||
[
|
||||
dbc.Alert(
|
||||
[
|
||||
html.I(className="fas fa-info-circle me-2"),
|
||||
"Enter some text above and select a model to enable embedding generation.",
|
||||
],
|
||||
color="light",
|
||||
className="mt-3",
|
||||
id="generation-help",
|
||||
)
|
||||
]
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
def _create_progress_indicators(self):
|
||||
"""Create progress bars for model loading and embedding generation."""
|
||||
return html.Div(
|
||||
[
|
||||
# Model loading progress
|
||||
html.Div(
|
||||
[
|
||||
html.H6("Model Loading Progress", className="mb-2"),
|
||||
dbc.Progress(
|
||||
id="model-loading-progress",
|
||||
value=0,
|
||||
striped=True,
|
||||
animated=True,
|
||||
className="mb-2",
|
||||
),
|
||||
html.Small(
|
||||
id="model-loading-status",
|
||||
children="No model loading in progress",
|
||||
className="text-muted",
|
||||
),
|
||||
],
|
||||
id="model-loading-section",
|
||||
style={"display": "none"},
|
||||
),
|
||||
html.Br(),
|
||||
# Embedding generation progress
|
||||
html.Div(
|
||||
[
|
||||
html.H6("Embedding Generation Progress", className="mb-2"),
|
||||
dbc.Progress(
|
||||
id="embedding-progress",
|
||||
value=0,
|
||||
striped=True,
|
||||
animated=True,
|
||||
className="mb-2",
|
||||
),
|
||||
html.Small(
|
||||
id="embedding-status",
|
||||
children="No embedding generation in progress",
|
||||
className="text-muted",
|
||||
),
|
||||
],
|
||||
id="embedding-progress-section",
|
||||
style={"display": "none"},
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
def _create_status_section(self):
|
||||
"""Create status alerts and results preview."""
|
||||
return html.Div(
|
||||
[
|
||||
# Immediate status (from client-side)
|
||||
dbc.Alert(
|
||||
id="text-input-status-immediate",
|
||||
children="Ready to generate embeddings",
|
||||
color="light",
|
||||
className="mb-3",
|
||||
),
|
||||
# Server-side status
|
||||
dbc.Alert(
|
||||
id="text-input-status",
|
||||
children="",
|
||||
color="light",
|
||||
className="mb-3",
|
||||
style={"display": "none"},
|
||||
),
|
||||
# Results preview
|
||||
html.Div(id="embedding-results-preview"),
|
||||
]
|
||||
)
|
||||
|
||||
def _create_hidden_components(self):
|
||||
"""Create hidden components for data flow."""
|
||||
return html.Div(
|
||||
[
|
||||
# Store for embeddings data from client-side
|
||||
dcc.Store(id="embeddings-generated-trigger"),
|
||||
# Store for tokenization preview
|
||||
dcc.Store(id="tokenization-preview-data"),
|
||||
]
|
||||
)
|
||||
|
||||
def _get_model_description(self, model_name):
|
||||
"""Get description for a specific model."""
|
||||
for model in self.settings.AVAILABLE_MODELS:
|
||||
if model["name"] == model_name:
|
||||
return html.Div(
|
||||
[
|
||||
html.Strong(
|
||||
f"Dimensions: {model['dimensions']} | Context Length: {model['context_length']}"
|
||||
),
|
||||
html.Br(),
|
||||
html.Span(model["description"]),
|
||||
html.Br(),
|
||||
html.Small(
|
||||
f"Multilingual: {'Yes' if model.get('multilingual', False) else 'No'} | Size: {model['size']}",
|
||||
className="text-muted",
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
return html.Span("Model information not available", className="text-muted")
|
@@ -20,6 +20,15 @@ class AppLayout:
|
||||
dbc.Col(
|
||||
[
|
||||
html.H1("EmbeddingBuddy", className="text-center mb-4"),
|
||||
# Load Transformers.js from CDN
|
||||
html.Script(
|
||||
"""
|
||||
import { pipeline } from 'https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.7.2';
|
||||
window.transformersPipeline = pipeline;
|
||||
console.log('✅ Transformers.js pipeline loaded globally');
|
||||
""",
|
||||
type="module"
|
||||
),
|
||||
],
|
||||
width=12,
|
||||
)
|
||||
|
Reference in New Issue
Block a user