add in browser embedding generation #4

Merged
godber merged 2 commits from add-browser-embeddings into main 2025-09-07 07:26:57 -07:00
14 changed files with 1598 additions and 4 deletions
Showing only changes of commit bced5e07ce - Show all commits

View File

@@ -3,7 +3,8 @@
"allow": [ "allow": [
"Bash(mkdir:*)", "Bash(mkdir:*)",
"Bash(uv run:*)", "Bash(uv run:*)",
"Bash(uv add:*)" "Bash(uv add:*)",
"Bash(uv sync:*)"
], ],
"deny": [], "deny": [],
"ask": [], "ask": [],

View File

@@ -9,14 +9,13 @@ from .ui.callbacks.interactions import InteractionCallbacks
def create_app(): def create_app():
import os import os
# Get the project root directory (two levels up from this file) # Get the project root directory (two levels up from this file)
project_root = os.path.dirname(os.path.dirname(os.path.dirname(__file__))) project_root = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
assets_path = os.path.join(project_root, 'assets') assets_path = os.path.join(project_root, "assets")
app = dash.Dash( app = dash.Dash(
__name__, __name__, external_stylesheets=[dbc.themes.BOOTSTRAP], assets_folder=assets_path
external_stylesheets=[dbc.themes.BOOTSTRAP],
assets_folder=assets_path
) )
# Allow callbacks to components that are dynamically created in tabs # Allow callbacks to components that are dynamically created in tabs

View File

@@ -562,7 +562,7 @@ class DataProcessingCallbacks:
if not ctx.triggered: if not ctx.triggered:
return no_update return no_update
button_id = ctx.triggered[0]['prop_id'].split('.')[0] button_id = ctx.triggered[0]["prop_id"].split(".")[0]
if button_id == "clear-text-btn" and clear_clicks: if button_id == "clear-text-btn" and clear_clicks:
return "" return ""
@@ -652,11 +652,15 @@ class DataProcessingCallbacks:
try: try:
# Get the project root directory (four levels up from this file) # Get the project root directory (four levels up from this file)
current_file = os.path.abspath(__file__) current_file = os.path.abspath(__file__)
project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(current_file))))) project_root = os.path.dirname(
sample_file_path = os.path.join(project_root, 'assets', 'sample-txt.md') os.path.dirname(
os.path.dirname(os.path.dirname(os.path.dirname(current_file)))
)
)
sample_file_path = os.path.join(project_root, "assets", "sample-txt.md")
if os.path.exists(sample_file_path): if os.path.exists(sample_file_path):
with open(sample_file_path, 'r', encoding='utf-8') as file: with open(sample_file_path, "r", encoding="utf-8") as file:
return file.read() return file.read()
else: else:
# Fallback sample text if file doesn't exist # Fallback sample text if file doesn't exist
@@ -678,7 +682,7 @@ A pinch of saffron adds a beautiful color and aroma to traditional paella.
If the soup is too salty, add a peeled potato to absorb excess sodium. If the soup is too salty, add a peeled potato to absorb excess sodium.
Let the bread dough rise for at least an hour in a warm, draft-free spot.""" Let the bread dough rise for at least an hour in a warm, draft-free spot."""
except Exception as e: except Exception:
# Return a simple fallback if there's any error # Return a simple fallback if there's any error
return "This is sample text for testing embedding generation. You can replace this with your own text." return "This is sample text for testing embedding generation. You can replace this with your own text."

View File

@@ -27,7 +27,7 @@ class AppLayout:
window.transformersPipeline = pipeline; window.transformersPipeline = pipeline;
console.log('✅ Transformers.js pipeline loaded globally'); console.log('✅ Transformers.js pipeline loaded globally');
""", """,
type="module" type="module",
), ),
], ],
width=12, width=12,

View File

@@ -1,6 +1,5 @@
"""Tests for client-side embedding processing functionality.""" """Tests for client-side embedding processing functionality."""
import pytest
import numpy as np import numpy as np
from src.embeddingbuddy.data.processor import DataProcessor from src.embeddingbuddy.data.processor import DataProcessor
@@ -23,20 +22,17 @@ class TestClientEmbeddingsProcessing:
"text": "First test document", "text": "First test document",
"category": "Text Input", "category": "Text Input",
"subcategory": "Generated", "subcategory": "Generated",
"tags": [] "tags": [],
}, },
{ {
"id": "text_input_1", "id": "text_input_1",
"text": "Second test document", "text": "Second test document",
"category": "Text Input", "category": "Text Input",
"subcategory": "Generated", "subcategory": "Generated",
"tags": [] "tags": [],
} },
], ],
"embeddings": [ "embeddings": [[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8]],
[0.1, 0.2, 0.3, 0.4],
[0.5, 0.6, 0.7, 0.8]
]
} }
result = self.processor.process_client_embeddings(client_data) result = self.processor.process_client_embeddings(client_data)
@@ -79,12 +75,15 @@ class TestClientEmbeddingsProcessing:
"""Test processing with mismatched document and embedding counts.""" """Test processing with mismatched document and embedding counts."""
client_data = { client_data = {
"documents": [ "documents": [
{"id": "test", "text": "Test document", "category": "Test", "subcategory": "Test", "tags": []} {
"id": "test",
"text": "Test document",
"category": "Test",
"subcategory": "Test",
"tags": [],
}
], ],
"embeddings": [ "embeddings": [[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8]],
[0.1, 0.2, 0.3, 0.4],
[0.5, 0.6, 0.7, 0.8]
]
} }
result = self.processor.process_client_embeddings(client_data) result = self.processor.process_client_embeddings(client_data)
@@ -98,12 +97,15 @@ class TestClientEmbeddingsProcessing:
client_data = { client_data = {
"documents": [ "documents": [
{"text": ""}, # Empty text should be skipped {"text": ""}, # Empty text should be skipped
{"id": "test2", "text": "Valid document", "category": "Test", "subcategory": "Test", "tags": []} {
"id": "test2",
"text": "Valid document",
"category": "Test",
"subcategory": "Test",
"tags": [],
},
], ],
"embeddings": [ "embeddings": [[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8]],
[0.1, 0.2, 0.3, 0.4],
[0.5, 0.6, 0.7, 0.8]
]
} }
result = self.processor.process_client_embeddings(client_data) result = self.processor.process_client_embeddings(client_data)
@@ -117,11 +119,14 @@ class TestClientEmbeddingsProcessing:
"""Test automatic ID generation for documents without IDs.""" """Test automatic ID generation for documents without IDs."""
client_data = { client_data = {
"documents": [ "documents": [
{"text": "Document without ID", "category": "Test", "subcategory": "Test", "tags": []} {
"text": "Document without ID",
"category": "Test",
"subcategory": "Test",
"tags": [],
}
], ],
"embeddings": [ "embeddings": [[0.1, 0.2, 0.3, 0.4]],
[0.1, 0.2, 0.3, 0.4]
]
} }
result = self.processor.process_client_embeddings(client_data) result = self.processor.process_client_embeddings(client_data)
@@ -135,9 +140,15 @@ class TestClientEmbeddingsProcessing:
"""Test processing with invalid embedding format.""" """Test processing with invalid embedding format."""
client_data = { client_data = {
"documents": [ "documents": [
{"id": "test", "text": "Test document", "category": "Test", "subcategory": "Test", "tags": []} {
"id": "test",
"text": "Test document",
"category": "Test",
"subcategory": "Test",
"tags": [],
}
], ],
"embeddings": 0.5 # Scalar instead of array "embeddings": 0.5, # Scalar instead of array
} }
result = self.processor.process_client_embeddings(client_data) result = self.processor.process_client_embeddings(client_data)