From 9582a0da5e9b2896425d1f0229d15ab21e220a25 Mon Sep 17 00:00:00 2001 From: Joao Henrique Machado Silva Date: Fri, 8 May 2026 17:03:38 +0200 Subject: [PATCH] feat(engine): HNSW probe widened to cosine + dot via per-index metric (SQLR-28) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 7d.2's `try_hnsw_probe` was L2-only, so any KNN query using `vec_distance_cosine` or `vec_distance_dot` silently fell through to brute-force even with an HNSW index attached. Surfaced by the SQLR-23 v2 W10 bench: the HNSW variant clocked ~181 ms vs ~129 ms for brute-force because the cosine hot loop never touched the graph. Lands as sub-phase 7d.4 — per-index distance metric, no file format bump (the metric round-trips via the synthesized CREATE INDEX SQL in `sqlrite_master`): - New SQL surface: `CREATE INDEX … USING hnsw (col) WITH (metric = '')`. Omitting the WITH clause defaults to L2, so pre-SQLR-28 catalogs round-trip byte-identical. Typo'd metric names error at CREATE INDEX time rather than silently defaulting to L2 — that silent fallback is exactly what we're fixing. - New `SqlriteDialect` (wraps sqlparser's `SQLiteDialect`, only override is `supports_create_index_with_clause = true`). - `HnswIndexEntry` grows a `metric: DistanceMetric` field; the load, rebuild, and dirty-rebuild paths all consume the per-entry metric instead of hard-coded L2. - `try_hnsw_probe` widens to all three `vec_distance_*` functions and only fires when the index entry's metric matches the query function. Mismatch → brute-force fallback (correct, just slow). - W10 bench bumped to v3; the HNSW variant creates the index `WITH (metric = 'cosine')`. v1/v2 numbers are not comparable. - Tests: cosine + dot self-query through the optimizer, metric-mismatch fallback, unknown-metric rejection, WITH-on-btree rejection, save+reopen preserves cosine metric. Unblocks SQLR-25 (republish v2/v3 bench numbers). Co-Authored-By: Claude Opus 4.7 (1M context) --- README.md | 2 +- benchmarks/README.md | 12 +- benchmarks/src/workloads/vector.rs | 32 ++++-- docs/phase-7-plan.md | 5 +- docs/supported-sql.md | 15 ++- src/connection.rs | 156 ++++++++++++++++++++++++- src/sql/db/database.rs | 6 +- src/sql/db/table.rs | 24 ++-- src/sql/dialect.rs | 100 ++++++++++++++++ src/sql/executor.rs | 161 ++++++++++++++++++++++---- src/sql/hnsw.rs | 34 ++++++ src/sql/mod.rs | 10 +- src/sql/pager/mod.rs | 177 +++++++++++++++++++++++------ src/sql/params.rs | 4 +- src/sql/parser/create.rs | 2 +- 15 files changed, 643 insertions(+), 97 deletions(-) create mode 100644 src/sql/dialect.rs diff --git a/README.md b/README.md index f96300a..ff3ad52 100644 --- a/README.md +++ b/README.md @@ -305,7 +305,7 @@ Lockstep versioning — one dispatch bumps every product to the same `vX.Y.Z`. T - [x] **7a — `VECTOR(N)` column type** *(v0.1.10)*: dense f32 vectors with bracket-array literal syntax (`[0.1, 0.2, ...]`); file format bumped to v4 - [x] **7b — Distance functions** *(v0.1.11)*: `vec_distance_l2/cosine/dot` + `ORDER BY LIMIT k` so KNN queries work end-to-end - [x] **7c — Bounded-heap top-k optimization** *(v0.1.12)* -- [x] **7d — HNSW ANN index** *(v0.1.13–15)*: `CREATE INDEX … USING hnsw (col)`; recall@10 ≥ 0.95 at default `M=16, ef_construction=200, ef_search=50`; persisted as a `KIND_HNSW` cell tree +- [x] **7d — HNSW ANN index** *(v0.1.13–15, +SQLR-28)*: `CREATE INDEX … USING hnsw (col) [WITH (metric = '')]`; recall@10 ≥ 0.95 at default `M=16, ef_construction=200, ef_search=50`; persisted as a `KIND_HNSW` cell tree, with the metric round-tripping through the synthesized `sqlrite_master` SQL - [x] **7e — JSON column type + path queries** *(v0.1.16)*: `JSON` / `JSONB` columns stored as canonical text; `json_extract` / `json_type` / `json_array_length` / `json_object_keys`; `$.key`, `[N]`, chained JSONPath subset - [x] **7g.1 — `sqlrite-ask` crate** *(v0.1.18)*: foundational natural-language → SQL via the [Anthropic API](https://docs.anthropic.com/) (Sonnet 4.6 by default), prompt-cached schema dump, sync `ureq` HTTP. - [x] **7g.2 — REPL `.ask` + dep-direction flip** *(v0.1.19)*: `.ask ` meta-command with `Run? [Y/n]` confirmation. The wiring required dropping the engine dep from `sqlrite-ask` (cargo cycle) — `sqlrite-ask` is now pure over `&str` schemas; the `Connection`/`Database` integration moved to the engine's new `ask` feature. Public surface for callers: `use sqlrite::{Connection, ConnectionAskExt}`. diff --git a/benchmarks/README.md b/benchmarks/README.md index 38a2563..1892ff0 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -170,14 +170,20 @@ Even at 10k scale, the gap is large: ### W10 — vector top-10 (cosine), brute-force vs HNSW -10k 384-dim vectors. Two variants per the plan: brute-force (no index) and HNSW (`CREATE INDEX … USING hnsw`). SQLRite-only — `sqlite-vec` extension wiring is a follow-up (`rusqlite[bundled]` doesn't ship it; loading a pre-compiled `.dylib` at runtime is non-trivial and was out of scope for v1). +10k 384-dim vectors. Two variants per the plan: brute-force (no index) and HNSW (`CREATE INDEX … USING hnsw (embedding) WITH (metric = 'cosine')`, per SQLR-28). SQLRite-only — `sqlite-vec` extension wiring is a follow-up (`rusqlite[bundled]` doesn't ship it; loading a pre-compiled `.dylib` at runtime is non-trivial and was out of scope for v1). -| Variant | SQLRite median | Throughput | +The **W10.v1** numbers below were taken before SQLR-23 (parser-bound) and SQLR-28 (HNSW probe was L2-only, so the HNSW variant silently fell through to brute-force on cosine queries) — they are retained for historical context only. **W10.v3** ships with the cosine-built index + cosine-aware optimizer probe; republish under SQLR-25. + +| Variant | SQLRite median (v1, retired) | Throughput | |---|---|---| | brute-force | ~122 ms | ~8 ops/s | | hnsw | ~132 ms | ~7 ops/s | -**Read this as:** at 10k vectors × 384 dim, **HNSW barely beats brute-force**. That's not the index's fault — both numbers are dominated by the **per-iter SQL parse cost** (the 384-element bracket-array literal in the `ORDER BY` clause is ~4 KB of SQL the parser walks every iteration; the actual cosine work is ~3.8M FP ops ≈ a few ms). At a much larger corpus (millions of vectors) HNSW would dominate; at 10k the parser cost masks the algorithmic win. A future "prepare-vector-query-once" path or VECTOR-bind binding would surface the real HNSW vs brute-force gap. +**Read this as (v1, retired):** at 10k vectors × 384 dim, **HNSW barely beats brute-force**. Two reasons compounded: +1. **Per-iter SQL parse cost** — the 384-element bracket-array literal in the `ORDER BY` clause was ~4 KB of SQL the parser walked every iteration. Fixed in SQLR-23 (`Value::Vector` bind). +2. **Cosine queries silently brute-forced on the HNSW path** — the optimizer's `try_hnsw_probe` was L2-only; cosine queries never hit the graph. Fixed in SQLR-28 (per-index metric + matching probe). + +W10.v3 measures the *actual* HNSW-vs-brute-force gap with both fixes in place. ### W11 — BM25 top-10 diff --git a/benchmarks/src/workloads/vector.rs b/benchmarks/src/workloads/vector.rs index 05b8302..02b6ce2 100644 --- a/benchmarks/src/workloads/vector.rs +++ b/benchmarks/src/workloads/vector.rs @@ -3,8 +3,10 @@ //! ```sql //! CREATE TABLE vecs (id INTEGER PRIMARY KEY, embedding VECTOR(384)); //! -- 10k 384-dim vectors, deterministic per-id. -//! -- HNSW variant adds: -//! CREATE INDEX vecs_hnsw ON vecs USING hnsw (embedding); +//! -- HNSW variant adds (SQLR-28: cosine-built index, matched to the +//! -- query's vec_distance_cosine): +//! CREATE INDEX vecs_hnsw ON vecs USING hnsw (embedding) +//! WITH (metric = 'cosine'); //! //! -- Hot loop: //! SELECT id FROM vecs @@ -12,12 +14,12 @@ //! LIMIT 10; //! ``` //! -//! Two criterion groups land per driver: `W10.v1/brute-force` (no HNSW +//! Two criterion groups land per driver: `W10.v3/brute-force` (no HNSW //! index — every probe full-scans + bounded-heap top-k) and -//! `W10.v1/hnsw` (with the HNSW index, optimizer probes the graph -//! per [`docs/supported-sql.md`](../../docs/supported-sql.md) "HNSW -//! indexes"). The gap between the two is the headline number for -//! "did Phase 7d's ANN actually deliver?" +//! `W10.v3/hnsw` (with the cosine-built HNSW index, optimizer probes +//! the graph per [`docs/supported-sql.md`](../../docs/supported-sql.md) +//! "HNSW indexes"). The gap between the two is the headline number +//! for "did Phase 7d's ANN actually deliver?" //! //! ## Comparator //! @@ -40,10 +42,18 @@ use crate::{Driver, Value, WorkloadId}; /// inlined as a 4 KB bracket-array literal in the SQL string. The /// brute-force-vs-HNSW gap should widen materially because the /// per-iter parser cost no longer dominates. +/// +/// SQLR-28 — bumped again to v3: the HNSW variant now creates the +/// index `WITH (metric = 'cosine')`, matching the hot-loop SQL's +/// `vec_distance_cosine`. v1/v2 used the optimizer's L2-only probe, +/// which silently fell through to brute-force on a cosine query — +/// the HNSW variant was never actually exercising the graph. Numbers +/// from before v3 are not comparable to v3 numbers and have been +/// retired. pub const W10: WorkloadId = WorkloadId { id: "W10", name: "vector-top10", - version: "v2", + version: "v3", }; /// `(label, with_hnsw_index)` — two variants per driver. @@ -71,9 +81,13 @@ pub fn setup( let dataset = vector_dataset(); insert_rows(driver, &mut conn, &dataset)?; if with_hnsw { + // SQLR-28: build the graph for cosine — matches the hot-loop + // SQL's vec_distance_cosine. Without the metric clause the + // index defaults to L2 and the optimizer's metric gate falls + // through to brute-force, which is exactly the bug v3 fixes. driver.execute( &mut conn, - "CREATE INDEX vecs_hnsw ON vecs USING hnsw (embedding)", + "CREATE INDEX vecs_hnsw ON vecs USING hnsw (embedding) WITH (metric = 'cosine')", )?; } Ok((conn, dataset)) diff --git a/docs/phase-7-plan.md b/docs/phase-7-plan.md index 1409bf2..1d01c4b 100644 --- a/docs/phase-7-plan.md +++ b/docs/phase-7-plan.md @@ -163,6 +163,7 @@ SELECT id, title FROM docs ORDER BY embedding <-> [0.1, ...] LIMIT 10; > - **✅ 7d.1 — Pure HNSW algorithm** *(~700 LOC, shipped in v0.1.13).* `src/sql/hnsw.rs` standalone module: insert + search + layer assignment + beam search per layer + L2/cosine/dot distance dispatch. No SQL integration yet — vectors are passed in via a `get_vec` closure so the algorithm doesn't depend on table types. Tests verify recall@k ≥ 0.95 vs brute-force on randomly-generated vector sets; deterministic via a fixed RNG seed. > - **✅ 7d.2 — SQL integration** *(~500 LOC).* `CREATE INDEX … USING hnsw (col)` parser + engine, INSERT wiring (also calls `hnsw.insert()` incrementally), query optimizer hook (recognizes `ORDER BY vec_distance_l2(col, literal) LIMIT k` and probes the HNSW instead of full-scanning). HNSW lives in memory only at this point; the **CREATE INDEX SQL persists in `sqlrite_master` and reopen rebuilds the graph from current rows** — partial persistence ahead of 7d.3. DELETE/UPDATE on HNSW-indexed tables refused with helpful error pointing at 7d.3. > - **✅ 7d.3 — Persistence** *(~600 LOC).* New `KIND_HNSW` cell tag and `HnswNodeCell` encoding (varint node_id + per-layer neighbor lists). Each HNSW index gets its own page tree parallel to secondary indexes. Open path loads cells directly into `HnswIndex::from_persisted_nodes` — no algorithm runs, exact bit-for-bit reproduction. Also unblocks DELETE / UPDATE on HNSW-indexed tables: those mark the index `needs_rebuild`, save rebuilds from current rows before staging. ~2× the original 300-LOC estimate because the cell encoding + tests + rebuild path together added more than expected. +> - **✅ 7d.4 (SQLR-28) — Per-index distance metric.** Q2's "deferred per-index metric knob" lands as `CREATE INDEX … USING hnsw (col) WITH (metric = '')`. The metric is stored on `HnswIndexEntry` and round-tripped via the synthesized CREATE INDEX SQL in `sqlrite_master` (no file-format bump — pre-SQLR-28 rows omit the WITH clause and decode as L2). The optimizer's `try_hnsw_probe` widens to all three `vec_distance_*` functions but only fires when the query function matches the index's metric; mismatches fall through to brute-force. Surfaced by the SQLR-23 v2 bench: W10 uses cosine, the optimizer was L2-only, and the HNSW variant had been silently brute-forcing the entire time. SQLR-25 (republish v2 numbers) was the gating consumer. > > Each 7d.x ships as its own PR + release wave. The user-facing value lands at 7d.2; 7d.3 closes the persistence loop. 7d.1 is foundational but ships a tested algorithmic primitive on its own — useful as documentation of the engine's "from scratch" theme. @@ -368,12 +369,12 @@ Q1–Q10 were resolved by the project owner on 2026-04-26. Each question keeps i ### Q2. HNSW parameters: fixed defaults or per-index configurable? -> **Decided: fixed defaults** (`M=16, ef_construction=200, ef_search=50`). +> **Decided: fixed defaults** (`M=16, ef_construction=200, ef_search=50`) for the algorithmic knobs. **Distance metric** *did* land as a per-index `WITH (metric = '')` clause in **SQLR-28 / sub-phase 7d.4** — see the 7d split note above. Was deferred from the original 7d.2 cut; surfaced as a gap by the SQLR-23 v2 bench, where W10's cosine query had been silently brute-forcing because the optimizer hook was L2-only. - **Fixed:** `M=16, ef_construction=200, ef_search=50`. Simpler API, less to test. Matches sqlite-vec's defaults. - **Configurable:** `CREATE INDEX … USING hnsw (col) WITH (m=32, ef_construction=400)`. Power-user knobs, more code, more test matrix. -**Recommendation:** fixed defaults for MVP. Configurable can land as a follow-up if anyone actually asks. +**Recommendation:** fixed defaults for MVP. Configurable can land as a follow-up if anyone actually asks. (`metric` already came back as a follow-up; `m` / `ef_*` haven't been requested yet.) ### Q3. JSON storage format diff --git a/docs/supported-sql.md b/docs/supported-sql.md index 4d9d9cc..54beadf 100644 --- a/docs/supported-sql.md +++ b/docs/supported-sql.md @@ -113,15 +113,18 @@ These are full-citizen indexes — they're visible via `.tables`-adjacent catalo ### HNSW indexes (Phase 7d) ```sql -CREATE INDEX ON USING hnsw (); +CREATE INDEX ON
USING hnsw () + [WITH (metric = '')]; ``` -Builds an [HNSW](https://arxiv.org/abs/1603.09320) approximate-nearest-neighbor index over a `VECTOR(N)` column. The query optimizer recognizes `ORDER BY vec_distance_l2(col, literal) LIMIT k` (or the cosine / dot variants) on an HNSW-indexed column and probes the graph instead of full-scanning. SQLR-23 — the second arg can be either an inline `[...]` literal *or* a bound `Value::Vector(...)` parameter via `Statement::query_with_params`; the optimizer recognizes both, so prepared-statement KNN queries still take the graph shortcut. +Builds an [HNSW](https://arxiv.org/abs/1603.09320) approximate-nearest-neighbor index over a `VECTOR(N)` column. The query optimizer recognizes `ORDER BY vec_distance_l2(col, literal) LIMIT k` (or the cosine / dot variants) on an HNSW-indexed column **whose metric matches the query's distance function**, and probes the graph instead of full-scanning. SQLR-23 — the second arg can be either an inline `[...]` literal *or* a bound `Value::Vector(...)` parameter via `Statement::query_with_params`; the optimizer recognizes both, so prepared-statement KNN queries still take the graph shortcut. -- Recall@10 ≥ 0.95 at default parameters (`M=16`, `ef_construction=200`, `ef_search=50`). Parameters aren't tunable from SQL yet — see Q2 of [`docs/phase-7-plan.md`](phase-7-plan.md). -- The index is built incrementally on `INSERT`. `DELETE` / `UPDATE` mark the index `needs_rebuild`; the next save rebuilds from current rows. -- Persisted as a `KIND_HNSW` cell tree alongside the regular page hierarchy — open path loads the graph bit-for-bit, no algorithm runs. -- Without an HNSW index, the same `ORDER BY vec_distance_… LIMIT k` query still works — it just brute-force-scans every row (Phase 7c's bounded-heap top-k optimization keeps the memory footprint to O(k)). +The `WITH (metric = '…')` clause picks the distance the graph is built for. Three values are recognized: `'l2'` (Euclidean — the default, also accepts `'euclidean'`), `'cosine'`, and `'dot'` (negated dot-product — also accepts `'inner_product'` / `'ip'`). Omitting the clause is equivalent to `metric = 'l2'`, so pre-SQLR-28 catalogs round-trip unchanged. **The metric is not a query-time choice** — the graph topology depends on the metric used during INSERT (neighbour pruning is metric-specific), so a query whose `vec_distance_*` function doesn't match the index's metric falls through to brute-force rather than getting a wrong answer back from the graph. If you need both L2 and cosine probes on the same column, create two indexes. + +- Recall@10 ≥ 0.95 at default parameters (`M=16`, `ef_construction=200`, `ef_search=50`). The `M` / `ef_*` knobs aren't tunable from SQL yet — see Q2 of [`docs/phase-7-plan.md`](phase-7-plan.md). +- The index is built incrementally on `INSERT`. `DELETE` / `UPDATE` mark the index `needs_rebuild`; the next save rebuilds from current rows under the same metric. +- Persisted as a `KIND_HNSW` cell tree alongside the regular page hierarchy — open path loads the graph bit-for-bit, no algorithm runs. The metric travels through the synthesized CREATE INDEX SQL in `sqlrite_master`; no file-format bump. +- Without an HNSW index — or with a metric mismatch — the same `ORDER BY vec_distance_… LIMIT k` query still works; it just brute-force-scans every row (Phase 7c's bounded-heap top-k optimization keeps the memory footprint to O(k)). ### FTS indexes (Phase 8) diff --git a/src/connection.rs b/src/connection.rs index 3eec9e2..8d72c5a 100644 --- a/src/connection.rs +++ b/src/connection.rs @@ -54,8 +54,8 @@ use std::collections::VecDeque; use std::path::Path; use std::sync::Arc; +use crate::sql::dialect::SqlriteDialect; use sqlparser::ast::Statement as AstStatement; -use sqlparser::dialect::SQLiteDialect; use sqlparser::parser::Parser; use crate::error::{Result, SQLRiteError}; @@ -320,7 +320,7 @@ struct CachedPlan { impl CachedPlan { fn compile(sql: &str) -> Result { - let dialect = SQLiteDialect {}; + let dialect = SqlriteDialect::new(); let mut ast = Parser::parse_sql(&dialect, sql).map_err(SQLRiteError::from)?; let Some(mut stmt) = ast.pop() else { return Err(SQLRiteError::General("no statement to prepare".to_string())); @@ -1115,6 +1115,158 @@ mod tests { assert_eq!(rows[0].get::(0).unwrap(), 1); } + /// SQLR-28 — cosine probe: an HNSW index built `WITH (metric = + /// 'cosine')` must serve `ORDER BY vec_distance_cosine(col, [...])` + /// from the graph. Self-query: querying for one of the corpus's + /// own vectors must come back as the nearest under cosine + /// distance. + #[test] + fn cosine_self_query_through_hnsw_optimizer() { + let mut conn = Connection::open_in_memory().unwrap(); + conn.execute("CREATE TABLE v (id INTEGER PRIMARY KEY, e VECTOR(4));") + .unwrap(); + let corpus: [(i64, [f32; 4]); 5] = [ + (1, [1.0, 0.0, 0.0, 0.0]), + (2, [0.0, 1.0, 0.0, 0.0]), + (3, [0.0, 0.0, 1.0, 0.0]), + (4, [0.0, 0.0, 0.0, 1.0]), + (5, [0.5, 0.5, 0.5, 0.5]), + ]; + for (id, vec) in corpus { + conn.execute(&format!( + "INSERT INTO v (id, e) VALUES ({id}, [{}, {}, {}, {}]);", + vec[0], vec[1], vec[2], vec[3] + )) + .unwrap(); + } + conn.execute("CREATE INDEX v_hnsw ON v USING hnsw (e) WITH (metric = 'cosine');") + .unwrap(); + + // Self-query for id=2's vector — expected nearest under cosine + // distance is id=2 itself (cos distance 0). + let rows = conn + .prepare("SELECT id FROM v ORDER BY vec_distance_cosine(e, [0.0, 1.0, 0.0, 0.0]) ASC LIMIT 1") + .unwrap() + .query_with_params(&[]) + .unwrap() + .collect_all() + .unwrap(); + assert_eq!(rows.len(), 1); + assert_eq!(rows[0].get::(0).unwrap(), 2); + } + + /// SQLR-28 — dot probe: same shape as the cosine test, but the + /// index is built `WITH (metric = 'dot')` and the query uses + /// `vec_distance_dot`. Confirms the third metric variant lights up + /// the graph shortcut, not just l2 / cosine. + #[test] + fn dot_self_query_through_hnsw_optimizer() { + let mut conn = Connection::open_in_memory().unwrap(); + conn.execute("CREATE TABLE v (id INTEGER PRIMARY KEY, e VECTOR(3));") + .unwrap(); + // Data: distinguishable magnitudes so the dot metric resolves + // a clear winner. `vec_distance_dot(a, b) = -(a·b)` — smaller + // (more negative) is closer. + let corpus: [(i64, [f32; 3]); 4] = [ + (1, [1.0, 0.0, 0.0]), + (2, [2.0, 0.0, 0.0]), + (3, [0.0, 1.0, 0.0]), + (4, [0.0, 0.0, 1.0]), + ]; + for (id, vec) in corpus { + conn.execute(&format!( + "INSERT INTO v (id, e) VALUES ({id}, [{}, {}, {}]);", + vec[0], vec[1], vec[2] + )) + .unwrap(); + } + conn.execute("CREATE INDEX v_hnsw ON v USING hnsw (e) WITH (metric = 'dot');") + .unwrap(); + + // Query [3, 0, 0]: dot products are 3, 6, 0, 0 → distances + // -3, -6, 0, 0. id=2 has the smallest (most negative) distance. + let rows = conn + .prepare("SELECT id FROM v ORDER BY vec_distance_dot(e, [3.0, 0.0, 0.0]) ASC LIMIT 1") + .unwrap() + .query_with_params(&[]) + .unwrap() + .collect_all() + .unwrap(); + assert_eq!(rows.len(), 1); + assert_eq!(rows[0].get::(0).unwrap(), 2); + } + + /// SQLR-28 — metric mismatch must NOT take the graph shortcut. + /// An L2-built index queried with `vec_distance_cosine` falls + /// through to brute-force, which still returns the correct + /// answer. We confirm the answer is correct; the slow-path + /// behaviour itself is implicit (no error, no panic, no wrong + /// result), which is the user-visible contract that matters. + #[test] + fn metric_mismatch_falls_back_to_brute_force() { + let mut conn = Connection::open_in_memory().unwrap(); + conn.execute("CREATE TABLE v (id INTEGER PRIMARY KEY, e VECTOR(2));") + .unwrap(); + let half_sqrt2 = std::f32::consts::FRAC_1_SQRT_2; + let corpus: [(i64, [f32; 2]); 3] = [ + (1, [1.0, 0.0]), + (2, [half_sqrt2, half_sqrt2]), + (3, [0.0, 1.0]), + ]; + for (id, vec) in corpus { + conn.execute(&format!( + "INSERT INTO v (id, e) VALUES ({id}, [{}, {}]);", + vec[0], vec[1] + )) + .unwrap(); + } + // Default L2 index — no WITH clause. + conn.execute("CREATE INDEX v_hnsw_l2 ON v USING hnsw (e);") + .unwrap(); + + // Query with cosine. Index can't help; brute-force still + // returns the correct nearest by cosine: id=1 (cos dist 0). + let rows = conn + .prepare("SELECT id FROM v ORDER BY vec_distance_cosine(e, [1.0, 0.0]) ASC LIMIT 1") + .unwrap() + .query_with_params(&[]) + .unwrap() + .collect_all() + .unwrap(); + assert_eq!(rows.len(), 1); + assert_eq!(rows[0].get::(0).unwrap(), 1); + } + + /// SQLR-28 — a typo in the metric name must error at CREATE INDEX + /// time. Falling back to L2 silently is the bug we're fixing here, + /// not the behaviour to preserve. + #[test] + fn unknown_metric_name_is_rejected() { + let mut conn = Connection::open_in_memory().unwrap(); + conn.execute("CREATE TABLE v (id INTEGER PRIMARY KEY, e VECTOR(2));") + .unwrap(); + let err = conn + .execute("CREATE INDEX bad ON v USING hnsw (e) WITH (metric = 'cosin');") + .unwrap_err(); + let msg = format!("{err}"); + assert!(msg.contains("unknown HNSW metric"), "got: {msg}"); + } + + /// SQLR-28 — WITH options on a non-HNSW index must error rather + /// than be silently ignored. An option that has no effect on the + /// resulting index is a footgun. + #[test] + fn with_metric_on_btree_is_rejected() { + let mut conn = Connection::open_in_memory().unwrap(); + conn.execute("CREATE TABLE t (a INTEGER PRIMARY KEY, b TEXT);") + .unwrap(); + let err = conn + .execute("CREATE INDEX bad ON t (b) WITH (metric = 'cosine');") + .unwrap_err(); + let msg = format!("{err}"); + assert!(msg.contains("doesn't support any options"), "got: {msg}"); + } + #[test] fn prepare_cached_executes_the_same_as_prepare() { let mut conn = Connection::open_in_memory().unwrap(); diff --git a/src/sql/db/database.rs b/src/sql/db/database.rs index 642d8de..f5db61d 100644 --- a/src/sql/db/database.rs +++ b/src/sql/db/database.rs @@ -198,8 +198,8 @@ impl Database { #[cfg(test)] mod tests { use super::*; + use crate::sql::dialect::SqlriteDialect; use crate::sql::parser::create::CreateQuery; - use sqlparser::dialect::SQLiteDialect; use sqlparser::parser::Parser; #[test] @@ -220,7 +220,7 @@ mod tests { last_name TEXT NOT NULl, email TEXT NOT NULL UNIQUE );"; - let dialect = SQLiteDialect {}; + let dialect = SqlriteDialect::new(); let mut ast = Parser::parse_sql(&dialect, query_statement).unwrap(); if ast.len() > 1 { panic!("Expected a single query statement, but there are more then 1.") @@ -246,7 +246,7 @@ mod tests { last_name TEXT NOT NULl, email TEXT NOT NULL UNIQUE );"; - let dialect = SQLiteDialect {}; + let dialect = SqlriteDialect::new(); let mut ast = Parser::parse_sql(&dialect, query_statement).unwrap(); if ast.len() > 1 { panic!("Expected a single query statement, but there are more then 1.") diff --git a/src/sql/db/table.rs b/src/sql/db/table.rs index 0b90b3b..392cc80 100644 --- a/src/sql/db/table.rs +++ b/src/sql/db/table.rs @@ -1,7 +1,7 @@ use crate::error::{Result, SQLRiteError}; use crate::sql::db::secondary_index::{IndexOrigin, SecondaryIndex}; use crate::sql::fts::PostingList; -use crate::sql::hnsw::HnswIndex; +use crate::sql::hnsw::{DistanceMetric, HnswIndex}; use crate::sql::parser::create::{CreateQuery, ParsedColumn}; use std::collections::{BTreeMap, HashMap}; use std::fmt; @@ -150,10 +150,11 @@ pub struct Table { pub primary_key: String, } -/// One HNSW index attached to a table. Phase 7d.2 only supports L2 -/// distance; cosine and dot are 7d.x follow-ups (would require either -/// distinct USING methods like `hnsw_cosine` or a `WITH (metric = …)` -/// clause — see `docs/phase-7-plan.md` for the deferred decision). +/// One HNSW index attached to a table. The distance metric is fixed +/// at CREATE INDEX time via `USING hnsw (col) WITH (metric = '')` +/// (`l2` / `cosine` / `dot`); omitting the WITH clause defaults to L2, +/// matching the pre-SQLR-28 behaviour for round-tripping older +/// `sqlrite_master` rows that didn't carry a metric. #[derive(Debug, Clone)] pub struct HnswIndexEntry { /// User-supplied name from `CREATE INDEX …`. Unique across @@ -161,6 +162,13 @@ pub struct HnswIndexEntry { pub name: String, /// The VECTOR column this index covers. pub column_name: String, + /// Distance metric the graph was built for. The optimizer's HNSW + /// shortcut only fires when the query's `vec_distance_*` function + /// matches this metric — picking a non-matching distance falls + /// through to brute-force, since the graph topology is metric- + /// specific (an L2-pruned graph isn't a valid cosine search graph + /// in general, and vice versa). + pub metric: DistanceMetric, /// The graph itself. pub index: HnswIndex, /// Phase 7d.3 — true iff a DELETE or UPDATE-on-vector-col has @@ -1628,7 +1636,7 @@ pub fn parse_vector_literal(s: &str) -> Result> { #[cfg(test)] mod tests { use super::*; - use sqlparser::dialect::SQLiteDialect; + use crate::sql::dialect::SqlriteDialect; use sqlparser::parser::Parser; #[test] @@ -1766,7 +1774,7 @@ mod tests { active BOOL, score REAL );"; - let dialect = SQLiteDialect {}; + let dialect = SqlriteDialect::new(); let mut ast = Parser::parse_sql(&dialect, query_statement).unwrap(); if ast.len() > 1 { panic!("Expected a single query statement, but there are more then 1.") @@ -1802,7 +1810,7 @@ mod tests { first_name TEXT NOT NULL, last_name TEXT NOT NULl );"; - let dialect = SQLiteDialect {}; + let dialect = SqlriteDialect::new(); let mut ast = Parser::parse_sql(&dialect, query_statement).unwrap(); if ast.len() > 1 { panic!("Expected a single query statement, but there are more then 1.") diff --git a/src/sql/dialect.rs b/src/sql/dialect.rs new file mode 100644 index 0000000..d25269d --- /dev/null +++ b/src/sql/dialect.rs @@ -0,0 +1,100 @@ +//! SQLRite SQL dialect. +//! +//! Wraps sqlparser's `SQLiteDialect` so we get every SQLite-specific +//! tokenizer/parser quirk (delimited identifiers, NOTNULL operator, +//! `LIMIT a, b`, `MATCH`/`REGEXP` infix, …) and overrides only what we +//! need for SQLRite's vector extensions: +//! +//! - `supports_create_index_with_clause = true` — lets the parser +//! accept `CREATE INDEX … USING hnsw (col) WITH (metric = 'cosine')`. +//! sqlparser's `SQLiteDialect` returns `false` from this method, so +//! the WITH clause would otherwise be parked in `index_options` (or +//! error). The PostgreSQL dialect already turns it on; we copy that +//! behaviour here without taking the rest of the pgsql parser +//! divergences. +//! +//! Add new dialect overrides here as the surface grows; everything not +//! explicitly listed defers to the base SQLite dialect. +use sqlparser::ast::{Expr, Statement}; +use sqlparser::dialect::{Dialect, SQLiteDialect}; +use sqlparser::parser::{Parser, ParserError}; + +#[derive(Debug, Default, Clone, Copy, PartialEq, Eq)] +pub struct SqlriteDialect { + inner: SQLiteDialect, +} + +impl SqlriteDialect { + pub const fn new() -> Self { + Self { + inner: SQLiteDialect {}, + } + } +} + +impl Dialect for SqlriteDialect { + fn is_delimited_identifier_start(&self, ch: char) -> bool { + self.inner.is_delimited_identifier_start(ch) + } + + fn identifier_quote_style(&self, identifier: &str) -> Option { + self.inner.identifier_quote_style(identifier) + } + + fn is_identifier_start(&self, ch: char) -> bool { + self.inner.is_identifier_start(ch) + } + + fn is_identifier_part(&self, ch: char) -> bool { + self.inner.is_identifier_part(ch) + } + + fn supports_filter_during_aggregation(&self) -> bool { + self.inner.supports_filter_during_aggregation() + } + + fn supports_start_transaction_modifier(&self) -> bool { + self.inner.supports_start_transaction_modifier() + } + + fn supports_in_empty_list(&self) -> bool { + self.inner.supports_in_empty_list() + } + + fn supports_limit_comma(&self) -> bool { + self.inner.supports_limit_comma() + } + + fn supports_asc_desc_in_column_definition(&self) -> bool { + self.inner.supports_asc_desc_in_column_definition() + } + + fn supports_dollar_placeholder(&self) -> bool { + self.inner.supports_dollar_placeholder() + } + + fn supports_notnull_operator(&self) -> bool { + self.inner.supports_notnull_operator() + } + + fn parse_statement(&self, parser: &mut Parser) -> Option> { + self.inner.parse_statement(parser) + } + + fn parse_infix( + &self, + parser: &mut Parser, + expr: &Expr, + precedence: u8, + ) -> Option> { + self.inner.parse_infix(parser, expr, precedence) + } + + /// SQLRite-specific extension: `CREATE INDEX … USING hnsw (col) + /// WITH (metric = 'cosine')` is the canonical way to pick a + /// non-L2 distance metric for an HNSW index. See + /// `docs/supported-sql.md` and `try_hnsw_probe`. + fn supports_create_index_with_clause(&self) -> bool { + true + } +} diff --git a/src/sql/executor.rs b/src/sql/executor.rs index fb28068..8f57f3c 100644 --- a/src/sql/executor.rs +++ b/src/sql/executor.rs @@ -903,6 +903,7 @@ pub fn execute_create_index(stmt: &Statement, db: &mut Database) -> Result Result IndexMethod::Btree, }; + // Parse `WITH (key = value, …)` options (SQLR-28). The only key + // recognized today is `metric` for HNSW indexes — `'l2'` / + // `'cosine'` / `'dot'`. The clause is rejected on non-HNSW indexes + // so a typo doesn't silently sit on a btree index where it can't + // do anything useful. + let hnsw_metric = parse_hnsw_with_options(with, &index_name, method)?; + let table_name_str = table_name.to_string(); let column_name = match &columns[0].column.expr { Expr::Identifier(ident) => ident.value.clone(), @@ -1030,6 +1038,7 @@ pub fn execute_create_index(stmt: &Statement, db: &mut Database) -> Result create_fts_index( @@ -1397,6 +1406,7 @@ fn create_hnsw_index( column_name: &str, datatype: &DataType, unique: bool, + metric: DistanceMetric, existing: &[(i64, Value)], ) -> Result { // HNSW only makes sense on VECTOR columns. Reject anything else @@ -1416,15 +1426,18 @@ fn create_hnsw_index( )); } - // Build the in-memory graph. Distance metric is L2 by default - // (Phase 7d.2 doesn't yet expose a knob for picking cosine/dot — - // see `docs/phase-7-plan.md` for the deferral). + // Build the in-memory graph. The distance metric was picked at + // CREATE INDEX time (defaults to L2 if no `WITH (metric = …)` + // clause was supplied). The graph topology is metric-specific — + // L2 neighbour pruning ≠ cosine neighbour pruning — so the + // optimizer's HNSW shortcut only fires when the query's + // `vec_distance_*` function matches this value (SQLR-28). // // Seed: hash the index name so different indexes get different // graph topologies, but the same index always gets the same one // — useful when debugging recall / index size. let seed = hash_str_to_seed(index_name); - let mut idx = HnswIndex::new(DistanceMetric::L2, seed); + let mut idx = HnswIndex::new(metric, seed); // Snapshot the (rowid, vector) pairs into a side map so the // get_vec closure below can serve them by id without re-borrowing @@ -1463,6 +1476,7 @@ fn create_hnsw_index( table_mut.hnsw_indexes.push(HnswIndexEntry { name: index_name.to_string(), column_name: column_name.to_string(), + metric, index: idx, // Freshly built — no DELETE/UPDATE has invalidated it yet. needs_rebuild: false, @@ -1470,6 +1484,97 @@ fn create_hnsw_index( Ok(index_name.to_string()) } +/// Parses the `WITH (metric = '', …)` options bag on a CREATE +/// INDEX statement. Returns the chosen metric (or `None` if no +/// `metric` key was supplied) on HNSW indexes; raises a +/// user-visible error on: +/// +/// - WITH options on a non-HNSW index (btree / fts have no knobs we +/// understand here), +/// - unknown option keys, +/// - unknown metric names (typo guard — silently falling back to L2 +/// would hide the user's intent and re-introduce the SQLR-28 bug). +fn parse_hnsw_with_options( + with: &[Expr], + index_name: &str, + method: IndexMethod, +) -> Result> { + if with.is_empty() { + return Ok(None); + } + if !matches!(method, IndexMethod::Hnsw) { + return Err(SQLRiteError::General(format!( + "CREATE INDEX '{index_name}' has a WITH (...) clause but its index method \ + doesn't support any options — only `USING hnsw` recognises `WITH (metric = ...)`" + ))); + } + + let mut metric: Option = None; + for opt in with { + let Expr::BinaryOp { left, op, right } = opt else { + return Err(SQLRiteError::General(format!( + "CREATE INDEX '{index_name}': unsupported WITH option {opt:?} \ + (expected `key = 'value'`)" + ))); + }; + if !matches!(op, BinaryOperator::Eq) { + return Err(SQLRiteError::General(format!( + "CREATE INDEX '{index_name}': WITH options must use `=` (got {op:?})" + ))); + } + let key = match left.as_ref() { + Expr::Identifier(ident) => ident.value.clone(), + other => { + return Err(SQLRiteError::General(format!( + "CREATE INDEX '{index_name}': WITH option key must be a bare identifier, \ + got {other:?}" + ))); + } + }; + let value = match right.as_ref() { + Expr::Value(v) => match &v.value { + AstValue::SingleQuotedString(s) => s.clone(), + AstValue::DoubleQuotedString(s) => s.clone(), + other => { + return Err(SQLRiteError::General(format!( + "CREATE INDEX '{index_name}': WITH option '{key}' value must be \ + a quoted string, got {other:?}" + ))); + } + }, + Expr::Identifier(ident) => ident.value.clone(), + other => { + return Err(SQLRiteError::General(format!( + "CREATE INDEX '{index_name}': WITH option '{key}' value must be a \ + quoted string, got {other:?}" + ))); + } + }; + + if key.eq_ignore_ascii_case("metric") { + let parsed = DistanceMetric::from_sql_name(&value).ok_or_else(|| { + SQLRiteError::General(format!( + "CREATE INDEX '{index_name}': unknown HNSW metric '{value}' \ + (try 'l2', 'cosine', or 'dot')" + )) + })?; + if metric.is_some() { + return Err(SQLRiteError::General(format!( + "CREATE INDEX '{index_name}': metric specified more than once in WITH (...)" + ))); + } + metric = Some(parsed); + } else { + return Err(SQLRiteError::General(format!( + "CREATE INDEX '{index_name}': unknown WITH option '{key}' \ + (only 'metric' is recognised on HNSW indexes)" + ))); + } + } + + Ok(metric) +} + /// Builds a Phase 8b FTS inverted index and attaches it to the table. /// Mirrors [`create_hnsw_index`] in shape: validate column type, /// tokenize each existing row's text into the in-memory posting list, @@ -1654,22 +1759,25 @@ fn try_extract_equality(expr: &Expr) -> Option<(String, sqlparser::ast::Value)> /// Recognizes the HNSW-probable query pattern and probes the graph /// if a matching index exists. /// -/// Looks for ORDER BY `vec_distance_l2(, )` -/// where the table has an HNSW index attached to ``. On a match, -/// returns the top-k rowids straight from the graph (O(log N)). On -/// any miss — different function name, no matching index, query -/// dimension wrong, etc. — returns `None` and the caller falls through +/// Looks for ORDER BY `vec_distance_(, )` where the table has an HNSW index attached to +/// `` *built for that same distance metric*. On a match, returns +/// the top-k rowids straight from the graph (O(log N)). On any miss — +/// different function name, no matching index, query dimension wrong, +/// metric mismatch, etc. — returns `None` and the caller falls through /// to the bounded-heap brute-force path (7c) or the full sort (7b), /// preserving correct results regardless of whether the HNSW pathway /// kicked in. /// -/// Phase 7d.2 caveats: -/// - Only `vec_distance_l2` is recognized. Cosine and dot fall through -/// to brute-force because we don't yet expose a per-index distance -/// knob (deferred to Phase 7d.x — see `docs/phase-7-plan.md`). +/// Caveats: +/// - The index's metric and the query's `vec_distance_*` function must +/// agree. An L2-built graph silently doesn't help cosine queries +/// (different neighbour pruning policy → potentially different +/// topology), so we don't pretend to. Pick the metric at CREATE +/// INDEX time via `WITH (metric = '')` (SQLR-28). /// - Only ASCENDING order makes sense for "k nearest" — DESC ORDER BY -/// `vec_distance_l2(...) LIMIT k` would mean "k farthest", which -/// isn't what the index is built for. We don't bother to detect +/// `vec_distance_*(...) LIMIT k` would mean "k farthest", which isn't +/// what the index is built for. We don't bother to detect /// `ascending == false` here; the optimizer just skips and the /// fallback path handles it correctly (slower). fn try_hnsw_probe(table: &Table, order_expr: &Expr, k: usize) -> Option> { @@ -1677,7 +1785,8 @@ fn try_hnsw_probe(table: &Table, order_expr: &Expr, k: usize) -> Option return None; } - // Pattern-match: order expr must be a function call vec_distance_l2(a, b). + // Pattern-match: order expr must be a function call + // vec_distance_(a, b). let func = match order_expr { Expr::Function(f) => f, _ => return None, @@ -1686,9 +1795,12 @@ fn try_hnsw_probe(table: &Table, order_expr: &Expr, k: usize) -> Option [ObjectNamePart::Identifier(ident)] => ident.value.to_lowercase(), _ => return None, }; - if fname != "vec_distance_l2" { - return None; - } + let query_metric = match fname.as_str() { + "vec_distance_l2" => DistanceMetric::L2, + "vec_distance_cosine" => DistanceMetric::Cosine, + "vec_distance_dot" => DistanceMetric::Dot, + _ => return None, + }; // Extract the two args as raw Exprs. let arg_list = match &func.args { @@ -1721,11 +1833,14 @@ fn try_hnsw_probe(table: &Table, order_expr: &Expr, k: usize) -> Option }, }; - // Find the HNSW index on this column. + // Find the HNSW index on this column AND with a matching metric. + // Multiple indexes on the same column are allowed in principle + // (cosine-built + L2-built), and a query picks whichever metric + // its `vec_distance_*` function names. let entry = table .hnsw_indexes .iter() - .find(|e| e.column_name == col_name)?; + .find(|e| e.column_name == col_name && e.metric == query_metric)?; // Dimension sanity check — the query vector must match the // indexed column's declared dimension. If it doesn't, the brute- @@ -3296,8 +3411,8 @@ mod tests { // ----------------------------------------------------------------- use crate::sql::db::database::Database; + use crate::sql::dialect::SqlriteDialect; use crate::sql::parser::select::SelectQuery; - use sqlparser::dialect::SQLiteDialect; use sqlparser::parser::Parser; /// Builds a `docs(id INTEGER PK, score REAL)` table with N rows of @@ -3334,7 +3449,7 @@ mod tests { /// `select_topk` / `sort_rowids` directly without the rest of the /// process_command pipeline. fn parse_select(sql: &str) -> SelectQuery { - let dialect = SQLiteDialect {}; + let dialect = SqlriteDialect::new(); let mut ast = Parser::parse_sql(&dialect, sql).expect("parse"); let stmt = ast.pop().expect("one statement"); SelectQuery::new(&stmt).expect("select-query") diff --git a/src/sql/hnsw.rs b/src/sql/hnsw.rs index cf01970..5665d35 100644 --- a/src/sql/hnsw.rs +++ b/src/sql/hnsw.rs @@ -67,6 +67,40 @@ pub enum DistanceMetric { } impl DistanceMetric { + /// Parses the metric name from the `CREATE INDEX … WITH + /// (metric = '')` clause. Case-insensitive. Returns `None` + /// for unknown values; the parser surfaces that as a user-visible + /// error so a typo doesn't silently fall back to L2. + pub fn from_sql_name(name: &str) -> Option { + match name.to_ascii_lowercase().as_str() { + "l2" | "euclidean" => Some(DistanceMetric::L2), + "cosine" => Some(DistanceMetric::Cosine), + "dot" | "inner_product" | "ip" => Some(DistanceMetric::Dot), + _ => None, + } + } + + /// The canonical SQL-surface name for this metric. Used when + /// synthesizing CREATE INDEX SQL back into `sqlrite_master`. + pub fn sql_name(self) -> &'static str { + match self { + DistanceMetric::L2 => "l2", + DistanceMetric::Cosine => "cosine", + DistanceMetric::Dot => "dot", + } + } + + /// The `vec_distance_*` SQL function whose result this metric + /// orders by. The optimizer's HNSW shortcut only fires when the + /// query's ORDER BY expression names this exact function. + pub fn matching_distance_fn(self) -> &'static str { + match self { + DistanceMetric::L2 => "vec_distance_l2", + DistanceMetric::Cosine => "vec_distance_cosine", + DistanceMetric::Dot => "vec_distance_dot", + } + } + /// Computes the configured distance between two equal-dimension /// vectors. Returns `f32::INFINITY` for the cosine/zero-magnitude /// edge case; HNSW treats infinity as "worst possible candidate" and diff --git a/src/sql/mod.rs b/src/sql/mod.rs index 29688d1..abd0ae3 100644 --- a/src/sql/mod.rs +++ b/src/sql/mod.rs @@ -1,5 +1,6 @@ pub mod agg; pub mod db; +pub mod dialect; pub mod executor; pub mod fts; pub mod hnsw; @@ -13,9 +14,10 @@ use parser::insert::InsertQuery; use parser::select::SelectQuery; use sqlparser::ast::{AlterTableOperation, ObjectType, Statement}; -use sqlparser::dialect::SQLiteDialect; use sqlparser::parser::{Parser, ParserError}; +use crate::sql::dialect::SqlriteDialect; + use crate::error::{Result, SQLRiteError}; use crate::sql::db::database::Database; use crate::sql::db::table::Table; @@ -87,7 +89,7 @@ pub fn process_command(query: &str, db: &mut Database) -> Result { /// to stdout.** The REPL is responsible for printing whatever it wants /// from the returned struct. pub fn process_command_with_render(query: &str, db: &mut Database) -> Result { - let dialect = SQLiteDialect {}; + let dialect = SqlriteDialect::new(); let mut ast = Parser::parse_sql(&dialect, query).map_err(SQLRiteError::from)?; if ast.len() > 1 { @@ -529,7 +531,7 @@ mod tests { id INTEGER PRIMARY KEY, name TEXT );"; - let dialect = SQLiteDialect {}; + let dialect = SqlriteDialect::new(); let mut ast = Parser::parse_sql(&dialect, query_statement).unwrap(); if ast.len() > 1 { panic!("Expected a single query statement, but there are more then 1.") @@ -563,7 +565,7 @@ mod tests { let query_statement = "CREATE TABLE users ( name TEXT );"; - let dialect = SQLiteDialect {}; + let dialect = SqlriteDialect::new(); let mut ast = Parser::parse_sql(&dialect, query_statement).unwrap(); if ast.len() > 1 { panic!("Expected a single query statement, but there are more then 1.") diff --git a/src/sql/pager/mod.rs b/src/sql/pager/mod.rs index 7c6623b..38e4beb 100644 --- a/src/sql/pager/mod.rs +++ b/src/sql/pager/mod.rs @@ -59,13 +59,14 @@ use std::collections::{BTreeMap, HashMap}; use std::path::Path; use std::sync::{Arc, Mutex}; -use sqlparser::dialect::SQLiteDialect; +use crate::sql::dialect::SqlriteDialect; use sqlparser::parser::Parser; use crate::error::{Result, SQLRiteError}; use crate::sql::db::database::Database; use crate::sql::db::secondary_index::{IndexOrigin, SecondaryIndex}; use crate::sql::db::table::{Column, DataType, Row, Table, Value}; +use crate::sql::hnsw::DistanceMetric; use crate::sql::pager::cell::Cell; use crate::sql::pager::header::DbHeader; use crate::sql::pager::index_cell::IndexCell; @@ -403,9 +404,11 @@ fn save_database_with_mode(db: &mut Database, path: &Path, compact: bool) -> Res master_rows.push(CatalogEntry { kind: "index".into(), name: entry.name.clone(), - sql: format!( - "CREATE INDEX {} ON {} USING hnsw ({})", - entry.name, table.tb_name, entry.column_name + sql: synthesize_hnsw_create_index_sql( + &entry.name, + &table.tb_name, + &entry.column_name, + entry.metric, ), rootpage, last_rowid: 0, @@ -648,7 +651,7 @@ fn render_default_literal(value: &Value) -> String { /// Reverses `table_to_create_sql`: feeds the SQL back through `sqlparser` /// and produces our internal column list. Returns `(table_name, columns)`. fn parse_create_sql(sql: &str) -> Result<(String, Vec)> { - let dialect = SQLiteDialect {}; + let dialect = SqlriteDialect::new(); let mut ast = Parser::parse_sql(&dialect, sql).map_err(SQLRiteError::from)?; let stmt = ast.pop().ok_or_else(|| { SQLRiteError::Internal("sqlrite_master row held an empty SQL string".to_string()) @@ -869,7 +872,7 @@ fn load_index_rows(pager: &Pager, idx: &mut SecondaryIndex, root_page: u32) -> R fn parse_create_index_sql(sql: &str) -> Result<(String, String, bool)> { use sqlparser::ast::{CreateIndex, Expr, Statement}; - let dialect = SQLiteDialect {}; + let dialect = SqlriteDialect::new(); let mut ast = Parser::parse_sql(&dialect, sql).map_err(SQLRiteError::from)?; let Some(Statement::CreateIndex(CreateIndex { table_name, @@ -909,7 +912,7 @@ fn parse_create_index_sql(sql: &str) -> Result<(String, String, bool)> { fn create_index_sql_uses_hnsw(sql: &str) -> bool { use sqlparser::ast::{CreateIndex, IndexType, Statement}; - let dialect = SQLiteDialect {}; + let dialect = SqlriteDialect::new(); let Ok(mut ast) = Parser::parse_sql(&dialect, sql) else { return false; }; @@ -924,7 +927,7 @@ fn create_index_sql_uses_hnsw(sql: &str) -> bool { fn create_index_sql_uses_fts(sql: &str) -> bool { use sqlparser::ast::{CreateIndex, IndexType, Statement}; - let dialect = SQLiteDialect {}; + let dialect = SqlriteDialect::new(); let Ok(mut ast) = Parser::parse_sql(&dialect, sql) else { return false; }; @@ -952,7 +955,7 @@ fn rebuild_fts_index(db: &mut Database, pager: &Pager, row: &IndexCatalogRow) -> use crate::sql::fts::PostingList; use sqlparser::ast::Statement; - let dialect = SQLiteDialect {}; + let dialect = SqlriteDialect::new(); let mut ast = Parser::parse_sql(&dialect, &row.sql).map_err(SQLRiteError::from)?; let Some(stmt @ Statement::CreateIndex(_)) = ast.pop() else { return Err(SQLRiteError::Internal(format!( @@ -990,7 +993,7 @@ fn rebuild_fts_index(db: &mut Database, pager: &Pager, row: &IndexCatalogRow) -> fn parse_fts_create_index_sql(sql: &str) -> Result<(String, String)> { use sqlparser::ast::{CreateIndex, Expr, Statement}; - let dialect = SQLiteDialect {}; + let dialect = SqlriteDialect::new(); let mut ast = Parser::parse_sql(&dialect, sql).map_err(SQLRiteError::from)?; let Some(Statement::CreateIndex(CreateIndex { table_name, @@ -1036,10 +1039,10 @@ fn parse_fts_create_index_sql(sql: &str) -> Result<(String, String)> { fn rebuild_hnsw_index(db: &mut Database, pager: &Pager, row: &IndexCatalogRow) -> Result<()> { use crate::sql::db::table::HnswIndexEntry; use crate::sql::executor::execute_create_index; - use crate::sql::hnsw::{DistanceMetric, HnswIndex}; + use crate::sql::hnsw::HnswIndex; use sqlparser::ast::Statement; - let dialect = SQLiteDialect {}; + let dialect = SqlriteDialect::new(); let mut ast = Parser::parse_sql(&dialect, &row.sql).map_err(SQLRiteError::from)?; let Some(stmt @ Statement::CreateIndex(_)) = ast.pop() else { return Err(SQLRiteError::Internal(format!( @@ -1054,13 +1057,16 @@ fn rebuild_hnsw_index(db: &mut Database, pager: &Pager, row: &IndexCatalogRow) - return Ok(()); } - // Persistence path — read the cell tree, deserialize. + // Persistence path — read the cell tree, deserialize. The metric + // travels through the synthesized CREATE INDEX SQL stored in + // `sqlrite_master`; pre-SQLR-28 rows omit the WITH clause and + // decode as L2, which matches what those graphs were built with. + let (tbl_name, col_name, metric) = parse_hnsw_create_index_sql(&row.sql)?; let nodes = load_hnsw_nodes(pager, row.rootpage)?; - let index = HnswIndex::from_persisted_nodes(DistanceMetric::L2, 0xC0FFEE, nodes); + let index = HnswIndex::from_persisted_nodes(metric, 0xC0FFEE, nodes); // Parse the CREATE INDEX to know which table + column to attach to // — same shape as the row-walk path; we just don't execute it. - let (tbl_name, col_name) = parse_hnsw_create_index_sql(&row.sql)?; let table_mut = db.get_table_mut(tbl_name.clone()).map_err(|_| { SQLRiteError::Internal(format!( "HNSW index '{}' references unknown table '{tbl_name}'", @@ -1070,6 +1076,7 @@ fn rebuild_hnsw_index(db: &mut Database, pager: &Pager, row: &IndexCatalogRow) - table_mut.hnsw_indexes.push(HnswIndexEntry { name: row.name.clone(), column_name: col_name, + metric, index, needs_rebuild: false, }); @@ -1112,19 +1119,22 @@ fn load_hnsw_nodes(pager: &Pager, root_page: u32) -> Result Result<(String, String)> { - use sqlparser::ast::{CreateIndex, Expr, Statement}; - - let dialect = SQLiteDialect {}; +/// Pulls `(table_name, column_name, metric)` out of a CREATE INDEX +/// SQL string of the form `CREATE INDEX … USING hnsw (col) [WITH +/// (metric = '')]`. Used by the persistence path on open to know +/// where to attach the loaded graph and which distance metric to +/// rebuild it under. Pre-SQLR-28 rows omit the WITH clause and +/// default to L2. +fn parse_hnsw_create_index_sql(sql: &str) -> Result<(String, String, DistanceMetric)> { + use crate::sql::hnsw::DistanceMetric; + use sqlparser::ast::{BinaryOperator, CreateIndex, Expr, Statement, Value as AstValue}; + + let dialect = SqlriteDialect::new(); let mut ast = Parser::parse_sql(&dialect, sql).map_err(SQLRiteError::from)?; let Some(Statement::CreateIndex(CreateIndex { table_name, columns, + with, .. })) = ast.pop() else { @@ -1148,7 +1158,34 @@ fn parse_hnsw_create_index_sql(sql: &str) -> Result<(String, String)> { ))); } }; - Ok((table_name.to_string(), col)) + + // Pull the metric off the parsed WITH (...) bag. The user-facing + // CREATE INDEX path validates this in detail (typo'd metric names, + // unknown keys, etc.); here on the persistence read-path we trust + // what we previously wrote and surface a clean Internal error if + // it ever doesn't match. + let mut metric = DistanceMetric::L2; + for opt in &with { + if let Expr::BinaryOp { left, op, right } = opt { + if matches!(op, BinaryOperator::Eq) { + if let (Expr::Identifier(key), Expr::Value(v)) = (left.as_ref(), right.as_ref()) + && key.value.eq_ignore_ascii_case("metric") + { + if let AstValue::SingleQuotedString(s) | AstValue::DoubleQuotedString(s) = + &v.value + { + metric = DistanceMetric::from_sql_name(s).ok_or_else(|| { + SQLRiteError::Internal(format!( + "sqlrite_master HNSW row carries unknown metric '{s}'" + )) + })?; + } + } + } + } + } + + Ok((table_name.to_string(), col, metric)) } /// Phase 7d.3 — rebuilds in-place any HnswIndexEntry whose @@ -1163,23 +1200,25 @@ fn parse_hnsw_create_index_sql(sql: &str) -> Result<(String, String)> { /// MVP, more sophisticated incremental delete strategies (soft-delete /// + tombstones, neighbor reconnection) are future polish. fn rebuild_dirty_hnsw_indexes(db: &mut Database) { - use crate::sql::hnsw::{DistanceMetric, HnswIndex}; + use crate::sql::hnsw::HnswIndex; for table in db.tables.values_mut() { - // Snapshot which (index_name, column) pairs need rebuilding, - // before we go grabbing column data — keeps the borrow - // structure simple. - let dirty: Vec<(String, String)> = table + // Snapshot which (index_name, column, metric) triples need + // rebuilding, before we go grabbing column data — keeps the + // borrow structure simple. The per-entry metric matters here: + // rebuilding a cosine-built graph as L2 (or vice versa) would + // silently corrupt the topology and break the SQLR-28 probe. + let dirty: Vec<(String, String, DistanceMetric)> = table .hnsw_indexes .iter() .filter(|e| e.needs_rebuild) - .map(|e| (e.name.clone(), e.column_name.clone())) + .map(|e| (e.name.clone(), e.column_name.clone(), e.metric)) .collect(); if dirty.is_empty() { continue; } - for (idx_name, col_name) in dirty { + for (idx_name, col_name, metric) in dirty { // Snapshot every (rowid, vec) for this column. let mut vectors: Vec<(i64, Vec)> = Vec::new(); { @@ -1195,7 +1234,7 @@ fn rebuild_dirty_hnsw_indexes(db: &mut Database) { let snapshot: std::collections::HashMap> = vectors.iter().cloned().collect(); - let mut new_idx = HnswIndex::new(DistanceMetric::L2, 0xC0FFEE); + let mut new_idx = HnswIndex::new(metric, 0xC0FFEE); // Sort by id so the rebuild is deterministic across runs. vectors.sort_by_key(|(id, _)| *id); for (id, v) in &vectors { @@ -1211,6 +1250,26 @@ fn rebuild_dirty_hnsw_indexes(db: &mut Database) { } } +/// Synthesises the CREATE INDEX SQL stored back into `sqlrite_master` +/// for an HNSW index. The metric travels through the SQL via an +/// optional `WITH (metric = '')` clause; L2 indexes omit the clause +/// for byte-identical round-trip with pre-SQLR-28 catalogs. +fn synthesize_hnsw_create_index_sql( + index_name: &str, + table_name: &str, + column_name: &str, + metric: DistanceMetric, +) -> String { + if matches!(metric, DistanceMetric::L2) { + format!("CREATE INDEX {index_name} ON {table_name} USING hnsw ({column_name})") + } else { + format!( + "CREATE INDEX {index_name} ON {table_name} USING hnsw ({column_name}) WITH (metric = '{}')", + metric.sql_name() + ) + } +} + /// Phase 8b — rebuild every FTS index a DELETE / UPDATE-on-text-col /// marked dirty. Mirrors [`rebuild_dirty_hnsw_indexes`]; runs at save /// time under `&mut Database`. Cheap on a clean DB (the `dirty` snapshot @@ -2166,6 +2225,58 @@ mod tests { cleanup(&path); } + /// SQLR-28 — the HNSW metric must round-trip across save+reopen. + /// Without this, the SQL re-synthesised into `sqlrite_master` + /// would drop the metric and a cosine-built graph would reload + /// as L2, silently breaking subsequent cosine probes. + #[test] + fn round_trip_preserves_hnsw_cosine_metric() { + use crate::sql::hnsw::DistanceMetric; + let path = tmp_path("hnsw_metric_roundtrip"); + + { + let mut db = Database::new("test".to_string()); + process_command( + "CREATE TABLE docs (id INTEGER PRIMARY KEY, e VECTOR(2));", + &mut db, + ) + .unwrap(); + for v in &["[1.0, 0.0]", "[0.0, 1.0]", "[0.7071, 0.7071]"] { + process_command(&format!("INSERT INTO docs (e) VALUES ({v});"), &mut db).unwrap(); + } + process_command( + "CREATE INDEX ix_cos ON docs USING hnsw (e) WITH (metric = 'cosine');", + &mut db, + ) + .unwrap(); + save_database(&mut db, &path).expect("save"); + } + + let mut loaded = open_database(&path, "test".to_string()).expect("open"); + { + let table = loaded.get_table("docs".to_string()).expect("docs"); + assert_eq!(table.hnsw_indexes.len(), 1); + assert_eq!( + table.hnsw_indexes[0].metric, + DistanceMetric::Cosine, + "metric should round-trip through CREATE INDEX SQL" + ); + assert_eq!(table.hnsw_indexes[0].index.distance, DistanceMetric::Cosine); + } + + // Cosine probe still finds the self-vector after reopen — the + // optimizer's metric gate should match the loaded entry's + // metric, so this should hit the graph shortcut. + let resp = process_command( + "SELECT id FROM docs ORDER BY vec_distance_cosine(e, [1.0, 0.0]) ASC LIMIT 1;", + &mut loaded, + ) + .unwrap(); + assert!(resp.contains("1 row returned"), "got: {resp}"); + + cleanup(&path); + } + #[test] fn round_trip_rebuilds_fts_index_from_create_sql() { // Phase 8c: FTS indexes now persist their posting lists as diff --git a/src/sql/params.rs b/src/sql/params.rs index 0b9ad23..f2c20ed 100644 --- a/src/sql/params.rs +++ b/src/sql/params.rs @@ -174,11 +174,11 @@ fn format_vector_inner(v: &[f32]) -> String { #[cfg(test)] mod tests { use super::*; - use sqlparser::dialect::SQLiteDialect; + use crate::sql::dialect::SqlriteDialect; use sqlparser::parser::Parser; fn parse_one(sql: &str) -> Statement { - let mut ast = Parser::parse_sql(&SQLiteDialect {}, sql).unwrap(); + let mut ast = Parser::parse_sql(&SqlriteDialect::new(), sql).unwrap(); ast.pop().unwrap() } diff --git a/src/sql/parser/create.rs b/src/sql/parser/create.rs index 1cef587..ed0812e 100644 --- a/src/sql/parser/create.rs +++ b/src/sql/parser/create.rs @@ -347,7 +347,7 @@ mod tests { ); let expected_table_name = String::from("contacts"); - let dialect = SQLiteDialect {}; + let dialect = SqlriteDialect::new(); let mut ast = Parser::parse_sql(&dialect, &sql_input).unwrap(); assert!(ast.len() == 1, "ast has more then one Statement");