Skip to content

Commit fd4d390

Browse files
authored
Merge pull request #3 from sqliteai/hash-text-v2
v1.0.0: switch to TEXT hash primary key and harden sync/model switching
2 parents 1e011b7 + 04ef2aa commit fd4d390

14 files changed

Lines changed: 877 additions & 221 deletions

README.md

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ sqlite-memory bridges these concepts, allowing any SQLite-powered application to
3636
- **Hybrid Search**: Combines vector similarity (cosine distance) with FTS5 full-text search for superior retrieval
3737
- **Smart Chunking**: Markdown-aware parsing preserves semantic boundaries
3838
- **Intelligent Sync**: Content-hash change detection skips unchanged files, atomically replaces modified ones, and cleans up deleted ones
39-
- **Transactional Safety**: Every sync operation runs inside a SAVEPOINT transaction - either fully succeeds or fully rolls back, no partially-indexed content
39+
- **Transactional Safety**: Text/file ingests run inside SAVEPOINT transactions, and directory sync uses transactional cleanup plus per-file transactional updates so failed files do not leave partial rows behind
4040
- **Efficient Storage**: Binary embeddings with configurable dimensions
4141
- **Embedding Cache**: Automatically caches computed embeddings, so re-indexing the same text skips redundant API calls and computation
4242
- **Flexible Embedding**: Use local models (llama.cpp) or [vectors.space](https://vectors.space) remote API
@@ -61,6 +61,9 @@ sqlite-memory bridges these concepts, allowing any SQLite-powered application to
6161

6262
## Getting Started
6363

64+
> [!IMPORTANT]
65+
> Databases created with sqlite-memory versions earlier than `1.0.0` must be rebuilt before use with `1.0.0+`, because the internal schema changed.
66+
6467
### Prerequisites
6568

6669
- SQLite
@@ -74,7 +77,7 @@ sqlite-memory bridges these concepts, allowing any SQLite-powered application to
7477
```sql
7578
-- Load extensions (sync is optional)
7679
.load ./vector
77-
.load ./sync
80+
.load ./cloudsync
7881
.load ./memory
7982

8083
-- Configure embedding model (choose one):
@@ -84,8 +87,8 @@ SELECT memory_set_model('local', '/path/to/nomic-embed-text-v1.5.Q8_0.gguf');
8487

8588
-- Option 2: Remote embedding via vectors.space (requires free API key from https://vectors.space)
8689
-- The provider name 'openai' selects the vectors.space OpenAI-compatible endpoint.
87-
-- SELECT memory_set_model('openai', 'text-embedding-3-small');
8890
-- SELECT memory_set_apikey('your-vectorspace-api-key');
91+
-- SELECT memory_set_model('openai', 'text-embedding-3-small');
8992

9093
-- Add some knowledge
9194
SELECT memory_add_text('SQLite is a C-language library that implements a small, fast,
@@ -160,7 +163,7 @@ All `memory_add_*` functions use content-hash change detection to avoid redundan
160163
1. **Cleanup**: Removes database entries for files that no longer exist on disk
161164
2. **Scan**: Recursively processes all matching files - adding new ones, replacing modified ones, and skipping unchanged ones
162165

163-
Every sync operation is wrapped in a SQLite SAVEPOINT transaction. If anything fails mid-sync (embedding error, disk issue, etc.), the entire operation rolls back cleanly. There is no risk of partially-indexed files or orphaned entries.
166+
`memory_add_text()` and `memory_add_file()` each run inside a SQLite SAVEPOINT transaction. `memory_add_directory()` performs its cleanup pass transactionally and then processes each file in its own transaction. If one file fails, that file rolls back cleanly and previously-committed files remain valid; there are no partially-indexed rows or orphaned chunk/FTS entries for the failed file.
164167

165168
This makes all sync functions safe to call repeatedly - for example, on a cron schedule or at agent startup - with minimal overhead.
166169

@@ -258,8 +261,8 @@ FROM dbmem_content;
258261
-- Delete by context
259262
SELECT memory_delete_context('old-project');
260263

261-
-- Delete specific memory
262-
SELECT memory_delete(1234567890);
264+
-- Delete specific memory by hash
265+
SELECT memory_delete('9e3779b97f4a7c15');
263266

264267
-- Clear all memories
265268
SELECT memory_clear();
@@ -279,8 +282,11 @@ cd sqlite-memory
279282
# Build (full build with local + remote engines)
280283
make
281284

282-
# Run tests
285+
# Run parser/core unit tests + extension loading smoke test
283286
make test
287+
288+
# Run the full SQL extension unit suite
289+
make test DEFINES="-DTEST_SQLITE_EXTENSION"
284290
```
285291

286292
### Build Configurations

src/dbmem-embed.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ void dbmem_local_engine_free (dbmem_local_engine_t *engine);
2929

3030
dbmem_remote_engine_t *dbmem_remote_engine_init (void *ctx, const char *provider, const char *model, char err_msg[DBMEM_ERRBUF_SIZE]);
3131
int dbmem_remote_compute_embedding (dbmem_remote_engine_t *engine, const char *text, int text_len, embedding_result_t *result);
32+
int dbmem_remote_engine_set_apikey (dbmem_remote_engine_t *engine, const char *api_key, char err_msg[DBMEM_ERRBUF_SIZE]);
3233
void dbmem_remote_engine_free (dbmem_remote_engine_t *engine);
3334

3435
// Custom provider (always available, defined in sqlite-memory.c)

src/dbmem-lembed.c

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -100,9 +100,15 @@ void dbmem_logger (enum ggml_log_level level, const char *text, void *user_data)
100100

101101
// MARK: -
102102

103+
static void dbmem_local_set_error(dbmem_local_engine_t *engine, const char *message) {
104+
if (!engine || !engine->context) return;
105+
dbmem_context_set_error(engine->context, message);
106+
}
107+
103108
dbmem_local_engine_t *dbmem_local_engine_init (void *ctx, const char *model_path, char err_msg[DBMEM_ERRBUF_SIZE]) {
104109
dbmem_local_engine_t *engine = (dbmem_local_engine_t *)dbmemory_zeroalloc(sizeof(dbmem_local_engine_t));
105110
if (!engine) return NULL;
111+
engine->context = (dbmem_context *)ctx;
106112

107113
// set logger
108114
llama_log_set(dbmem_logger, engine);
@@ -212,7 +218,7 @@ int dbmem_local_compute_embedding (dbmem_local_engine_t *engine, const char *tex
212218
// Tokenize
213219
int n_tokens = llama_tokenize(engine->vocab, text, text_len, engine->tokens, engine->tokens_capacity, true, true);
214220
if (n_tokens < 0) {
215-
dbmem_context_set_error(engine->context, "Tokenization failed (text too long?)");
221+
dbmem_local_set_error(engine, "Tokenization failed (text too long?)");
216222
return -1;
217223
}
218224

@@ -242,7 +248,7 @@ int dbmem_local_compute_embedding (dbmem_local_engine_t *engine, const char *tex
242248
// Encode
243249
int ret = llama_encode(engine->ctx, batch);
244250
if (ret != 0) {
245-
dbmem_context_set_error(engine->context, "Llama_encode failed");
251+
dbmem_local_set_error(engine, "Llama_encode failed");
246252
return -1;
247253
}
248254

@@ -255,7 +261,7 @@ int dbmem_local_compute_embedding (dbmem_local_engine_t *engine, const char *tex
255261
}
256262

257263
if (!emb_ptr) {
258-
dbmem_context_set_error(engine->context, "Failed to get embeddings");
264+
dbmem_local_set_error(engine, "Failed to get embeddings");
259265
return -1;
260266
}
261267

@@ -301,5 +307,5 @@ void dbmem_local_engine_free (dbmem_local_engine_t *engine) {
301307
}
302308

303309
llama_backend_free();
310+
dbmemory_free(engine);
304311
}
305-

src/dbmem-parser.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
typedef struct {
2929
size_t start; // Byte offset in source buffer
3030
size_t end; // Byte end in source buffer
31+
int is_heading; // True if this section starts with a heading block
3132
char *text; // Stripped plain text (allocated)
3233
size_t text_len; // Length of stripped text
3334
} section_t;
@@ -113,8 +114,6 @@ static size_t find_split (const char *text, size_t len, size_t max_chars) {
113114

114115
// Push a section to dynamic array
115116
static int section_push (parse_ctx_t *ctx, size_t start, size_t end, int is_heading) {
116-
UNUSED_PARAM(is_heading);
117-
118117
if (ctx->sec_count >= ctx->sec_cap) {
119118
size_t new_cap = ctx->sec_cap ? ctx->sec_cap * 2 : 16;
120119
section_t *tmp = (section_t *)dbmemory_realloc(ctx->sections, new_cap * sizeof(section_t));
@@ -126,6 +125,7 @@ static int section_push (parse_ctx_t *ctx, size_t start, size_t end, int is_head
126125
section_t *s = &ctx->sections[ctx->sec_count++];
127126
s->start = start;
128127
s->end = end;
128+
s->is_heading = is_heading;
129129
s->text = NULL;
130130
s->text_len = 0;
131131

@@ -607,7 +607,7 @@ static int parse_sections (const char *buffer, size_t buffer_size, bool skip_sem
607607
for (size_t i = 0; i < ctx->sec_count; i++) {
608608
section_t *s = &ctx->sections[i];
609609
// First section or heading starts new section
610-
if (write_idx == 0) {
610+
if (write_idx == 0 || s->is_heading) {
611611
ctx->sections[write_idx++] = *s;
612612
} else {
613613
// Extend previous section to include this one

src/dbmem-rembed.c

Lines changed: 53 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ static size_t cacert_len = sizeof(cacert_pem) - 1;
2626

2727
#ifndef DBMEM_OMIT_CURL
2828
static size_t dbmem_remote_receive_data(void *contents, size_t size, size_t nmemb, void *xdata);
29+
static struct curl_slist *dbmem_remote_build_headers (const char *api_key);
2930
#endif
3031

3132
struct dbmem_remote_engine_t {
@@ -67,6 +68,27 @@ struct dbmem_remote_engine_t {
6768
#include <stdbool.h>
6869
#include <stddef.h>
6970

71+
#ifndef DBMEM_OMIT_CURL
72+
static struct curl_slist *dbmem_remote_build_headers (const char *api_key) {
73+
char auth_header[512];
74+
struct curl_slist *headers = NULL;
75+
struct curl_slist *next = NULL;
76+
77+
snprintf(auth_header, sizeof(auth_header), "Authorization: Bearer %s", api_key);
78+
headers = curl_slist_append(headers, auth_header);
79+
if (!headers) return NULL;
80+
81+
next = curl_slist_append(headers, "Content-Type: application/json");
82+
if (!next) {
83+
curl_slist_free_all(headers);
84+
return NULL;
85+
}
86+
headers = next;
87+
88+
return headers;
89+
}
90+
#endif
91+
7092
static bool text_needs_json_escape (const char *text, size_t *len) {
7193
size_t original_len = *len;
7294
size_t required_len = 0;
@@ -263,11 +285,7 @@ dbmem_remote_engine_t *dbmem_remote_engine_init (void *ctx, const char *provider
263285
#endif
264286

265287
// set up headers
266-
char auth_header[512];
267-
snprintf(auth_header, sizeof(auth_header), "Authorization: Bearer %s", api_key);
268-
struct curl_slist *headers = NULL;
269-
headers = curl_slist_append(headers, auth_header);
270-
if (headers) headers = curl_slist_append(headers, "Content-Type: application/json");
288+
struct curl_slist *headers = dbmem_remote_build_headers(api_key);
271289
if (!headers) {
272290
snprintf(err_msg, DBMEM_ERRBUF_SIZE, "Failed to allocate HTTP headers");
273291
curl_easy_cleanup(curl);
@@ -522,6 +540,36 @@ int dbmem_remote_compute_embedding (dbmem_remote_engine_t *engine, const char *t
522540
return 0;
523541
}
524542

543+
int dbmem_remote_engine_set_apikey (dbmem_remote_engine_t *engine, const char *api_key, char err_msg[DBMEM_ERRBUF_SIZE]) {
544+
if (!engine || !api_key) {
545+
if (err_msg) snprintf(err_msg, DBMEM_ERRBUF_SIZE, "Invalid remote engine or API key");
546+
return SQLITE_MISUSE;
547+
}
548+
549+
#ifndef DBMEM_OMIT_CURL
550+
struct curl_slist *headers = dbmem_remote_build_headers(api_key);
551+
if (!headers) {
552+
if (err_msg) snprintf(err_msg, DBMEM_ERRBUF_SIZE, "Failed to allocate HTTP headers");
553+
return SQLITE_NOMEM;
554+
}
555+
556+
curl_easy_setopt(engine->curl, CURLOPT_HTTPHEADER, headers);
557+
if (engine->headers) curl_slist_free_all(engine->headers);
558+
engine->headers = headers;
559+
#else
560+
char *copy = dbmem_strdup(api_key);
561+
if (!copy) {
562+
if (err_msg) snprintf(err_msg, DBMEM_ERRBUF_SIZE, "Unable to duplicate API key (insufficient memory)");
563+
return SQLITE_NOMEM;
564+
}
565+
566+
if (engine->api_key) dbmemory_free(engine->api_key);
567+
engine->api_key = copy;
568+
#endif
569+
570+
return SQLITE_OK;
571+
}
572+
525573
void dbmem_remote_engine_free (dbmem_remote_engine_t *engine) {
526574
if (!engine) return;
527575

0 commit comments

Comments
 (0)