From: Howard Chu Date: Tue, 28 Jun 2011 05:39:24 +0000 (-0700) Subject: Checkpoint X-Git-Tag: OPENLDAP_REL_ENG_2_4_27~148^2~188 X-Git-Url: https://git.sur5r.net/?a=commitdiff_plain;h=d620d4368a7ee17d60f2b381d4c80b22c68ba8e2;p=openldap Checkpoint --- d620d4368a7ee17d60f2b381d4c80b22c68ba8e2 diff --git a/libraries/libmdb/mdb.c b/libraries/libmdb/mdb.c new file mode 100644 index 0000000000..4b27da3ab2 --- /dev/null +++ b/libraries/libmdb/mdb.c @@ -0,0 +1,2460 @@ +#include +#include +#include +#include +#include +#include +#ifdef HAVE_SYS_FILE_H +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "mdb.h" + +#define DEBUG + +#ifdef DEBUG +# define DPRINTF(...) do { fprintf(stderr, "%s:%d: ", __func__, __LINE__); \ + fprintf(stderr, __VA_ARGS__); \ + fprintf(stderr, "\n"); } while(0) +#else +# define DPRINTF(...) +#endif + +#define PAGESIZE 4096 +#define MDB_MINKEYS 4 +#define MDB_MAGIC 0xBEEFC0DE +#define MDB_VERSION 1 +#define MAXKEYSIZE 255 + +#define P_INVALID 0xFFFFFFFF + +#define F_ISSET(w, f) (((w) & (f)) == (f)) + +typedef ulong pgno_t; +typedef uint16_t indx_t; + +/* Common header for all page types. Overflow pages + * occupy a number of contiguous pages with no + * headers on any page after the first. + */ +typedef struct MDB_page { /* represents a page of storage */ + pgno_t mp_pgno; /* page number */ +#define P_BRANCH 0x01 /* branch page */ +#define P_LEAF 0x02 /* leaf page */ +#define P_OVERFLOW 0x04 /* overflow page */ +#define P_META 0x08 /* meta page */ +#define P_HEAD 0x10 /* header page */ +#define P_DIRTY 0x20 /* dirty page */ + uint32_t mp_flags; +#define mp_lower mp_pb.pb.pb_lower +#define mp_upper mp_pb.pb.pb_upper +#define mp_pages mp_pb.pb_pages + union page_bounds { + struct { + indx_t pb_lower; /* lower bound of free space */ + indx_t pb_upper; /* upper bound of free space */ + } pb; + uint32_t pb_pages; /* number of overflow pages */ + } mp_pb; + indx_t mp_ptrs[1]; /* dynamic size */ +} MDB_page; + +#define PAGEHDRSZ offsetof(MDB_page, mp_ptrs) + +#define NUMKEYS(p) (((p)->mp_lower - PAGEHDRSZ) >> 1) +#define SIZELEFT(p) (indx_t)((p)->mp_upper - (p)->mp_lower) +#define PAGEFILL(env, p) (1000 * ((env)->me_head.mh_psize - PAGEHDRSZ - SIZELEFT(p)) / \ + ((env)->me_head.mh_psize - PAGEHDRSZ)) +#define IS_LEAF(p) F_ISSET((p)->mp_flags, P_LEAF) +#define IS_BRANCH(p) F_ISSET((p)->mp_flags, P_BRANCH) +#define IS_OVERFLOW(p) F_ISSET((p)->mp_flags, P_OVERFLOW) + +typedef struct MDB_head { /* header page content */ + uint32_t mh_magic; + uint32_t mh_version; + uint32_t mh_flags; + uint32_t mh_psize; /* page size */ + void *mh_address; /* address for fixed mapping */ + size_t mh_mapsize; /* size of mmap region */ +} MDB_head; + +typedef struct MDB_rootstat { + pgno_t mr_root; /* page number of root page */ + MDB_stat mr_stat; +} MDB_rootstat; + +typedef struct MDB_meta { /* meta (footer) page content */ + pgno_t mm_root; /* page number of root page */ + MDB_stat mm_stat; + pgno_t mm_prev_meta; /* previous meta page number */ +#define MDB_TOMBSTONE 0x01 /* file is replaced */ + uint32_t mm_flags; +#define mm_revisions mm_stat.ms_revisions +#define mm_created_at mm_stat.ms_created_at + uint8_t mm_hash[SHA_DIGEST_LENGTH]; +} MDB_meta; + +typedef struct MDB_dhead { /* a dirty page */ + SIMPLEQ_ENTRY(MDB_dpage) md_next; /* queue of dirty pages */ + MDB_page *md_parent; + int md_pi; /* parent index */ + int md_num; +} MDB_dhead; + +typedef struct MDB_dpage { + MDB_dhead h; + MDB_page p; +} MDB_dpage; + +SIMPLEQ_HEAD(dirty_queue, MDB_dpage); + +typedef struct MDB_pageparent { + MDB_page *mp_page; + MDB_page *mp_parent; + int mp_pi; +} MDB_pageparent; + +static MDB_dpage *mdb_newpage(MDB_txn *txn, MDB_page *parent, int parent_idx, int num); +static int mdb_touch(MDB_txn *txn, MDB_pageparent *mp); + +typedef struct MDB_ppage { /* ordered list of pages */ + SLIST_ENTRY(MDB_ppage) mp_entry; + MDB_page *mp_page; + unsigned int mp_ki; /* cursor index on page */ +} MDB_ppage; +SLIST_HEAD(page_stack, MDB_ppage); + +#define CURSOR_EMPTY(c) SLIST_EMPTY(&(c)->mc_stack) +#define CURSOR_TOP(c) SLIST_FIRST(&(c)->mc_stack) +#define CURSOR_POP(c) SLIST_REMOVE_HEAD(&(c)->mc_stack, mp_entry) +#define CURSOR_PUSH(c,p) SLIST_INSERT_HEAD(&(c)->mc_stack, p, mp_entry) + +struct MDB_cursor { + MDB_db *mc_db; + MDB_txn *mc_txn; + struct page_stack mc_stack; /* stack of parent pages */ + short mc_initialized; /* 1 if initialized */ + short mc_eof; /* 1 if end is reached */ +}; + +#define METAHASHLEN offsetof(MDB_meta, mm_hash) +#define METADATA(p) ((void *)((char *)p + PAGEHDRSZ)) + +typedef struct MDB_node { +#define mn_pgno mn_p.np_pgno +#define mn_dsize mn_p.np_dsize + union { + pgno_t np_pgno; /* child page number */ + uint32_t np_dsize; /* leaf data size */ + } mn_p; + uint16_t mn_ksize; /* key size */ +#define F_BIGDATA 0x01 /* data put on overflow page */ + uint8_t mn_flags; + char mn_data[1]; +} MDB_node; + +struct MDB_txn { + pgno_t mt_root; /* current / new root page */ + pgno_t mt_next_pgno; /* next unallocated page */ + pgno_t mt_first_pgno; + MDB_env *mt_env; + struct dirty_queue *mt_dirty_queue; /* modified pages */ +#define MDB_TXN_RDONLY 0x01 /* read-only transaction */ +#define MDB_TXN_ERROR 0x02 /* an error has occurred */ + unsigned int mt_flags; +}; + +/* Must be same as MDB_db, minus md_root/md_stat */ +typedef struct MDB_db0 { + unsigned int md_flags; + MDB_cmp_func *md_cmp; /* user compare function */ + MDB_rel_func *md_rel; /* user relocate function */ + MDB_db *md_parent; /* parent tree */ + MDB_env *md_env; +} MDB_db0; + +struct MDB_db { + unsigned int md_flags; + MDB_cmp_func *md_cmp; /* user compare function */ + MDB_rel_func *md_rel; /* user relocate function */ + MDB_db *md_parent; /* parent tree */ + MDB_env *md_env; + pgno_t md_root; /* page number of root page */ + MDB_stat md_stat; +}; + +struct MDB_env { + int me_fd; +#define MDB_FIXPADDING 0x01 /* internal */ + uint32_t me_flags; + char *me_path; + char *me_map; + MDB_head me_head; + MDB_db0 me_db; /* first DB, overlaps with meta */ + MDB_meta me_meta; + MDB_txn *me_txn; /* current write transaction */ + size_t me_mapsize; + off_t me_size; /* current file size */ +}; + +#define NODESIZE offsetof(MDB_node, mn_data) + +#define INDXSIZE(k) (NODESIZE + ((k) == NULL ? 0 : (k)->mv_size)) +#define LEAFSIZE(k, d) (NODESIZE + (k)->mv_size + (d)->mv_size) +#define NODEPTR(p, i) ((MDB_node *)((char *)(p) + (p)->mp_ptrs[i])) +#define NODEKEY(node) (void *)((node)->mn_data) +#define NODEDATA(node) (void *)((char *)(node)->mn_data + (node)->mn_ksize) +#define NODEPGNO(node) ((node)->mn_pgno) +#define NODEDSZ(node) ((node)->mn_dsize) + +#define MDB_COMMIT_PAGES 64 /* max number of pages to write in one commit */ +#define MDB_MAXCACHE_DEF 1024 /* max number of pages to keep in cache */ + +static int mdb_search_page_root(MDB_db *db, + MDB_val *key, + MDB_cursor *cursor, int modify, + MDB_pageparent *mpp); +static int mdb_search_page(MDB_db *db, + MDB_txn *txn, MDB_val *key, + MDB_cursor *cursor, int modify, + MDB_pageparent *mpp); + +static int mdbenv_write_header(MDB_env *env); +static int mdbenv_read_header(MDB_env *env); +static int mdb_check_meta_page(MDB_page *p); +static int mdbenv_read_meta(MDB_env *env, pgno_t *p_next); +static int mdbenv_write_meta(MDB_env *env, pgno_t root, + unsigned int flags); +static MDB_page *mdbenv_get_page(MDB_env *env, pgno_t pgno); + +static MDB_node *mdb_search_node(MDB_db *db, MDB_page *mp, + MDB_val *key, int *exactp, unsigned int *kip); +static int mdb_add_node(MDB_db *bt, MDB_page *mp, + indx_t indx, MDB_val *key, MDB_val *data, + pgno_t pgno, uint8_t flags); +static void mdb_del_node(MDB_db *bt, MDB_page *mp, + indx_t indx); +static int mdb_read_data(MDB_db *bt, MDB_page *mp, + MDB_node *leaf, MDB_val *data); + +static int mdb_rebalance(MDB_db *bt, MDB_pageparent *mp); +static int mdb_update_key(MDB_db *bt, MDB_page *mp, + indx_t indx, MDB_val *key); +static int mdb_move_node(MDB_db *bt, + MDB_pageparent *src, indx_t srcindx, + MDB_pageparent *dst, indx_t dstindx); +static int mdb_merge(MDB_db *bt, MDB_pageparent *src, + MDB_pageparent *dst); +static int mdb_split(MDB_db *bt, MDB_page **mpp, + unsigned int *newindxp, MDB_val *newkey, + MDB_val *newdata, pgno_t newpgno); +static MDB_dpage *mdbenv_new_page(MDB_env *env, uint32_t flags, int num); + +static void cursor_pop_page(MDB_cursor *cursor); +static MDB_ppage *cursor_push_page(MDB_cursor *cursor, + MDB_page *mp); + +static int mdb_set_key(MDB_db *bt, MDB_page *mp, + MDB_node *node, MDB_val *key); +static int mdb_sibling(MDB_cursor *cursor, int move_right); +static int mdb_cursor_next(MDB_cursor *cursor, + MDB_val *key, MDB_val *data); +static int mdb_cursor_set(MDB_cursor *cursor, + MDB_val *key, MDB_val *data, int *exactp); +static int mdb_cursor_first(MDB_cursor *cursor, + MDB_val *key, MDB_val *data); + +static size_t mdb_leaf_size(MDB_db *bt, MDB_val *key, + MDB_val *data); +static size_t mdb_branch_size(MDB_db *bt, MDB_val *key); + +static pgno_t mdbenv_compact_tree(MDB_env *env, pgno_t pgno, + MDB_env *envc); + +static int memncmp(const void *s1, size_t n1, + const void *s2, size_t n2); +static int memnrcmp(const void *s1, size_t n1, + const void *s2, size_t n2); + +static int +memncmp(const void *s1, size_t n1, const void *s2, size_t n2) +{ + if (n1 < n2) { + if (memcmp(s1, s2, n1) == 0) + return -1; + } + else if (n1 > n2) { + if (memcmp(s1, s2, n2) == 0) + return 1; + } + return memcmp(s1, s2, n1); +} + +static int +memnrcmp(const void *s1, size_t n1, const void *s2, size_t n2) +{ + const unsigned char *p1; + const unsigned char *p2; + + if (n1 == 0) + return n2 == 0 ? 0 : -1; + + if (n2 == 0) + return n1 == 0 ? 0 : 1; + + p1 = (const unsigned char *)s1 + n1 - 1; + p2 = (const unsigned char *)s2 + n2 - 1; + + while (*p1 == *p2) { + if (p1 == s1) + return (p2 == s2) ? 0 : -1; + if (p2 == s2) + return (p1 == p2) ? 0 : 1; + p1--; + p2--; + } + return *p1 - *p2; +} + +int +mdb_cmp(MDB_db *db, const MDB_val *a, const MDB_val *b) +{ + return db->md_cmp(a, b); +} + +static int +_mdb_cmp(MDB_db *db, const MDB_val *key1, const MDB_val *key2) +{ + if (F_ISSET(db->md_flags, MDB_REVERSEKEY)) + return memnrcmp(key1->mv_data, key1->mv_size, key2->mv_data, key2->mv_size); + else + return memncmp((char *)key1->mv_data, key1->mv_size, key2->mv_data, key2->mv_size); +} + +/* Allocate new page(s) for writing */ +static MDB_dpage * +mdb_newpage(MDB_txn *txn, MDB_page *parent, int parent_idx, int num) +{ + MDB_dpage *dp; + + if ((dp = malloc(txn->mt_env->me_head.mh_psize * num + sizeof(MDB_dhead))) == NULL) + return NULL; + dp->h.md_num = num; + dp->h.md_parent = parent; + dp->h.md_pi = parent_idx; + SIMPLEQ_INSERT_TAIL(txn->mt_dirty_queue, dp, h.md_next); + dp->p.mp_pgno = txn->mt_next_pgno; + txn->mt_next_pgno += num; + + return dp; +} + +/* Touch a page: make it dirty and re-insert into tree with updated pgno. + */ +static int +mdb_touch(MDB_txn *txn, MDB_pageparent *pp) +{ + int rc; + MDB_page *mp = pp->mp_page; + pgno_t pgno; + assert(txn != NULL); + assert(pp != NULL); + + if (!F_ISSET(mp->mp_flags, P_DIRTY)) { + MDB_dpage *dp; + DPRINTF("touching page %lu -> %lu", mp->mp_pgno, txn->mt_next_pgno); + if ((dp = mdb_newpage(txn, pp->mp_parent, pp->mp_pi, 1)) == NULL) + return ENOMEM; + pgno = dp->p.mp_pgno; + bcopy(mp, &dp->p, txn->mt_env->me_head.mh_psize); + mp = &dp->p; + mp->mp_pgno = pgno; + mp->mp_flags |= P_DIRTY; + + /* Update the page number to new touched page. */ + if (pp->mp_parent != NULL) + NODEPGNO(NODEPTR(pp->mp_parent, pp->mp_pi)) = mp->mp_pgno; + pp->mp_page = mp; + } + return 0; +} + +int +mdbenv_sync(MDB_env *env) +{ + int rc = 0; + if (!F_ISSET(env->me_flags, MDB_NOSYNC)) { + if (fsync(env->me_fd)) + rc = errno; + } + return rc; +} + +int +mdb_txn_begin(MDB_env *env, int rdonly, MDB_txn **ret) +{ + MDB_txn *txn; + int rc; + + if (!rdonly && env->me_txn != NULL) { + DPRINTF("write transaction already begun"); + return EBUSY; + } + + if ((txn = calloc(1, sizeof(*txn))) == NULL) { + DPRINTF("calloc: %s", strerror(errno)); + return ENOMEM; + } + + if (rdonly) { + txn->mt_flags |= MDB_TXN_RDONLY; + } else { + txn->mt_dirty_queue = calloc(1, sizeof(*txn->mt_dirty_queue)); + if (txn->mt_dirty_queue == NULL) { + free(txn); + return ENOMEM; + } + SIMPLEQ_INIT(txn->mt_dirty_queue); + +#if 0 + DPRINTF("taking write lock on txn %p", txn); + if (flock(bt->fd, LOCK_EX | LOCK_NB) != 0) { + DPRINTF("flock: %s", strerror(errno)); + errno = EBUSY; + free(txn->dirty_queue); + free(txn); + return NULL; + } +#endif + env->me_txn = txn; + } + + txn->mt_env = env; + + if ((rc = mdbenv_read_meta(env, &txn->mt_next_pgno)) != MDB_SUCCESS) { + mdb_txn_abort(txn); + return rc; + } + + txn->mt_first_pgno = txn->mt_next_pgno; + txn->mt_root = env->me_meta.mm_root; + DPRINTF("begin transaction on mdbenv %p, root page %lu", env, txn->mt_root); + + *ret = txn; + return MDB_SUCCESS; +} + +void +mdb_txn_abort(MDB_txn *txn) +{ + MDB_dpage *dp; + MDB_env *env; + + if (txn == NULL) + return; + + env = txn->mt_env; + DPRINTF("abort transaction on mdbenv %p, root page %lu", env, txn->mt_root); + + if (!F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) { + /* Discard all dirty pages. + */ + while (!SIMPLEQ_EMPTY(txn->mt_dirty_queue)) { + dp = SIMPLEQ_FIRST(txn->mt_dirty_queue); + SIMPLEQ_REMOVE_HEAD(txn->mt_dirty_queue, h.md_next); + free(dp); + } + +#if 0 + DPRINTF("releasing write lock on txn %p", txn); + txn->bt->txn = NULL; + if (flock(txn->bt->fd, LOCK_UN) != 0) { + DPRINTF("failed to unlock fd %d: %s", + txn->bt->fd, strerror(errno)); + } +#endif + free(txn->mt_dirty_queue); + } + + if (txn == env->me_txn) + env->me_txn = NULL; + free(txn); +} + +int +mdb_txn_commit(MDB_txn *txn) +{ + int n, done; + ssize_t rc; + off_t size; + MDB_dpage *dp; + MDB_env *env; + struct iovec iov[MDB_COMMIT_PAGES]; + + assert(txn != NULL); + assert(txn->mt_env != NULL); + + env = txn->mt_env; + + if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) { + DPRINTF("attempt to commit read-only transaction"); + mdb_txn_abort(txn); + return EPERM; + } + + if (txn != env->me_txn) { + DPRINTF("attempt to commit unknown transaction"); + mdb_txn_abort(txn); + return EINVAL; + } + + if (F_ISSET(txn->mt_flags, MDB_TXN_ERROR)) { + DPRINTF("error flag is set, can't commit"); + mdb_txn_abort(txn); + return EINVAL; + } + + if (SIMPLEQ_EMPTY(txn->mt_dirty_queue)) + goto done; + + if (F_ISSET(env->me_flags, MDB_FIXPADDING)) { + size = lseek(env->me_fd, 0, SEEK_END); + size += env->me_head.mh_psize - (size % env->me_head.mh_psize); + DPRINTF("extending to multiple of page size: %lu", size); + if (ftruncate(env->me_fd, size) != 0) { + n = errno; + DPRINTF("ftruncate: %s", strerror(errno)); + mdb_txn_abort(txn); + return n; + } + env->me_flags ^= MDB_FIXPADDING; + } + + DPRINTF("committing transaction on mdbenv %p, root page %lu", + env, txn->mt_root); + + /* Commit up to MDB_COMMIT_PAGES dirty pages to disk until done. + */ + do { + n = 0; + done = 1; + SIMPLEQ_FOREACH(dp, txn->mt_dirty_queue, h.md_next) { + DPRINTF("committing page %lu", dp->p.mp_pgno); + iov[n].iov_len = env->me_head.mh_psize * dp->h.md_num; + iov[n].iov_base = &dp->p; + /* clear dirty flag */ + dp->p.mp_flags &= ~P_DIRTY; + if (++n >= MDB_COMMIT_PAGES) { + done = 0; + break; + } + } + + if (n == 0) + break; + + DPRINTF("committing %u dirty pages", n); + rc = writev(env->me_fd, iov, n); + if (rc != (ssize_t)env->me_head.mh_psize*n) { + n = errno; + if (rc > 0) + DPRINTF("short write, filesystem full?"); + else + DPRINTF("writev: %s", strerror(errno)); + mdb_txn_abort(txn); + return n; + } + + /* Drop the dirty pages. + */ + while (!SIMPLEQ_EMPTY(txn->mt_dirty_queue)) { + dp = SIMPLEQ_FIRST(txn->mt_dirty_queue); + SIMPLEQ_REMOVE_HEAD(txn->mt_dirty_queue, h.md_next); + free(dp); + if (--n == 0) + break; + } + } while (!done); + + if ((n = mdbenv_sync(env)) != 0 || + (n = mdbenv_write_meta(env, txn->mt_root, 0)) != MDB_SUCCESS || + (n = mdbenv_sync(env)) != 0) { + mdb_txn_abort(txn); + return n; + } + txn = NULL; + env->me_txn = NULL; + +done: + mdb_txn_abort(txn); + + return MDB_SUCCESS; +} + +static int +mdbenv_write_header(MDB_env *env) +{ + struct stat sb; + MDB_head *h; + MDB_page *p; + ssize_t rc; + unsigned int psize; + + DPRINTF("writing header page"); + assert(env != NULL); + + psize = sysconf(_SC_PAGE_SIZE); + + if ((p = calloc(1, psize)) == NULL) + return ENOMEM; + p->mp_flags = P_HEAD; + + env->me_head.mh_psize = psize; + env->me_head.mh_flags = env->me_flags & 0xffff; + + h = METADATA(p); + bcopy(&env->me_head, h, sizeof(*h)); + + rc = write(env->me_fd, p, env->me_head.mh_psize); + free(p); + if (rc != (ssize_t)env->me_head.mh_psize) { + int err = errno; + if (rc > 0) + DPRINTF("short write, filesystem full?"); + return err; + } + + return MDB_SUCCESS; +} + +static int +mdbenv_read_header(MDB_env *env) +{ + char page[PAGESIZE]; + MDB_page *p; + MDB_head *h; + int rc; + + assert(env != NULL); + + /* We don't know the page size yet, so use a minimum value. + */ + + if ((rc = pread(env->me_fd, page, PAGESIZE, 0)) == 0) { + return ENOENT; + } else if (rc != PAGESIZE) { + if (rc > 0) + errno = EINVAL; + DPRINTF("read: %s", strerror(errno)); + return errno; + } + + p = (MDB_page *)page; + + if (!F_ISSET(p->mp_flags, P_HEAD)) { + DPRINTF("page %lu not a header page", p->mp_pgno); + return EINVAL; + } + + h = METADATA(p); + if (h->mh_magic != MDB_MAGIC) { + DPRINTF("header has invalid magic"); + return EINVAL; + } + + if (h->mh_version != MDB_VERSION) { + DPRINTF("database is version %u, expected version %u", + h->mh_version, MDB_VERSION); + return EINVAL; + } + + bcopy(h, &env->me_head, sizeof(*h)); + return 0; +} + +static int +mdbenv_write_meta(MDB_env *env, pgno_t root, unsigned int flags) +{ + MDB_dpage *dp; + MDB_meta *meta; + ssize_t rc; + + DPRINTF("writing meta page for root page %lu", root); + + assert(env != NULL); + assert(env->me_txn != NULL); + + if ((dp = mdbenv_new_page(env, P_META, 1)) == NULL) + return ENOMEM; + + env->me_meta.mm_prev_meta = env->me_meta.mm_root; + env->me_meta.mm_root = root; + env->me_meta.mm_flags = flags; + env->me_meta.mm_created_at = time(0); + env->me_meta.mm_revisions++; + SHA1((unsigned char *)&env->me_meta, METAHASHLEN, env->me_meta.mm_hash); + + /* Copy the meta data changes to the new meta page. */ + meta = METADATA(&dp->p); + bcopy(&env->me_meta, meta, sizeof(*meta)); + + rc = write(env->me_fd, &dp->p, env->me_head.mh_psize); + SIMPLEQ_REMOVE_HEAD(env->me_txn->mt_dirty_queue, h.md_next); + free(dp); + if (rc != (ssize_t)env->me_head.mh_psize) { + int err = errno; + if (rc > 0) + DPRINTF("short write, filesystem full?"); + return err; + } + + if ((env->me_size = lseek(env->me_fd, 0, SEEK_END)) == -1) { + DPRINTF("failed to update file size: %s", strerror(errno)); + env->me_size = 0; + } + + return MDB_SUCCESS; +} + +/* Returns true if page p is a valid meta page, false otherwise. + */ +static int +mdb_check_meta_page(MDB_page *p) +{ + MDB_meta *m; + unsigned char hash[SHA_DIGEST_LENGTH]; + + m = METADATA(p); + if (!F_ISSET(p->mp_flags, P_META)) { + DPRINTF("page %lu not a meta page", p->mp_pgno); + return EINVAL; + } + + if (m->mm_root >= p->mp_pgno && m->mm_root != P_INVALID) { + DPRINTF("page %lu points to an invalid root page", p->mp_pgno); + return EINVAL; + } + + SHA1((unsigned char *)m, METAHASHLEN, hash); + if (bcmp(hash, m->mm_hash, SHA_DIGEST_LENGTH) != 0) { + DPRINTF("page %lu has an invalid digest", p->mp_pgno); + return EINVAL; + } + + return 0; +} + +static int +mdbenv_read_meta(MDB_env *env, pgno_t *p_next) +{ + MDB_page *mp; + MDB_meta *meta; + pgno_t meta_pgno, next_pgno; + off_t size; + + assert(env != NULL); + + if ((size = lseek(env->me_fd, 0, SEEK_END)) == -1) + goto fail; + + DPRINTF("mdbenv_read_meta: size = %lu", size); + + if (size < env->me_size) { + DPRINTF("file has shrunk!"); + errno = EIO; + goto fail; + } + + if (size == env->me_head.mh_psize) { /* there is only the header */ + if (p_next != NULL) + *p_next = 1; + return MDB_SUCCESS; /* new file */ + } + + next_pgno = size / env->me_head.mh_psize; + if (next_pgno == 0) { + DPRINTF("corrupt file"); + errno = EIO; + goto fail; + } + + meta_pgno = next_pgno - 1; + + if (size % env->me_head.mh_psize != 0) { + DPRINTF("filesize not a multiple of the page size!"); + env->me_flags |= MDB_FIXPADDING; + next_pgno++; + } + + if (p_next != NULL) + *p_next = next_pgno; + + if (size == env->me_size) { + DPRINTF("size unchanged, keeping current meta page"); + if (F_ISSET(env->me_meta.mm_flags, MDB_TOMBSTONE)) { + DPRINTF("file is dead"); + errno = ESTALE; + return MDB_FAIL; + } else + return MDB_SUCCESS; + } else { + env->me_size = size; + } + + while (meta_pgno > 0) { + if ((mp = mdbenv_get_page(env, meta_pgno)) == NULL) + break; + if (!mdb_check_meta_page(mp)) { + meta = METADATA(mp); + DPRINTF("flags = 0x%x", meta->mm_flags); + if (F_ISSET(meta->mm_flags, MDB_TOMBSTONE)) { + DPRINTF("file is dead"); + errno = ESTALE; + return MDB_FAIL; + } else { + /* Make copy of last meta page. */ + bcopy(meta, &env->me_meta, sizeof(env->me_meta)); + return MDB_SUCCESS; + } + } + --meta_pgno; /* scan backwards to first valid meta page */ + } + + errno = EIO; +fail: + if (p_next != NULL) + *p_next = P_INVALID; + return MDB_FAIL; +} + +int +mdbenv_create(MDB_env **env, size_t size) +{ + MDB_env *e; + + if (!size) return EINVAL; + + e = calloc(1, sizeof(*e)); + if (!e) return ENOMEM; + + e->me_head.mh_magic = MDB_MAGIC; + e->me_head.mh_version = MDB_VERSION; + e->me_mapsize = e->me_head.mh_mapsize = size; + e->me_db.md_env = e; + *env = e; + return MDB_SUCCESS; +} + +int +mdbenv_open2(MDB_env *env, unsigned int flags) +{ + int i, newenv = 0; + + i = fcntl(env->me_fd, F_GETFL, 0); + if (fcntl(env->me_fd, F_SETFL, i | O_APPEND) == -1) + return errno; + + env->me_flags = flags; + env->me_flags &= ~MDB_FIXPADDING; + env->me_meta.mm_root = P_INVALID; + + if ((i = mdbenv_read_header(env)) != 0) { + if (i != ENOENT) + return i; + DPRINTF("new mdbenv"); + newenv = 1; + } + + i = MAP_SHARED; + if (env->me_head.mh_address && (flags & MDB_FIXEDMAP)) + i |= MAP_FIXED; + env->me_map = mmap(env->me_head.mh_address, env->me_mapsize, PROT_READ, i, + env->me_fd, 0); + if (env->me_map == MAP_FAILED) + return errno; + + if (newenv) { + env->me_head.mh_mapsize = env->me_mapsize; + if (flags & MDB_FIXEDMAP) + env->me_head.mh_address = env->me_map; + i = mdbenv_write_header(env); + if (i != MDB_SUCCESS) { + munmap(env->me_map, env->me_mapsize); + return i; + } + } + + if ((i = mdbenv_read_meta(env, NULL)) != 0) + return i; + + DPRINTF("opened database version %u, pagesize %u", + env->me_head.mh_version, env->me_head.mh_psize); + DPRINTF("timestamp: %s", ctime(&env->me_meta.mm_created_at)); + DPRINTF("depth: %u", env->me_meta.mm_stat.ms_depth); + DPRINTF("entries: %lu", env->me_meta.mm_stat.ms_entries); + DPRINTF("revisions: %lu", env->me_meta.mm_stat.ms_revisions); + DPRINTF("branch pages: %lu", env->me_meta.mm_stat.ms_branch_pages); + DPRINTF("leaf pages: %lu", env->me_meta.mm_stat.ms_leaf_pages); + DPRINTF("overflow pages: %lu", env->me_meta.mm_stat.ms_overflow_pages); + DPRINTF("root: %lu", env->me_meta.mm_root); + DPRINTF("previous me_meta.me_page: %lu", env->me_meta.mm_prev_meta); + + return MDB_SUCCESS; +} + +int +mdbenv_open(MDB_env *env, const char *path, unsigned int flags, mode_t mode) +{ + int oflags, rc; + + if (F_ISSET(flags, MDB_RDONLY)) + oflags = O_RDONLY; + else + oflags = O_RDWR | O_CREAT | O_APPEND; + + if ((env->me_fd = open(path, oflags, mode)) == -1) + return errno; + + if ((rc = mdbenv_open2(env, flags)) != MDB_SUCCESS) { + close(env->me_fd); + env->me_fd = -1; + } else { + env->me_path = strdup(path); + DPRINTF("opened dbenv %p", env); + } + + return rc; +} + +void +mdbenv_close(MDB_env *env) +{ + if (env == NULL) + return; + + free(env->me_path); + + if (env->me_map) { + munmap(env->me_map, env->me_mapsize); + } + close(env->me_fd); +} + +/* Search for key within a leaf page, using binary search. + * Returns the smallest entry larger or equal to the key. + * If exactp is non-null, stores whether the found entry was an exact match + * in *exactp (1 or 0). + * If kip is non-null, stores the index of the found entry in *kip. + * If no entry larger or equal to the key is found, returns NULL. + */ +static MDB_node * +mdb_search_node(MDB_db *bt, MDB_page *mp, MDB_val *key, + int *exactp, unsigned int *kip) +{ + unsigned int i = 0; + int low, high; + int rc = 0; + MDB_node *node; + MDB_val nodekey; + + DPRINTF("searching %lu keys in %s page %lu", + NUMKEYS(mp), + IS_LEAF(mp) ? "leaf" : "branch", + mp->mp_pgno); + + assert(NUMKEYS(mp) > 0); + + bzero(&nodekey, sizeof(nodekey)); + + low = IS_LEAF(mp) ? 0 : 1; + high = NUMKEYS(mp) - 1; + while (low <= high) { + i = (low + high) >> 1; + node = NODEPTR(mp, i); + + nodekey.mv_size = node->mn_ksize; + nodekey.mv_data = NODEKEY(node); + + if (bt->md_cmp) + rc = bt->md_cmp(key, &nodekey); + else + rc = _mdb_cmp(bt, key, &nodekey); + + if (IS_LEAF(mp)) + DPRINTF("found leaf index %u [%.*s], rc = %i", + i, (int)nodekey.mv_size, (char *)nodekey.mv_data, rc); + else + DPRINTF("found branch index %u [%.*s -> %lu], rc = %i", + i, (int)node->mn_ksize, (char *)NODEKEY(node), + node->mn_pgno, rc); + + if (rc == 0) + break; + if (rc > 0) + low = i + 1; + else + high = i - 1; + } + + if (rc > 0) { /* Found entry is less than the key. */ + i++; /* Skip to get the smallest entry larger than key. */ + if (i >= NUMKEYS(mp)) + /* There is no entry larger or equal to the key. */ + return NULL; + } + if (exactp) + *exactp = (rc == 0); + if (kip) /* Store the key index if requested. */ + *kip = i; + + return NODEPTR(mp, i); +} + +static void +cursor_pop_page(MDB_cursor *cursor) +{ + MDB_ppage *top; + + top = CURSOR_TOP(cursor); + CURSOR_POP(cursor); + + DPRINTF("popped page %lu off cursor %p", top->mp_page->mp_pgno, cursor); + + free(top); +} + +static MDB_ppage * +cursor_push_page(MDB_cursor *cursor, MDB_page *mp) +{ + MDB_ppage *ppage; + + DPRINTF("pushing page %lu on cursor %p", mp->mp_pgno, cursor); + + if ((ppage = calloc(1, sizeof(*ppage))) == NULL) + return NULL; + ppage->mp_page = mp; + CURSOR_PUSH(cursor, ppage); + return ppage; +} + +static MDB_page * +mdbenv_get_page(MDB_env *env, pgno_t pgno) +{ + MDB_page *p = NULL; + MDB_txn *txn = env->me_txn; + + if (txn && pgno >= txn->mt_first_pgno && + !SIMPLEQ_EMPTY(txn->mt_dirty_queue)) { + MDB_dpage *dp; + SIMPLEQ_FOREACH(dp, txn->mt_dirty_queue, h.md_next) { + if (dp->p.mp_pgno == pgno) { + p = &dp->p; + break; + } + } + } else { + p = (MDB_page *)(env->me_map + env->me_head.mh_psize * pgno); + } + return p; +} + +static int +mdb_search_page_root(MDB_db *bt, MDB_val *key, + MDB_cursor *cursor, int modify, MDB_pageparent *mpp) +{ + MDB_page *mp = mpp->mp_page; + int rc; + + if (cursor && cursor_push_page(cursor, mp) == NULL) + return MDB_FAIL; + + while (IS_BRANCH(mp)) { + unsigned int i = 0; + MDB_node *node; + + DPRINTF("branch page %lu has %lu keys", mp->mp_pgno, NUMKEYS(mp)); + assert(NUMKEYS(mp) > 1); + DPRINTF("found index 0 to page %lu", NODEPGNO(NODEPTR(mp, 0))); + + if (key == NULL) /* Initialize cursor to first page. */ + i = 0; + else { + int exact; + node = mdb_search_node(bt, mp, key, &exact, &i); + if (node == NULL) + i = NUMKEYS(mp) - 1; + else if (!exact) { + assert(i > 0); + i--; + } + } + + if (key) + DPRINTF("following index %u for key %.*s", + i, (int)key->mv_size, (char *)key->mv_data); + assert(i >= 0 && i < NUMKEYS(mp)); + node = NODEPTR(mp, i); + + if (cursor) + CURSOR_TOP(cursor)->mp_ki = i; + + mpp->mp_parent = mp; + if ((mp = mdbenv_get_page(bt->md_env, NODEPGNO(node))) == NULL) + return MDB_FAIL; + mpp->mp_pi = i; + mpp->mp_page = mp; + + if (cursor && cursor_push_page(cursor, mp) == NULL) + return MDB_FAIL; + + if (modify && (rc = mdb_touch(bt->md_env->me_txn, mpp))) + return rc; + mp = mpp->mp_page; + } + + if (!IS_LEAF(mp)) { + DPRINTF("internal error, index points to a %02X page!?", + mp->mp_flags); + return MDB_FAIL; + } + + DPRINTF("found leaf page %lu for key %.*s", mp->mp_pgno, + key ? (int)key->mv_size : 0, key ? (char *)key->mv_data : NULL); + + return MDB_SUCCESS; +} + +/* Search for the page a given key should be in. + * Stores a pointer to the found page in *mpp. + * If key is NULL, search for the lowest page (used by mdb_cursor_first). + * If cursor is non-null, pushes parent pages on the cursor stack. + * If modify is true, visited pages are updated with new page numbers. + */ +static int +mdb_search_page(MDB_db *db, MDB_txn *txn, MDB_val *key, + MDB_cursor *cursor, int modify, MDB_pageparent *mpp) +{ + int rc; + pgno_t root; + + /* Can't modify pages outside a transaction. */ + if (txn == NULL && modify) { + return EINVAL; + } + + /* Choose which root page to start with. If a transaction is given + * use the root page from the transaction, otherwise read the last + * committed root page. + */ + if (txn == NULL) { + if ((rc = mdbenv_read_meta(db->md_env, NULL)) != MDB_SUCCESS) + return rc; + root = db->md_env->me_meta.mm_root; + } else if (F_ISSET(txn->mt_flags, MDB_TXN_ERROR)) { + DPRINTF("transaction has failed, must abort"); + return EINVAL; + } else + root = txn->mt_root; + + if (root == P_INVALID) { /* Tree is empty. */ + DPRINTF("tree is empty"); + return ENOENT; + } + + if ((mpp->mp_page = mdbenv_get_page(db->md_env, root)) == NULL) + return MDB_FAIL; + + DPRINTF("root page has flags 0x%X", mpp->mp_page->mp_flags); + + if (modify && !F_ISSET(mpp->mp_page->mp_flags, P_DIRTY)) { + mpp->mp_parent = NULL; + mpp->mp_pi = 0; + if ((rc = mdb_touch(txn, mpp))) + return rc; + txn->mt_root = mpp->mp_page->mp_pgno; + } + + return mdb_search_page_root(db, key, cursor, modify, mpp); +} + +static int +mdb_read_data(MDB_db *db, MDB_page *mp, MDB_node *leaf, + MDB_val *data) +{ + MDB_page *omp; /* overflow mpage */ + size_t psz; + size_t max; + size_t sz = 0; + pgno_t pgno; + + bzero(data, sizeof(*data)); + max = db->md_env->me_head.mh_psize - PAGEHDRSZ; + + if (!F_ISSET(leaf->mn_flags, F_BIGDATA)) { + data->mv_size = leaf->mn_dsize; + data->mv_data = NODEDATA(leaf); + return MDB_SUCCESS; + } + + /* Read overflow data. + */ + data->mv_size = leaf->mn_dsize; + bcopy(NODEDATA(leaf), &pgno, sizeof(pgno)); + if ((omp = mdbenv_get_page(db->md_env, pgno)) == NULL) { + DPRINTF("read overflow page %lu failed", pgno); + return MDB_FAIL; + } + data->mv_data = omp; + + return MDB_SUCCESS; +} + +int +mdb_get(MDB_db *db, MDB_txn *txn, + MDB_val *key, MDB_val *data) +{ + int rc, exact; + MDB_node *leaf; + MDB_pageparent mpp; + + assert(key); + assert(data); + DPRINTF("===> get key [%.*s]", (int)key->mv_size, (char *)key->mv_data); + + if (db == NULL) + return EINVAL; + + if (txn != NULL && db->md_env != txn->mt_env) { + return EINVAL; + } + + if (key->mv_size == 0 || key->mv_size > MAXKEYSIZE) { + return EINVAL; + } + + if ((rc = mdb_search_page(db, txn, key, NULL, 0, &mpp)) != MDB_SUCCESS) + return rc; + + leaf = mdb_search_node(db, mpp.mp_page, key, &exact, NULL); + if (leaf && exact) + rc = mdb_read_data(db, mpp.mp_page, leaf, data); + else { + rc = ENOENT; + } + + return rc; +} + +static int +mdb_sibling(MDB_cursor *cursor, int move_right) +{ + int rc; + MDB_node *indx; + MDB_ppage *parent, *top; + MDB_page *mp; + + top = CURSOR_TOP(cursor); + if ((parent = SLIST_NEXT(top, mp_entry)) == NULL) { + return ENOENT; /* root has no siblings */ + } + + DPRINTF("parent page is page %lu, index %u", + parent->mp_page->mp_pgno, parent->mp_ki); + + cursor_pop_page(cursor); + if (move_right ? (parent->mp_ki + 1 >= NUMKEYS(parent->mp_page)) + : (parent->mp_ki == 0)) { + DPRINTF("no more keys left, moving to %s sibling", + move_right ? "right" : "left"); + if ((rc = mdb_sibling(cursor, move_right)) != MDB_SUCCESS) + return rc; + parent = CURSOR_TOP(cursor); + } else { + if (move_right) + parent->mp_ki++; + else + parent->mp_ki--; + DPRINTF("just moving to %s index key %u", + move_right ? "right" : "left", parent->mp_ki); + } + assert(IS_BRANCH(parent->mp_page)); + +#if 0 + indx = NODEPTR(parent->mp_page, parent->mp_ki); + if ((mp = mdbenv_get_page(cursor->bt, indx->n_pgno)) == NULL) + return MDB_FAIL; + mp->parent = parent->mp_page; + mp->parent_index = parent->mp_ki; +#endif + + cursor_push_page(cursor, mp); + + return MDB_SUCCESS; +} + +static int +mdb_set_key(MDB_db *bt, MDB_page *mp, MDB_node *node, + MDB_val *key) +{ + if (key == NULL) + return 0; + + key->mv_size = node->mn_ksize; + key->mv_data = NODEKEY(node); + + return 0; +} + +static int +mdb_cursor_next(MDB_cursor *cursor, MDB_val *key, MDB_val *data) +{ + MDB_ppage *top; + MDB_page *mp; + MDB_node *leaf; + + if (cursor->mc_eof) { + return ENOENT; + } + + assert(cursor->mc_initialized); + + top = CURSOR_TOP(cursor); + mp = top->mp_page; + + DPRINTF("cursor_next: top page is %lu in cursor %p", mp->mp_pgno, cursor); + + if (top->mp_ki + 1 >= NUMKEYS(mp)) { + DPRINTF("=====> move to next sibling page"); + if (mdb_sibling(cursor, 1) != MDB_SUCCESS) { + cursor->mc_eof = 1; + return ENOENT; + } + top = CURSOR_TOP(cursor); + mp = top->mp_page; + DPRINTF("next page is %lu, key index %u", mp->mp_pgno, top->mp_ki); + } else + top->mp_ki++; + + DPRINTF("==> cursor points to page %lu with %lu keys, key index %u", + mp->mp_pgno, NUMKEYS(mp), top->mp_ki); + + assert(IS_LEAF(mp)); + leaf = NODEPTR(mp, top->mp_ki); + + if (data && mdb_read_data(cursor->mc_db, mp, leaf, data) != MDB_SUCCESS) + return MDB_FAIL; + + return mdb_set_key(cursor->mc_db, mp, leaf, key); +} + +static int +mdb_cursor_set(MDB_cursor *cursor, MDB_val *key, MDB_val *data, + int *exactp) +{ + int rc; + MDB_node *leaf; + MDB_ppage *top; + MDB_pageparent mpp; + + assert(cursor); + assert(key); + assert(key->mv_size > 0); + + rc = mdb_search_page(cursor->mc_db, cursor->mc_txn, key, cursor, 0, &mpp); + if (rc != MDB_SUCCESS) + return rc; + assert(IS_LEAF(mpp.mp_page)); + + top = CURSOR_TOP(cursor); + leaf = mdb_search_node(cursor->mc_db, mpp.mp_page, key, exactp, &top->mp_ki); + if (exactp != NULL && !*exactp) { + /* MDB_CURSOR_EXACT specified and not an exact match. */ + return ENOENT; + } + + if (leaf == NULL) { + DPRINTF("===> inexact leaf not found, goto sibling"); + if ((rc = mdb_sibling(cursor, 1)) != MDB_SUCCESS) + return rc; /* no entries matched */ + top = CURSOR_TOP(cursor); + top->mp_ki = 0; + mpp.mp_page = top->mp_page; + assert(IS_LEAF(mpp.mp_page)); + leaf = NODEPTR(mpp.mp_page, 0); + } + + cursor->mc_initialized = 1; + cursor->mc_eof = 0; + + if (data && (rc = mdb_read_data(cursor->mc_db, mpp.mp_page, leaf, data)) != MDB_SUCCESS) + return rc; + + rc = mdb_set_key(cursor->mc_db, mpp.mp_page, leaf, key); + if (rc == MDB_SUCCESS) { + DPRINTF("==> cursor placed on key %.*s", + (int)key->mv_size, (char *)key->mv_data); + ; + } + + return rc; +} + +static int +mdb_cursor_first(MDB_cursor *cursor, MDB_val *key, MDB_val *data) +{ + int rc; + MDB_pageparent mpp; + MDB_node *leaf; + + rc = mdb_search_page(cursor->mc_db, cursor->mc_txn, NULL, cursor, 0, &mpp); + if (rc != MDB_SUCCESS) + return rc; + assert(IS_LEAF(mpp.mp_page)); + + leaf = NODEPTR(mpp.mp_page, 0); + cursor->mc_initialized = 1; + cursor->mc_eof = 0; + + if (data && (rc = mdb_read_data(cursor->mc_db, mpp.mp_page, leaf, data)) != MDB_SUCCESS) + return rc; + + return mdb_set_key(cursor->mc_db, mpp.mp_page, leaf, key); +} + +int +mdb_cursor_get(MDB_cursor *cursor, MDB_val *key, MDB_val *data, + MDB_cursor_op op) +{ + int rc; + int exact = 0; + + assert(cursor); + + switch (op) { + case MDB_CURSOR: + case MDB_CURSOR_EXACT: + while (CURSOR_TOP(cursor) != NULL) + cursor_pop_page(cursor); + if (key == NULL || key->mv_size == 0 || key->mv_size > MAXKEYSIZE) { + rc = EINVAL; + } else if (op == MDB_CURSOR_EXACT) + rc = mdb_cursor_set(cursor, key, data, &exact); + else + rc = mdb_cursor_set(cursor, key, data, NULL); + break; + case MDB_NEXT: + if (!cursor->mc_initialized) + rc = mdb_cursor_first(cursor, key, data); + else + rc = mdb_cursor_next(cursor, key, data); + break; + case MDB_FIRST: + while (CURSOR_TOP(cursor) != NULL) + cursor_pop_page(cursor); + rc = mdb_cursor_first(cursor, key, data); + break; + default: + DPRINTF("unhandled/unimplemented cursor operation %u", op); + rc = EINVAL; + break; + } + + return rc; +} + +/* Allocate a page and initialize it + */ +static MDB_dpage * +mdbenv_new_page(MDB_env *env, uint32_t flags, int num) +{ + MDB_dpage *dp; + + assert(env != NULL); + assert(env->me_txn != NULL); + + DPRINTF("allocating new mpage %lu, page size %u", + env->me_txn->mt_next_pgno, env->me_head.mh_psize); + if ((dp = mdb_newpage(env->me_txn, NULL, 0, num)) == NULL) + return NULL; + dp->p.mp_flags = flags; + dp->p.mp_lower = PAGEHDRSZ; + dp->p.mp_upper = env->me_head.mh_psize; + + if (IS_BRANCH(&dp->p)) + env->me_meta.mm_stat.ms_branch_pages++; + else if (IS_LEAF(&dp->p)) + env->me_meta.mm_stat.ms_leaf_pages++; + else if (IS_OVERFLOW(&dp->p)) { + env->me_meta.mm_stat.ms_overflow_pages += num; + dp->p.mp_pages = num; + } + + return dp; +} + +static size_t +mdb_leaf_size(MDB_db *db, MDB_val *key, MDB_val *data) +{ + size_t sz; + + sz = LEAFSIZE(key, data); + if (data->mv_size >= db->md_env->me_head.mh_psize / MDB_MINKEYS) { + /* put on overflow page */ + sz -= data->mv_size - sizeof(pgno_t); + } + + return sz + sizeof(indx_t); +} + +static size_t +mdb_branch_size(MDB_db *db, MDB_val *key) +{ + size_t sz; + + sz = INDXSIZE(key); + if (sz >= db->md_env->me_head.mh_psize / MDB_MINKEYS) { + /* put on overflow page */ + /* not implemented */ + /* sz -= key->size - sizeof(pgno_t); */ + } + + return sz + sizeof(indx_t); +} + +static int +mdb_add_node(MDB_db *db, MDB_page *mp, indx_t indx, + MDB_val *key, MDB_val *data, pgno_t pgno, uint8_t flags) +{ + unsigned int i; + size_t node_size = NODESIZE; + indx_t ofs; + MDB_node *node; + MDB_dpage *ofp = NULL; /* overflow page */ + + assert(mp->mp_upper >= mp->mp_lower); + + DPRINTF("add node [%.*s] to %s page %lu at index %i, key size %zu", + key ? (int)key->mv_size : 0, key ? (char *)key->mv_data : NULL, + IS_LEAF(mp) ? "leaf" : "branch", + mp->mp_pgno, indx, key ? key->mv_size : 0); + + if (key != NULL) + node_size += key->mv_size; + + if (IS_LEAF(mp)) { + assert(data); + node_size += data->mv_size; + if (F_ISSET(flags, F_BIGDATA)) { + /* Data already on overflow page. */ + node_size -= data->mv_size - sizeof(pgno_t); + } else if (data->mv_size >= db->md_env->me_head.mh_psize / MDB_MINKEYS) { + int ovpages = PAGEHDRSZ + data->mv_size + db->md_env->me_head.mh_psize - 1; + ovpages /= db->md_env->me_head.mh_psize; + /* Put data on overflow page. */ + DPRINTF("data size is %zu, put on overflow page", + data->mv_size); + node_size -= data->mv_size - sizeof(pgno_t); + if ((ofp = mdbenv_new_page(db->md_env, P_OVERFLOW, ovpages)) == NULL) + return MDB_FAIL; + DPRINTF("allocated overflow page %lu", ofp->p.mp_pgno); + flags |= F_BIGDATA; + } + } + + if (node_size + sizeof(indx_t) > SIZELEFT(mp)) { + DPRINTF("not enough room in page %lu, got %lu ptrs", + mp->mp_pgno, NUMKEYS(mp)); + DPRINTF("upper - lower = %u - %u = %u", mp->mp_upper, mp->mp_lower, + mp->mp_upper - mp->mp_lower); + DPRINTF("node size = %zu", node_size); + return ENOSPC; + } + + /* Move higher pointers up one slot. */ + for (i = NUMKEYS(mp); i > indx; i--) + mp->mp_ptrs[i] = mp->mp_ptrs[i - 1]; + + /* Adjust free space offsets. */ + ofs = mp->mp_upper - node_size; + assert(ofs >= mp->mp_lower + sizeof(indx_t)); + mp->mp_ptrs[indx] = ofs; + mp->mp_upper = ofs; + mp->mp_lower += sizeof(indx_t); + + /* Write the node data. */ + node = NODEPTR(mp, indx); + node->mn_ksize = (key == NULL) ? 0 : key->mv_size; + node->mn_flags = flags; + if (IS_LEAF(mp)) + node->mn_dsize = data->mv_size; + else + node->mn_pgno = pgno; + + if (key) + bcopy(key->mv_data, NODEKEY(node), key->mv_size); + + if (IS_LEAF(mp)) { + assert(key); + if (ofp == NULL) { + if (F_ISSET(flags, F_BIGDATA)) + bcopy(data->mv_data, node->mn_data + key->mv_size, + sizeof(pgno_t)); + else + bcopy(data->mv_data, node->mn_data + key->mv_size, + data->mv_size); + } else { + bcopy(&ofp->p.mp_pgno, node->mn_data + key->mv_size, + sizeof(pgno_t)); + bcopy(data->mv_data, METADATA(&ofp->p), data->mv_size); + } + } + + return MDB_SUCCESS; +} + +static void +mdb_del_node(MDB_db *db, MDB_page *mp, indx_t indx) +{ + unsigned int sz; + indx_t i, j, numkeys, ptr; + MDB_node *node; + char *base; + + DPRINTF("delete node %u on %s page %lu", indx, + IS_LEAF(mp) ? "leaf" : "branch", mp->mp_pgno); + assert(indx < NUMKEYS(mp)); + + node = NODEPTR(mp, indx); + sz = NODESIZE + node->mn_ksize; + if (IS_LEAF(mp)) { + if (F_ISSET(node->mn_flags, F_BIGDATA)) + sz += sizeof(pgno_t); + else + sz += NODEDSZ(node); + } + + ptr = mp->mp_ptrs[indx]; + numkeys = NUMKEYS(mp); + for (i = j = 0; i < numkeys; i++) { + if (i != indx) { + mp->mp_ptrs[j] = mp->mp_ptrs[i]; + if (mp->mp_ptrs[i] < ptr) + mp->mp_ptrs[j] += sz; + j++; + } + } + + base = (char *)mp + mp->mp_upper; + bcopy(base, base + sz, ptr - mp->mp_upper); + + mp->mp_lower -= sizeof(indx_t); + mp->mp_upper += sz; +} + +int +mdb_cursor_open(MDB_db *db, MDB_txn *txn, MDB_cursor **ret) +{ + MDB_cursor *cursor; + + if (db == NULL || ret == NULL) + return EINVAL; + + if (txn != NULL && db->md_env != txn->mt_env) { + return EINVAL; + } + + if ((cursor = calloc(1, sizeof(*cursor))) != NULL) { + SLIST_INIT(&cursor->mc_stack); + cursor->mc_db = db; + cursor->mc_txn = txn; + } + + *ret = cursor; + + return MDB_SUCCESS; +} + +void +mdb_cursor_close(MDB_cursor *cursor) +{ + if (cursor != NULL) { + while (!CURSOR_EMPTY(cursor)) + cursor_pop_page(cursor); + +/* btree_close(cursor->bt); */ + free(cursor); + } +} + +static int +mdb_update_key(MDB_db *db, MDB_page *mp, indx_t indx, + MDB_val *key) +{ + indx_t ptr, i, numkeys; + int delta; + size_t len; + MDB_node *node; + char *base; + + node = NODEPTR(mp, indx); + ptr = mp->mp_ptrs[indx]; + DPRINTF("update key %u (ofs %u) [%.*s] to [%.*s] on page %lu", + indx, ptr, + (int)node->mn_ksize, (char *)NODEKEY(node), + (int)key->mv_size, (char *)key->mv_data, + mp->mp_pgno); + + delta = key->mv_size - node->mn_ksize; + if (delta) { + if (delta > 0 && SIZELEFT(mp) < delta) { + DPRINTF("OUCH! Not enough room, delta = %d", delta); + return ENOSPC; + } + + numkeys = NUMKEYS(mp); + for (i = 0; i < numkeys; i++) { + if (mp->mp_ptrs[i] <= ptr) + mp->mp_ptrs[i] -= delta; + } + + base = (char *)mp + mp->mp_upper; + len = ptr - mp->mp_upper + NODESIZE; + bcopy(base, base - delta, len); + mp->mp_upper -= delta; + + node = NODEPTR(mp, indx); + node->mn_ksize = key->mv_size; + } + + bcopy(key->mv_data, NODEKEY(node), key->mv_size); + + return MDB_SUCCESS; +} + +/* Move a node from src to dst. + */ +static int +mdb_move_node(MDB_db *bt, MDB_pageparent *src, indx_t srcindx, + MDB_pageparent *dst, indx_t dstindx) +{ + int rc; + MDB_node *srcnode; + MDB_val key, data; + + srcnode = NODEPTR(src->mp_page, srcindx); + DPRINTF("moving %s node %u [%.*s] on page %lu to node %u on page %lu", + IS_LEAF(src->mp_page) ? "leaf" : "branch", + srcindx, + (int)srcnode->mn_ksize, (char *)NODEKEY(srcnode), + src->mp_page->mp_pgno, + dstindx, dst->mp_page->mp_pgno); + + /* Mark src and dst as dirty. */ + if ((rc = mdb_touch(bt->md_env->me_txn, src)) || + (rc = mdb_touch(bt->md_env->me_txn, dst))) + return rc;; + + /* Add the node to the destination page. + */ + key.mv_size = srcnode->mn_ksize; + key.mv_data = NODEKEY(srcnode); + data.mv_size = NODEDSZ(srcnode); + data.mv_data = NODEDATA(srcnode); + rc = mdb_add_node(bt, dst->mp_page, dstindx, &key, &data, NODEPGNO(srcnode), + srcnode->mn_flags); + if (rc != MDB_SUCCESS) + return rc; + + /* Delete the node from the source page. + */ + mdb_del_node(bt, src->mp_page, srcindx); + + /* Update the parent separators. + */ + if (srcindx == 0 && src->mp_pi != 0) { + DPRINTF("update separator for source page %lu to [%.*s]", + src->mp_page->mp_pgno, (int)key.mv_size, (char *)key.mv_data); + if ((rc = mdb_update_key(bt, src->mp_parent, src->mp_pi, + &key)) != MDB_SUCCESS) + return rc; + } + + if (srcindx == 0 && IS_BRANCH(src->mp_page)) { + MDB_val nullkey; + nullkey.mv_size = 0; + assert(mdb_update_key(bt, src->mp_page, 0, &nullkey) == MDB_SUCCESS); + } + + if (dstindx == 0 && dst->mp_pi != 0) { + DPRINTF("update separator for destination page %lu to [%.*s]", + dst->mp_page->mp_pgno, (int)key.mv_size, (char *)key.mv_data); + if ((rc = mdb_update_key(bt, dst->mp_parent, dst->mp_pi, + &key)) != MDB_SUCCESS) + return rc; + } + + if (dstindx == 0 && IS_BRANCH(dst->mp_page)) { + MDB_val nullkey; + nullkey.mv_size = 0; + assert(mdb_update_key(bt, dst->mp_page, 0, &nullkey) == MDB_SUCCESS); + } + + return MDB_SUCCESS; +} + +static int +mdb_merge(MDB_db *bt, MDB_pageparent *src, MDB_pageparent *dst) +{ + int rc; + indx_t i; + MDB_node *srcnode; + MDB_val key, data; + MDB_pageparent mpp; + MDB_dhead *dh; + + DPRINTF("merging page %lu and %lu", src->mp_page->mp_pgno, dst->mp_page->mp_pgno); + + assert(src->mp_parent); /* can't merge root page */ + assert(dst->mp_parent); + assert(bt->md_env->me_txn != NULL); + + /* Mark src and dst as dirty. */ + if ((rc = mdb_touch(bt->md_env->me_txn, src)) || + (rc = mdb_touch(bt->md_env->me_txn, dst))) + return rc; + + /* Move all nodes from src to dst. + */ + for (i = 0; i < NUMKEYS(src->mp_page); i++) { + srcnode = NODEPTR(src->mp_page, i); + + key.mv_size = srcnode->mn_ksize; + key.mv_data = NODEKEY(srcnode); + data.mv_size = NODEDSZ(srcnode); + data.mv_data = NODEDATA(srcnode); + rc = mdb_add_node(bt, dst->mp_page, NUMKEYS(dst->mp_page), &key, + &data, NODEPGNO(srcnode), srcnode->mn_flags); + if (rc != MDB_SUCCESS) + return rc; + } + + DPRINTF("dst page %lu now has %lu keys (%.1f%% filled)", + dst->mp_page->mp_pgno, NUMKEYS(dst->mp_page), (float)PAGEFILL(bt->md_env, dst->mp_page) / 10); + + /* Unlink the src page from parent. + */ + mdb_del_node(bt, src->mp_parent, src->mp_pi); + if (src->mp_pi == 0) { + key.mv_size = 0; + if ((rc = mdb_update_key(bt, src->mp_parent, 0, &key)) != MDB_SUCCESS) + return rc; + } + + if (IS_LEAF(src->mp_page)) + bt->md_env->me_meta.mm_stat.ms_leaf_pages--; + else + bt->md_env->me_meta.mm_stat.ms_branch_pages--; + + mpp.mp_page = src->mp_parent; + dh = (MDB_dhead *)src->mp_parent; + dh--; + mpp.mp_parent = dh->md_parent; + mpp.mp_pi = dh->md_pi; + + return mdb_rebalance(bt, &mpp); +} + +#define FILL_THRESHOLD 250 + +static int +mdb_rebalance(MDB_db *db, MDB_pageparent *mpp) +{ + MDB_node *node; + MDB_page *root; + MDB_pageparent npp; + indx_t si = 0, di = 0; + + assert(db != NULL); + assert(db->md_env->me_txn != NULL); + assert(mpp != NULL); + + DPRINTF("rebalancing %s page %lu (has %lu keys, %.1f%% full)", + IS_LEAF(mpp->mp_page) ? "leaf" : "branch", + mpp->mp_page->mp_pgno, NUMKEYS(mpp->mp_page), (float)PAGEFILL(db->md_env, mpp->mp_page) / 10); + + if (PAGEFILL(db->md_env, mpp->mp_page) >= FILL_THRESHOLD) { + DPRINTF("no need to rebalance page %lu, above fill threshold", + mpp->mp_page->mp_pgno); + return MDB_SUCCESS; + } + + if (mpp->mp_parent == NULL) { + if (NUMKEYS(mpp->mp_page) == 0) { + DPRINTF("tree is completely empty"); + db->md_env->me_txn->mt_root = P_INVALID; + db->md_env->me_meta.mm_stat.ms_depth--; + db->md_env->me_meta.mm_stat.ms_leaf_pages--; + } else if (IS_BRANCH(mpp->mp_page) && NUMKEYS(mpp->mp_page) == 1) { + DPRINTF("collapsing root page!"); + db->md_env->me_txn->mt_root = NODEPGNO(NODEPTR(mpp->mp_page, 0)); + if ((root = mdbenv_get_page(db->md_env, db->md_env->me_txn->mt_root)) == NULL) + return MDB_FAIL; + db->md_env->me_meta.mm_stat.ms_depth--; + db->md_env->me_meta.mm_stat.ms_branch_pages--; + } else + DPRINTF("root page doesn't need rebalancing"); + return MDB_SUCCESS; + } + + /* The parent (branch page) must have at least 2 pointers, + * otherwise the tree is invalid. + */ + assert(NUMKEYS(mpp->mp_parent) > 1); + + /* Leaf page fill factor is below the threshold. + * Try to move keys from left or right neighbor, or + * merge with a neighbor page. + */ + + /* Find neighbors. + */ + if (mpp->mp_pi == 0) { + /* We're the leftmost leaf in our parent. + */ + DPRINTF("reading right neighbor"); + node = NODEPTR(mpp->mp_parent, mpp->mp_pi + 1); + if ((npp.mp_page = mdbenv_get_page(db->md_env, NODEPGNO(node))) == NULL) + return MDB_FAIL; + npp.mp_pi = mpp->mp_pi + 1; + si = 0; + di = NUMKEYS(mpp->mp_page); + } else { + /* There is at least one neighbor to the left. + */ + DPRINTF("reading left neighbor"); + node = NODEPTR(mpp->mp_parent, mpp->mp_pi - 1); + if ((npp.mp_page = mdbenv_get_page(db->md_env, NODEPGNO(node))) == NULL) + return MDB_FAIL; + npp.mp_pi = mpp->mp_pi - 1; + si = NUMKEYS(npp.mp_page) - 1; + di = 0; + } + npp.mp_parent = mpp->mp_parent; + + DPRINTF("found neighbor page %lu (%lu keys, %.1f%% full)", + npp.mp_page->mp_pgno, NUMKEYS(npp.mp_page), (float)PAGEFILL(db->md_env, npp.mp_page) / 10); + + /* If the neighbor page is above threshold and has at least two + * keys, move one key from it. + * + * Otherwise we should try to merge them. + */ + if (PAGEFILL(db->md_env, npp.mp_page) >= FILL_THRESHOLD && NUMKEYS(npp.mp_page) >= 2) + return mdb_move_node(db, &npp, si, mpp, di); + else { /* FIXME: if (has_enough_room()) */ + if (mpp->mp_pi == 0) + return mdb_merge(db, &npp, mpp); + else + return mdb_merge(db, mpp, &npp); + } +} + +int +mdb_del(MDB_db *bt, MDB_txn *txn, + MDB_val *key, MDB_val *data) +{ + int rc, exact; + unsigned int ki; + MDB_node *leaf; + MDB_pageparent mpp; + + DPRINTF("========> delete key %.*s", (int)key->mv_size, (char *)key->mv_data); + + assert(key != NULL); + + if (bt == NULL || txn == NULL) + return EINVAL; + + if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) { + return EINVAL; + } + + if (bt->md_env->me_txn != txn) { + return EINVAL; + } + + if (key->mv_size == 0 || key->mv_size > MAXKEYSIZE) { + return EINVAL; + } + + if ((rc = mdb_search_page(bt, txn, key, NULL, 1, &mpp)) != MDB_SUCCESS) + return rc; + + leaf = mdb_search_node(bt, mpp.mp_page, key, &exact, &ki); + if (leaf == NULL || !exact) { + return ENOENT; + } + + if (data && (rc = mdb_read_data(bt, NULL, leaf, data)) != MDB_SUCCESS) + return rc; + + mdb_del_node(bt, mpp.mp_page, ki); + bt->md_env->me_meta.mm_stat.ms_entries--; + rc = mdb_rebalance(bt, &mpp); + if (rc != MDB_SUCCESS) + txn->mt_flags |= MDB_TXN_ERROR; + + return rc; +} + +/* Split page <*mpp>, and insert in either left or + * right sibling, at index <*newindxp> (as if unsplit). Updates *mpp and + * *newindxp with the actual values after split, ie if *mpp and *newindxp + * refer to a node in the new right sibling page. + */ +static int +mdb_split(MDB_db *bt, MDB_page **mpp, unsigned int *newindxp, + MDB_val *newkey, MDB_val *newdata, pgno_t newpgno) +{ + uint8_t flags; + int rc = MDB_SUCCESS, ins_new = 0; + indx_t newindx; + pgno_t pgno = 0; + unsigned int i, j, split_indx; + MDB_node *node; + MDB_page *pright, *p; + MDB_val sepkey, rkey, rdata; + MDB_page *copy; + MDB_dpage *mdp, *rdp, *pdp; + MDB_dhead *dh; + + assert(bt != NULL); + assert(bt->md_env != NULL); + + dh = ((MDB_dhead *)*mpp) - 1; + mdp = (MDB_dpage *)dh; + newindx = *newindxp; + + DPRINTF("-----> splitting %s page %lu and adding [%.*s] at index %i", + IS_LEAF(&mdp->p) ? "leaf" : "branch", mdp->p.mp_pgno, + (int)newkey->mv_size, (char *)newkey->mv_data, *newindxp); + + if (mdp->h.md_parent == NULL) { + if ((pdp = mdbenv_new_page(bt->md_env, P_BRANCH, 1)) == NULL) + return MDB_FAIL; + mdp->h.md_pi = 0; + mdp->h.md_parent = &pdp->p; + bt->md_env->me_txn->mt_root = pdp->p.mp_pgno; + DPRINTF("root split! new root = %lu", pdp->p.mp_pgno); + bt->md_env->me_meta.mm_stat.ms_depth++; + + /* Add left (implicit) pointer. */ + if (mdb_add_node(bt, &pdp->p, 0, NULL, NULL, + mdp->p.mp_pgno, 0) != MDB_SUCCESS) + return MDB_FAIL; + } else { + DPRINTF("parent branch page is %lu", mdp->h.md_parent->mp_pgno); + } + + /* Create a right sibling. */ + if ((rdp = mdbenv_new_page(bt->md_env, mdp->p.mp_flags, 1)) == NULL) + return MDB_FAIL; + rdp->h.md_parent = mdp->h.md_parent; + rdp->h.md_pi = mdp->h.md_pi + 1; + DPRINTF("new right sibling: page %lu", rdp->p.mp_pgno); + + /* Move half of the keys to the right sibling. */ + if ((copy = malloc(bt->md_env->me_head.mh_psize)) == NULL) + return MDB_FAIL; + bcopy(&mdp->p, copy, bt->md_env->me_head.mh_psize); + bzero(&mdp->p.mp_ptrs, bt->md_env->me_head.mh_psize - PAGEHDRSZ); + mdp->p.mp_lower = PAGEHDRSZ; + mdp->p.mp_upper = bt->md_env->me_head.mh_psize; + + split_indx = NUMKEYS(copy) / 2 + 1; + + /* First find the separating key between the split pages. + */ + bzero(&sepkey, sizeof(sepkey)); + if (newindx == split_indx) { + sepkey.mv_size = newkey->mv_size; + sepkey.mv_data = newkey->mv_data; + } else { + node = NODEPTR(copy, split_indx); + sepkey.mv_size = node->mn_ksize; + sepkey.mv_data = NODEKEY(node); + } + + DPRINTF("separator is [%.*s]", (int)sepkey.mv_size, (char *)sepkey.mv_data); + + /* Copy separator key to the parent. + */ + if (SIZELEFT(rdp->h.md_parent) < mdb_branch_size(bt, &sepkey)) { + rc = mdb_split(bt, &rdp->h.md_parent, &rdp->h.md_pi, + &sepkey, NULL, rdp->p.mp_pgno); + + /* Right page might now have changed parent. + * Check if left page also changed parent. + */ + if (rdp->h.md_parent != mdp->h.md_parent && + mdp->h.md_pi >= NUMKEYS(mdp->h.md_parent)) { + mdp->h.md_parent = rdp->h.md_parent; + mdp->h.md_pi = rdp->h.md_pi - 1; + } + } else { + rc = mdb_add_node(bt, rdp->h.md_parent, rdp->h.md_pi, + &sepkey, NULL, rdp->p.mp_pgno, 0); + } + if (rc != MDB_SUCCESS) { + free(copy); + return MDB_FAIL; + } + + for (i = j = 0; i <= NUMKEYS(copy); j++) { + if (i < split_indx) { + /* Re-insert in left sibling. */ + pdp = mdp; + } else { + /* Insert in right sibling. */ + if (i == split_indx) + /* Reset insert index for right sibling. */ + j = (i == newindx && ins_new); + pdp = rdp; + } + + if (i == newindx && !ins_new) { + /* Insert the original entry that caused the split. */ + rkey.mv_data = newkey->mv_data; + rkey.mv_size = newkey->mv_size; + if (IS_LEAF(&mdp->p)) { + rdata.mv_data = newdata->mv_data; + rdata.mv_size = newdata->mv_size; + } else + pgno = newpgno; + flags = 0; + + ins_new = 1; + + /* Update page and index for the new key. */ + *newindxp = j; + *mpp = &pdp->p; + } else if (i == NUMKEYS(copy)) { + break; + } else { + node = NODEPTR(copy, i); + rkey.mv_data = NODEKEY(node); + rkey.mv_size = node->mn_ksize; + if (IS_LEAF(&mdp->p)) { + rdata.mv_data = NODEDATA(node); + rdata.mv_size = node->mn_dsize; + } else + pgno = node->mn_pgno; + flags = node->mn_flags; + + i++; + } + + if (!IS_LEAF(&mdp->p) && j == 0) { + /* First branch index doesn't need key data. */ + rkey.mv_size = 0; + } + + rc = mdb_add_node(bt, &pdp->p, j, &rkey, &rdata, pgno,flags); + } + + free(copy); + return rc; +} + +int +mdb_put(MDB_db *bt, MDB_txn *txn, + MDB_val *key, MDB_val *data, unsigned int flags) +{ + int rc = MDB_SUCCESS, exact; + unsigned int ki; + MDB_node *leaf; + MDB_pageparent mpp; + + assert(key != NULL); + assert(data != NULL); + + if (bt == NULL || txn == NULL) + return EINVAL; + + if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) { + return EINVAL; + } + + if (bt->md_env->me_txn != txn) { + return EINVAL; + } + + if (key->mv_size == 0 || key->mv_size > MAXKEYSIZE) { + return EINVAL; + } + + DPRINTF("==> put key %.*s, size %zu, data size %zu", + (int)key->mv_size, (char *)key->mv_data, key->mv_size, data->mv_size); + + rc = mdb_search_page(bt, txn, key, NULL, 1, &mpp); + if (rc == MDB_SUCCESS) { + leaf = mdb_search_node(bt, mpp.mp_page, key, &exact, &ki); + if (leaf && exact) { + if (F_ISSET(flags, MDB_NOOVERWRITE)) { + DPRINTF("duplicate key %.*s", + (int)key->mv_size, (char *)key->mv_data); + return EEXIST; + } + mdb_del_node(bt, mpp.mp_page, ki); + } + if (leaf == NULL) { /* append if not found */ + ki = NUMKEYS(mpp.mp_page); + DPRINTF("appending key at index %i", ki); + } + } else if (rc == ENOENT) { + MDB_dpage *dp; + /* new file, just write a root leaf page */ + DPRINTF("allocating new root leaf page"); + if ((dp = mdbenv_new_page(bt->md_env, P_LEAF, 1)) == NULL) { + return ENOMEM; + } + mpp.mp_page = &dp->p; + txn->mt_root = mpp.mp_page->mp_pgno; + bt->md_env->me_meta.mm_stat.ms_depth++; + ki = 0; + } + else + goto done; + + assert(IS_LEAF(mpp.mp_page)); + DPRINTF("there are %lu keys, should insert new key at index %i", + NUMKEYS(mpp.mp_page), ki); + + if (SIZELEFT(mpp.mp_page) < mdb_leaf_size(bt, key, data)) { + rc = mdb_split(bt, &mpp.mp_page, &ki, key, data, P_INVALID); + } else { + /* There is room already in this leaf page. */ + rc = mdb_add_node(bt, mpp.mp_page, ki, key, data, 0, 0); + } + + if (rc != MDB_SUCCESS) + txn->mt_flags |= MDB_TXN_ERROR; + else + bt->md_env->me_meta.mm_stat.ms_entries++; + +done: + return rc; +} + +static pgno_t +mdbenv_compact_tree(MDB_env *env, pgno_t pgno, MDB_env *envc) +{ + ssize_t rc; + indx_t i; + pgno_t *pnext, next; + MDB_node *node; + MDB_page *p; + MDB_page *mp; + + /* Get the page and make a copy of it. + */ + if ((mp = mdbenv_get_page(env, pgno)) == NULL) + return P_INVALID; + if ((p = malloc(env->me_head.mh_psize)) == NULL) + return P_INVALID; + bcopy(mp, p, env->me_head.mh_psize); + + /* Go through all nodes in the (copied) page and update the + * page pointers. + */ + if (F_ISSET(p->mp_flags, P_BRANCH)) { + for (i = 0; i < NUMKEYS(p); i++) { + node = NODEPTR(p, i); + node->mn_pgno = mdbenv_compact_tree(env, node->mn_pgno, envc); + if (node->mn_pgno == P_INVALID) { + free(p); + return P_INVALID; + } + } + } else if (F_ISSET(p->mp_flags, P_LEAF)) { + for (i = 0; i < NUMKEYS(p); i++) { + node = NODEPTR(p, i); + if (F_ISSET(node->mn_flags, F_BIGDATA)) { + bcopy(NODEDATA(node), &next, sizeof(next)); + next = mdbenv_compact_tree(env, next, envc); + if (next == P_INVALID) { + free(p); + return P_INVALID; + } + bcopy(&next, NODEDATA(node), sizeof(next)); + } + } + } else if (F_ISSET(p->mp_flags, P_OVERFLOW)) { + ; /* handled below */ + } else + assert(0); + + pgno = p->mp_pgno = envc->me_txn->mt_next_pgno++; + rc = write(envc->me_fd, p, env->me_head.mh_psize); + free(p); + if (rc != (ssize_t)env->me_head.mh_psize) + return P_INVALID; + + if (F_ISSET(mp->mp_flags, P_OVERFLOW)) { + if (mp->mp_pages > 1) { + size_t len = (mp->mp_pages-1) * env->me_head.mh_psize; + envc->me_txn->mt_next_pgno += mp->mp_pages-1; + rc = write(envc->me_fd, mp+1, len); + if (rc != len) + return P_INVALID; + } + } + return pgno; +} + +int +mdbenv_compact(MDB_env *env) +{ + char *compact_path = NULL; + MDB_env *envc; + MDB_txn *txn, *txnc = NULL; + int fd, rc; + pgno_t root; + + assert(env != NULL); + + if (env->me_path == NULL) { + return EINVAL; + } + + DPRINTF("compacting mdbenv %p with path %s", env, env->me_path); + + rc = mdb_txn_begin(env, 0, &txn); + if (rc) return rc; + + asprintf(&compact_path, "%s.compact.XXXXXX", env->me_path); + fd = mkstemp(compact_path); + if (fd == -1) { + rc = errno; + free(compact_path); + mdb_txn_abort(txn); + return rc; + } + + rc = mdbenv_create(&envc, env->me_mapsize); + if (rc) goto failed; + + envc->me_fd = fd; + rc = mdbenv_open2(envc, env->me_flags); + if (rc) goto failed; + + bcopy(&env->me_meta, &envc->me_meta, sizeof(env->me_meta)); + envc->me_meta.mm_stat.ms_revisions = 0; + + rc = mdb_txn_begin(envc, 0, &txnc); + if (rc) goto failed; + + if (env->me_meta.mm_root != P_INVALID) { + root = mdbenv_compact_tree(env, env->me_meta.mm_root, envc); + if (root == P_INVALID) + goto failed; + if ((rc = mdbenv_write_meta(envc, root, 0)) != MDB_SUCCESS) + goto failed; + } + + fsync(fd); + + DPRINTF("renaming %s to %s", compact_path, env->me_path); + if (rename(compact_path, env->me_path) != 0) { + rc = errno; + goto failed; + } + + /* Write a "tombstone" meta page so other processes can pick up + * the change and re-open the file. + */ + if (mdbenv_write_meta(env, P_INVALID, MDB_TOMBSTONE) != MDB_SUCCESS) + goto failed; + + mdb_txn_abort(txn); + mdb_txn_abort(txnc); + free(compact_path); + mdbenv_close(envc); + return 0; + +failed: + mdb_txn_abort(txn); + mdb_txn_abort(txnc); + unlink(compact_path); + free(compact_path); + mdbenv_close(envc); + return rc; +} + +int +mdbenv_get_flags(MDB_env *env, unsigned int *arg) +{ + if (!env || !arg) + return EINVAL; + + *arg = (env->me_flags & ~MDB_FIXPADDING); + return MDB_SUCCESS; +} + +int +mdbenv_get_path(MDB_env *env, const char **arg) +{ + if (!env || !arg) + return EINVAL; + + *arg = env->me_path; + return MDB_SUCCESS; +} + +int +mdbenv_stat(MDB_env *env, MDB_stat **arg) +{ + if (env == NULL || arg == NULL) + return EINVAL; + + *arg = &env->me_meta.mm_stat; + + return MDB_SUCCESS; +} + +int mdb_open(MDB_env *env, MDB_txn *txn, const char *name, unsigned int flags, MDB_db **db) +{ + if (!name) { + *db = (MDB_db *)&env->me_db; + return MDB_SUCCESS; + } + return EINVAL; +} + +int mdb_stat(MDB_db *db, MDB_stat **arg) +{ + if (db == NULL || arg == NULL) + return EINVAL; + + *arg = &db->md_stat; + + return MDB_SUCCESS; +} + +void mdb_close(MDB_db *db) +{ +} diff --git a/libraries/libmdb/mdb.h b/libraries/libmdb/mdb.h new file mode 100644 index 0000000000..b401bebaa1 --- /dev/null +++ b/libraries/libmdb/mdb.h @@ -0,0 +1,87 @@ +#ifndef _MDB_H_ +#define _MDB_H_ + +#include + +struct MDB_cursor; +struct MDB_txn; +struct MDB_db; +struct MDB_env; + +typedef struct MDB_cursor MDB_cursor; +typedef struct MDB_txn MDB_txn; +typedef struct MDB_db MDB_db; +typedef struct MDB_env MDB_env; + +typedef struct MDB_val { + void *mv_data; + size_t mv_size; +} MDB_val; + +typedef int (MDB_cmp_func)(const MDB_val *a, const MDB_val *b); +typedef void (MDB_rel_func)(void *ptr, void *oldptr); + +#define MDB_NOOVERWRITE 1 + +typedef enum MDB_cursor_op { /* cursor operations */ + MDB_CURSOR, /* position at given key */ + MDB_CURSOR_EXACT, /* position at key, or fail */ + MDB_FIRST, + MDB_NEXT, + MDB_LAST, /* not implemented */ + MDB_PREV /* not implemented */ +} MDB_cursor_op; + +/* return codes */ +#define MDB_FAIL -1 +#define MDB_SUCCESS 0 + +/* DB flags */ +#define MDB_REVERSEKEY 0x02 /* use reverse string keys */ +#define MDB_NOSYNC 0x10000 /* don't fsync after commit */ +#define MDB_RDONLY 0x20000 /* read only */ + +/* environment flags */ +#define MDB_FIXEDMAP 0x01 /* mmap at a fixed address */ + +typedef struct MDB_stat { + time_t ms_created_at; + unsigned int ms_psize; + unsigned int ms_depth; + unsigned long ms_branch_pages; + unsigned long ms_leaf_pages; + unsigned long ms_overflow_pages; + unsigned long ms_revisions; + unsigned long ms_entries; +} MDB_stat; + +int mdbenv_create(MDB_env **env, size_t size); +int mdbenv_open(MDB_env *env, const char *path, unsigned int flags, mode_t mode); +int mdbenv_stat(MDB_env *env, MDB_stat **stat); +void mdbenv_close(MDB_env *env); +int mdbenv_get_flags(MDB_env *env, unsigned int *flags); +int mdbenv_get_path(MDB_env *env, const char **path); +int mdbenv_sync(MDB_env *env); +int mdbenv_compact(MDB_env *env); + +int mdb_txn_begin(MDB_env *env, int rdonly, MDB_txn **txn); +int mdb_txn_commit(MDB_txn *txn); +void mdb_txn_abort(MDB_txn *txn); + +int mdb_open(MDB_env *env, MDB_txn *txn, const char *name, unsigned int flags, MDB_db **db); +int mdb_stat(MDB_db *db, MDB_stat **stat); +void mdb_close(MDB_db *db); + +int mdb_get(MDB_db *db, MDB_txn *txn, MDB_val *key, MDB_val *data); +int mdb_put(MDB_db *db, MDB_txn *txn, MDB_val *key, MDB_val *data, + unsigned int flags); +int mdb_del(MDB_db *db, MDB_txn *txn, MDB_val *key, MDB_val *data); + +int mdb_cursor_open(MDB_db *db, MDB_txn *txn, MDB_cursor **cursor); +void mdb_cursor_close(MDB_cursor *cursor); +int mdb_cursor_get(MDB_cursor *cursor, MDB_val *key, MDB_val *data, + MDB_cursor_op op); + +int mdb_cmp(MDB_db *db, const MDB_val *a, const MDB_val *b); + +#endif /* _MDB_H_ */ diff --git a/libraries/libmdb/mtest.c b/libraries/libmdb/mtest.c new file mode 100644 index 0000000000..bc3df268c9 --- /dev/null +++ b/libraries/libmdb/mtest.c @@ -0,0 +1,56 @@ +#include +#include +#include +#include "mdb.h" + +int main(int argc,char * argv[]) +{ + int i = 0, rc; + MDB_env *env; + MDB_db *db; + MDB_val key, data; + MDB_txn *txn; + MDB_stat *mst; + int count; + int *values; + char sval[32]; + + srandom(time(NULL)); + + count = random()%512; + values = (int *)malloc(count*sizeof(int)); + + for(i = 0;i -1; i-= (random()%5)) { + rc = mdb_txn_begin(env, 0, &txn); + sprintf(sval, "%03x ", values[i]); + rc = mdb_del(db, txn, &key, NULL); + rc = mdb_txn_commit(txn); + } + free(values); + rc = mdbenv_stat(env, &mst); + mdb_close(db); + mdbenv_close(env); + + return 0; +}