* BerkeleyDB API, but much simplified.
*/
/*
- * Copyright 2011 Howard Chu, Symas Corp.
+ * Copyright 2011-2012 Howard Chu, Symas Corp.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
#include <time.h>
#include <unistd.h>
+#if !(defined(BYTE_ORDER) || defined(__BYTE_ORDER))
+#include <resolv.h> /* defines BYTE_ORDER on HPUX and Solaris */
+#endif
+
#ifndef _WIN32
#include <pthread.h>
#ifdef __APPLE__
#endif
#endif
+#ifdef USE_VALGRIND
+#include <valgrind/memcheck.h>
+#define VGMEMP_CREATE(h,r,z) VALGRIND_CREATE_MEMPOOL(h,r,z)
+#define VGMEMP_ALLOC(h,a,s) VALGRIND_MEMPOOL_ALLOC(h,a,s)
+#define VGMEMP_FREE(h,a) VALGRIND_MEMPOOL_FREE(h,a)
+#define VGMEMP_DESTROY(h) VALGRIND_DESTROY_MEMPOOL(h)
+#define VGMEMP_DEFINED(a,s) VALGRIND_MAKE_MEM_DEFINED(a,s)
+#else
+#define VGMEMP_CREATE(h,r,z)
+#define VGMEMP_ALLOC(h,a,s)
+#define VGMEMP_FREE(h,a)
+#define VGMEMP_DESTROY(h)
+#define VGMEMP_DEFINED(a,s)
+#endif
+
#ifndef BYTE_ORDER
# if (defined(_LITTLE_ENDIAN) || defined(_BIG_ENDIAN)) && !(defined(_LITTLE_ENDIAN) && defined(_BIG_ENDIAN))
/* Solaris just defines one or the other */
#ifndef MDB_DSYNC
# define MDB_DSYNC O_DSYNC
#endif
+#endif
+
+/** Function for flushing the data of a file. Define this to fsync
+ * if fdatasync() is not supported.
+ */
+#ifndef MDB_FDATASYNC
+# define MDB_FDATASYNC fdatasync
#endif
/** A page number in the database.
/** @defgroup debug Debug Macros
* @{
*/
-#ifndef DEBUG
+#ifndef MDB_DEBUG
/** Enable debug output.
* Set this to 1 for copious tracing. Set to 2 to add dumps of all IDLs
* read from and written to the database (used for free space management).
*/
-#define DEBUG 0
+#define MDB_DEBUG 0
#endif
#if !(__STDC_VERSION__ >= 199901L || defined(__GNUC__))
# define DPRINTF (void) /* Vararg macros may be unsupported */
-#elif DEBUG
+#elif MDB_DEBUG
/** Print a debug message with printf formatting. */
# define DPRINTF(fmt, ...) /**< Requires 2 or more args */ \
fprintf(stderr, "%s:%d " fmt "\n", __func__, __LINE__, __VA_ARGS__)
*/
#define MAXKEYSIZE 511
-#if DEBUG
+#if MDB_DEBUG
/** A key buffer.
* @ingroup debug
* This is used for printing a hex dump of a key's contents.
#endif
/** @defgroup lazylock Lazy Locking
- * Macros for locks that are't actually needed.
+ * Macros for locks that aren't actually needed.
* The DB view is always consistent because all writes are wrapped in
* the wmutex. Finer-grained locks aren't necessary.
* @{
* unlikely. If a collision occurs, the results are unpredictable.
*/
typedef struct MDB_txbody {
- /** Stamp identifying this as an MDB lock file. It must be set
+ /** Stamp identifying this as an MDB file. It must be set
* to #MDB_MAGIC. */
uint32_t mtb_magic;
/** Version number of this lock file. Must be set to #MDB_VERSION. */
/** Meta page content. */
typedef struct MDB_meta {
- /** Stamp identifying this as an MDB data file. It must be set
+ /** Stamp identifying this as an MDB file. It must be set
* to #MDB_MAGIC. */
uint32_t mm_magic;
/** Version number of this lock file. Must be set to #MDB_VERSION. */
/** The @ref mt_dbflag for this database */
unsigned char *mc_dbflag;
unsigned short mc_snum; /**< number of pushed pages */
- unsigned short mc_top; /**< index of top page, mc_snum-1 */
+ unsigned short mc_top; /**< index of top page, normally mc_snum-1 */
/** @defgroup mdb_cursor Cursor Flags
* @ingroup internal
* Cursor state flags.
};
/** max number of pages to commit in one writev() call */
#define MDB_COMMIT_PAGES 64
+#if defined(IOV_MAX) && IOV_MAX < MDB_COMMIT_PAGES
+#undef MDB_COMMIT_PAGES
+#define MDB_COMMIT_PAGES IOV_MAX
+#endif
static MDB_page *mdb_page_alloc(MDB_cursor *mc, int num);
static MDB_page *mdb_page_new(MDB_cursor *mc, uint32_t flags, int num);
return strerror(err);
}
-#if DEBUG
+#if MDB_DEBUG
/** Display a key in hexadecimal and return the address of the result.
* @param[in] key the key to display
* @param[in] buf the buffer to write into. Should always be #DKBUF.
* printable characters, print it as-is instead of converting to hex.
*/
#if 1
+ buf[0] = '\0';
for (i=0; i<key->mv_size; i++)
ptr += sprintf(ptr, "%02x", *c++);
#else
#endif
return buf;
}
+
+/** Display all the keys in the page. */
+static void
+mdb_page_keys(MDB_page *mp)
+{
+ MDB_node *node;
+ unsigned int i, nkeys;
+ MDB_val key;
+ DKBUF;
+
+ nkeys = NUMKEYS(mp);
+ DPRINTF("numkeys %d", nkeys);
+ for (i=0; i<nkeys; i++) {
+ node = NODEPTR(mp, i);
+ key.mv_size = node->mn_ksize;
+ key.mv_data = node->mn_data;
+ DPRINTF("key %d: %s", i, DKEY(&key));
+ }
+}
#endif
int
static MDB_page *
mdb_page_malloc(MDB_cursor *mc) {
MDB_page *ret;
+ size_t sz = mc->mc_txn->mt_env->me_psize;
if (mc->mc_txn->mt_env->me_dpages) {
ret = mc->mc_txn->mt_env->me_dpages;
+ VGMEMP_ALLOC(mc->mc_txn->mt_env, ret, sz);
+ VGMEMP_DEFINED(ret, sizeof(ret->mp_next));
mc->mc_txn->mt_env->me_dpages = ret->mp_next;
} else {
- ret = malloc(mc->mc_txn->mt_env->me_psize);
+ ret = malloc(sz);
+ VGMEMP_ALLOC(mc->mc_txn->mt_env, ret, sz);
}
return ret;
}
txn->mt_env->me_pghead = mop;
memcpy(mop->mo_pages, idl, MDB_IDL_SIZEOF(idl));
-#if DEBUG > 1
+#if MDB_DEBUG > 1
{
unsigned int i;
DPRINTF("IDL read txn %zu root %zu num %zu",
}
if (txn->mt_env->me_dpages && num == 1) {
np = txn->mt_env->me_dpages;
+ VGMEMP_ALLOC(txn->mt_env, np, txn->mt_env->me_psize);
+ VGMEMP_DEFINED(np, sizeof(np->mp_next));
txn->mt_env->me_dpages = np->mp_next;
} else {
- if ((np = malloc(txn->mt_env->me_psize * num )) == NULL)
+ size_t sz = txn->mt_env->me_psize * num;
+ if ((np = malloc(sz)) == NULL)
return NULL;
+ VGMEMP_ALLOC(txn->mt_env, np, sz);
}
if (pgno == P_INVALID) {
np->mp_pgno = txn->mt_next_pgno;
for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
if (m2 == mc) continue;
m3 = &m2->mc_xcursor->mx_cursor;
+ if (m3->mc_snum < mc->mc_snum) continue;
if (m3->mc_pg[mc->mc_top] == mc->mc_pg[mc->mc_top]) {
m3->mc_pg[mc->mc_top] = mp;
}
MDB_cursor *m2;
for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) {
- if (m2 == mc) continue;
+ if (m2 == mc || m2->mc_snum < mc->mc_snum) continue;
if (m2->mc_pg[mc->mc_top] == mc->mc_pg[mc->mc_top]) {
m2->mc_pg[mc->mc_top] = mp;
}
{
int rc = 0;
if (force || !F_ISSET(env->me_flags, MDB_NOSYNC)) {
- if (fdatasync(env->me_fd))
+ if (MDB_FDATASYNC(env->me_fd))
rc = ErrCode();
}
return rc;
dp = txn->mt_u.dirty_list[i].mptr;
if (!IS_OVERFLOW(dp) || dp->mp_pages == 1) {
dp->mp_next = txn->mt_env->me_dpages;
+ VGMEMP_FREE(txn->mt_env, dp);
txn->mt_env->me_dpages = dp;
} else {
/* large pages just get freed directly */
+ VGMEMP_FREE(txn->mt_env, dp);
free(dp);
}
}
}
x = dst[0].mid;
for (; y<=src[0].mid; y++) {
- if (++x >= MDB_IDL_UM_MAX)
+ if (++x >= MDB_IDL_UM_MAX) {
+ mdb_txn_abort(txn);
return ENOMEM;
+ }
dst[x] = src[y];
}
dst[0].mid = x;
mdb_page_search(&mc, &key, 1);
mdb_midl_sort(txn->mt_free_pgs);
-#if DEBUG > 1
+#if MDB_DEBUG > 1
{
unsigned int i;
ID *idl = txn->mt_free_pgs;
dp = txn->mt_u.dirty_list[i].mptr;
if (!IS_OVERFLOW(dp) || dp->mp_pages == 1) {
dp->mp_next = txn->mt_env->me_dpages;
+ VGMEMP_FREE(txn->mt_env, dp);
txn->mt_env->me_dpages = dp;
} else {
+ VGMEMP_FREE(txn->mt_env, dp);
free(dp);
}
txn->mt_u.dirty_list[i].mid = 0;
e->me_fd = INVALID_HANDLE_VALUE;
e->me_lfd = INVALID_HANDLE_VALUE;
e->me_mfd = INVALID_HANDLE_VALUE;
+ VGMEMP_CREATE(e,0,0);
*env = e;
return MDB_SUCCESS;
}
if (env == NULL)
return;
+ VGMEMP_DESTROY(env);
while (env->me_dpages) {
dp = env->me_dpages;
+ VGMEMP_DEFINED(&dp->mp_next, sizeof(dp->mp_next));
env->me_dpages = dp->mp_next;
free(dp);
}
nkeys = NUMKEYS(mp);
+#if MDB_DEBUG
+ {
+ pgno_t pgno;
+ COPY_PGNO(pgno, mp->mp_pgno);
DPRINTF("searching %u keys in %s %spage %zu",
nkeys, IS_LEAF(mp) ? "leaf" : "branch", IS_SUBP(mp) ? "sub-" : "",
- mp->mp_pgno);
+ pgno);
+ }
+#endif
assert(nkeys > 0);
nodekey.mv_data = NODEKEY(node);
rc = cmp(key, &nodekey);
-#if DEBUG
+#if MDB_DEBUG
if (IS_LEAF(mp))
DPRINTF("found leaf index %u [%s], rc = %i",
i, DKEY(&nodekey), rc);
m3 = &m2->mc_xcursor->mx_cursor;
else
m3 = m2;
- if (m3 == mc) continue;
+ if (m3 == mc || m3->mc_snum < mc->mc_snum) continue;
if (m3->mc_pg[i] == mp && m3->mc_ki[i] >= mc->mc_ki[i]) {
m3->mc_ki[i]++;
}
MDB_page *mp = mc->mc_pg[i];
for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) {
- if (m2 == mc) continue;
+ if (m2 == mc || m2->mc_snum < mc->mc_snum) continue;
if (m2->mc_pg[i] == mp && m2->mc_ki[i] == mc->mc_ki[i]) {
mdb_xcursor_init1(m2, leaf);
}
size_t sz;
sz = LEAFSIZE(key, data);
- if (data->mv_size >= env->me_psize / MDB_MINKEYS) {
+ if (sz >= env->me_psize / MDB_MINKEYS) {
/* put on overflow page */
sz -= data->mv_size - sizeof(pgno_t);
}
if (F_ISSET(flags, F_BIGDATA)) {
/* Data already on overflow page. */
node_size += sizeof(pgno_t);
- } else if (data->mv_size >= mc->mc_txn->mt_env->me_psize / MDB_MINKEYS) {
+ } else if (node_size + data->mv_size >= mc->mc_txn->mt_env->me_psize / MDB_MINKEYS) {
int ovpages = OVPAGES(data->mv_size, mc->mc_txn->mt_env->me_psize);
/* Put data on overflow page. */
- DPRINTF("data size is %zu, put on overflow page",
- data->mv_size);
+ DPRINTF("data size is %zu, node would be %zu, put data on overflow page",
+ data->mv_size, node_size+data->mv_size);
node_size += sizeof(pgno_t);
if ((ofp = mdb_page_new(mc, P_OVERFLOW, ovpages)) == NULL)
return ENOMEM;
MDB_node *node;
char *base;
+#if MDB_DEBUG
+ {
+ pgno_t pgno;
+ COPY_PGNO(pgno, mp->mp_pgno);
DPRINTF("delete node %u on %s page %zu", indx,
- IS_LEAF(mp) ? "leaf" : "branch", mp->mp_pgno);
+ IS_LEAF(mp) ? "leaf" : "branch", pgno);
+ }
+#endif
assert(indx < NUMKEYS(mp));
if (IS_LEAF2(mp)) {
MDB_xcursor *mx = mc->mc_xcursor;
if (node->mn_flags & F_SUBDATA) {
- MDB_db *db = NODEDATA(node);
- mx->mx_db = *db;
+ memcpy(&mx->mx_db, NODEDATA(node), sizeof(MDB_db));
mx->mx_cursor.mc_snum = 0;
mx->mx_cursor.mc_flags = C_SUB;
} else {
mc->mc_dbx = &txn->mt_dbxs[dbi];
mc->mc_dbflag = &txn->mt_dbflags[dbi];
mc->mc_snum = 0;
+ mc->mc_top = 0;
mc->mc_flags = 0;
if (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT) {
assert(mx != NULL);
node = NODEPTR(mp, indx);
ptr = mp->mp_ptrs[indx];
- DPRINTF("update key %u (ofs %u) [%.*s] to [%s] on page %zu",
- indx, ptr,
- (int)node->mn_ksize, (char *)NODEKEY(node),
- DKEY(key),
- mp->mp_pgno);
+#if MDB_DEBUG
+ {
+ MDB_val k2;
+ char kbuf2[(MAXKEYSIZE*2+1)];
+ k2.mv_data = NODEKEY(node);
+ k2.mv_size = node->mn_ksize;
+ DPRINTF("update key %u (ofs %u) [%s] to [%s] on page %zu",
+ indx, ptr,
+ mdb_dkey(&k2, kbuf2),
+ DKEY(key),
+ mp->mp_pgno);
+ }
+#endif
delta = key->mv_size - node->mn_ksize;
if (delta) {
node->mn_ksize = key->mv_size;
}
- memcpy(NODEKEY(node), key->mv_data, key->mv_size);
+ if (key->mv_size)
+ memcpy(NODEKEY(node), key->mv_data, key->mv_size);
return MDB_SUCCESS;
}
int rc;
MDB_node *srcnode;
MDB_val key, data;
+ pgno_t srcpg;
+ unsigned short flags;
+
DKBUF;
/* Mark src and dst as dirty. */
key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], csrc->mc_ki[csrc->mc_top], key.mv_size);
data.mv_size = 0;
data.mv_data = NULL;
+ srcpg = 0;
+ flags = 0;
} else {
srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], csrc->mc_ki[csrc->mc_top]);
+ assert(!((long)srcnode&1));
+ srcpg = NODEPGNO(srcnode);
+ flags = srcnode->mn_flags;
if (csrc->mc_ki[csrc->mc_top] == 0 && IS_BRANCH(csrc->mc_pg[csrc->mc_top])) {
unsigned int snum = csrc->mc_snum;
MDB_node *s2;
data.mv_size = NODEDSZ(srcnode);
data.mv_data = NODEDATA(srcnode);
}
+ if (IS_BRANCH(cdst->mc_pg[cdst->mc_top]) && cdst->mc_ki[cdst->mc_top] == 0) {
+ unsigned int snum = cdst->mc_snum;
+ MDB_node *s2;
+ MDB_val bkey;
+ /* must find the lowest key below dst */
+ mdb_page_search_root(cdst, NULL, 0);
+ s2 = NODEPTR(cdst->mc_pg[cdst->mc_top], 0);
+ bkey.mv_size = NODEKSZ(s2);
+ bkey.mv_data = NODEKEY(s2);
+ cdst->mc_snum = snum--;
+ cdst->mc_top = snum;
+ rc = mdb_update_key(cdst->mc_pg[cdst->mc_top], 0, &bkey);
+ }
+
DPRINTF("moving %s node %u [%s] on page %zu to node %u on page %zu",
IS_LEAF(csrc->mc_pg[csrc->mc_top]) ? "leaf" : "branch",
csrc->mc_ki[csrc->mc_top],
/* Add the node to the destination page.
*/
- rc = mdb_node_add(cdst, cdst->mc_ki[cdst->mc_top], &key, &data, NODEPGNO(srcnode),
- srcnode->mn_flags);
+ rc = mdb_node_add(cdst, cdst->mc_ki[cdst->mc_top], &key, &data, srcpg, flags);
if (rc != MDB_SUCCESS)
return rc;
} else {
for (i = 0; i < NUMKEYS(csrc->mc_pg[csrc->mc_top]); i++, j++) {
srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], i);
+ if (i == 0 && IS_BRANCH(csrc->mc_pg[csrc->mc_top])) {
+ unsigned int snum = csrc->mc_snum;
+ MDB_node *s2;
+ /* must find the lowest key below src */
+ mdb_page_search_root(csrc, NULL, 0);
+ s2 = NODEPTR(csrc->mc_pg[csrc->mc_top], 0);
+ key.mv_size = NODEKSZ(s2);
+ key.mv_data = NODEKEY(s2);
+ csrc->mc_snum = snum--;
+ csrc->mc_top = snum;
+ } else {
+ key.mv_size = srcnode->mn_ksize;
+ key.mv_data = NODEKEY(srcnode);
+ }
- key.mv_size = srcnode->mn_ksize;
- key.mv_data = NODEKEY(srcnode);
data.mv_size = NODEDSZ(srcnode);
data.mv_data = NODEDATA(srcnode);
rc = mdb_node_add(cdst, j, &key, &data, NODEPGNO(srcnode), srcnode->mn_flags);
dbi--;
for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
- if (m2 == csrc) continue;
if (csrc->mc_flags & C_SUB)
m3 = &m2->mc_xcursor->mx_cursor;
else
m3 = m2;
+ if (m3 == csrc) continue;
+ if (m3->mc_snum < csrc->mc_snum) continue;
if (m3->mc_pg[csrc->mc_top] == csrc->mc_pg[csrc->mc_top]) {
m3->mc_pg[csrc->mc_top] = mp;
m3->mc_ki[csrc->mc_top] += nkeys;
unsigned int ptop;
MDB_cursor mn;
+#if MDB_DEBUG
+ {
+ pgno_t pgno;
+ COPY_PGNO(pgno, mc->mc_pg[mc->mc_top]->mp_pgno);
DPRINTF("rebalancing %s page %zu (has %u keys, %.1f%% full)",
IS_LEAF(mc->mc_pg[mc->mc_top]) ? "leaf" : "branch",
- mc->mc_pg[mc->mc_top]->mp_pgno, NUMKEYS(mc->mc_pg[mc->mc_top]), (float)PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) / 10);
+ pgno, NUMKEYS(mc->mc_pg[mc->mc_top]), (float)PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) / 10);
+ }
+#endif
if (PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) >= FILL_THRESHOLD) {
+#if MDB_DEBUG
+ pgno_t pgno;
+ COPY_PGNO(pgno, mc->mc_pg[mc->mc_top]->mp_pgno);
DPRINTF("no need to rebalance page %zu, above fill threshold",
- mc->mc_pg[mc->mc_top]->mp_pgno);
+ pgno);
+#endif
return MDB_SUCCESS;
}
m3 = &m2->mc_xcursor->mx_cursor;
else
m3 = m2;
+ if (m3->mc_snum < mc->mc_snum) continue;
if (m3->mc_pg[0] == mp) {
m3->mc_snum = 0;
m3->mc_top = 0;
m3 = &m2->mc_xcursor->mx_cursor;
else
m3 = m2;
+ if (m3->mc_snum < mc->mc_snum) continue;
if (m3->mc_pg[0] == mp) {
m3->mc_pg[0] = mc->mc_pg[0];
}
ins_new = 1;
/* Update page and index for the new key. */
+ if (!newindx)
+ mc->mc_pg[mc->mc_top] = copy;
mc->mc_ki[mc->mc_top] = j;
} else if (i == nkeys) {
break;
mc->mc_txn->mt_env->me_psize - copy->mp_upper);
/* reset back to original page */
- if (newindx < split_indx) {
+ if (!newindx || (newindx < split_indx)) {
mc->mc_pg[mc->mc_top] = mp;
if (nflags & MDB_RESERVE) {
node = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
/* return tmp page to freelist */
copy->mp_next = mc->mc_txn->mt_env->me_dpages;
+ VGMEMP_FREE(mc->mc_txn->mt_env, copy);
mc->mc_txn->mt_env->me_dpages = copy;
done:
{