#include <time.h>
#include <unistd.h>
+#if !(defined(BYTE_ORDER) || defined(__BYTE_ORDER))
+#include <resolv.h> /* defines BYTE_ORDER on HPUX and Solaris */
+#endif
+
#ifndef _WIN32
#include <pthread.h>
#ifdef __APPLE__
#if defined(_WIN32) || defined(__APPLE__)
#define MNAME_LEN 32
+#else
+#define MNAME_LEN (sizeof(pthread_mutex_t))
#endif
/** @} */
#endif
/** @defgroup lazylock Lazy Locking
- * Macros for locks that are't actually needed.
+ * Macros for locks that aren't actually needed.
* The DB view is always consistent because all writes are wrapped in
* the wmutex. Finer-grained locks aren't necessary.
* @{
* unlikely. If a collision occurs, the results are unpredictable.
*/
typedef struct MDB_txbody {
- /** Stamp identifying this as an MDB lock file. It must be set
+ /** Stamp identifying this as an MDB file. It must be set
* to #MDB_MAGIC. */
uint32_t mtb_magic;
/** Version number of this lock file. Must be set to #MDB_VERSION. */
pthread_mutex_t mt2_wmutex;
#define mti_wmutex mt2.mt2_wmutex
#endif
- char pad[(sizeof(pthread_mutex_t)+CACHELINE-1) & ~(CACHELINE-1)];
+ char pad[(MNAME_LEN+CACHELINE-1) & ~(CACHELINE-1)];
} mt2;
MDB_reader mti_readers[1];
} MDB_txninfo;
/** Meta page content. */
typedef struct MDB_meta {
- /** Stamp identifying this as an MDB data file. It must be set
+ /** Stamp identifying this as an MDB file. It must be set
* to #MDB_MAGIC. */
uint32_t mm_magic;
/** Version number of this lock file. Must be set to #MDB_VERSION. */
txnid_t mm_txnid; /**< txnid that committed this page */
} MDB_meta;
+ /** Buffer for a stack-allocated dirty page.
+ * The members define size and alignment, and silence type
+ * aliasing warnings. They are not used directly; that could
+ * mean incorrectly using several union members in parallel.
+ */
+typedef union MDB_pagebuf {
+ char mb_raw[MDB_PAGESIZE];
+ MDB_page mb_page;
+ struct {
+ char mm_pad[PAGEHDRSZ];
+ MDB_meta mm_meta;
+ } mb_metabuf;
+} MDB_pagebuf;
+
/** Auxiliary DB info.
* The information here is mostly static/read-only. There is
* only a single copy of this record in the environment.
/** The @ref mt_dbflag for this database */
unsigned char *mc_dbflag;
unsigned short mc_snum; /**< number of pushed pages */
- unsigned short mc_top; /**< index of top page, mc_snum-1 */
+ unsigned short mc_top; /**< index of top page, normally mc_snum-1 */
/** @defgroup mdb_cursor Cursor Flags
* @ingroup internal
* Cursor state flags.
};
/** max number of pages to commit in one writev() call */
#define MDB_COMMIT_PAGES 64
+#if defined(IOV_MAX) && IOV_MAX < MDB_COMMIT_PAGES
+#undef MDB_COMMIT_PAGES
+#define MDB_COMMIT_PAGES IOV_MAX
+#endif
static MDB_page *mdb_page_alloc(MDB_cursor *mc, int num);
static MDB_page *mdb_page_new(MDB_cursor *mc, uint32_t flags, int num);
* printable characters, print it as-is instead of converting to hex.
*/
#if 1
+ buf[0] = '\0';
for (i=0; i<key->mv_size; i++)
ptr += sprintf(ptr, "%02x", *c++);
#else
#endif
return buf;
}
+
+/** Display all the keys in the page. */
+static void
+mdb_page_keys(MDB_page *mp)
+{
+ MDB_node *node;
+ unsigned int i, nkeys;
+ MDB_val key;
+ DKBUF;
+
+ nkeys = NUMKEYS(mp);
+ DPRINTF("numkeys %d", nkeys);
+ for (i=0; i<nkeys; i++) {
+ node = NODEPTR(mp, i);
+ key.mv_size = node->mn_ksize;
+ key.mv_data = node->mn_data;
+ DPRINTF("key %d: %s", i, DKEY(&key));
+ }
+}
#endif
int
mdb_page_malloc(MDB_cursor *mc) {
MDB_page *ret;
size_t sz = mc->mc_txn->mt_env->me_psize;
- if (mc->mc_txn->mt_env->me_dpages) {
- ret = mc->mc_txn->mt_env->me_dpages;
+ if ((ret = mc->mc_txn->mt_env->me_dpages) != NULL) {
VGMEMP_ALLOC(mc->mc_txn->mt_env, ret, sz);
VGMEMP_DEFINED(ret, sizeof(ret->mp_next));
mc->mc_txn->mt_env->me_dpages = ret->mp_next;
- } else {
- ret = malloc(sz);
+ } else if ((ret = malloc(sz)) != NULL) {
VGMEMP_ALLOC(mc->mc_txn->mt_env, ret, sz);
}
return ret;
static int
mdb_env_read_header(MDB_env *env, MDB_meta *meta)
{
- char page[MDB_PAGESIZE];
+ MDB_pagebuf pbuf;
MDB_page *p;
MDB_meta *m;
int rc, err;
*/
#ifdef _WIN32
- if (!ReadFile(env->me_fd, page, MDB_PAGESIZE, (DWORD *)&rc, NULL) || rc == 0)
+ if (!ReadFile(env->me_fd, &pbuf, MDB_PAGESIZE, (DWORD *)&rc, NULL) || rc == 0)
#else
- if ((rc = read(env->me_fd, page, MDB_PAGESIZE)) == 0)
+ if ((rc = read(env->me_fd, &pbuf, MDB_PAGESIZE)) == 0)
#endif
{
return ENOENT;
return err;
}
- p = (MDB_page *)page;
+ p = (MDB_page *)&pbuf;
if (!F_ISSET(p->mp_flags, P_META)) {
DPRINTF("page %zu not a meta page", p->mp_pgno);
i |= MAP_FIXED;
env->me_map = mmap(meta.mm_address, env->me_mapsize, PROT_READ, i,
env->me_fd, 0);
- if (env->me_map == MAP_FAILED)
+ if (env->me_map == MAP_FAILED) {
+ env->me_map = NULL;
return ErrCode();
+ }
#endif
if (newenv) {
size = rsize - sizeof(MDB_txninfo);
env->me_maxreaders = size/sizeof(MDB_reader) + 1;
}
-#ifdef _WIN32
{
+#ifdef _WIN32
HANDLE mh;
mh = CreateFileMapping(env->me_lfd, NULL, PAGE_READWRITE,
0, 0, NULL);
rc = ErrCode();
goto fail;
}
- }
#else
- env->me_txns = (MDB_txninfo *)mmap(0, rsize, PROT_READ|PROT_WRITE, MAP_SHARED,
- env->me_lfd, 0);
- if (env->me_txns == MAP_FAILED) {
- rc = ErrCode();
- goto fail;
- }
+ void *m = mmap(NULL, rsize, PROT_READ|PROT_WRITE, MAP_SHARED,
+ env->me_lfd, 0);
+ if (m == MAP_FAILED) {
+ env->me_txns = NULL;
+ rc = ErrCode();
+ goto fail;
+ }
+ env->me_txns = m;
#endif
+ }
if (*excl) {
#ifdef _WIN32
char hexbuf[17];
unsigned int mcount = 0;
size_t nsize;
int rc, rc2;
- char pbuf[MDB_PAGESIZE];
+ MDB_pagebuf pbuf;
char dbuf[MAXKEYSIZE+1];
unsigned int nflags;
DKBUF;
/* create a fake page for the dup items */
memcpy(dbuf, dkey.mv_data, dkey.mv_size);
dkey.mv_data = dbuf;
- fp = (MDB_page *)pbuf;
+ fp = (MDB_page *)&pbuf;
fp->mp_pgno = mc->mc_pg[mc->mc_top]->mp_pgno;
fp->mp_flags = P_LEAF|P_DIRTY|P_SUBP;
fp->mp_lower = PAGEHDRSZ;
do_sub = 1;
rdata = &xdata;
xdata.mv_size = fp->mp_upper;
- xdata.mv_data = pbuf;
+ xdata.mv_data = fp;
flags |= F_DUPDATA;
goto new_sub;
}
/* no, just grow it */
rdata = &xdata;
xdata.mv_size = NODEDSZ(leaf) + offset;
- xdata.mv_data = pbuf;
- mp = (MDB_page *)pbuf;
+ xdata.mv_data = &pbuf;
+ mp = (MDB_page *)&pbuf;
mp->mp_pgno = mc->mc_pg[mc->mc_top]->mp_pgno;
flags |= F_DUPDATA;
}
* DB are all zero size.
*/
if (do_sub) {
- MDB_db *db;
int xflags;
put_sub:
xdata.mv_size = 0;
xflags |= (flags & MDB_APPEND);
rc = mdb_cursor_put(&mc->mc_xcursor->mx_cursor, data, &xdata, xflags);
if (flags & F_SUBDATA) {
- db = NODEDATA(leaf);
+ void *db = NODEDATA(leaf);
memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDB_db));
}
}
if (mc->mc_xcursor->mx_db.md_entries) {
if (leaf->mn_flags & F_SUBDATA) {
/* update subDB info */
- MDB_db *db = NODEDATA(leaf);
+ void *db = NODEDATA(leaf);
memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDB_db));
} else {
/* shrink fake page */
MDB_xcursor *mx = mc->mc_xcursor;
if (node->mn_flags & F_SUBDATA) {
- MDB_db *db = NODEDATA(node);
- mx->mx_db = *db;
+ memcpy(&mx->mx_db, NODEDATA(node), sizeof(MDB_db));
mx->mx_cursor.mc_snum = 0;
mx->mx_cursor.mc_flags = C_SUB;
} else {
node = NODEPTR(mp, indx);
ptr = mp->mp_ptrs[indx];
- DPRINTF("update key %u (ofs %u) [%.*s] to [%s] on page %zu",
- indx, ptr,
- (int)node->mn_ksize, (char *)NODEKEY(node),
- DKEY(key),
- mp->mp_pgno);
+#if MDB_DEBUG
+ {
+ MDB_val k2;
+ char kbuf2[(MAXKEYSIZE*2+1)];
+ k2.mv_data = NODEKEY(node);
+ k2.mv_size = node->mn_ksize;
+ DPRINTF("update key %u (ofs %u) [%s] to [%s] on page %zu",
+ indx, ptr,
+ mdb_dkey(&k2, kbuf2),
+ DKEY(key),
+ mp->mp_pgno);
+ }
+#endif
delta = key->mv_size - node->mn_ksize;
if (delta) {
node->mn_ksize = key->mv_size;
}
- memcpy(NODEKEY(node), key->mv_data, key->mv_size);
+ if (key->mv_size)
+ memcpy(NODEKEY(node), key->mv_data, key->mv_size);
return MDB_SUCCESS;
}
int rc;
MDB_node *srcnode;
MDB_val key, data;
+ pgno_t srcpg;
+ unsigned short flags;
+
DKBUF;
/* Mark src and dst as dirty. */
key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], csrc->mc_ki[csrc->mc_top], key.mv_size);
data.mv_size = 0;
data.mv_data = NULL;
+ srcpg = 0;
+ flags = 0;
} else {
srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], csrc->mc_ki[csrc->mc_top]);
+ assert(!((long)srcnode&1));
+ srcpg = NODEPGNO(srcnode);
+ flags = srcnode->mn_flags;
if (csrc->mc_ki[csrc->mc_top] == 0 && IS_BRANCH(csrc->mc_pg[csrc->mc_top])) {
unsigned int snum = csrc->mc_snum;
MDB_node *s2;
data.mv_size = NODEDSZ(srcnode);
data.mv_data = NODEDATA(srcnode);
}
+ if (IS_BRANCH(cdst->mc_pg[cdst->mc_top]) && cdst->mc_ki[cdst->mc_top] == 0) {
+ unsigned int snum = cdst->mc_snum;
+ MDB_node *s2;
+ MDB_val bkey;
+ /* must find the lowest key below dst */
+ mdb_page_search_root(cdst, NULL, 0);
+ s2 = NODEPTR(cdst->mc_pg[cdst->mc_top], 0);
+ bkey.mv_size = NODEKSZ(s2);
+ bkey.mv_data = NODEKEY(s2);
+ cdst->mc_snum = snum--;
+ cdst->mc_top = snum;
+ rc = mdb_update_key(cdst->mc_pg[cdst->mc_top], 0, &bkey);
+ }
+
DPRINTF("moving %s node %u [%s] on page %zu to node %u on page %zu",
IS_LEAF(csrc->mc_pg[csrc->mc_top]) ? "leaf" : "branch",
csrc->mc_ki[csrc->mc_top],
/* Add the node to the destination page.
*/
- rc = mdb_node_add(cdst, cdst->mc_ki[cdst->mc_top], &key, &data, NODEPGNO(srcnode),
- srcnode->mn_flags);
+ rc = mdb_node_add(cdst, cdst->mc_ki[cdst->mc_top], &key, &data, srcpg, flags);
if (rc != MDB_SUCCESS)
return rc;
} else {
for (i = 0; i < NUMKEYS(csrc->mc_pg[csrc->mc_top]); i++, j++) {
srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], i);
+ if (i == 0 && IS_BRANCH(csrc->mc_pg[csrc->mc_top])) {
+ unsigned int snum = csrc->mc_snum;
+ MDB_node *s2;
+ /* must find the lowest key below src */
+ mdb_page_search_root(csrc, NULL, 0);
+ s2 = NODEPTR(csrc->mc_pg[csrc->mc_top], 0);
+ key.mv_size = NODEKSZ(s2);
+ key.mv_data = NODEKEY(s2);
+ csrc->mc_snum = snum--;
+ csrc->mc_top = snum;
+ } else {
+ key.mv_size = srcnode->mn_ksize;
+ key.mv_data = NODEKEY(srcnode);
+ }
- key.mv_size = srcnode->mn_ksize;
- key.mv_data = NODEKEY(srcnode);
data.mv_size = NODEDSZ(srcnode);
data.mv_data = NODEDATA(srcnode);
rc = mdb_node_add(cdst, j, &key, &data, NODEPGNO(srcnode), srcnode->mn_flags);
dbi--;
for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
- if (m2 == csrc) continue;
if (csrc->mc_flags & C_SUB)
m3 = &m2->mc_xcursor->mx_cursor;
else
m3 = m2;
+ if (m3 == csrc) continue;
if (m3->mc_snum < csrc->mc_snum) continue;
if (m3->mc_pg[csrc->mc_top] == csrc->mc_pg[csrc->mc_top]) {
m3->mc_pg[csrc->mc_top] = mp;