X-Git-Url: https://git.sur5r.net/?a=blobdiff_plain;f=libraries%2Fliblmdb%2Fmdb.c;h=99e68727e07471e74673873e1011186ecc56ed77;hb=eebbd22c2ff9d6088e307c960be56baf35e1ab0d;hp=0ee9f50b361c2ec3c52818c199ecd1ec34d79aa8;hpb=3a1d73dafff3a4da3a1ec3841831edadcb7e76be;p=openldap diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 0ee9f50b36..99e68727e0 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -5,7 +5,7 @@ * BerkeleyDB API, but much simplified. */ /* - * Copyright 2011-2013 Howard Chu, Symas Corp. + * Copyright 2011-2014 Howard Chu, Symas Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -37,10 +37,26 @@ #endif #include #include -#include #ifdef _WIN32 #include +/** getpid() returns int; MinGW defines pid_t but MinGW64 typedefs it + * as int64 which is wrong. MSVC doesn't define it at all, so just + * don't use it. + */ +#define MDB_PID_T int +#ifdef __GNUC__ +# include +#else +# define LITTLE_ENDIAN 1234 +# define BIG_ENDIAN 4321 +# define BYTE_ORDER LITTLE_ENDIAN +# ifndef SSIZE_MAX +# define SSIZE_MAX INT_MAX +# endif +#endif #else +#define MDB_PID_T pid_t +#include #include #include #ifdef HAVE_SYS_FILE_H @@ -49,7 +65,6 @@ #include #endif -#include #include #include #include @@ -75,6 +90,7 @@ #ifndef _WIN32 #include #ifdef MDB_USE_POSIX_SEM +# define MDB_USE_HASH 1 #include #endif #endif @@ -132,14 +148,26 @@ /** @defgroup internal MDB Internals * @{ */ -/** @defgroup compat Windows Compatibility Macros +/** @defgroup compat Compatibility Macros * A bunch of macros to minimize the amount of platform-specific ifdefs * needed throughout the rest of the code. When the features this library * needs are similar enough to POSIX to be hidden in a one-or-two line * replacement, this macro approach is used. * @{ */ + + /** Wrapper around __func__, which is a C99 feature */ +#if __STDC_VERSION__ >= 199901L +# define mdb_func_ __func__ +#elif __GNUC__ >= 2 || _MSC_VER >= 1300 +# define mdb_func_ __FUNCTION__ +#else +/* If a debug message says (), update the #if statements above */ +# define mdb_func_ "" +#endif + #ifdef _WIN32 +#define MDB_USE_HASH 1 #define MDB_PIDLOCK 0 #define pthread_t DWORD #define pthread_mutex_t HANDLE @@ -309,7 +337,7 @@ static txnid_t mdb_debug_start; */ # define DPRINTF(args) ((void) ((mdb_debug) && DPRINTF0 args)) # define DPRINTF0(fmt, ...) \ - fprintf(stderr, "%s:%d " fmt "\n", __func__, __LINE__, __VA_ARGS__) + fprintf(stderr, "%s:%d " fmt "\n", mdb_func_, __LINE__, __VA_ARGS__) #else # define DPRINTF(args) ((void) 0) #endif @@ -322,10 +350,13 @@ static txnid_t mdb_debug_start; (((mc)->mc_flags & C_SUB) ? -(int)(mc)->mc_dbi : (int)(mc)->mc_dbi) /** @} */ - /** A default memory page size. - * The actual size is platform-dependent, but we use this for - * boot-strapping. We probably should not be using this any more. - * The #GET_PAGESIZE() macro is used to get the actual size. + /** @brief The maximum size of a database page. + * + * This is 32k, since it must fit in #MDB_page.%mp_upper. + * + * LMDB will use database pages < OS pages if needed. + * That causes more I/O in write transactions: The OS must + * know (read) the whole page before writing a partial page. * * Note that we don't currently support Huge pages. On Linux, * regular data files cannot use Huge pages, and in general @@ -334,7 +365,7 @@ static txnid_t mdb_debug_start; * pressure from other processes is high. So until OSs have * actual paging support for Huge pages, they're not viable. */ -#define MDB_PAGESIZE 4096 +#define MAX_PAGESIZE 0x8000 /** The minimum number of keys required in a database page. * Setting this to a larger value will place a smaller bound on the @@ -361,20 +392,25 @@ static txnid_t mdb_debug_start; /** The version number for a database's lockfile format. */ #define MDB_LOCK_VERSION 1 - /** @brief The maximum size of a key in the database. - * - * The library rejects bigger keys, and cannot deal with records - * with bigger keys stored by a library with bigger max keysize. + /** @brief The max size of a key we can write, or 0 for dynamic max. * - * We require that keys all fit onto a regular page. This limit - * could be raised a bit further if needed; to something just - * under #MDB_PAGESIZE / #MDB_MINKEYS. + * Define this as 0 to compute the max from the page size. 511 + * is default for backwards compat: liblmdb <= 0.9.10 can break + * when modifying a DB with keys/dupsort data bigger than its max. * - * Note that data items in an #MDB_DUPSORT database are actually keys - * of a subDB, so they're also limited to this size. + * Data items in an #MDB_DUPSORT database are also limited to + * this size, since they're actually keys of a sub-DB. Keys and + * #MDB_DUPSORT data items must fit on a node in a regular page. */ #ifndef MDB_MAXKEYSIZE #define MDB_MAXKEYSIZE 511 +#endif + + /** The maximum size of a key we can write to the environment. */ +#if MDB_MAXKEYSIZE +#define ENV_MAXKEY(env) (MDB_MAXKEYSIZE) +#else +#define ENV_MAXKEY(env) ((env)->me_maxkey) #endif /** @brief The maximum size of a data item. @@ -384,11 +420,15 @@ static txnid_t mdb_debug_start; #define MAXDATASIZE 0xffffffffUL #if MDB_DEBUG + /** Key size which fits in a #DKBUF. + * @ingroup debug + */ +#define DKBUF_MAXKEYSIZE ((MDB_MAXKEYSIZE) > 0 ? (MDB_MAXKEYSIZE) : 511) /** A key buffer. * @ingroup debug * This is used for printing a hex dump of a key's contents. */ -#define DKBUF char kbuf[(MDB_MAXKEYSIZE*2+1)] +#define DKBUF char kbuf[DKBUF_MAXKEYSIZE*2+1] /** Display a key in hex. * @ingroup debug * Invoke a function to display a key in hex. @@ -407,6 +447,9 @@ static txnid_t mdb_debug_start; /** Test if the flags \b f are set in a flag word \b w. */ #define F_ISSET(w, f) (((w) & (f)) == (f)) + /** Round \b n up to an even number. */ +#define EVEN(n) (((n) + 1U) & -2) /* sign-extending -2 to match n+1U */ + /** Used for offsets within a single page. * Since memory pages are typically 4 or 8KB in size, 12-13 bits, * this is plenty. @@ -492,7 +535,7 @@ typedef struct MDB_rxbody { */ txnid_t mrb_txnid; /** The process ID of the process owning this reader txn. */ - pid_t mrb_pid; + MDB_PID_T mrb_pid; /** The thread ID of the thread owning this txn. */ pthread_t mrb_tid; } MDB_rxbody; @@ -658,7 +701,8 @@ typedef struct MDB_page { #define OVPAGES(size, psize) ((PAGEHDRSZ-1 + (size)) / (psize) + 1) /** Header for a single key/data pair within a page. - * We guarantee 2-byte alignment for nodes. + * Used in pages of type #P_BRANCH and #P_LEAF without #P_LEAF2. + * We guarantee 2-byte alignment for 'MDB_node's. */ typedef struct MDB_node { /** lo and hi are used for data size on leaf nodes and for @@ -667,9 +711,11 @@ typedef struct MDB_node { * They are in host byte order in case that lets some * accesses be optimized into a 32-bit word access. */ -#define mn_lo mn_offset[BYTE_ORDER!=LITTLE_ENDIAN] -#define mn_hi mn_offset[BYTE_ORDER==LITTLE_ENDIAN] /**< part of dsize or pgno */ - unsigned short mn_offset[2]; /**< storage for #mn_lo and #mn_hi */ +#if BYTE_ORDER == LITTLE_ENDIAN + unsigned short mn_lo, mn_hi; /**< part of data size or pgno */ +#else + unsigned short mn_hi, mn_lo; +#endif /** @defgroup mdb_node Node Flags * @ingroup internal * Flags for node headers. @@ -811,19 +857,18 @@ typedef struct MDB_meta { txnid_t mm_txnid; /**< txnid that committed this page */ } MDB_meta; - /** Buffer for a stack-allocated dirty page. + /** Buffer for a stack-allocated meta page. * The members define size and alignment, and silence type * aliasing warnings. They are not used directly; that could * mean incorrectly using several union members in parallel. */ -typedef union MDB_pagebuf { - char mb_raw[MDB_PAGESIZE]; +typedef union MDB_metabuf { MDB_page mb_page; struct { char mm_pad[PAGEHDRSZ]; MDB_meta mm_meta; } mb_metabuf; -} MDB_pagebuf; +} MDB_metabuf; /** Auxiliary DB info. * The information here is mostly static/read-only. There is @@ -891,12 +936,12 @@ struct MDB_txn { * @{ */ #define MDB_TXN_RDONLY 0x01 /**< read-only transaction */ -#define MDB_TXN_ERROR 0x02 /**< an error has occurred */ +#define MDB_TXN_ERROR 0x02 /**< txn is unusable after an error */ #define MDB_TXN_DIRTY 0x04 /**< must write, even if dirty list is empty */ #define MDB_TXN_SPILLS 0x08 /**< txn or a parent has spilled pages */ /** @} */ unsigned int mt_flags; /**< @ref mdb_txn */ - /** dirty_list room: Array size - #dirty pages visible to this txn. + /** #dirty_list room: Array size - \#dirty pages visible to this txn. * Includes ancestor txns' dirty pages not hidden by other txns' * dirty/spilled pages. Thus commit(nested txn) has room to merge * dirty_list into mt_parent after freeing hidden mt_parent pages. @@ -989,19 +1034,19 @@ struct MDB_env { #define MDB_ENV_ACTIVE 0x20000000U /** me_txkey is set */ #define MDB_ENV_TXKEY 0x10000000U - /** Have liveness lock in reader table */ -#define MDB_LIVE_READER 0x08000000U uint32_t me_flags; /**< @ref mdb_env */ - unsigned int me_psize; /**< size of a page, from #GET_PAGESIZE */ + unsigned int me_psize; /**< DB page size, inited from me_os_psize */ + unsigned int me_os_psize; /**< OS page size, from #GET_PAGESIZE */ unsigned int me_maxreaders; /**< size of the reader table */ unsigned int me_numreaders; /**< max numreaders set by this env */ MDB_dbi me_numdbs; /**< number of DBs opened */ MDB_dbi me_maxdbs; /**< size of the DB table */ - pid_t me_pid; /**< process ID of this env */ + MDB_PID_T me_pid; /**< process ID of this env */ char *me_path; /**< path to the DB files */ char *me_map; /**< the memory map of the data file */ MDB_txninfo *me_txns; /**< the memory map of the lock file or NULL */ MDB_meta *me_metas[2]; /**< pointers to the two meta pages */ + void *me_pbuf; /**< scratch area for DUPSORT put() */ MDB_txn *me_txn; /**< current write transaction */ size_t me_mapsize; /**< size of the data memory map */ off_t me_size; /**< current file size */ @@ -1021,6 +1066,10 @@ struct MDB_env { int me_maxfree_1pg; /** Max size of a node on a page */ unsigned int me_nodemax; +#if !(MDB_MAXKEYSIZE) + unsigned int me_maxkey; /**< max size of a key */ +#endif + int me_live_reader; /**< have liveness lock in reader table */ #ifdef _WIN32 int me_pidquery; /**< Used in OpenProcess */ HANDLE me_rmutex; /* Windows mutexes don't reside in shared mem */ @@ -1029,6 +1078,8 @@ struct MDB_env { sem_t *me_rmutex; /* Shared mutexes are not supported */ sem_t *me_wmutex; #endif + void *me_userctx; /**< User-settable context */ + MDB_assert_func *me_assert_func; /**< Callback for assertion failures */ }; /** Nested transaction */ @@ -1044,9 +1095,13 @@ typedef struct MDB_ntxn { #define MDB_COMMIT_PAGES IOV_MAX #endif - /* max bytes to write in one call */ + /** max bytes to write in one call */ #define MAX_WRITE (0x80000000U >> (sizeof(ssize_t) == 4)) + /** Check \b txn and \b dbi arguments to a function */ +#define TXN_DBI_EXIST(txn, dbi) \ + ((txn) && (dbi) < (txn)->mt_numdbs && ((txn)->mt_dbflags[dbi] & DB_VALID)) + static int mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp); static int mdb_page_new(MDB_cursor *mc, uint32_t flags, int num, MDB_page **mp); static int mdb_page_touch(MDB_cursor *mc); @@ -1077,7 +1132,7 @@ static void mdb_env_close0(MDB_env *env, int excl); static MDB_node *mdb_node_search(MDB_cursor *mc, MDB_val *key, int *exactp); static int mdb_node_add(MDB_cursor *mc, indx_t indx, MDB_val *key, MDB_val *data, pgno_t pgno, unsigned int flags); -static void mdb_node_del(MDB_page *mp, indx_t indx, int ksize); +static void mdb_node_del(MDB_cursor *mc, int ksize); static void mdb_node_shrink(MDB_page *mp, indx_t indx); static int mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst); static int mdb_node_read(MDB_txn *txn, MDB_node *leaf, MDB_val *data); @@ -1090,7 +1145,8 @@ static int mdb_update_key(MDB_cursor *mc, MDB_val *key); static void mdb_cursor_pop(MDB_cursor *mc); static int mdb_cursor_push(MDB_cursor *mc, MDB_page *mp); -static int mdb_cursor_del0(MDB_cursor *mc, MDB_node *leaf); +static int mdb_cursor_del0(MDB_cursor *mc); +static int mdb_del0(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, unsigned flags); static int mdb_cursor_sibling(MDB_cursor *mc, int move_right); static int mdb_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op); static int mdb_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op); @@ -1146,7 +1202,7 @@ static char *const mdb_errstr[] = { "MDB_INCOMPATIBLE: Operation and DB incompatible, or DB flags changed", "MDB_BAD_RSLOT: Invalid reuse of reader locktable slot", "MDB_BAD_TXN: Transaction cannot recover - it must be aborted", - "MDB_BAD_VALSIZE: Too big key/data, key is empty, or wrong DUPFIXED size", + "MDB_BAD_VALSIZE: Unsupported size of key/DB name/data, or wrong DUPFIXED size", }; char * @@ -1164,7 +1220,43 @@ mdb_strerror(int err) return strerror(err); } +/** assert(3) variant in cursor context */ +#define mdb_cassert(mc, expr) mdb_assert0((mc)->mc_txn->mt_env, expr, #expr) +/** assert(3) variant in transaction context */ +#define mdb_tassert(mc, expr) mdb_assert0((txn)->mt_env, expr, #expr) +/** assert(3) variant in environment context */ +#define mdb_eassert(env, expr) mdb_assert0(env, expr, #expr) + +#ifndef NDEBUG +# define mdb_assert0(env, expr, expr_txt) ((expr) ? (void)0 : \ + mdb_assert_fail(env, expr_txt, mdb_func_, __FILE__, __LINE__)) + +static void +mdb_assert_fail(MDB_env *env, const char *expr_txt, + const char *func, const char *file, int line) +{ + char buf[400]; + sprintf(buf, "%.100s:%d: Assertion '%.200s' failed in %.40s()", + file, line, expr_txt, func); + if (env->me_assert_func) + env->me_assert_func(env, buf); + fprintf(stderr, "%s\n", buf); + abort(); +} +#else +# define mdb_assert0(env, expr, expr_txt) ((void) 0) +#endif /* NDEBUG */ + #if MDB_DEBUG +/** Return the page number of \b mp which may be sub-page, for debug output */ +static pgno_t +mdb_dbg_pgno(MDB_page *mp) +{ + pgno_t ret; + COPY_PGNO(ret, mp->mp_pgno); + return ret; +} + /** Display a key in hexadecimal and return the address of the result. * @param[in] key the key to display * @param[in] buf the buffer to write into. Should always be #DKBUF. @@ -1180,7 +1272,7 @@ mdb_dkey(MDB_val *key, char *buf) if (!key) return ""; - if (key->mv_size > MDB_MAXKEYSIZE) + if (key->mv_size > DKBUF_MAXKEYSIZE) return "MDB_MAXKEYSIZE"; /* may want to make this a dynamic check: if the key is mostly * printable characters, print it as-is instead of converting to hex. @@ -1195,33 +1287,77 @@ mdb_dkey(MDB_val *key, char *buf) return buf; } +static const char * +mdb_leafnode_type(MDB_node *n) +{ + static char *const tp[2][2] = {{"", ": DB"}, {": sub-page", ": sub-DB"}}; + return F_ISSET(n->mn_flags, F_BIGDATA) ? ": overflow page" : + tp[F_ISSET(n->mn_flags, F_DUPDATA)][F_ISSET(n->mn_flags, F_SUBDATA)]; +} + /** Display all the keys in the page. */ void mdb_page_list(MDB_page *mp) { + pgno_t pgno = mdb_dbg_pgno(mp); + const char *type, *state = (mp->mp_flags & P_DIRTY) ? ", dirty" : ""; MDB_node *node; - unsigned int i, nkeys, nsize; + unsigned int i, nkeys, nsize, total = 0; MDB_val key; DKBUF; + switch (mp->mp_flags & (P_BRANCH|P_LEAF|P_LEAF2|P_META|P_OVERFLOW|P_SUBP)) { + case P_BRANCH: type = "Branch page"; break; + case P_LEAF: type = "Leaf page"; break; + case P_LEAF|P_SUBP: type = "Sub-page"; break; + case P_LEAF|P_LEAF2: type = "LEAF2 page"; break; + case P_LEAF|P_LEAF2|P_SUBP: type = "LEAF2 sub-page"; break; + case P_OVERFLOW: + fprintf(stderr, "Overflow page %"Z"u pages %u%s\n", + pgno, mp->mp_pages, state); + return; + case P_META: + fprintf(stderr, "Meta-page %"Z"u txnid %"Z"u\n", + pgno, ((MDB_meta *)METADATA(mp))->mm_txnid); + return; + default: + fprintf(stderr, "Bad page %"Z"u flags 0x%u\n", pgno, mp->mp_flags); + return; + } + nkeys = NUMKEYS(mp); - fprintf(stderr, "Page %"Z"u numkeys %d\n", mp->mp_pgno, nkeys); + fprintf(stderr, "%s %"Z"u numkeys %d%s\n", type, pgno, nkeys, state); + for (i=0; imp_pad; + key.mv_data = LEAF2KEY(mp, i, nsize); + total += nsize; + fprintf(stderr, "key %d: nsize %d, %s\n", i, nsize, DKEY(&key)); + continue; + } node = NODEPTR(mp, i); key.mv_size = node->mn_ksize; key.mv_data = node->mn_data; - nsize = NODESIZE + NODEKSZ(node) + sizeof(indx_t); + nsize = NODESIZE + key.mv_size; if (IS_BRANCH(mp)) { fprintf(stderr, "key %d: page %"Z"u, %s\n", i, NODEPGNO(node), DKEY(&key)); + total += nsize; } else { if (F_ISSET(node->mn_flags, F_BIGDATA)) nsize += sizeof(pgno_t); else nsize += NODEDSZ(node); - fprintf(stderr, "key %d: nsize %d, %s\n", i, nsize, DKEY(&key)); + total += nsize; + nsize += sizeof(indx_t); + fprintf(stderr, "key %d: nsize %d, %s%s\n", + i, nsize, DKEY(&key), mdb_leafnode_type(node)); } + total = EVEN(total); } + fprintf(stderr, "Total: header %d + contents %d + unused %d\n", + IS_LEAF2(mp) ? PAGEHDRSZ : mp->mp_lower, total, SIZELEFT(mp)); } void @@ -1247,6 +1383,7 @@ mdb_cursor_chk(MDB_cursor *mc) /** Count all the pages in each DB and in the freelist * and make sure it matches the actual number of pages * being used. + * All named DBs must be open for a correct count. */ static void mdb_audit(MDB_txn *txn) { @@ -1260,10 +1397,13 @@ static void mdb_audit(MDB_txn *txn) mdb_cursor_init(&mc, txn, FREE_DBI, NULL); while ((rc = mdb_cursor_get(&mc, &key, &data, MDB_NEXT)) == 0) freecount += *(MDB_ID *)data.mv_data; + mdb_tassert(txn, rc == MDB_NOTFOUND); count = 0; for (i = 0; imt_numdbs; i++) { MDB_xcursor mx; + if (!(txn->mt_dbflags[i] & DB_VALID)) + continue; mdb_cursor_init(&mc, txn, i, &mx); if (txn->mt_dbs[i].md_root == P_INVALID) continue; @@ -1271,8 +1411,8 @@ static void mdb_audit(MDB_txn *txn) txn->mt_dbs[i].md_leaf_pages + txn->mt_dbs[i].md_overflow_pages; if (txn->mt_dbs[i].md_flags & MDB_DUPSORT) { - mdb_page_search(&mc, NULL, MDB_PS_FIRST); - do { + rc = mdb_page_search(&mc, NULL, MDB_PS_FIRST); + for (; rc == MDB_SUCCESS; rc = mdb_cursor_sibling(&mc, 1)) { unsigned j; MDB_page *mp; mp = mc.mc_pg[mc.mc_top]; @@ -1286,7 +1426,7 @@ static void mdb_audit(MDB_txn *txn) } } } - while (mdb_cursor_sibling(&mc, 1) == 0); + mdb_tassert(txn, rc == MDB_NOTFOUND); } } if (freecount + count + 2 /* metapages */ != txn->mt_next_pgno) { @@ -1316,7 +1456,12 @@ mdb_page_malloc(MDB_txn *txn, unsigned num) { MDB_env *env = txn->mt_env; MDB_page *ret = env->me_dpages; - size_t sz = env->me_psize; + size_t psize = env->me_psize, sz = psize, off; + /* For ! #MDB_NOMEMINIT, psize counts how much to init. + * For a single page alloc, we init everything after the page header. + * For multi-page, we init the final page; if the caller needed that + * many pages they will be filling in at least up to the last page. + */ if (num == 1) { if (ret) { VGMEMP_ALLOC(env, ret, sz); @@ -1324,11 +1469,19 @@ mdb_page_malloc(MDB_txn *txn, unsigned num) env->me_dpages = ret->mp_next; return ret; } + psize -= off = PAGEHDRSZ; } else { sz *= num; + off = sz - psize; } if ((ret = malloc(sz)) != NULL) { VGMEMP_ALLOC(env, ret, sz); + if (!(env->me_flags & MDB_NOMEMINIT)) { + memset((char *)ret + off, 0, psize); + ret->mp_pad = 0; + } + } else { + txn->mt_flags |= MDB_TXN_ERROR; } return ret; } @@ -1594,7 +1747,7 @@ static void mdb_page_dirty(MDB_txn *txn, MDB_page *mp) { MDB_ID2 mid; - int (*insert)(MDB_ID2L, MDB_ID2 *); + int rc, (*insert)(MDB_ID2L, MDB_ID2 *); if (txn->mt_env->me_flags & MDB_WRITEMAP) { insert = mdb_mid2l_append; @@ -1603,7 +1756,8 @@ mdb_page_dirty(MDB_txn *txn, MDB_page *mp) } mid.mid = mp->mp_pgno; mid.mptr = mp; - insert(txn->mt_u.dirty_list, &mid); + rc = insert(txn->mt_u.dirty_list, &mid); + mdb_tassert(txn, rc == 0); txn->mt_dirty_room--; } @@ -1636,11 +1790,11 @@ mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp) #else enum { Paranoid = 0, Max_retries = INT_MAX /*infinite*/ }; #endif - int rc, n2 = num-1, retry = Max_retries; + int rc, retry = num * 20; MDB_txn *txn = mc->mc_txn; MDB_env *env = txn->mt_env; pgno_t pgno, *mop = env->me_pghead; - unsigned i, j, k, mop_len = mop ? mop[0] : 0; + unsigned i, j, k, mop_len = mop ? mop[0] : 0, n2 = num-1; MDB_page *np; txnid_t oldest = 0, last; MDB_cursor_op op; @@ -1649,8 +1803,10 @@ mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp) *mp = NULL; /* If our dirty list is already full, we can't do anything */ - if (txn->mt_dirty_room == 0) - return MDB_TXN_FULL; + if (txn->mt_dirty_room == 0) { + rc = MDB_TXN_FULL; + goto fail; + } for (op = MDB_FIRST;; op = MDB_NEXT) { MDB_val key, data; @@ -1660,14 +1816,14 @@ mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp) /* Seek a big enough contiguous page range. Prefer * pages at the tail, just truncating the list. */ - if (mop_len >= (unsigned)num) { + if (mop_len > n2) { i = mop_len; do { pgno = mop[i]; if (mop[i-n2] == pgno+n2) goto search_done; - } while (--i >= (unsigned)num); - if (Max_retries < INT_MAX && --retry < 0) + } while (--i > n2); + if (--retry < 0) break; } @@ -1695,7 +1851,7 @@ mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp) if (rc) { if (rc == MDB_NOTFOUND) break; - return rc; + goto fail; } last = *(txnid_t*)key.mv_data; if (oldest <= last) @@ -1708,11 +1864,13 @@ mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp) idl = (MDB_ID *) data.mv_data; i = idl[0]; if (!mop) { - if (!(env->me_pghead = mop = mdb_midl_alloc(i))) - return ENOMEM; + if (!(env->me_pghead = mop = mdb_midl_alloc(i))) { + rc = ENOMEM; + goto fail; + } } else { if ((rc = mdb_midl_need(&env->me_pghead, i)) != 0) - return rc; + goto fail; mop = env->me_pghead; } env->me_pglast = last; @@ -1741,15 +1899,18 @@ mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp) pgno = txn->mt_next_pgno; if (pgno + num >= env->me_maxpg) { DPUTS("DB size maxed out"); - return MDB_MAP_FULL; + rc = MDB_MAP_FULL; + goto fail; } search_done: if (env->me_flags & MDB_WRITEMAP) { np = (MDB_page *)(env->me_map + env->me_psize * pgno); } else { - if (!(np = mdb_page_malloc(txn, num))) - return ENOMEM; + if (!(np = mdb_page_malloc(txn, num))) { + rc = ENOMEM; + goto fail; + } } if (i) { mop[0] = mop_len -= num; @@ -1764,6 +1925,10 @@ search_done: *mp = np; return MDB_SUCCESS; + +fail: + txn->mt_flags |= MDB_TXN_ERROR; + return rc; } /** Copy the used portions of a non-overflow page. @@ -1794,7 +1959,7 @@ mdb_page_copy(MDB_page *dst, MDB_page *src, unsigned int psize) * If a page being referenced was spilled to disk in this txn, bring * it back and make it dirty/writable again. * @param[in] txn the transaction handle. - * @param[in] mp the page being referenced. + * @param[in] mp the page being referenced. It must not be dirty. * @param[out] ret the writable page, if any. ret is unchanged if * mp wasn't spilled. */ @@ -1870,17 +2035,17 @@ mdb_page_touch(MDB_cursor *mc) np = NULL; rc = mdb_page_unspill(txn, mp, &np); if (rc) - return rc; + goto fail; if (np) goto done; } if ((rc = mdb_midl_need(&txn->mt_free_pgs, 1)) || (rc = mdb_page_alloc(mc, 1, &np))) - return rc; + goto fail; pgno = np->mp_pgno; DPRINTF(("touched db %d page %"Z"u -> %"Z"u", DDBI(mc), mp->mp_pgno, pgno)); - assert(mp->mp_pgno != pgno); + mdb_cassert(mc, mp->mp_pgno != pgno); mdb_midl_xappend(txn->mt_free_pgs, mp->mp_pgno); /* Update the parent page, if any, to point to the new page */ if (mc->mc_top) { @@ -1901,19 +2066,21 @@ mdb_page_touch(MDB_cursor *mc) if (x <= dl[0].mid && dl[x].mid == pgno) { if (mp != dl[x].mptr) { /* bad cursor? */ mc->mc_flags &= ~(C_INITIALIZED|C_EOF); + txn->mt_flags |= MDB_TXN_ERROR; return MDB_CORRUPTED; } return 0; } } - assert(dl[0].mid < MDB_IDL_UM_MAX); + mdb_cassert(mc, dl[0].mid < MDB_IDL_UM_MAX); /* No - copy it */ np = mdb_page_malloc(txn, 1); if (!np) return ENOMEM; mid.mid = pgno; mid.mptr = np; - mdb_mid2l_insert(dl, &mid); + rc = mdb_mid2l_insert(dl, &mid); + mdb_cassert(mc, rc == 0); } else { return 0; } @@ -1939,6 +2106,7 @@ done: if (m2->mc_pg[mc->mc_top] == mp) { m2->mc_pg[mc->mc_top] = np; if ((mc->mc_db->md_flags & MDB_DUPSORT) && + IS_LEAF(np) && m2->mc_ki[mc->mc_top] == mc->mc_ki[mc->mc_top]) { MDB_node *leaf = NODEPTR(np, mc->mc_ki[mc->mc_top]); @@ -1949,6 +2117,10 @@ done: } } return 0; + +fail: + txn->mt_flags |= MDB_TXN_ERROR; + return rc; } int @@ -2076,7 +2248,7 @@ enum Pidlock_op { * lock on the lockfile, set at an offset equal to the pid. */ static int -mdb_reader_pid(MDB_env *env, enum Pidlock_op op, pid_t pid) +mdb_reader_pid(MDB_env *env, enum Pidlock_op op, MDB_PID_T pid) { #if !(MDB_PIDLOCK) /* Currently the same as defined(_WIN32) */ int ret = 0; @@ -2141,16 +2313,14 @@ mdb_txn_renew0(MDB_txn *txn) if (r->mr_pid != env->me_pid || r->mr_txnid != (txnid_t)-1) return MDB_BAD_RSLOT; } else { - pid_t pid = env->me_pid; + MDB_PID_T pid = env->me_pid; pthread_t tid = pthread_self(); - if (!(env->me_flags & MDB_LIVE_READER)) { + if (!env->me_live_reader) { rc = mdb_reader_pid(env, Pidset, pid); - if (rc) { - UNLOCK_MUTEX_R(env); + if (rc) return rc; - } - env->me_flags |= MDB_LIVE_READER; + env->me_live_reader = 1; } LOCK_MUTEX_R(env); @@ -2480,7 +2650,7 @@ mdb_freelist_save(MDB_txn *txn) int rc, maxfree_1pg = env->me_maxfree_1pg, more = 1; txnid_t pglast = 0, head_id = 0; pgno_t freecnt = 0, *free_pgs, *mop; - ssize_t head_room = 0, total_room = 0, mop_len; + ssize_t head_room = 0, total_room = 0, mop_len, clean_limit; mdb_cursor_init(&mc, txn, FREE_DBI, NULL); @@ -2491,9 +2661,15 @@ mdb_freelist_save(MDB_txn *txn) return rc; } + /* MDB_RESERVE cancels meminit in ovpage malloc (when no WRITEMAP) */ + clean_limit = (env->me_flags & (MDB_NOMEMINIT|MDB_WRITEMAP)) + ? SSIZE_MAX : maxfree_1pg; + for (;;) { /* Come back here after each Put() in case freelist changed */ MDB_val key, data; + pgno_t *pgs; + ssize_t j; /* If using records from freeDB which we have not yet * deleted, delete them and any we reserved for me_pghead. @@ -2504,7 +2680,7 @@ mdb_freelist_save(MDB_txn *txn) return rc; pglast = head_id = *(txnid_t *)key.mv_data; total_room = head_room = 0; - assert(pglast <= env->me_pglast); + mdb_tassert(txn, pglast <= env->me_pglast); rc = mdb_cursor_del(&mc, 0); if (rc) return rc; @@ -2577,7 +2753,12 @@ mdb_freelist_save(MDB_txn *txn) rc = mdb_cursor_put(&mc, &key, &data, MDB_RESERVE); if (rc) return rc; - *(MDB_ID *)data.mv_data = 0; /* IDL is initially empty */ + /* IDL is initially empty, zero out at least the length */ + pgs = (pgno_t *)data.mv_data; + j = head_room > clean_limit ? head_room : 0; + do { + pgs[j] = 0; + } while (--j >= 0); total_room += head_room; } @@ -2589,22 +2770,20 @@ mdb_freelist_save(MDB_txn *txn) mop += mop_len; rc = mdb_cursor_first(&mc, &key, &data); for (; !rc; rc = mdb_cursor_next(&mc, &key, &data, MDB_NEXT)) { - unsigned flags = MDB_CURRENT; txnid_t id = *(txnid_t *)key.mv_data; ssize_t len = (ssize_t)(data.mv_size / sizeof(MDB_ID)) - 1; MDB_ID save; - assert(len >= 0 && id <= env->me_pglast); + mdb_tassert(txn, len >= 0 && id <= env->me_pglast); key.mv_data = &id; if (len > mop_len) { len = mop_len; data.mv_size = (len + 1) * sizeof(MDB_ID); - flags = 0; } data.mv_data = mop -= len; save = mop[0]; mop[0] = len; - rc = mdb_cursor_put(&mc, &key, &data, flags); + rc = mdb_cursor_put(&mc, &key, &data, MDB_CURRENT); mop[0] = save; if (rc || !(mop_len -= len)) break; @@ -2760,8 +2939,8 @@ mdb_txn_commit(MDB_txn *txn) unsigned int i; MDB_env *env; - assert(txn != NULL); - assert(txn->mt_env != NULL); + if (txn == NULL || txn->mt_env == NULL) + return EINVAL; if (txn->mt_child) { rc = mdb_txn_commit(txn->mt_child); @@ -2869,7 +3048,7 @@ mdb_txn_commit(MDB_txn *txn) if (yp == dst[x].mid) free(dst[x--].mptr); } - assert(i == x); + mdb_tassert(txn, i == x); dst[0].mid = len; free(txn->mt_u.dirty_list); parent->mt_dirty_room = txn->mt_dirty_room; @@ -2968,10 +3147,11 @@ fail: static int mdb_env_read_header(MDB_env *env, MDB_meta *meta) { - MDB_pagebuf pbuf; + MDB_metabuf pbuf; MDB_page *p; MDB_meta *m; int i, rc, off; + enum { Size = sizeof(pbuf) }; /* We don't know the page size yet, so use a minimum value. * Read both meta pages so we can use the latest one. @@ -2983,13 +3163,13 @@ mdb_env_read_header(MDB_env *env, MDB_meta *meta) OVERLAPPED ov; memset(&ov, 0, sizeof(ov)); ov.Offset = off; - rc = ReadFile(env->me_fd,&pbuf,MDB_PAGESIZE,&len,&ov) ? (int)len : -1; + rc = ReadFile(env->me_fd, &pbuf, Size, &len, &ov) ? (int)len : -1; if (rc == -1 && ErrCode() == ERROR_HANDLE_EOF) rc = 0; #else - rc = pread(env->me_fd, &pbuf, MDB_PAGESIZE, off); + rc = pread(env->me_fd, &pbuf, Size, off); #endif - if (rc != MDB_PAGESIZE) { + if (rc != Size) { if (rc == 0 && off == 0) return ENOENT; rc = rc < 0 ? (int) ErrCode() : MDB_INVALID; @@ -3101,9 +3281,6 @@ mdb_env_write_meta(MDB_txn *txn) int r2; #endif - assert(txn != NULL); - assert(txn->mt_env != NULL); - toggle = txn->mt_txnid & 1; DPRINTF(("writing meta page %d for root page %"Z"u", toggle, txn->mt_dbs[MAIN_DBI].md_root)); @@ -3120,11 +3297,18 @@ mdb_env_write_meta(MDB_txn *txn) mp->mm_last_pg = txn->mt_next_pgno - 1; mp->mm_txnid = txn->mt_txnid; if (!(env->me_flags & (MDB_NOMETASYNC|MDB_NOSYNC))) { + unsigned meta_size = env->me_psize; rc = (env->me_flags & MDB_MAPASYNC) ? MS_ASYNC : MS_SYNC; ptr = env->me_map; - if (toggle) - ptr += env->me_psize; - if (MDB_MSYNC(ptr, env->me_psize, rc)) { + if (toggle) { +#ifndef _WIN32 /* POSIX msync() requires ptr = start of OS page */ + if (meta_size < env->me_os_psize) + meta_size += meta_size; + else +#endif + ptr += meta_size; + } + if (MDB_MSYNC(ptr, meta_size, rc)) { rc = ErrCode(); goto fail; } @@ -3230,6 +3414,7 @@ mdb_env_create(MDB_env **env) e->me_wmutex = SEM_FAILED; #endif e->me_pid = getpid(); + GET_PAGESIZE(e->me_os_psize); VGMEMP_CREATE(e,0,0); *env = e; return MDB_SUCCESS; @@ -3272,7 +3457,7 @@ mdb_env_map(MDB_env *env, void *addr, int newsize) int prot = PROT_READ; if (flags & MDB_WRITEMAP) { prot |= PROT_WRITE; - if (newsize && ftruncate(env->me_fd, env->me_mapsize) < 0) + if (ftruncate(env->me_fd, env->me_mapsize) < 0) return ErrCode(); } env->me_map = mmap(addr, env->me_mapsize, prot, MAP_SHARED, @@ -3281,14 +3466,17 @@ mdb_env_map(MDB_env *env, void *addr, int newsize) env->me_map = NULL; return ErrCode(); } - /* Turn off readahead. It's harmful when the DB is larger than RAM. */ + + if (flags & MDB_NORDAHEAD) { + /* Turn off readahead. It's harmful when the DB is larger than RAM. */ #ifdef MADV_RANDOM - madvise(env->me_map, env->me_mapsize, MADV_RANDOM); + madvise(env->me_map, env->me_mapsize, MADV_RANDOM); #else #ifdef POSIX_MADV_RANDOM - posix_madvise(env->me_map, env->me_mapsize, POSIX_MADV_RANDOM); + posix_madvise(env->me_map, env->me_mapsize, POSIX_MADV_RANDOM); #endif /* POSIX_MADV_RANDOM */ #endif /* MADV_RANDOM */ + } #endif /* _WIN32 */ /* Can happen because the address argument to mmap() is just a @@ -3319,6 +3507,14 @@ mdb_env_set_mapsize(MDB_env *env, size_t size) return EINVAL; if (!size) size = env->me_metas[mdb_env_pick_meta(env)]->mm_mapsize; + else if (size < env->me_mapsize) { + /* If the configured size is smaller, make sure it's + * still big enough. Silently round up to minimum if not. + */ + size_t minsize = (env->me_metas[mdb_env_pick_meta(env)]->mm_last_pg + 1) * env->me_psize; + if (size < minsize) + size = minsize; + } munmap(env->me_map, env->me_mapsize); env->me_mapsize = size; old = (env->me_flags & MDB_FIXEDMAP) ? env->me_map : NULL; @@ -3384,7 +3580,9 @@ mdb_env_open2(MDB_env *env) return i; DPUTS("new mdbenv"); newenv = 1; - GET_PAGESIZE(env->me_psize); + env->me_psize = env->me_os_psize; + if (env->me_psize > MAX_PAGESIZE) + env->me_psize = MAX_PAGESIZE; } else { env->me_psize = meta.mm_psize; } @@ -3404,7 +3602,7 @@ mdb_env_open2(MDB_env *env) env->me_mapsize = minsize; } - rc = mdb_env_map(env, meta.mm_address, newenv); + rc = mdb_env_map(env, meta.mm_address, newenv || env->me_mapsize != meta.mm_mapsize); if (rc) return rc; @@ -3416,10 +3614,15 @@ mdb_env_open2(MDB_env *env) return i; } } - env->me_maxfree_1pg = (env->me_psize - PAGEHDRSZ) / sizeof(pgno_t) - 1; - env->me_nodemax = (env->me_psize - PAGEHDRSZ) / MDB_MINKEYS; + env->me_maxfree_1pg = (env->me_psize - PAGEHDRSZ) / sizeof(pgno_t) - 1; + env->me_nodemax = (((env->me_psize - PAGEHDRSZ) / MDB_MINKEYS) & -2) + - sizeof(indx_t); +#if !(MDB_MAXKEYSIZE) + env->me_maxkey = env->me_nodemax - (NODESIZE + sizeof(MDB_db)); +#endif env->me_maxpg = env->me_mapsize / env->me_psize; + #if MDB_DEBUG { int toggle = mdb_env_pick_meta(env); @@ -3474,7 +3677,9 @@ static void NTAPI mdb_tls_callback(PVOID module, DWORD reason, PVOID ptr) case DLL_THREAD_DETACH: for (i=0; ime_maxreaders-1) * sizeof(MDB_reader) + sizeof(MDB_txninfo); if (size < rsize && *excl > 0) { #ifdef _WIN32 - if (SetFilePointer(env->me_lfd, rsize, NULL, FILE_BEGIN) != rsize + if (SetFilePointer(env->me_lfd, rsize, NULL, FILE_BEGIN) != (DWORD)rsize || !SetEndOfFile(env->me_lfd)) goto fail_errno; #else @@ -3915,8 +4120,13 @@ fail: * at runtime. Changing other flags requires closing the * environment and re-opening it with the new flags. */ -#define CHANGEABLE (MDB_NOSYNC|MDB_NOMETASYNC|MDB_MAPASYNC) -#define CHANGELESS (MDB_FIXEDMAP|MDB_NOSUBDIR|MDB_RDONLY|MDB_WRITEMAP|MDB_NOTLS|MDB_NOLOCK) +#define CHANGEABLE (MDB_NOSYNC|MDB_NOMETASYNC|MDB_MAPASYNC|MDB_NOMEMINIT) +#define CHANGELESS (MDB_FIXEDMAP|MDB_NOSUBDIR|MDB_RDONLY|MDB_WRITEMAP| \ + MDB_NOTLS|MDB_NOLOCK|MDB_NORDAHEAD) + +#if VALID_FLAGS & PERSISTENT_FLAGS & (CHANGEABLE|CHANGELESS) +# error "Persistent DB flags & env flags overlap, but both go in mm_flags" +#endif int mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode) @@ -4029,7 +4239,12 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode DPRINTF(("opened dbenv %p", (void *) env)); if (excl > 0) { rc = mdb_env_share_locks(env, &excl); + if (rc) + goto leave; } + if (!((flags & MDB_RDONLY) || + (env->me_pbuf = calloc(1, env->me_psize)))) + rc = ENOMEM; } leave: @@ -4053,6 +4268,7 @@ mdb_env_close0(MDB_env *env, int excl) for (i = env->me_maxdbs; --i > MAIN_DBI; ) free(env->me_dbxs[i].md_name.mv_data); + free(env->me_pbuf); free(env->me_dbflags); free(env->me_dbxs); free(env->me_path); @@ -4080,7 +4296,7 @@ mdb_env_close0(MDB_env *env, int excl) if (env->me_fd != INVALID_HANDLE_VALUE) (void) close(env->me_fd); if (env->me_txns) { - pid_t pid = env->me_pid; + MDB_PID_T pid = env->me_pid; /* Clearing readers is done in this function because * me_txkey with its destructor must be disabled first. */ @@ -4330,6 +4546,13 @@ mdb_cmp_cint(const MDB_val *a, const MDB_val *b) #endif } +/** Compare two items pointing at size_t's of unknown alignment. */ +#ifdef MISALIGNED_OK +# define mdb_cmp_clong mdb_cmp_long +#else +# define mdb_cmp_clong mdb_cmp_cint +#endif + /** Compare two items lexically */ static int mdb_cmp_memn(const MDB_val *a, const MDB_val *b) @@ -4396,17 +4619,9 @@ mdb_node_search(MDB_cursor *mc, MDB_val *key, int *exactp) nkeys = NUMKEYS(mp); -#if MDB_DEBUG - { - pgno_t pgno; - COPY_PGNO(pgno, mp->mp_pgno); DPRINTF(("searching %u keys in %s %spage %"Z"u", nkeys, IS_LEAF(mp) ? "leaf" : "branch", IS_SUBP(mp) ? "sub-" : "", - pgno)); - } -#endif - - assert(nkeys > 0); + mdb_dbg_pgno(mp))); low = IS_LEAF(mp) ? 0 : 1; high = nkeys - 1; @@ -4470,7 +4685,7 @@ mdb_node_search(MDB_cursor *mc, MDB_val *key, int *exactp) node = NODEPTR(mp, i); } if (exactp) - *exactp = (rc == 0); + *exactp = (rc == 0 && nkeys > 0); /* store the key index */ mc->mc_ki[mc->mc_top] = i; if (i >= nkeys) @@ -4520,7 +4735,7 @@ mdb_cursor_push(MDB_cursor *mc, MDB_page *mp) DDBI(mc), (void *) mc)); if (mc->mc_snum >= CURSOR_STACK) { - assert(mc->mc_snum < CURSOR_STACK); + mc->mc_txn->mt_flags |= MDB_TXN_ERROR; return MDB_CURSOR_FULL; } @@ -4580,7 +4795,7 @@ mdb_page_get(MDB_txn *txn, pgno_t pgno, MDB_page **ret, int *lvl) p = (MDB_page *)(env->me_map + env->me_psize * pgno); } else { DPRINTF(("page %"Z"u not found", pgno)); - assert(p != NULL); + txn->mt_flags |= MDB_TXN_ERROR; return MDB_PAGE_NOTFOUND; } @@ -4606,7 +4821,7 @@ mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int flags) indx_t i; DPRINTF(("branch page %"Z"u has %u keys", mp->mp_pgno, NUMKEYS(mp))); - assert(NUMKEYS(mp) > 1); + mdb_cassert(mc, NUMKEYS(mp) > 1); DPRINTF(("found index 0 to page %"Z"u", NODEPGNO(NODEPTR(mp, 0)))); if (flags & (MDB_PS_FIRST|MDB_PS_LAST)) { @@ -4621,14 +4836,14 @@ mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int flags) else { i = mc->mc_ki[mc->mc_top]; if (!exact) { - assert(i > 0); + mdb_cassert(mc, i > 0); i--; } } DPRINTF(("following index %u for key [%s]", i, DKEY(key))); } - assert(i < NUMKEYS(mp)); + mdb_cassert(mc, i < NUMKEYS(mp)); node = NODEPTR(mp, i); if ((rc = mdb_page_get(mc->mc_txn, NODEPGNO(node), &mp, NULL)) != 0) @@ -4648,6 +4863,7 @@ mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int flags) if (!IS_LEAF(mp)) { DPRINTF(("internal error, index points to a %02X page!?", mp->mp_flags)); + mc->mc_txn->mt_flags |= MDB_TXN_ERROR; return MDB_CORRUPTED; } @@ -4742,7 +4958,7 @@ mdb_page_search(MDB_cursor *mc, MDB_val *key, int flags) } } - assert(root > 1); + mdb_cassert(mc, root > 1); if (!mc->mc_pg[0] || mc->mc_pg[0]->mp_pgno != root) if ((rc = mdb_page_get(mc->mc_txn, root, &mc->mc_pg[0], NULL)) != 0) return rc; @@ -4812,7 +5028,7 @@ mdb_ovpage_free(MDB_cursor *mc, MDB_page *mp) iy = dl[x]; dl[x] = ix; } else { - assert(x > 1); + mdb_cassert(mc, x > 1); j = ++(dl[0].mid); dl[j] = ix; /* Unsorted. OK when MDB_TXN_ERROR. */ txn->mt_flags |= MDB_TXN_ERROR; @@ -4880,20 +5096,14 @@ mdb_get(MDB_txn *txn, MDB_dbi dbi, int exact = 0; DKBUF; - assert(key); - assert(data); DPRINTF(("===> get db %u key [%s]", dbi, DKEY(key))); - if (txn == NULL || !dbi || dbi >= txn->mt_numdbs || !(txn->mt_dbflags[dbi] & DB_VALID)) + if (!key || !data || dbi == FREE_DBI || !TXN_DBI_EXIST(txn, dbi)) return EINVAL; if (txn->mt_flags & MDB_TXN_ERROR) return MDB_BAD_TXN; - if (key->mv_size > MDB_MAXKEYSIZE) { - return MDB_BAD_VALSIZE; - } - mdb_cursor_init(&mc, txn, dbi, &mx); return mdb_cursor_set(&mc, key, data, MDB_SET, &exact); } @@ -4939,11 +5149,14 @@ mdb_cursor_sibling(MDB_cursor *mc, int move_right) DPRINTF(("just moving to %s index key %u", move_right ? "right" : "left", mc->mc_ki[mc->mc_top])); } - assert(IS_BRANCH(mc->mc_pg[mc->mc_top])); + mdb_cassert(mc, IS_BRANCH(mc->mc_pg[mc->mc_top])); indx = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); - if ((rc = mdb_page_get(mc->mc_txn, NODEPGNO(indx), &mp, NULL) != 0)) + if ((rc = mdb_page_get(mc->mc_txn, NODEPGNO(indx), &mp, NULL)) != 0) { + /* mc will be inconsistent if caller does mc_snum++ as above */ + mc->mc_flags &= ~(C_INITIALIZED|C_EOF); return rc; + } mdb_cursor_push(mc, mp); if (!move_right) @@ -4964,7 +5177,7 @@ mdb_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) return MDB_NOTFOUND; } - assert(mc->mc_flags & C_INITIALIZED); + mdb_cassert(mc, mc->mc_flags & C_INITIALIZED); mp = mc->mc_pg[mc->mc_top]; @@ -4986,7 +5199,8 @@ mdb_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) } } - DPRINTF(("cursor_next: top page is %"Z"u in cursor %p", mp->mp_pgno, (void *) mc)); + DPRINTF(("cursor_next: top page is %"Z"u in cursor %p", + mdb_dbg_pgno(mp), (void *) mc)); if (mc->mc_flags & C_DEL) goto skip; @@ -5003,7 +5217,7 @@ mdb_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) skip: DPRINTF(("==> cursor points to page %"Z"u with %u keys, key index %u", - mp->mp_pgno, NUMKEYS(mp), mc->mc_ki[mc->mc_top])); + mdb_dbg_pgno(mp), NUMKEYS(mp), mc->mc_ki[mc->mc_top])); if (IS_LEAF2(mp)) { key->mv_size = mc->mc_db->md_pad; @@ -5011,7 +5225,7 @@ skip: return MDB_SUCCESS; } - assert(IS_LEAF(mp)); + mdb_cassert(mc, IS_LEAF(mp)); leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { @@ -5040,7 +5254,7 @@ mdb_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) MDB_node *leaf; int rc; - assert(mc->mc_flags & C_INITIALIZED); + mdb_cassert(mc, mc->mc_flags & C_INITIALIZED); mp = mc->mc_pg[mc->mc_top]; @@ -5062,7 +5276,8 @@ mdb_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) } } - DPRINTF(("cursor_prev: top page is %"Z"u in cursor %p", mp->mp_pgno, (void *) mc)); + DPRINTF(("cursor_prev: top page is %"Z"u in cursor %p", + mdb_dbg_pgno(mp), (void *) mc)); if (mc->mc_ki[mc->mc_top] == 0) { DPUTS("=====> move to prev sibling page"); @@ -5078,7 +5293,7 @@ mdb_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) mc->mc_flags &= ~C_EOF; DPRINTF(("==> cursor points to page %"Z"u with %u keys, key index %u", - mp->mp_pgno, NUMKEYS(mp), mc->mc_ki[mc->mc_top])); + mdb_dbg_pgno(mp), NUMKEYS(mp), mc->mc_ki[mc->mc_top])); if (IS_LEAF2(mp)) { key->mv_size = mc->mc_db->md_pad; @@ -5086,7 +5301,7 @@ mdb_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) return MDB_SUCCESS; } - assert(IS_LEAF(mp)); + mdb_cassert(mc, IS_LEAF(mp)); leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { @@ -5117,8 +5332,6 @@ mdb_cursor_set(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_node *leaf = NULL; DKBUF; - assert(mc); - assert(key); if (key->mv_size == 0) return MDB_BAD_VALSIZE; @@ -5208,7 +5421,7 @@ mdb_cursor_set(MDB_cursor *mc, MDB_val *key, MDB_val *data, if (!mc->mc_top) { /* There are no other pages */ mc->mc_ki[mc->mc_top] = 0; - if (op == MDB_SET_RANGE) { + if (op == MDB_SET_RANGE && !exactp) { rc = 0; goto set1; } else @@ -5221,7 +5434,7 @@ mdb_cursor_set(MDB_cursor *mc, MDB_val *key, MDB_val *data, return rc; mp = mc->mc_pg[mc->mc_top]; - assert(IS_LEAF(mp)); + mdb_cassert(mc, IS_LEAF(mp)); set2: leaf = mdb_node_search(mc, key, exactp); @@ -5235,7 +5448,7 @@ set2: if ((rc = mdb_cursor_sibling(mc, 1)) != MDB_SUCCESS) return rc; /* no entries matched */ mp = mc->mc_pg[mc->mc_top]; - assert(IS_LEAF(mp)); + mdb_cassert(mc, IS_LEAF(mp)); leaf = NODEPTR(mp, 0); } @@ -5244,8 +5457,10 @@ set1: mc->mc_flags &= ~C_EOF; if (IS_LEAF2(mp)) { - key->mv_size = mc->mc_db->md_pad; - key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); + if (op == MDB_SET_RANGE || op == MDB_SET_KEY) { + key->mv_size = mc->mc_db->md_pad; + key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); + } return MDB_SUCCESS; } @@ -5277,6 +5492,7 @@ set1: if (op == MDB_GET_BOTH || rc > 0) return MDB_NOTFOUND; rc = 0; + *data = d2; } } else { @@ -5310,7 +5526,7 @@ mdb_cursor_first(MDB_cursor *mc, MDB_val *key, MDB_val *data) if (rc != MDB_SUCCESS) return rc; } - assert(IS_LEAF(mc->mc_pg[mc->mc_top])); + mdb_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); leaf = NODEPTR(mc->mc_pg[mc->mc_top], 0); mc->mc_flags |= C_INITIALIZED; @@ -5356,7 +5572,7 @@ mdb_cursor_last(MDB_cursor *mc, MDB_val *key, MDB_val *data) if (rc != MDB_SUCCESS) return rc; } - assert(IS_LEAF(mc->mc_pg[mc->mc_top])); + mdb_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); } mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]) - 1; @@ -5393,7 +5609,8 @@ mdb_cursor_get(MDB_cursor *mc, MDB_val *key, MDB_val *data, int exact = 0; int (*mfunc)(MDB_cursor *mc, MDB_val *key, MDB_val *data); - assert(mc); + if (mc == NULL) + return EINVAL; if (mc->mc_txn->mt_flags & MDB_TXN_ERROR) return MDB_BAD_TXN; @@ -5404,8 +5621,9 @@ mdb_cursor_get(MDB_cursor *mc, MDB_val *key, MDB_val *data, rc = EINVAL; } else { MDB_page *mp = mc->mc_pg[mc->mc_top]; - if (!NUMKEYS(mp)) { - mc->mc_ki[mc->mc_top] = 0; + int nkeys = NUMKEYS(mp); + if (!nkeys || mc->mc_ki[mc->mc_top] >= nkeys) { + mc->mc_ki[mc->mc_top] = nkeys; rc = MDB_NOTFOUND; break; } @@ -5444,12 +5662,10 @@ mdb_cursor_get(MDB_cursor *mc, MDB_val *key, MDB_val *data, case MDB_SET_RANGE: if (key == NULL) { rc = EINVAL; - } else if (key->mv_size > MDB_MAXKEYSIZE) { - rc = MDB_BAD_VALSIZE; - } else if (op == MDB_SET_RANGE) - rc = mdb_cursor_set(mc, key, data, op, NULL); - else - rc = mdb_cursor_set(mc, key, data, op, &exact); + } else { + rc = mdb_cursor_set(mc, key, data, op, + op == MDB_SET_RANGE ? NULL : &exact); + } break; case MDB_GET_MULTIPLE: if (data == NULL || !(mc->mc_flags & C_INITIALIZED)) { @@ -5550,14 +5766,14 @@ fetchm: return rc; } -/** Touch all the pages in the cursor stack. +/** Touch all the pages in the cursor stack. Set mc_top. * Makes sure all the pages are writable, before attempting a write operation. * @param[in] mc The cursor to operate on. */ static int mdb_cursor_touch(MDB_cursor *mc) { - int rc; + int rc = MDB_SUCCESS; if (mc->mc_dbi > MAIN_DBI && !(*mc->mc_dbflag & DB_DIRTY)) { MDB_cursor mc2; @@ -5568,13 +5784,14 @@ mdb_cursor_touch(MDB_cursor *mc) return rc; *mc->mc_dbflag |= DB_DIRTY; } - for (mc->mc_top = 0; mc->mc_top < mc->mc_snum; mc->mc_top++) { - rc = mdb_page_touch(mc); - if (rc) - return rc; + mc->mc_top = 0; + if (mc->mc_snum) { + do { + rc = mdb_page_touch(mc); + } while (!rc && ++(mc->mc_top) < mc->mc_snum); + mc->mc_top = mc->mc_snum-1; } - mc->mc_top = mc->mc_snum-1; - return MDB_SUCCESS; + return rc; } /** Do not spill pages to disk if txn is getting full, may fail instead */ @@ -5585,19 +5802,24 @@ mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, unsigned int flags) { enum { MDB_NO_ROOT = MDB_LAST_ERRCODE+10 }; /* internal code */ + MDB_env *env; MDB_node *leaf = NULL; - MDB_val xdata, *rdata, dkey; - MDB_page *fp; + MDB_page *fp, *mp; + uint16_t fp_flags; + MDB_val xdata, *rdata, dkey, olddata; MDB_db dummy; - int do_sub = 0, insert = 0; + int do_sub = 0, insert_key, insert_data; unsigned int mcount = 0, dcount = 0, nospill; size_t nsize; int rc, rc2; - MDB_pagebuf pbuf; - char dbuf[MDB_MAXKEYSIZE+1]; unsigned int nflags; DKBUF; + if (mc == NULL || key == NULL) + return EINVAL; + + env = mc->mc_txn->mt_env; + /* Check this first so counter will always be zero on any * early failures. */ @@ -5614,14 +5836,14 @@ mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, if (mc->mc_txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_ERROR)) return (mc->mc_txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; - if (flags != MDB_CURRENT && (key->mv_size == 0 || key->mv_size > MDB_MAXKEYSIZE)) - return MDB_BAD_VALSIZE; - - if (F_ISSET(mc->mc_db->md_flags, MDB_DUPSORT) && data->mv_size > MDB_MAXKEYSIZE) + if (key->mv_size-1 >= ENV_MAXKEY(env)) return MDB_BAD_VALSIZE; #if SIZE_MAX > MAXDATASIZE - if (data->mv_size > MAXDATASIZE) + if (data->mv_size > ((mc->mc_db->md_flags & MDB_DUPSORT) ? ENV_MAXKEY(env) : MAXDATASIZE)) + return MDB_BAD_VALSIZE; +#else + if ((mc->mc_db->md_flags & MDB_DUPSORT) && data->mv_size > ENV_MAXKEY(env)) return MDB_BAD_VALSIZE; #endif @@ -5637,6 +5859,7 @@ mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, } else if (mc->mc_db->md_root == P_INVALID) { /* new database, cursor has nothing to point to */ mc->mc_snum = 0; + mc->mc_top = 0; mc->mc_flags &= ~C_INITIALIZED; rc = MDB_NO_ROOT; } else { @@ -5704,164 +5927,172 @@ mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, return rc2; } - /* The key already exists */ - if (rc == MDB_SUCCESS) { + insert_key = insert_data = rc; + if (insert_key) { + /* The key does not exist */ + DPRINTF(("inserting key at index %i", mc->mc_ki[mc->mc_top])); + if ((mc->mc_db->md_flags & MDB_DUPSORT) && + LEAFSIZE(key, data) > env->me_nodemax) + { + /* Too big for a node, insert in sub-DB */ + fp_flags = P_LEAF|P_DIRTY; + fp = env->me_pbuf; + fp->mp_pad = data->mv_size; /* used if MDB_DUPFIXED */ + fp->mp_lower = fp->mp_upper = olddata.mv_size = PAGEHDRSZ; + goto prep_subDB; + } + } else { /* there's only a key anyway, so this is a no-op */ if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { + char *ptr; unsigned int ksize = mc->mc_db->md_pad; if (key->mv_size != ksize) return MDB_BAD_VALSIZE; - if (flags == MDB_CURRENT) { - char *ptr = LEAF2KEY(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], ksize); - memcpy(ptr, key->mv_data, ksize); - } + ptr = LEAF2KEY(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], ksize); + memcpy(ptr, key->mv_data, ksize); return MDB_SUCCESS; } +more: leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + olddata.mv_size = NODEDSZ(leaf); + olddata.mv_data = NODEDATA(leaf); /* DB has dups? */ if (F_ISSET(mc->mc_db->md_flags, MDB_DUPSORT)) { + /* Prepare (sub-)page/sub-DB to accept the new item, + * if needed. fp: old sub-page or a header faking + * it. mp: new (sub-)page. offset: growth in page + * size. xdata: node data with new page or DB. + */ + unsigned i, offset = 0; + mp = fp = xdata.mv_data = env->me_pbuf; + mp->mp_pgno = mc->mc_pg[mc->mc_top]->mp_pgno; + /* Was a single item before, must convert now */ -more: if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) { /* Just overwrite the current item */ if (flags == MDB_CURRENT) goto current; - dkey.mv_size = NODEDSZ(leaf); - dkey.mv_data = NODEDATA(leaf); #if UINT_MAX < SIZE_MAX - if (mc->mc_dbx->md_dcmp == mdb_cmp_int && dkey.mv_size == sizeof(size_t)) -#ifdef MISALIGNED_OK - mc->mc_dbx->md_dcmp = mdb_cmp_long; -#else - mc->mc_dbx->md_dcmp = mdb_cmp_cint; + if (mc->mc_dbx->md_dcmp == mdb_cmp_int && olddata.mv_size == sizeof(size_t)) + mc->mc_dbx->md_dcmp = mdb_cmp_clong; #endif -#endif - /* if data matches, skip it */ - if (!mc->mc_dbx->md_dcmp(data, &dkey)) { + /* does data match? */ + if (!mc->mc_dbx->md_dcmp(data, &olddata)) { if (flags & MDB_NODUPDATA) - rc = MDB_KEYEXIST; - else if (flags & MDB_MULTIPLE) - goto next_mult; - else - rc = MDB_SUCCESS; - return rc; + return MDB_KEYEXIST; + /* overwrite it */ + goto current; } - /* create a fake page for the dup items */ - memcpy(dbuf, dkey.mv_data, dkey.mv_size); - dkey.mv_data = dbuf; - fp = (MDB_page *)&pbuf; - fp->mp_pgno = mc->mc_pg[mc->mc_top]->mp_pgno; + /* Back up original data item */ + dkey.mv_size = olddata.mv_size; + dkey.mv_data = memcpy(fp+1, olddata.mv_data, olddata.mv_size); + + /* Make sub-page header for the dup items, with dummy body */ fp->mp_flags = P_LEAF|P_DIRTY|P_SUBP; fp->mp_lower = PAGEHDRSZ; - fp->mp_upper = PAGEHDRSZ + dkey.mv_size + data->mv_size; + xdata.mv_size = PAGEHDRSZ + dkey.mv_size + data->mv_size; if (mc->mc_db->md_flags & MDB_DUPFIXED) { fp->mp_flags |= P_LEAF2; fp->mp_pad = data->mv_size; - fp->mp_upper += 2 * data->mv_size; /* leave space for 2 more */ + xdata.mv_size += 2 * data->mv_size; /* leave space for 2 more */ } else { - fp->mp_upper += 2 * sizeof(indx_t) + 2 * NODESIZE + + xdata.mv_size += 2 * (sizeof(indx_t) + NODESIZE) + (dkey.mv_size & 1) + (data->mv_size & 1); } - mdb_node_del(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], 0); - do_sub = 1; - rdata = &xdata; - xdata.mv_size = fp->mp_upper; - xdata.mv_data = fp; - flags |= F_DUPDATA; - goto new_sub; - } - if (!F_ISSET(leaf->mn_flags, F_SUBDATA)) { - /* See if we need to convert from fake page to subDB */ - MDB_page *mp; - unsigned int offset; - unsigned int i; - uint16_t fp_flags; - - fp = NODEDATA(leaf); - if (flags == MDB_CURRENT) { -reuse: + fp->mp_upper = xdata.mv_size; + olddata.mv_size = fp->mp_upper; /* pretend olddata is fp */ + } else if (leaf->mn_flags & F_SUBDATA) { + /* Data is on sub-DB, just store it */ + flags |= F_DUPDATA|F_SUBDATA; + goto put_sub; + } else { + /* Data is on sub-page */ + fp = olddata.mv_data; + switch (flags) { + default: + if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) { + offset = EVEN(NODESIZE + sizeof(indx_t) + + data->mv_size); + break; + } + offset = fp->mp_pad; + if (SIZELEFT(fp) < offset) { + offset *= 4; /* space for 4 more */ + break; + } + /* FALLTHRU: Big enough MDB_DUPFIXED sub-page */ + case MDB_CURRENT: fp->mp_flags |= P_DIRTY; - COPY_PGNO(fp->mp_pgno, mc->mc_pg[mc->mc_top]->mp_pgno); + COPY_PGNO(fp->mp_pgno, mp->mp_pgno); mc->mc_xcursor->mx_cursor.mc_pg[0] = fp; flags |= F_DUPDATA; goto put_sub; } - if (mc->mc_db->md_flags & MDB_DUPFIXED) { - offset = fp->mp_pad; - if (SIZELEFT(fp) >= offset) - goto reuse; - offset *= 4; /* space for 4 more */ - } else { - offset = NODESIZE + sizeof(indx_t) + data->mv_size; - } - offset += offset & 1; - fp_flags = fp->mp_flags; - if (NODESIZE + sizeof(indx_t) + NODEKSZ(leaf) + NODEDSZ(leaf) + - offset >= mc->mc_txn->mt_env->me_nodemax) { - /* yes, convert it */ - dummy.md_flags = 0; + xdata.mv_size = olddata.mv_size + offset; + } + + fp_flags = fp->mp_flags; + if (NODESIZE + NODEKSZ(leaf) + xdata.mv_size > env->me_nodemax) { + /* Too big for a sub-page, convert to sub-DB */ + fp_flags &= ~P_SUBP; +prep_subDB: if (mc->mc_db->md_flags & MDB_DUPFIXED) { + fp_flags |= P_LEAF2; dummy.md_pad = fp->mp_pad; dummy.md_flags = MDB_DUPFIXED; if (mc->mc_db->md_flags & MDB_INTEGERDUP) dummy.md_flags |= MDB_INTEGERKEY; + } else { + dummy.md_pad = 0; + dummy.md_flags = 0; } dummy.md_depth = 1; dummy.md_branch_pages = 0; dummy.md_leaf_pages = 1; dummy.md_overflow_pages = 0; dummy.md_entries = NUMKEYS(fp); - rdata = &xdata; xdata.mv_size = sizeof(MDB_db); xdata.mv_data = &dummy; if ((rc = mdb_page_alloc(mc, 1, &mp))) return rc; - offset = mc->mc_txn->mt_env->me_psize - NODEDSZ(leaf); + offset = env->me_psize - olddata.mv_size; flags |= F_DUPDATA|F_SUBDATA; dummy.md_root = mp->mp_pgno; - fp_flags &= ~P_SUBP; - } else { - /* no, just grow it */ - rdata = &xdata; - xdata.mv_size = NODEDSZ(leaf) + offset; - xdata.mv_data = &pbuf; - mp = (MDB_page *)&pbuf; - mp->mp_pgno = mc->mc_pg[mc->mc_top]->mp_pgno; - flags |= F_DUPDATA; - } + } + if (mp != fp) { mp->mp_flags = fp_flags | P_DIRTY; mp->mp_pad = fp->mp_pad; mp->mp_lower = fp->mp_lower; mp->mp_upper = fp->mp_upper + offset; - if (IS_LEAF2(fp)) { + if (fp_flags & P_LEAF2) { memcpy(METADATA(mp), METADATA(fp), NUMKEYS(fp) * fp->mp_pad); } else { - nsize = NODEDSZ(leaf) - fp->mp_upper; - memcpy((char *)mp + mp->mp_upper, (char *)fp + fp->mp_upper, nsize); + memcpy((char *)mp + mp->mp_upper, (char *)fp + fp->mp_upper, + olddata.mv_size - fp->mp_upper); for (i=0; imp_ptrs[i] = fp->mp_ptrs[i] + offset; } - mdb_node_del(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], 0); - do_sub = 1; - goto new_sub; } - /* data is on sub-DB, just store it */ - flags |= F_DUPDATA|F_SUBDATA; - goto put_sub; + + rdata = &xdata; + flags |= F_DUPDATA; + do_sub = 1; + if (!insert_key) + mdb_node_del(mc, 0); + goto new_sub; } current: /* overflow page overwrites need special handling */ if (F_ISSET(leaf->mn_flags, F_BIGDATA)) { MDB_page *omp; pgno_t pg; - unsigned psize = mc->mc_txn->mt_env->me_psize; - int level, ovpages, dpages = OVPAGES(data->mv_size, psize); + int level, ovpages, dpages = OVPAGES(data->mv_size, env->me_psize); - memcpy(&pg, NODEDATA(leaf), sizeof(pg)); + memcpy(&pg, olddata.mv_data, sizeof(pg)); if ((rc2 = mdb_page_get(mc->mc_txn, pg, &omp, &level)) != 0) return rc2; ovpages = omp->mp_pages; @@ -5869,7 +6100,7 @@ current: /* Is the ov page large enough? */ if (ovpages >= dpages) { if (!(omp->mp_flags & P_DIRTY) && - (level || (mc->mc_txn->mt_env->me_flags & MDB_WRITEMAP))) + (level || (env->me_flags & MDB_WRITEMAP))) { rc = mdb_page_unspill(mc->mc_txn, omp, &omp); if (rc) @@ -5884,14 +6115,15 @@ current: */ if (level > 1) { /* It is writable only in a parent txn */ - size_t sz = (size_t) psize * ovpages, off; + size_t sz = (size_t) env->me_psize * ovpages, off; MDB_page *np = mdb_page_malloc(mc->mc_txn, ovpages); MDB_ID2 id2; if (!np) return ENOMEM; id2.mid = pg; id2.mptr = np; - mdb_mid2l_insert(mc->mc_txn->mt_u.dirty_list, &id2); + rc2 = mdb_mid2l_insert(mc->mc_txn->mt_u.dirty_list, &id2); + mdb_cassert(mc, rc2 == 0); if (!(flags & MDB_RESERVE)) { /* Copy end of page, adjusting alignment so * compiler may copy words instead of bytes. @@ -5909,46 +6141,42 @@ current: data->mv_data = METADATA(omp); else memcpy(METADATA(omp), data->mv_data, data->mv_size); - goto done; + return MDB_SUCCESS; } } if ((rc2 = mdb_ovpage_free(mc, omp)) != MDB_SUCCESS) return rc2; - } else if (NODEDSZ(leaf) == data->mv_size) { + } else if (data->mv_size == olddata.mv_size) { /* same size, just replace it. Note that we could * also reuse this node if the new data is smaller, * but instead we opt to shrink the node in that case. */ if (F_ISSET(flags, MDB_RESERVE)) - data->mv_data = NODEDATA(leaf); - else if (data->mv_size) - memcpy(NODEDATA(leaf), data->mv_data, data->mv_size); + data->mv_data = olddata.mv_data; + else if (!(mc->mc_flags & C_SUB)) + memcpy(olddata.mv_data, data->mv_data, data->mv_size); else memcpy(NODEKEY(leaf), key->mv_data, key->mv_size); - goto done; + return MDB_SUCCESS; } - mdb_node_del(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], 0); - mc->mc_db->md_entries--; - } else { - DPRINTF(("inserting key at index %i", mc->mc_ki[mc->mc_top])); - insert = 1; + mdb_node_del(mc, 0); } rdata = data; new_sub: nflags = flags & NODE_ADD_FLAGS; - nsize = IS_LEAF2(mc->mc_pg[mc->mc_top]) ? key->mv_size : mdb_leaf_size(mc->mc_txn->mt_env, key, rdata); + nsize = IS_LEAF2(mc->mc_pg[mc->mc_top]) ? key->mv_size : mdb_leaf_size(env, key, rdata); if (SIZELEFT(mc->mc_pg[mc->mc_top]) < nsize) { if (( flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA ) - nflags &= ~MDB_APPEND; - if (!insert) + nflags &= ~MDB_APPEND; /* sub-page may need room to grow */ + if (!insert_key) nflags |= MDB_SPLIT_REPLACE; rc = mdb_page_split(mc, key, rdata, P_INVALID, nflags); } else { /* There is room already in this leaf page. */ rc = mdb_node_add(mc, mc->mc_ki[mc->mc_top], key, rdata, 0, nflags); - if (rc == 0 && !do_sub && insert) { + if (rc == 0 && insert_key) { /* Adjust other cursors pointing to mp */ MDB_cursor *m2, *m3; MDB_dbi dbi = mc->mc_dbi; @@ -5968,9 +6196,7 @@ new_sub: } } - if (rc != MDB_SUCCESS) - mc->mc_txn->mt_flags |= MDB_TXN_ERROR; - else { + if (rc == MDB_SUCCESS) { /* Now store the actual data in the child DB. Note that we're * storing the user data in the keys field, so there are strict * size limits on dupdata. The actual data fields of the child @@ -5978,6 +6204,7 @@ new_sub: */ if (do_sub) { int xflags; + size_t ecount; put_sub: xdata.mv_size = 0; xdata.mv_data = ""; @@ -5993,7 +6220,7 @@ put_sub: if (dkey.mv_size) { rc = mdb_cursor_put(&mc->mc_xcursor->mx_cursor, &dkey, &xdata, xflags); if (rc) - return rc; + goto bad_sub; { /* Adjust other cursors pointing to mp */ MDB_cursor *m2; @@ -6011,6 +6238,7 @@ put_sub: /* we've done our job */ dkey.mv_size = 0; } + ecount = mc->mc_xcursor->mx_db.md_entries; if (flags & MDB_APPENDDUP) xflags |= MDB_APPEND; rc = mdb_cursor_put(&mc->mc_xcursor->mx_cursor, data, &xdata, xflags); @@ -6018,32 +6246,38 @@ put_sub: void *db = NODEDATA(leaf); memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDB_db)); } + insert_data = mc->mc_xcursor->mx_db.md_entries - ecount; } - /* sub-writes might have failed so check rc again. - * Don't increment count if we just replaced an existing item. - */ - if (!rc && !(flags & MDB_CURRENT)) + /* Increment count unless we just replaced an existing item. */ + if (insert_data) mc->mc_db->md_entries++; + if (insert_key) { + /* Invalidate txn if we created an empty sub-DB */ + if (rc) + goto bad_sub; + /* If we succeeded and the key didn't exist before, + * make sure the cursor is marked valid. + */ + mc->mc_flags |= C_INITIALIZED; + } if (flags & MDB_MULTIPLE) { if (!rc) { -next_mult: mcount++; /* let caller know how many succeeded, if any */ data[1].mv_size = mcount; if (mcount < dcount) { data[0].mv_data = (char *)data[0].mv_data + data[0].mv_size; - leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + insert_key = insert_data = 0; goto more; } } } + return rc; +bad_sub: + if (rc == MDB_KEYEXIST) /* should not happen, we deleted that item */ + rc = MDB_CORRUPTED; } -done: - /* If we succeeded and the key didn't exist before, make sure - * the cursor is marked valid. - */ - if (!rc && insert) - mc->mc_flags |= C_INITIALIZED; + mc->mc_txn->mt_flags |= MDB_TXN_ERROR; return rc; } @@ -6051,6 +6285,7 @@ int mdb_cursor_del(MDB_cursor *mc, unsigned int flags) { MDB_node *leaf; + MDB_page *mp; int rc; if (mc->mc_txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_ERROR)) @@ -6059,6 +6294,9 @@ mdb_cursor_del(MDB_cursor *mc, unsigned int flags) if (!(mc->mc_flags & C_INITIALIZED)) return EINVAL; + if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mc->mc_pg[mc->mc_top])) + return MDB_NOTFOUND; + if (!(flags & MDB_NOSPILL) && (rc = mdb_page_spill(mc, NULL, NULL))) return rc; @@ -6066,14 +6304,22 @@ mdb_cursor_del(MDB_cursor *mc, unsigned int flags) if (rc) return rc; - leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + mp = mc->mc_pg[mc->mc_top]; + if (IS_LEAF2(mp)) + goto del_key; + leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); - if (!IS_LEAF2(mc->mc_pg[mc->mc_top]) && F_ISSET(leaf->mn_flags, F_DUPDATA)) { - if (!(flags & MDB_NODUPDATA)) { + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + if (flags & MDB_NODUPDATA) { + /* mdb_cursor_del0() will subtract the final entry */ + mc->mc_db->md_entries -= mc->mc_xcursor->mx_db.md_entries - 1; + } else { if (!F_ISSET(leaf->mn_flags, F_SUBDATA)) { mc->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf); } rc = mdb_cursor_del(&mc->mc_xcursor->mx_cursor, MDB_NOSPILL); + if (rc) + return rc; /* If sub-DB still has entries, we're done */ if (mc->mc_xcursor->mx_db.md_entries) { if (leaf->mn_flags & F_SUBDATA) { @@ -6083,13 +6329,13 @@ mdb_cursor_del(MDB_cursor *mc, unsigned int flags) } else { MDB_cursor *m2; /* shrink fake page */ - mdb_node_shrink(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); - leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + mdb_node_shrink(mp, mc->mc_ki[mc->mc_top]); + leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); mc->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf); /* fix other sub-DB cursors pointed at this fake page */ for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) { if (m2 == mc || m2->mc_snum < mc->mc_snum) continue; - if (m2->mc_pg[mc->mc_top] == mc->mc_pg[mc->mc_top] && + if (m2->mc_pg[mc->mc_top] == mp && m2->mc_ki[mc->mc_top] == mc->mc_ki[mc->mc_top]) m2->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf); } @@ -6104,14 +6350,28 @@ mdb_cursor_del(MDB_cursor *mc, unsigned int flags) if (leaf->mn_flags & F_SUBDATA) { /* add all the child DB's pages to the free list */ rc = mdb_drop0(&mc->mc_xcursor->mx_cursor, 0); - if (rc == MDB_SUCCESS) { - mc->mc_db->md_entries -= - mc->mc_xcursor->mx_db.md_entries; - } + if (rc) + goto fail; } } - return mdb_cursor_del0(mc, leaf); + /* add overflow pages to free list */ + if (F_ISSET(leaf->mn_flags, F_BIGDATA)) { + MDB_page *omp; + pgno_t pg; + + memcpy(&pg, NODEDATA(leaf), sizeof(pg)); + if ((rc = mdb_page_get(mc->mc_txn, pg, &omp, NULL)) || + (rc = mdb_ovpage_free(mc, omp))) + goto fail; + } + +del_key: + return mdb_cursor_del0(mc); + +fail: + mc->mc_txn->mt_flags |= MDB_TXN_ERROR; + return rc; } /** Allocate and initialize new pages for a database. @@ -6166,13 +6426,12 @@ mdb_leaf_size(MDB_env *env, MDB_val *key, MDB_val *data) size_t sz; sz = LEAFSIZE(key, data); - if (sz >= env->me_nodemax) { + if (sz > env->me_nodemax) { /* put on overflow page */ sz -= data->mv_size - sizeof(pgno_t); } - sz += sz & 1; - return sz + sizeof(indx_t); + return EVEN(sz + sizeof(indx_t)); } /** Calculate the size of a branch node. @@ -6191,7 +6450,7 @@ mdb_branch_size(MDB_env *env, MDB_val *key) size_t sz; sz = INDXSIZE(key); - if (sz >= env->me_nodemax) { + if (sz > env->me_nodemax) { /* put on overflow page */ /* not implemented */ /* sz -= key->size - sizeof(pgno_t); */ @@ -6221,18 +6480,19 @@ mdb_node_add(MDB_cursor *mc, indx_t indx, { unsigned int i; size_t node_size = NODESIZE; + ssize_t room; indx_t ofs; MDB_node *node; MDB_page *mp = mc->mc_pg[mc->mc_top]; MDB_page *ofp = NULL; /* overflow page */ DKBUF; - assert(mp->mp_upper >= mp->mp_lower); + mdb_cassert(mc, mp->mp_upper >= mp->mp_lower); DPRINTF(("add to %s %spage %"Z"u index %i, data size %"Z"u key size %"Z"u [%s]", IS_LEAF(mp) ? "leaf" : "branch", IS_SUBP(mp) ? "sub-" : "", - mp->mp_pgno, indx, data ? data->mv_size : 0, + mdb_dbg_pgno(mp), indx, data ? data->mv_size : 0, key ? key->mv_size : 0, key ? DKEY(key) : "null")); if (IS_LEAF2(mp)) { @@ -6251,47 +6511,44 @@ mdb_node_add(MDB_cursor *mc, indx_t indx, return MDB_SUCCESS; } + room = (ssize_t)SIZELEFT(mp) - (ssize_t)sizeof(indx_t); if (key != NULL) node_size += key->mv_size; - if (IS_LEAF(mp)) { - assert(data); + mdb_cassert(mc, data); if (F_ISSET(flags, F_BIGDATA)) { /* Data already on overflow page. */ node_size += sizeof(pgno_t); - } else if (node_size + data->mv_size >= mc->mc_txn->mt_env->me_nodemax) { + } else if (node_size + data->mv_size > mc->mc_txn->mt_env->me_nodemax) { int ovpages = OVPAGES(data->mv_size, mc->mc_txn->mt_env->me_psize); int rc; /* Put data on overflow page. */ DPRINTF(("data size is %"Z"u, node would be %"Z"u, put data on overflow page", data->mv_size, node_size+data->mv_size)); - node_size += sizeof(pgno_t); + node_size = EVEN(node_size + sizeof(pgno_t)); + if ((ssize_t)node_size > room) + goto full; if ((rc = mdb_page_new(mc, P_OVERFLOW, ovpages, &ofp))) return rc; DPRINTF(("allocated overflow page %"Z"u", ofp->mp_pgno)); flags |= F_BIGDATA; + goto update; } else { node_size += data->mv_size; } } - node_size += node_size & 1; - - if (node_size + sizeof(indx_t) > SIZELEFT(mp)) { - DPRINTF(("not enough room in page %"Z"u, got %u ptrs", - mp->mp_pgno, NUMKEYS(mp))); - DPRINTF(("upper - lower = %u - %u = %u", mp->mp_upper, mp->mp_lower, - mp->mp_upper - mp->mp_lower)); - DPRINTF(("node size = %"Z"u", node_size)); - return MDB_PAGE_FULL; - } + node_size = EVEN(node_size); + if ((ssize_t)node_size > room) + goto full; +update: /* Move higher pointers up one slot. */ for (i = NUMKEYS(mp); i > indx; i--) mp->mp_ptrs[i] = mp->mp_ptrs[i - 1]; /* Adjust free space offsets. */ ofs = mp->mp_upper - node_size; - assert(ofs >= mp->mp_lower + sizeof(indx_t)); + mdb_cassert(mc, ofs >= mp->mp_lower + sizeof(indx_t)); mp->mp_ptrs[indx] = ofs; mp->mp_upper = ofs; mp->mp_lower += sizeof(indx_t); @@ -6309,7 +6566,7 @@ mdb_node_add(MDB_cursor *mc, indx_t indx, memcpy(NODEKEY(node), key->mv_data, key->mv_size); if (IS_LEAF(mp)) { - assert(key); + mdb_cassert(mc, key); if (ofp == NULL) { if (F_ISSET(flags, F_BIGDATA)) memcpy(node->mn_data + key->mv_size, data->mv_data, @@ -6330,34 +6587,38 @@ mdb_node_add(MDB_cursor *mc, indx_t indx, } return MDB_SUCCESS; + +full: + DPRINTF(("not enough room in page %"Z"u, got %u ptrs", + mdb_dbg_pgno(mp), NUMKEYS(mp))); + DPRINTF(("upper-lower = %u - %u = %"Z"d", mp->mp_upper,mp->mp_lower,room)); + DPRINTF(("node size = %"Z"u", node_size)); + mc->mc_txn->mt_flags |= MDB_TXN_ERROR; + return MDB_PAGE_FULL; } /** Delete the specified node from a page. - * @param[in] mp The page to operate on. - * @param[in] indx The index of the node to delete. + * @param[in] mc Cursor pointing to the node to delete. * @param[in] ksize The size of a node. Only used if the page is * part of a #MDB_DUPFIXED database. */ static void -mdb_node_del(MDB_page *mp, indx_t indx, int ksize) +mdb_node_del(MDB_cursor *mc, int ksize) { + MDB_page *mp = mc->mc_pg[mc->mc_top]; + indx_t indx = mc->mc_ki[mc->mc_top]; unsigned int sz; indx_t i, j, numkeys, ptr; MDB_node *node; char *base; -#if MDB_DEBUG - { - pgno_t pgno; - COPY_PGNO(pgno, mp->mp_pgno); DPRINTF(("delete node %u on %s page %"Z"u", indx, - IS_LEAF(mp) ? "leaf" : "branch", pgno)); - } -#endif - assert(indx < NUMKEYS(mp)); + IS_LEAF(mp) ? "leaf" : "branch", mdb_dbg_pgno(mp))); + numkeys = NUMKEYS(mp); + mdb_cassert(mc, indx < numkeys); if (IS_LEAF2(mp)) { - int x = NUMKEYS(mp) - 1 - indx; + int x = numkeys - 1 - indx; base = LEAF2KEY(mp, indx, ksize); if (x) memmove(base, base + ksize, x * ksize); @@ -6374,10 +6635,9 @@ mdb_node_del(MDB_page *mp, indx_t indx, int ksize) else sz += NODEDSZ(node); } - sz += sz & 1; + sz = EVEN(sz); ptr = mp->mp_ptrs[indx]; - numkeys = NUMKEYS(mp); for (i = j = 0; i < numkeys; i++) { if (i != indx) { mp->mp_ptrs[j] = mp->mp_ptrs[i]; @@ -6404,25 +6664,22 @@ mdb_node_shrink(MDB_page *mp, indx_t indx) MDB_node *node; MDB_page *sp, *xp; char *base; - int osize, nsize; - int delta; + int nsize, delta; indx_t i, numkeys, ptr; node = NODEPTR(mp, indx); sp = (MDB_page *)NODEDATA(node); - osize = NODEDSZ(node); - - delta = sp->mp_upper - sp->mp_lower; - SETDSZ(node, osize - delta); + delta = SIZELEFT(sp); xp = (MDB_page *)((char *)sp + delta); /* shift subpage upward */ if (IS_LEAF2(sp)) { nsize = NUMKEYS(sp) * sp->mp_pad; + if (nsize & 1) + return; /* do not make the node uneven-sized */ memmove(METADATA(xp), METADATA(sp), nsize); } else { int i; - nsize = osize - sp->mp_upper; numkeys = NUMKEYS(sp); for (i=numkeys-1; i>=0; i--) xp->mp_ptrs[i] = sp->mp_ptrs[i] - delta; @@ -6433,6 +6690,9 @@ mdb_node_shrink(MDB_page *mp, indx_t indx) xp->mp_pad = sp->mp_pad; COPY_PGNO(xp->mp_pgno, mp->mp_pgno); + nsize = NODEDSZ(node) - delta; + SETDSZ(node, nsize); + /* shift lower nodes upward */ ptr = mp->mp_ptrs[indx]; numkeys = NUMKEYS(mp); @@ -6491,6 +6751,7 @@ mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node) memcpy(&mx->mx_db, NODEDATA(node), sizeof(MDB_db)); mx->mx_cursor.mc_pg[0] = 0; mx->mx_cursor.mc_snum = 0; + mx->mx_cursor.mc_top = 0; mx->mx_cursor.mc_flags = C_SUB; } else { MDB_page *fp = NODEDATA(node); @@ -6503,8 +6764,8 @@ mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node) mx->mx_db.md_entries = NUMKEYS(fp); COPY_PGNO(mx->mx_db.md_root, fp->mp_pgno); mx->mx_cursor.mc_snum = 1; - mx->mx_cursor.mc_flags = C_INITIALIZED|C_SUB; mx->mx_cursor.mc_top = 0; + mx->mx_cursor.mc_flags = C_INITIALIZED|C_SUB; mx->mx_cursor.mc_pg[0] = fp; mx->mx_cursor.mc_ki[0] = 0; if (mc->mc_db->md_flags & MDB_DUPFIXED) { @@ -6519,11 +6780,7 @@ mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node) mx->mx_dbflag = DB_VALID|DB_DIRTY; /* DB_DIRTY guides mdb_cursor_touch */ #if UINT_MAX < SIZE_MAX if (mx->mx_dbx.md_cmp == mdb_cmp_int && mx->mx_db.md_pad == sizeof(size_t)) -#ifdef MISALIGNED_OK - mx->mx_dbx.md_cmp = mdb_cmp_long; -#else - mx->mx_dbx.md_cmp = mdb_cmp_cint; -#endif + mx->mx_dbx.md_cmp = mdb_cmp_clong; #endif } @@ -6543,7 +6800,7 @@ mdb_cursor_init(MDB_cursor *mc, MDB_txn *txn, MDB_dbi dbi, MDB_xcursor *mx) mc->mc_pg[0] = 0; mc->mc_flags = 0; if (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT) { - assert(mx != NULL); + mdb_tassert(txn, mx != NULL); mc->mc_xcursor = mx; mdb_xcursor_init0(mc); } else { @@ -6560,7 +6817,7 @@ mdb_cursor_open(MDB_txn *txn, MDB_dbi dbi, MDB_cursor **ret) MDB_cursor *mc; size_t size = sizeof(MDB_cursor); - if (txn == NULL || ret == NULL || dbi >= txn->mt_numdbs || !(txn->mt_dbflags[dbi] & DB_VALID)) + if (!ret || !TXN_DBI_EXIST(txn, dbi)) return EINVAL; if (txn->mt_flags & MDB_TXN_ERROR) @@ -6592,12 +6849,15 @@ mdb_cursor_open(MDB_txn *txn, MDB_dbi dbi, MDB_cursor **ret) int mdb_cursor_renew(MDB_txn *txn, MDB_cursor *mc) { - if (txn == NULL || mc == NULL || mc->mc_dbi >= txn->mt_numdbs) + if (!mc || !TXN_DBI_EXIST(txn, mc->mc_dbi)) return EINVAL; if ((mc->mc_flags & C_UNTRACK) || txn->mt_cursors) return EINVAL; + if (txn->mt_flags & MDB_TXN_ERROR) + return MDB_BAD_TXN; + mdb_cursor_init(mc, txn, mc->mc_dbi, mc->mc_xcursor); return MDB_SUCCESS; } @@ -6614,6 +6874,9 @@ mdb_cursor_count(MDB_cursor *mc, size_t *countp) if (mc->mc_xcursor == NULL) return MDB_INCOMPATIBLE; + if (mc->mc_txn->mt_flags & MDB_TXN_ERROR) + return MDB_BAD_TXN; + leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) { *countp = 1; @@ -6651,11 +6914,10 @@ mdb_cursor_txn(MDB_cursor *mc) MDB_dbi mdb_cursor_dbi(MDB_cursor *mc) { - assert(mc != NULL); return mc->mc_dbi; } -/** Replace the key for a node with a new key. +/** Replace the key for a branch node with a new key. * @param[in] mc Cursor pointing to the node to operate on. * @param[in] key The new key to use. * @return 0 on success, non-zero on failure. @@ -6667,7 +6929,7 @@ mdb_update_key(MDB_cursor *mc, MDB_val *key) MDB_node *node; char *base; size_t len; - int delta, delta0; + int delta, ksize, oksize; indx_t ptr, i, numkeys, indx; DKBUF; @@ -6678,7 +6940,7 @@ mdb_update_key(MDB_cursor *mc, MDB_val *key) #if MDB_DEBUG { MDB_val k2; - char kbuf2[(MDB_MAXKEYSIZE*2+1)]; + char kbuf2[DKBUF_MAXKEYSIZE*2+1]; k2.mv_data = NODEKEY(node); k2.mv_size = node->mn_ksize; DPRINTF(("update key %u (ofs %u) [%s] to [%s] on page %"Z"u", @@ -6689,19 +6951,19 @@ mdb_update_key(MDB_cursor *mc, MDB_val *key) } #endif - delta0 = delta = key->mv_size - node->mn_ksize; + /* Sizes must be 2-byte aligned. */ + ksize = EVEN(key->mv_size); + oksize = EVEN(node->mn_ksize); + delta = ksize - oksize; - /* Must be 2-byte aligned. If new key is - * shorter by 1, the shift will be skipped. - */ - delta += (delta & 1); + /* Shift node contents if EVEN(key length) changed. */ if (delta) { if (delta > 0 && SIZELEFT(mp) < delta) { pgno_t pgno; /* not enough space left, do a delete and split */ DPRINTF(("Not enough room, delta = %d, splitting...", delta)); pgno = NODEPGNO(node); - mdb_node_del(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], 0); + mdb_node_del(mc, 0); return mdb_page_split(mc, key, NULL, pgno, MDB_SPLIT_REPLACE); } @@ -6720,7 +6982,7 @@ mdb_update_key(MDB_cursor *mc, MDB_val *key) } /* But even if no shift was needed, update ksize */ - if (delta0) + if (node->mn_ksize != key->mv_size) node->mn_ksize = key->mv_size; if (key->mv_size) @@ -6752,7 +7014,6 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst) return rc; if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { - srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], 0); /* fake */ key.mv_size = csrc->mc_db->md_pad; key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], csrc->mc_ki[csrc->mc_top], key.mv_size); data.mv_size = 0; @@ -6761,14 +7022,16 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst) flags = 0; } else { srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], csrc->mc_ki[csrc->mc_top]); - assert(!((long)srcnode&1)); + mdb_cassert(csrc, !((size_t)srcnode & 1)); srcpg = NODEPGNO(srcnode); flags = srcnode->mn_flags; if (csrc->mc_ki[csrc->mc_top] == 0 && IS_BRANCH(csrc->mc_pg[csrc->mc_top])) { unsigned int snum = csrc->mc_snum; MDB_node *s2; /* must find the lowest key below src */ - mdb_page_search_lowest(csrc); + rc = mdb_page_search_lowest(csrc); + if (rc) + return rc; if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { key.mv_size = csrc->mc_db->md_pad; key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], 0, key.mv_size); @@ -6791,18 +7054,20 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst) MDB_node *s2; MDB_val bkey; /* must find the lowest key below dst */ - mdb_page_search_lowest(cdst); - if (IS_LEAF2(cdst->mc_pg[cdst->mc_top])) { - bkey.mv_size = cdst->mc_db->md_pad; - bkey.mv_data = LEAF2KEY(cdst->mc_pg[cdst->mc_top], 0, bkey.mv_size); + mdb_cursor_copy(cdst, &mn); + rc = mdb_page_search_lowest(&mn); + if (rc) + return rc; + if (IS_LEAF2(mn.mc_pg[mn.mc_top])) { + bkey.mv_size = mn.mc_db->md_pad; + bkey.mv_data = LEAF2KEY(mn.mc_pg[mn.mc_top], 0, bkey.mv_size); } else { - s2 = NODEPTR(cdst->mc_pg[cdst->mc_top], 0); + s2 = NODEPTR(mn.mc_pg[mn.mc_top], 0); bkey.mv_size = NODEKSZ(s2); bkey.mv_data = NODEKEY(s2); } - cdst->mc_snum = snum--; - cdst->mc_top = snum; - mdb_cursor_copy(cdst, &mn); + mn.mc_snum = snum--; + mn.mc_top = snum; mn.mc_ki[snum] = 0; rc = mdb_update_key(&mn, &bkey); if (rc) @@ -6824,7 +7089,7 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst) /* Delete the node from the source page. */ - mdb_node_del(csrc->mc_pg[csrc->mc_top], csrc->mc_ki[csrc->mc_top], key.mv_size); + mdb_node_del(csrc, key.mv_size); { /* Adjust other cursors pointing to mp */ @@ -6872,7 +7137,7 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst) csrc->mc_ki[csrc->mc_top] = 0; rc = mdb_update_key(csrc, &nullkey); csrc->mc_ki[csrc->mc_top] = ix; - assert(rc == MDB_SUCCESS); + mdb_cassert(csrc, rc == MDB_SUCCESS); } } @@ -6900,7 +7165,7 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst) cdst->mc_ki[cdst->mc_top] = 0; rc = mdb_update_key(cdst, &nullkey); cdst->mc_ki[cdst->mc_top] = ix; - assert(rc == MDB_SUCCESS); + mdb_cassert(csrc, rc == MDB_SUCCESS); } } @@ -6913,6 +7178,7 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst) * the \b csrc page will be freed. * @param[in] csrc Cursor pointing to the source page. * @param[in] cdst Cursor pointing to the destination page. + * @return 0 on success, non-zero on failure. */ static int mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst) @@ -6926,8 +7192,8 @@ mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst) DPRINTF(("merging page %"Z"u into %"Z"u", csrc->mc_pg[csrc->mc_top]->mp_pgno, cdst->mc_pg[cdst->mc_top]->mp_pgno)); - assert(csrc->mc_snum > 1); /* can't merge root page */ - assert(cdst->mc_snum > 1); + mdb_cassert(csrc, csrc->mc_snum > 1); /* can't merge root page */ + mdb_cassert(csrc, cdst->mc_snum > 1); /* Mark dst as dirty. */ if ((rc = mdb_page_touch(cdst))) @@ -6949,20 +7215,21 @@ mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst) for (i = 0; i < NUMKEYS(csrc->mc_pg[csrc->mc_top]); i++, j++) { srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], i); if (i == 0 && IS_BRANCH(csrc->mc_pg[csrc->mc_top])) { - unsigned int snum = csrc->mc_snum; + MDB_cursor mn; MDB_node *s2; + mdb_cursor_copy(csrc, &mn); /* must find the lowest key below src */ - mdb_page_search_lowest(csrc); - if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { - key.mv_size = csrc->mc_db->md_pad; - key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], 0, key.mv_size); + rc = mdb_page_search_lowest(&mn); + if (rc) + return rc; + if (IS_LEAF2(mn.mc_pg[mn.mc_top])) { + key.mv_size = mn.mc_db->md_pad; + key.mv_data = LEAF2KEY(mn.mc_pg[mn.mc_top], 0, key.mv_size); } else { - s2 = NODEPTR(csrc->mc_pg[csrc->mc_top], 0); + s2 = NODEPTR(mn.mc_pg[mn.mc_top], 0); key.mv_size = NODEKSZ(s2); key.mv_data = NODEKEY(s2); } - csrc->mc_snum = snum--; - csrc->mc_top = snum; } else { key.mv_size = srcnode->mn_ksize; key.mv_data = NODEKEY(srcnode); @@ -6982,15 +7249,17 @@ mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst) /* Unlink the src page from parent and add to free list. */ - mdb_node_del(csrc->mc_pg[csrc->mc_top-1], csrc->mc_ki[csrc->mc_top-1], 0); - if (csrc->mc_ki[csrc->mc_top-1] == 0) { + csrc->mc_top--; + mdb_node_del(csrc, 0); + if (csrc->mc_ki[csrc->mc_top] == 0) { key.mv_size = 0; - csrc->mc_top--; rc = mdb_update_key(csrc, &key); - csrc->mc_top++; - if (rc) + if (rc) { + csrc->mc_top++; return rc; + } } + csrc->mc_top++; rc = mdb_midl_append(&csrc->mc_txn->mt_free_pgs, csrc->mc_pg[csrc->mc_top]->mp_pgno); @@ -7019,9 +7288,18 @@ mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst) } } } - mdb_cursor_pop(csrc); - - return mdb_rebalance(csrc); + { + unsigned int snum = cdst->mc_snum; + uint16_t depth = cdst->mc_db->md_depth; + mdb_cursor_pop(cdst); + rc = mdb_rebalance(cdst); + /* Did the tree shrink? */ + if (depth > cdst->mc_db->md_depth) + snum--; + cdst->mc_snum = snum; + cdst->mc_top = snum-1; + } + return rc; } /** Copy the contents of a cursor. @@ -7059,27 +7337,18 @@ mdb_rebalance(MDB_cursor *mc) int rc; unsigned int ptop, minkeys; MDB_cursor mn; + indx_t oldki; minkeys = 1 + (IS_BRANCH(mc->mc_pg[mc->mc_top])); -#if MDB_DEBUG - { - pgno_t pgno; - COPY_PGNO(pgno, mc->mc_pg[mc->mc_top]->mp_pgno); DPRINTF(("rebalancing %s page %"Z"u (has %u keys, %.1f%% full)", IS_LEAF(mc->mc_pg[mc->mc_top]) ? "leaf" : "branch", - pgno, NUMKEYS(mc->mc_pg[mc->mc_top]), + mdb_dbg_pgno(mc->mc_pg[mc->mc_top]), NUMKEYS(mc->mc_pg[mc->mc_top]), (float)PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) / 10)); - } -#endif if (PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) >= FILL_THRESHOLD && NUMKEYS(mc->mc_pg[mc->mc_top]) >= minkeys) { -#if MDB_DEBUG - pgno_t pgno; - COPY_PGNO(pgno, mc->mc_pg[mc->mc_top]->mp_pgno); DPRINTF(("no need to rebalance page %"Z"u, above fill threshold", - pgno)); -#endif + mdb_dbg_pgno(mc->mc_pg[mc->mc_top]))); return MDB_SUCCESS; } @@ -7100,6 +7369,7 @@ mdb_rebalance(MDB_cursor *mc) /* Adjust cursors pointing to mp */ mc->mc_snum = 0; mc->mc_top = 0; + mc->mc_flags &= ~C_INITIALIZED; { MDB_cursor *m2, *m3; MDB_dbi dbi = mc->mc_dbi; @@ -7113,10 +7383,12 @@ mdb_rebalance(MDB_cursor *mc) if (m3->mc_pg[0] == mp) { m3->mc_snum = 0; m3->mc_top = 0; + m3->mc_flags &= ~C_INITIALIZED; } } } } else if (IS_BRANCH(mp) && NUMKEYS(mp) == 1) { + int i; DPUTS("collapsing root page!"); rc = mdb_midl_append(&mc->mc_txn->mt_free_pgs, mp->mp_pgno); if (rc) @@ -7128,6 +7400,10 @@ mdb_rebalance(MDB_cursor *mc) mc->mc_db->md_depth--; mc->mc_db->md_branch_pages--; mc->mc_ki[0] = mc->mc_ki[1]; + for (i = 1; imc_db->md_depth; i++) { + mc->mc_pg[i] = mc->mc_pg[i+1]; + mc->mc_ki[i] = mc->mc_ki[i+1]; + } { /* Adjust other cursors pointing to mp */ MDB_cursor *m2, *m3; @@ -7140,10 +7416,12 @@ mdb_rebalance(MDB_cursor *mc) m3 = m2; if (m3 == mc || m3->mc_snum < mc->mc_snum) continue; if (m3->mc_pg[0] == mp) { - m3->mc_pg[0] = mc->mc_pg[0]; - m3->mc_snum = 1; - m3->mc_top = 0; - m3->mc_ki[0] = m3->mc_ki[1]; + m3->mc_snum--; + m3->mc_top--; + for (i=0; imc_snum; i++) { + m3->mc_pg[i] = m3->mc_pg[i+1]; + m3->mc_ki[i] = m3->mc_ki[i+1]; + } } } } @@ -7156,7 +7434,7 @@ mdb_rebalance(MDB_cursor *mc) * otherwise the tree is invalid. */ ptop = mc->mc_top-1; - assert(NUMKEYS(mc->mc_pg[ptop]) > 1); + mdb_cassert(mc, NUMKEYS(mc->mc_pg[ptop]) > 1); /* Leaf page fill factor is below the threshold. * Try to move keys from left or right neighbor, or @@ -7168,6 +7446,7 @@ mdb_rebalance(MDB_cursor *mc) mdb_cursor_copy(mc, &mn); mn.mc_xcursor = NULL; + oldki = mc->mc_ki[mc->mc_top]; if (mc->mc_ki[ptop] == 0) { /* We're the leftmost leaf in our parent. */ @@ -7201,113 +7480,115 @@ mdb_rebalance(MDB_cursor *mc) * (A branch page must never have less than 2 keys.) */ minkeys = 1 + (IS_BRANCH(mn.mc_pg[mn.mc_top])); - if (PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) >= FILL_THRESHOLD && NUMKEYS(mn.mc_pg[mn.mc_top]) > minkeys) - return mdb_node_move(&mn, mc); - else { - if (mc->mc_ki[ptop] == 0) + if (PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) >= FILL_THRESHOLD && NUMKEYS(mn.mc_pg[mn.mc_top]) > minkeys) { + rc = mdb_node_move(&mn, mc); + if (mc->mc_ki[ptop]) { + oldki++; + } + } else { + if (mc->mc_ki[ptop] == 0) { rc = mdb_page_merge(&mn, mc); - else { + } else { + oldki += NUMKEYS(mn.mc_pg[mn.mc_top]); mn.mc_ki[mn.mc_top] += mc->mc_ki[mn.mc_top] + 1; rc = mdb_page_merge(mc, &mn); mdb_cursor_copy(&mn, mc); } - mc->mc_flags &= ~(C_INITIALIZED|C_EOF); + mc->mc_flags &= ~C_EOF; } + mc->mc_ki[mc->mc_top] = oldki; return rc; } /** Complete a delete operation started by #mdb_cursor_del(). */ static int -mdb_cursor_del0(MDB_cursor *mc, MDB_node *leaf) +mdb_cursor_del0(MDB_cursor *mc) { int rc; MDB_page *mp; indx_t ki; unsigned int nkeys; - mp = mc->mc_pg[mc->mc_top]; ki = mc->mc_ki[mc->mc_top]; - - /* add overflow pages to free list */ - if (!IS_LEAF2(mp) && F_ISSET(leaf->mn_flags, F_BIGDATA)) { - MDB_page *omp; - pgno_t pg; - - memcpy(&pg, NODEDATA(leaf), sizeof(pg)); - if ((rc = mdb_page_get(mc->mc_txn, pg, &omp, NULL)) || - (rc = mdb_ovpage_free(mc, omp))) - return rc; - } - mdb_node_del(mp, ki, mc->mc_db->md_pad); + mdb_node_del(mc, mc->mc_db->md_pad); mc->mc_db->md_entries--; rc = mdb_rebalance(mc); - if (rc != MDB_SUCCESS) - mc->mc_txn->mt_flags |= MDB_TXN_ERROR; - else { - MDB_cursor *m2; + + if (rc == MDB_SUCCESS) { + MDB_cursor *m2, *m3; MDB_dbi dbi = mc->mc_dbi; mp = mc->mc_pg[mc->mc_top]; nkeys = NUMKEYS(mp); /* if mc points past last node in page, find next sibling */ - if (mc->mc_ki[mc->mc_top] >= nkeys) - mdb_cursor_sibling(mc, 1); + if (mc->mc_ki[mc->mc_top] >= nkeys) { + rc = mdb_cursor_sibling(mc, 1); + if (rc == MDB_NOTFOUND) + rc = MDB_SUCCESS; + } /* Adjust other cursors pointing to mp */ - for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { - if (m2 == mc) + for (m2 = mc->mc_txn->mt_cursors[dbi]; !rc && m2; m2=m2->mc_next) { + m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; + if (! (m2->mc_flags & m3->mc_flags & C_INITIALIZED)) continue; - if (!(m2->mc_flags & C_INITIALIZED)) + if (m3 == mc || m3->mc_snum < mc->mc_snum) continue; - if (m2->mc_pg[mc->mc_top] == mp) { - if (m2->mc_ki[mc->mc_top] >= ki) { - m2->mc_flags |= C_DEL; - if (m2->mc_ki[mc->mc_top] > ki) - m2->mc_ki[mc->mc_top]--; + if (m3->mc_pg[mc->mc_top] == mp) { + if (m3->mc_ki[mc->mc_top] >= ki) { + m3->mc_flags |= C_DEL; + if (m3->mc_ki[mc->mc_top] > ki) + m3->mc_ki[mc->mc_top]--; + } + if (m3->mc_ki[mc->mc_top] >= nkeys) { + rc = mdb_cursor_sibling(m3, 1); + if (rc == MDB_NOTFOUND) + rc = MDB_SUCCESS; } - if (m2->mc_ki[mc->mc_top] >= nkeys) - mdb_cursor_sibling(m2, 1); } } mc->mc_flags |= C_DEL; } + if (rc) + mc->mc_txn->mt_flags |= MDB_TXN_ERROR; return rc; } int mdb_del(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data) +{ + if (!key || dbi == FREE_DBI || !TXN_DBI_EXIST(txn, dbi)) + return EINVAL; + + if (txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_ERROR)) + return (txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; + + if (!F_ISSET(txn->mt_dbs[dbi].md_flags, MDB_DUPSORT)) { + /* must ignore any data */ + data = NULL; + } + + return mdb_del0(txn, dbi, key, data, 0); +} + +static int +mdb_del0(MDB_txn *txn, MDB_dbi dbi, + MDB_val *key, MDB_val *data, unsigned flags) { MDB_cursor mc; MDB_xcursor mx; MDB_cursor_op op; MDB_val rdata, *xdata; - int rc, exact; + int rc, exact = 0; DKBUF; - assert(key != NULL); - DPRINTF(("====> delete db %u key [%s]", dbi, DKEY(key))); - if (txn == NULL || !dbi || dbi >= txn->mt_numdbs || !(txn->mt_dbflags[dbi] & DB_VALID)) - return EINVAL; - - if (txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_ERROR)) - return (txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; - - if (key->mv_size > MDB_MAXKEYSIZE) { - return MDB_BAD_VALSIZE; - } - mdb_cursor_init(&mc, txn, dbi, &mx); - exact = 0; - if (!F_ISSET(txn->mt_dbs[dbi].md_flags, MDB_DUPSORT)) { - /* must ignore any data */ - data = NULL; - } if (data) { op = MDB_GET_BOTH; rdata = *data; @@ -7315,6 +7596,7 @@ mdb_del(MDB_txn *txn, MDB_dbi dbi, } else { op = MDB_SET; xdata = NULL; + flags |= MDB_NODUPDATA; } rc = mdb_cursor_set(&mc, key, xdata, op, &exact); if (rc == 0) { @@ -7329,7 +7611,7 @@ mdb_del(MDB_txn *txn, MDB_dbi dbi, mc.mc_flags |= C_UNTRACK; mc.mc_next = txn->mt_cursors[dbi]; txn->mt_cursors[dbi] = &mc; - rc = mdb_cursor_del(&mc, data ? 0 : MDB_NODUPDATA); + rc = mdb_cursor_del(&mc, flags); txn->mt_cursors[dbi] = mc.mc_next; } return rc; @@ -7350,24 +7632,26 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno unsigned int nflags) { unsigned int flags; - int rc = MDB_SUCCESS, ins_new = 0, new_root = 0, newpos = 1, did_split = 0; + int rc = MDB_SUCCESS, new_root = 0, did_split = 0; indx_t newindx; pgno_t pgno = 0; - unsigned int i, j, split_indx, nkeys, pmax; + int i, j, split_indx, nkeys, pmax; + MDB_env *env = mc->mc_txn->mt_env; MDB_node *node; MDB_val sepkey, rkey, xdata, *rdata = &xdata; - MDB_page *copy; + MDB_page *copy = NULL; MDB_page *mp, *rp, *pp; - unsigned int ptop; + int ptop; MDB_cursor mn; DKBUF; mp = mc->mc_pg[mc->mc_top]; newindx = mc->mc_ki[mc->mc_top]; + nkeys = NUMKEYS(mp); - DPRINTF(("-----> splitting %s page %"Z"u and adding [%s] at index %i", + DPRINTF(("-----> splitting %s page %"Z"u and adding [%s] at index %i/%i", IS_LEAF(mp) ? "leaf" : "branch", mp->mp_pgno, - DKEY(newkey), mc->mc_ki[mc->mc_top])); + DKEY(newkey), mc->mc_ki[mc->mc_top], nkeys)); /* Create a right sibling. */ if ((rc = mdb_page_new(mc, mp->mp_flags, 1, &rp))) @@ -7376,7 +7660,7 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno if (mc->mc_snum < 2) { if ((rc = mdb_page_new(mc, P_BRANCH, 1, &pp))) - return rc; + goto done; /* shift current top to make room for new parent */ mc->mc_pg[1] = mc->mc_pg[0]; mc->mc_ki[1] = mc->mc_ki[0]; @@ -7394,7 +7678,7 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno mc->mc_ki[0] = mc->mc_ki[1]; mc->mc_db->md_root = mp->mp_pgno; mc->mc_db->md_depth--; - return rc; + goto done; } mc->mc_snum = 2; mc->mc_top = 1; @@ -7414,145 +7698,146 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno sepkey = *newkey; split_indx = newindx; nkeys = 0; - goto newsep; - } + } else { - nkeys = NUMKEYS(mp); - split_indx = nkeys / 2; - if (newindx < split_indx) - newpos = 0; - - if (IS_LEAF2(rp)) { - char *split, *ins; - int x; - unsigned int lsize, rsize, ksize; - /* Move half of the keys to the right sibling */ - copy = NULL; - x = mc->mc_ki[mc->mc_top] - split_indx; - ksize = mc->mc_db->md_pad; - split = LEAF2KEY(mp, split_indx, ksize); - rsize = (nkeys - split_indx) * ksize; - lsize = (nkeys - split_indx) * sizeof(indx_t); - mp->mp_lower -= lsize; - rp->mp_lower += lsize; - mp->mp_upper += rsize - lsize; - rp->mp_upper -= rsize - lsize; - sepkey.mv_size = ksize; - if (newindx == split_indx) { - sepkey.mv_data = newkey->mv_data; - } else { - sepkey.mv_data = split; - } - if (x<0) { - ins = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], ksize); - memcpy(rp->mp_ptrs, split, rsize); - sepkey.mv_data = rp->mp_ptrs; - memmove(ins+ksize, ins, (split_indx - mc->mc_ki[mc->mc_top]) * ksize); - memcpy(ins, newkey->mv_data, ksize); - mp->mp_lower += sizeof(indx_t); - mp->mp_upper -= ksize - sizeof(indx_t); + split_indx = (nkeys+1) / 2; + + if (IS_LEAF2(rp)) { + char *split, *ins; + int x; + unsigned int lsize, rsize, ksize; + /* Move half of the keys to the right sibling */ + x = mc->mc_ki[mc->mc_top] - split_indx; + ksize = mc->mc_db->md_pad; + split = LEAF2KEY(mp, split_indx, ksize); + rsize = (nkeys - split_indx) * ksize; + lsize = (nkeys - split_indx) * sizeof(indx_t); + mp->mp_lower -= lsize; + rp->mp_lower += lsize; + mp->mp_upper += rsize - lsize; + rp->mp_upper -= rsize - lsize; + sepkey.mv_size = ksize; + if (newindx == split_indx) { + sepkey.mv_data = newkey->mv_data; + } else { + sepkey.mv_data = split; + } + if (x<0) { + ins = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], ksize); + memcpy(rp->mp_ptrs, split, rsize); + sepkey.mv_data = rp->mp_ptrs; + memmove(ins+ksize, ins, (split_indx - mc->mc_ki[mc->mc_top]) * ksize); + memcpy(ins, newkey->mv_data, ksize); + mp->mp_lower += sizeof(indx_t); + mp->mp_upper -= ksize - sizeof(indx_t); + } else { + if (x) + memcpy(rp->mp_ptrs, split, x * ksize); + ins = LEAF2KEY(rp, x, ksize); + memcpy(ins, newkey->mv_data, ksize); + memcpy(ins+ksize, split + x * ksize, rsize - x * ksize); + rp->mp_lower += sizeof(indx_t); + rp->mp_upper -= ksize - sizeof(indx_t); + mc->mc_ki[mc->mc_top] = x; + mc->mc_pg[mc->mc_top] = rp; + } } else { - if (x) - memcpy(rp->mp_ptrs, split, x * ksize); - ins = LEAF2KEY(rp, x, ksize); - memcpy(ins, newkey->mv_data, ksize); - memcpy(ins+ksize, split + x * ksize, rsize - x * ksize); - rp->mp_lower += sizeof(indx_t); - rp->mp_upper -= ksize - sizeof(indx_t); - mc->mc_ki[mc->mc_top] = x; - mc->mc_pg[mc->mc_top] = rp; - } - goto newsep; - } + int psize, nsize, k; + /* Maximum free space in an empty page */ + pmax = env->me_psize - PAGEHDRSZ; + if (IS_LEAF(mp)) + nsize = mdb_leaf_size(env, newkey, newdata); + else + nsize = mdb_branch_size(env, newkey); + nsize = EVEN(nsize); - /* For leaf pages, check the split point based on what - * fits where, since otherwise mdb_node_add can fail. - * - * This check is only needed when the data items are - * relatively large, such that being off by one will - * make the difference between success or failure. - * - * It's also relevant if a page happens to be laid out - * such that one half of its nodes are all "small" and - * the other half of its nodes are "large." If the new - * item is also "large" and falls on the half with - * "large" nodes, it also may not fit. - */ - if (IS_LEAF(mp)) { - unsigned int psize, nsize; - /* Maximum free space in an empty page */ - pmax = mc->mc_txn->mt_env->me_psize - PAGEHDRSZ; - nsize = mdb_leaf_size(mc->mc_txn->mt_env, newkey, newdata); - if ((nkeys < 20) || (nsize > pmax/16)) { - if (newindx <= split_indx) { - psize = nsize; - newpos = 0; - for (i=0; imn_flags, F_BIGDATA)) - psize += sizeof(pgno_t); - else - psize += NODEDSZ(node); - psize += psize & 1; - if (psize > pmax) { - if (i <= newindx) { - split_indx = newindx; - if (i < newindx) - newpos = 1; + /* grab a page to hold a temporary copy */ + copy = mdb_page_malloc(mc->mc_txn, 1); + if (copy == NULL) { + rc = ENOMEM; + goto done; + } + copy->mp_pgno = mp->mp_pgno; + copy->mp_flags = mp->mp_flags; + copy->mp_lower = PAGEHDRSZ; + copy->mp_upper = env->me_psize; + + /* prepare to insert */ + for (i=0, j=0; imp_ptrs[j++] = 0; + } + copy->mp_ptrs[j++] = mp->mp_ptrs[i]; + } + + /* When items are relatively large the split point needs + * to be checked, because being off-by-one will make the + * difference between success or failure in mdb_node_add. + * + * It's also relevant if a page happens to be laid out + * such that one half of its nodes are all "small" and + * the other half of its nodes are "large." If the new + * item is also "large" and falls on the half with + * "large" nodes, it also may not fit. + * + * As a final tweak, if the new item goes on the last + * spot on the page (and thus, onto the new page), bias + * the split so the new page is emptier than the old page. + * This yields better packing during sequential inserts. + */ + if (nkeys < 20 || nsize > pmax/16 || newindx >= nkeys) { + /* Find split point */ + psize = 0; + if (newindx <= split_indx || newindx >= nkeys) { + i = 0; j = 1; + k = newindx >= nkeys ? nkeys : split_indx+2; + } else { + i = nkeys; j = -1; + k = split_indx-1; + } + for (; i!=k; i+=j) { + if (i == newindx) { + psize += nsize; + node = NULL; + } else { + node = (MDB_node *)((char *)mp + copy->mp_ptrs[i]); + psize += NODESIZE + NODEKSZ(node) + sizeof(indx_t); + if (IS_LEAF(mp)) { + if (F_ISSET(node->mn_flags, F_BIGDATA)) + psize += sizeof(pgno_t); + else + psize += NODEDSZ(node); } - else - split_indx = i; - break; + psize = EVEN(psize); } - } - } else { - psize = nsize; - for (i=nkeys-1; i>=split_indx; i--) { - node = NODEPTR(mp, i); - psize += NODESIZE + NODEKSZ(node) + sizeof(indx_t); - if (F_ISSET(node->mn_flags, F_BIGDATA)) - psize += sizeof(pgno_t); - else - psize += NODEDSZ(node); - psize += psize & 1; - if (psize > pmax) { - if (i >= newindx) { - split_indx = newindx; - newpos = 0; - } else - split_indx = i+1; + if (psize > pmax || i == k-j) { + split_indx = i + (j<0); break; } } } + if (split_indx == newindx) { + sepkey.mv_size = newkey->mv_size; + sepkey.mv_data = newkey->mv_data; + } else { + node = (MDB_node *)((char *)mp + copy->mp_ptrs[split_indx]); + sepkey.mv_size = node->mn_ksize; + sepkey.mv_data = NODEKEY(node); + } } } - /* First find the separating key between the split pages. - * The case where newindx == split_indx is ambiguous; the - * new item could go to the new page or stay on the original - * page. If newpos == 1 it goes to the new page. - */ - if (newindx == split_indx && newpos) { - sepkey.mv_size = newkey->mv_size; - sepkey.mv_data = newkey->mv_data; - } else { - node = NODEPTR(mp, split_indx); - sepkey.mv_size = node->mn_ksize; - sepkey.mv_data = NODEKEY(node); - } - -newsep: - DPRINTF(("separator is [%s]", DKEY(&sepkey))); + DPRINTF(("separator is %d [%s]", split_indx, DKEY(&sepkey))); /* Copy separator key to the parent. */ - if (SIZELEFT(mn.mc_pg[ptop]) < mdb_branch_size(mc->mc_txn->mt_env, &sepkey)) { + if (SIZELEFT(mn.mc_pg[ptop]) < mdb_branch_size(env, &sepkey)) { mn.mc_snum--; mn.mc_top--; did_split = 1; rc = mdb_page_split(&mn, &sepkey, NULL, rp->mp_pgno, 0); + if (rc) + goto done; /* root split? */ if (mn.mc_snum == mc->mc_snum) { @@ -7574,7 +7859,13 @@ newsep: mc->mc_ki[i] = mn.mc_ki[i]; } mc->mc_pg[ptop] = mn.mc_pg[ptop]; - mc->mc_ki[ptop] = mn.mc_ki[ptop] - 1; + if (mn.mc_ki[ptop]) { + mc->mc_ki[ptop] = mn.mc_ki[ptop] - 1; + } else { + /* find right page's left sibling */ + mc->mc_ki[ptop] = mn.mc_ki[ptop]; + mdb_cursor_sibling(mc, 0); + } } } else { mn.mc_top--; @@ -7583,118 +7874,94 @@ newsep: } mc->mc_flags ^= C_SPLITTING; if (rc != MDB_SUCCESS) { - return rc; + goto done; } if (nflags & MDB_APPEND) { mc->mc_pg[mc->mc_top] = rp; mc->mc_ki[mc->mc_top] = 0; rc = mdb_node_add(mc, 0, newkey, newdata, newpgno, nflags); if (rc) - return rc; + goto done; for (i=0; imc_top; i++) mc->mc_ki[i] = mn.mc_ki[i]; - goto done; - } - if (IS_LEAF2(rp)) { - goto done; - } - - /* Move half of the keys to the right sibling. */ + } else if (!IS_LEAF2(mp)) { + /* Move nodes */ + mc->mc_pg[mc->mc_top] = rp; + i = split_indx; + j = 0; + do { + if (i == newindx) { + rkey.mv_data = newkey->mv_data; + rkey.mv_size = newkey->mv_size; + if (IS_LEAF(mp)) { + rdata = newdata; + } else + pgno = newpgno; + flags = nflags; + /* Update index for the new key. */ + mc->mc_ki[mc->mc_top] = j; + } else { + node = (MDB_node *)((char *)mp + copy->mp_ptrs[i]); + rkey.mv_data = NODEKEY(node); + rkey.mv_size = node->mn_ksize; + if (IS_LEAF(mp)) { + xdata.mv_data = NODEDATA(node); + xdata.mv_size = NODEDSZ(node); + rdata = &xdata; + } else + pgno = NODEPGNO(node); + flags = node->mn_flags; + } - /* grab a page to hold a temporary copy */ - copy = mdb_page_malloc(mc->mc_txn, 1); - if (copy == NULL) - return ENOMEM; + if (!IS_LEAF(mp) && j == 0) { + /* First branch index doesn't need key data. */ + rkey.mv_size = 0; + } - copy->mp_pgno = mp->mp_pgno; - copy->mp_flags = mp->mp_flags; - copy->mp_lower = PAGEHDRSZ; - copy->mp_upper = mc->mc_txn->mt_env->me_psize; - mc->mc_pg[mc->mc_top] = copy; - for (i = j = 0; i <= nkeys; j++) { - if (i == split_indx) { - /* Insert in right sibling. */ - /* Reset insert index for right sibling. */ - if (i != newindx || (newpos ^ ins_new)) { + rc = mdb_node_add(mc, j, &rkey, rdata, pgno, flags); + if (rc) + goto done; + if (i == nkeys) { + i = 0; j = 0; - mc->mc_pg[mc->mc_top] = rp; + mc->mc_pg[mc->mc_top] = copy; + } else { + i++; + j++; + } + } while (i != split_indx); + + nkeys = NUMKEYS(copy); + for (i=0; imp_ptrs[i] = copy->mp_ptrs[i]; + mp->mp_lower = copy->mp_lower; + mp->mp_upper = copy->mp_upper; + memcpy(NODEPTR(mp, nkeys-1), NODEPTR(copy, nkeys-1), + env->me_psize - copy->mp_upper); + + /* reset back to original page */ + if (newindx < split_indx) { + mc->mc_pg[mc->mc_top] = mp; + if (nflags & MDB_RESERVE) { + node = NODEPTR(mp, mc->mc_ki[mc->mc_top]); + if (!(node->mn_flags & F_BIGDATA)) + newdata->mv_data = NODEDATA(node); } - } - - if (i == newindx && !ins_new) { - /* Insert the original entry that caused the split. */ - rkey.mv_data = newkey->mv_data; - rkey.mv_size = newkey->mv_size; - if (IS_LEAF(mp)) { - rdata = newdata; - } else - pgno = newpgno; - flags = nflags; - - ins_new = 1; - - /* Update index for the new key. */ - mc->mc_ki[mc->mc_top] = j; - } else if (i == nkeys) { - break; } else { - node = NODEPTR(mp, i); - rkey.mv_data = NODEKEY(node); - rkey.mv_size = node->mn_ksize; - if (IS_LEAF(mp)) { - xdata.mv_data = NODEDATA(node); - xdata.mv_size = NODEDSZ(node); - rdata = &xdata; - } else - pgno = NODEPGNO(node); - flags = node->mn_flags; - - i++; - } - - if (!IS_LEAF(mp) && j == 0) { - /* First branch index doesn't need key data. */ - rkey.mv_size = 0; - } - - rc = mdb_node_add(mc, j, &rkey, rdata, pgno, flags); - if (rc) break; - } - - nkeys = NUMKEYS(copy); - for (i=0; imp_ptrs[i] = copy->mp_ptrs[i]; - mp->mp_lower = copy->mp_lower; - mp->mp_upper = copy->mp_upper; - memcpy(NODEPTR(mp, nkeys-1), NODEPTR(copy, nkeys-1), - mc->mc_txn->mt_env->me_psize - copy->mp_upper); - - /* reset back to original page */ - if (newindx < split_indx || (!newpos && newindx == split_indx)) { - mc->mc_pg[mc->mc_top] = mp; - if (nflags & MDB_RESERVE) { - node = NODEPTR(mp, mc->mc_ki[mc->mc_top]); - if (!(node->mn_flags & F_BIGDATA)) - newdata->mv_data = NODEDATA(node); - } - } else { - mc->mc_ki[ptop]++; - /* Make sure mc_ki is still valid. - */ - if (mn.mc_pg[ptop] != mc->mc_pg[ptop] && - mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) { - for (i=0; imc_pg[i] = mn.mc_pg[i]; - mc->mc_ki[i] = mn.mc_ki[i]; + mc->mc_pg[mc->mc_top] = rp; + mc->mc_ki[ptop]++; + /* Make sure mc_ki is still valid. + */ + if (mn.mc_pg[ptop] != mc->mc_pg[ptop] && + mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) { + for (i=0; i<=ptop; i++) { + mc->mc_pg[i] = mn.mc_pg[i]; + mc->mc_ki[i] = mn.mc_ki[i]; + } } - mc->mc_pg[ptop] = mn.mc_pg[ptop]; - mc->mc_ki[ptop] = mn.mc_ki[ptop] - 1; } } - /* return tmp page to freelist */ - mdb_page_free(mc->mc_txn->mt_env, copy); -done: { /* Adjust other cursors pointing to mp */ MDB_cursor *m2, *m3; @@ -7742,6 +8009,13 @@ done: } } } + DPRINTF(("mp left: %d, rp left: %d", SIZELEFT(mp), SIZELEFT(rp))); + +done: + if (copy) /* tmp page */ + mdb_page_free(env, copy); + if (rc) + mc->mc_txn->mt_flags |= MDB_TXN_ERROR; return rc; } @@ -7752,10 +8026,7 @@ mdb_put(MDB_txn *txn, MDB_dbi dbi, MDB_cursor mc; MDB_xcursor mx; - assert(key != NULL); - assert(data != NULL); - - if (txn == NULL || !dbi || dbi >= txn->mt_numdbs || !(txn->mt_dbflags[dbi] & DB_VALID)) + if (!key || !data || dbi == FREE_DBI || !TXN_DBI_EXIST(txn, dbi)) return EINVAL; if ((flags & (MDB_NOOVERWRITE|MDB_NODUPDATA|MDB_RESERVE|MDB_APPEND|MDB_APPENDDUP)) != flags) @@ -7787,6 +8058,32 @@ mdb_env_get_flags(MDB_env *env, unsigned int *arg) return MDB_SUCCESS; } +int +mdb_env_set_userctx(MDB_env *env, void *ctx) +{ + if (!env) + return EINVAL; + env->me_userctx = ctx; + return MDB_SUCCESS; +} + +void * +mdb_env_get_userctx(MDB_env *env) +{ + return env ? env->me_userctx : NULL; +} + +int +mdb_env_set_assert(MDB_env *env, MDB_assert_func *func) +{ + if (!env) + return EINVAL; +#ifndef NDEBUG + env->me_assert_func = func; +#endif + return MDB_SUCCESS; +} + int mdb_env_get_path(MDB_env *env, const char **arg) { @@ -7797,6 +8094,16 @@ mdb_env_get_path(MDB_env *env, const char **arg) return MDB_SUCCESS; } +int +mdb_env_get_fd(MDB_env *env, mdb_filehandle_t *arg) +{ + if (!env || !arg) + return EINVAL; + + *arg = env->me_fd; + return MDB_SUCCESS; +} + /** Common code for #mdb_stat() and #mdb_env_stat(). * @param[in] env the environment to operate in. * @param[in] db the #MDB_db record containing the stats to return. @@ -7978,9 +8285,12 @@ int mdb_dbi_open(MDB_txn *txn, const char *name, unsigned int flags, MDB_dbi *db int mdb_stat(MDB_txn *txn, MDB_dbi dbi, MDB_stat *arg) { - if (txn == NULL || arg == NULL || dbi >= txn->mt_numdbs) + if (!arg || !TXN_DBI_EXIST(txn, dbi)) return EINVAL; + if (txn->mt_flags & MDB_TXN_ERROR) + return MDB_BAD_TXN; + if (txn->mt_dbflags[dbi] & DB_STALE) { MDB_cursor mc; MDB_xcursor mx; @@ -8005,7 +8315,7 @@ void mdb_dbi_close(MDB_env *env, MDB_dbi dbi) int mdb_dbi_flags(MDB_txn *txn, MDB_dbi dbi, unsigned int *flags) { /* We could return the flags for the FREE_DBI too but what's the point? */ - if (txn == NULL || dbi < MAIN_DBI || dbi >= txn->mt_numdbs) + if (dbi == FREE_DBI || !TXN_DBI_EXIST(txn, dbi)) return EINVAL; *flags = txn->mt_dbs[dbi].md_flags & PERSISTENT_FLAGS; return MDB_SUCCESS; @@ -8045,22 +8355,22 @@ mdb_drop0(MDB_cursor *mc, int subs) memcpy(&pg, NODEDATA(ni), sizeof(pg)); rc = mdb_page_get(txn, pg, &omp, NULL); if (rc != 0) - return rc; - assert(IS_OVERFLOW(omp)); + goto done; + mdb_cassert(mc, IS_OVERFLOW(omp)); rc = mdb_midl_append_range(&txn->mt_free_pgs, pg, omp->mp_pages); if (rc) - return rc; + goto done; } else if (subs && (ni->mn_flags & F_SUBDATA)) { mdb_xcursor_init1(mc, ni); rc = mdb_drop0(&mc->mc_xcursor->mx_cursor, 0); if (rc) - return rc; + goto done; } } } else { if ((rc = mdb_midl_need(&txn->mt_free_pgs, n)) != 0) - return rc; + goto done; for (i=0; imc_ki[mc->mc_top] = i; rc = mdb_cursor_sibling(mc, 1); if (rc) { + if (rc != MDB_NOTFOUND) + goto done; /* no more siblings, go back to beginning * of previous level. */ @@ -8087,6 +8399,9 @@ mdb_drop0(MDB_cursor *mc, int subs) } /* free it */ rc = mdb_midl_append(&txn->mt_free_pgs, mc->mc_db->md_root); +done: + if (rc) + txn->mt_flags |= MDB_TXN_ERROR; } else if (rc == MDB_NOTFOUND) { rc = MDB_SUCCESS; } @@ -8098,7 +8413,7 @@ int mdb_drop(MDB_txn *txn, MDB_dbi dbi, int del) MDB_cursor *mc, *m2; int rc; - if (!txn || !dbi || dbi >= txn->mt_numdbs || (unsigned)del > 1 || !(txn->mt_dbflags[dbi] & DB_VALID)) + if ((unsigned)del > 1 || dbi == FREE_DBI || !TXN_DBI_EXIST(txn, dbi)) return EINVAL; if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) @@ -8117,10 +8432,12 @@ int mdb_drop(MDB_txn *txn, MDB_dbi dbi, int del) /* Can't delete the main DB */ if (del && dbi > MAIN_DBI) { - rc = mdb_del(txn, MAIN_DBI, &mc->mc_dbx->md_name, NULL); + rc = mdb_del0(txn, MAIN_DBI, &mc->mc_dbx->md_name, NULL, 0); if (!rc) { txn->mt_dbflags[dbi] = DB_STALE; mdb_dbi_close(txn->mt_env, dbi); + } else { + txn->mt_flags |= MDB_TXN_ERROR; } } else { /* reset the DB record, mark it dirty */ @@ -8141,7 +8458,7 @@ leave: int mdb_set_compare(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp) { - if (txn == NULL || !dbi || dbi >= txn->mt_numdbs || !(txn->mt_dbflags[dbi] & DB_VALID)) + if (dbi == FREE_DBI || !TXN_DBI_EXIST(txn, dbi)) return EINVAL; txn->mt_dbxs[dbi].md_cmp = cmp; @@ -8150,7 +8467,7 @@ int mdb_set_compare(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp) int mdb_set_dupsort(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp) { - if (txn == NULL || !dbi || dbi >= txn->mt_numdbs || !(txn->mt_dbflags[dbi] & DB_VALID)) + if (dbi == FREE_DBI || !TXN_DBI_EXIST(txn, dbi)) return EINVAL; txn->mt_dbxs[dbi].md_dcmp = cmp; @@ -8159,7 +8476,7 @@ int mdb_set_dupsort(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp) int mdb_set_relfunc(MDB_txn *txn, MDB_dbi dbi, MDB_rel_func *rel) { - if (txn == NULL || !dbi || dbi >= txn->mt_numdbs || !(txn->mt_dbflags[dbi] & DB_VALID)) + if (dbi == FREE_DBI || !TXN_DBI_EXIST(txn, dbi)) return EINVAL; txn->mt_dbxs[dbi].md_rel = rel; @@ -8168,7 +8485,7 @@ int mdb_set_relfunc(MDB_txn *txn, MDB_dbi dbi, MDB_rel_func *rel) int mdb_set_relctx(MDB_txn *txn, MDB_dbi dbi, void *ctx) { - if (txn == NULL || !dbi || dbi >= txn->mt_numdbs || !(txn->mt_dbflags[dbi] & DB_VALID)) + if (dbi == FREE_DBI || !TXN_DBI_EXIST(txn, dbi)) return EINVAL; txn->mt_dbxs[dbi].md_relctx = ctx; @@ -8177,7 +8494,7 @@ int mdb_set_relctx(MDB_txn *txn, MDB_dbi dbi, void *ctx) int mdb_env_get_maxkeysize(MDB_env *env) { - return MDB_MAXKEYSIZE; + return ENV_MAXKEY(env); } int mdb_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx) @@ -8185,7 +8502,7 @@ int mdb_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx) unsigned int i, rdrs; MDB_reader *mr; char buf[64]; - int first = 1; + int rc = 0, first = 1; if (!env || !func) return -1; @@ -8196,33 +8513,31 @@ int mdb_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx) mr = env->me_txns->mti_readers; for (i=0; i 0 ) { ++cursor; } @@ -8262,7 +8577,7 @@ int mdb_reader_check(MDB_env *env, int *dead) { unsigned int i, j, rdrs; MDB_reader *mr; - pid_t *pids, pid; + MDB_PID_T *pids, pid; int count = 0; if (!env) @@ -8272,12 +8587,11 @@ int mdb_reader_check(MDB_env *env, int *dead) if (!env->me_txns) return MDB_SUCCESS; rdrs = env->me_txns->mti_numreaders; - pids = malloc((rdrs+1) * sizeof(pid_t)); + pids = malloc((rdrs+1) * sizeof(MDB_PID_T)); if (!pids) return ENOMEM; pids[0] = 0; mr = env->me_txns->mti_readers; - j = 0; for (i=0; ime_pid) { pid = mr[i].mr_pid;