X-Git-Url: https://git.sur5r.net/?a=blobdiff_plain;f=libraries%2Fliblmdb%2Fmdb.c;h=0bff10e75933687912cb62909bf0f3e6bbac2efd;hb=17aab561f28676f1dc74d8963492e282039b83af;hp=7ba1a529f74d49555d91984c6f93915222df76c9;hpb=70788bfe45b32c32fb1bc23fdfe84b325d7de836;p=openldap diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 7ba1a529f7..0bff10e759 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -96,7 +96,13 @@ extern int cacheflush(char *addr, int nbytes, int cache); #include #include #include + +#ifdef _MSC_VER +#include +typedef SSIZE_T ssize_t; +#else #include +#endif #if defined(__sun) || defined(ANDROID) /* Most platforms have posix_memalign, older may only have memalign */ @@ -232,7 +238,25 @@ extern int cacheflush(char *addr, int nbytes, int cache); #define MDB_OWNERDEAD EOWNERDEAD /**< #LOCK_MUTEX0() result if dead owner */ #endif -#ifdef MDB_OWNERDEAD + +/** Some platforms define the EOWNERDEAD error code + * even though they don't support Robust Mutexes. + * Compile with -DMDB_USE_ROBUST=0, or use some other + * mechanism like -DMDB_USE_SYSV_SEM instead of + * -DMDB_USE_POSIX_MUTEX. (SysV semaphores are + * also Robust, but some systems don't support them + * either.) + */ +#ifndef MDB_USE_ROBUST +/* Android currently lacks Robust Mutex support */ +#if defined(ANDROID) && defined(MDB_USE_POSIX_MUTEX) && !defined(MDB_USE_ROBUST) +#define MDB_USE_ROBUST 0 +#else +#define MDB_USE_ROBUST 1 +#endif +#endif /* MDB_USE_ROBUST */ + +#if defined(MDB_OWNERDEAD) && MDB_USE_ROBUST #define MDB_ROBUST_SUPPORTED 1 #endif @@ -943,6 +967,11 @@ typedef struct MDB_db { #define FREE_DBI 0 /** Handle for the default DB. */ #define MAIN_DBI 1 + /** Number of DBs in metapage (free and main) - also hardcoded elsewhere */ +#define CORE_DBS 2 + + /** Number of meta pages - also hardcoded elsewhere */ +#define NUM_METAS 2 /** Meta page content. * A meta page is the start point for accessing a database snapshot. @@ -956,11 +985,11 @@ typedef struct MDB_meta { uint32_t mm_version; void *mm_address; /**< address for fixed mapping */ size_t mm_mapsize; /**< size of mmap region */ - MDB_db mm_dbs[2]; /**< first is free space, 2nd is main db */ + MDB_db mm_dbs[CORE_DBS]; /**< first is free space, 2nd is main db */ /** The size of pages used in this DB */ -#define mm_psize mm_dbs[0].md_pad +#define mm_psize mm_dbs[FREE_DBI].md_pad /** Any persistent environment flags. @ref mdb_env */ -#define mm_flags mm_dbs[0].md_flags +#define mm_flags mm_dbs[FREE_DBI].md_flags pgno_t mm_last_pg; /**< last used page in file */ volatile txnid_t mm_txnid; /**< txnid that committed this page */ } MDB_meta; @@ -995,7 +1024,8 @@ typedef struct MDB_dbx { */ struct MDB_txn { MDB_txn *mt_parent; /**< parent of a nested txn */ - MDB_txn *mt_child; /**< nested txn under this txn */ + /** Nested txn under this txn, set together with flag #MDB_TXN_HAS_CHILD */ + MDB_txn *mt_child; pgno_t mt_next_pgno; /**< next unallocated page */ /** The ID of this transaction. IDs are integers incrementing from 1. * Only committed write transactions increment the ID. If a transaction @@ -1037,13 +1067,15 @@ struct MDB_txn { #define DB_STALE 0x02 /**< Named-DB record is older than txnID */ #define DB_NEW 0x04 /**< Named-DB handle opened in this txn */ #define DB_VALID 0x08 /**< DB handle is valid, see also #MDB_VALID */ +#define DB_USRVALID 0x10 /**< As #DB_VALID, but not set for #FREE_DBI */ /** @} */ /** In write txns, array of cursors for each DB */ MDB_cursor **mt_cursors; /** Array of flags for each DB */ unsigned char *mt_dbflags; - /** Number of DB records in use. This number only ever increments; - * we don't decrement it when individual DB handles are closed. + /** Number of DB records in use, or 0 when the txn is finished. + * This number only ever increments until the txn finishes; we + * don't decrement it when individual DB handles are closed. */ MDB_dbi mt_numdbs; @@ -1056,9 +1088,13 @@ struct MDB_txn { #define MDB_TXN_RDONLY MDB_RDONLY /**< read-only transaction */ /* internal txn flags */ #define MDB_TXN_WRITEMAP MDB_WRITEMAP /**< copy of #MDB_env flag in writers */ +#define MDB_TXN_FINISHED 0x01 /**< txn is finished or never began */ #define MDB_TXN_ERROR 0x02 /**< txn is unusable after an error */ #define MDB_TXN_DIRTY 0x04 /**< must write, even if dirty list is empty */ #define MDB_TXN_SPILLS 0x08 /**< txn or a parent has spilled pages */ +#define MDB_TXN_HAS_CHILD 0x10 /**< txn has an #MDB_txn.%mt_child */ + /** most operations on the txn are currently illegal */ +#define MDB_TXN_BLOCKED (MDB_TXN_FINISHED|MDB_TXN_ERROR|MDB_TXN_HAS_CHILD) /** @} */ unsigned int mt_flags; /**< @ref mdb_txn */ /** #dirty_list room: Array size - \#dirty pages visible to this txn. @@ -1168,7 +1204,7 @@ struct MDB_env { char *me_path; /**< path to the DB files */ char *me_map; /**< the memory map of the data file */ MDB_txninfo *me_txns; /**< the memory map of the lock file or NULL */ - MDB_meta *me_metas[2]; /**< pointers to the two meta pages */ + MDB_meta *me_metas[NUM_METAS]; /**< pointers to the two meta pages */ void *me_pbuf; /**< scratch area for DUPSORT put() */ MDB_txn *me_txn; /**< current write transaction */ MDB_txn *me_txn0; /**< prealloc'd write transaction */ @@ -1227,8 +1263,8 @@ typedef struct MDB_ntxn { #define MAX_WRITE (0x80000000U >> (sizeof(ssize_t) == 4)) /** Check \b txn and \b dbi arguments to a function */ -#define TXN_DBI_EXIST(txn, dbi) \ - ((txn) && (dbi) < (txn)->mt_numdbs && ((txn)->mt_dbflags[dbi] & DB_VALID)) +#define TXN_DBI_EXIST(txn, dbi, validity) \ + ((txn) && (dbi)<(txn)->mt_numdbs && ((txn)->mt_dbflags[dbi] & (validity))) /** Check for misused \b dbi handles */ #define TXN_DBI_CHANGED(txn, dbi) \ @@ -1238,6 +1274,19 @@ static int mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp); static int mdb_page_new(MDB_cursor *mc, uint32_t flags, int num, MDB_page **mp); static int mdb_page_touch(MDB_cursor *mc); +#define MDB_END_NAMES {"committed", "empty-commit", "abort", "reset", \ + "reset-tmp", "fail-begin", "fail-beginchild"} +enum { + /* mdb_txn_end operation number, for logging */ + MDB_END_COMMITTED, MDB_END_EMPTY_COMMIT, MDB_END_ABORT, MDB_END_RESET, + MDB_END_RESET_TMP, MDB_END_FAIL_BEGIN, MDB_END_FAIL_BEGINCHILD +}; +#define MDB_END_OPMASK 0x0F /**< mask for #mdb_txn_end() operation number */ +#define MDB_END_UPDATE 0x10 /**< update env state (DBIs) */ +#define MDB_END_FREE 0x20 /**< free txn unless it is #MDB_env.%me_txn0 */ +#define MDB_END_SLOT MDB_NOTLS /**< release any reader slot if #MDB_NOTLS */ +static void mdb_txn_end(MDB_txn *txn, unsigned mode); + static int mdb_page_get(MDB_txn *txn, pgno_t pgno, MDB_page **mp, int *lvl); static int mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int modify); @@ -1254,7 +1303,7 @@ static int mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno, unsigned int nflags); static int mdb_env_read_header(MDB_env *env, MDB_meta *meta); -static int mdb_env_pick_meta(const MDB_env *env); +static MDB_meta *mdb_env_pick_meta(const MDB_env *env); static int mdb_env_write_meta(MDB_txn *txn); #ifdef MDB_USE_POSIX_MUTEX /* Drop unused excl arg */ # define mdb_env_close0(env, excl) mdb_env_close1(env) @@ -1314,7 +1363,7 @@ static int mdb_sec_inited; #endif /** Return the library version info. */ -char * +char * ESECT mdb_version(int *major, int *minor, int *patch) { if (major) *major = MDB_VERSION_MAJOR; @@ -1342,7 +1391,7 @@ static char *const mdb_errstr[] = { "MDB_MAP_RESIZED: Database contents grew beyond environment mapsize", "MDB_INCOMPATIBLE: Operation and DB incompatible, or DB flags changed", "MDB_BAD_RSLOT: Invalid reuse of reader locktable slot", - "MDB_BAD_TXN: Transaction cannot recover - it must be aborted", + "MDB_BAD_TXN: Transaction must abort, has a child, or is invalid", "MDB_BAD_VALSIZE: Unsupported size of key/DB name/data, or wrong DUPFIXED size", "MDB_BAD_DBI: The specified DBI handle was closed/changed unexpectedly", }; @@ -1386,7 +1435,7 @@ mdb_strerror(int err) ; } buf[0] = 0; - FormatMessage(FORMAT_MESSAGE_FROM_SYSTEM | + FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, NULL, err, 0, ptr, sizeof(buf), (va_list *)pad); return ptr; @@ -1398,7 +1447,7 @@ mdb_strerror(int err) /** assert(3) variant in cursor context */ #define mdb_cassert(mc, expr) mdb_assert0((mc)->mc_txn->mt_env, expr, #expr) /** assert(3) variant in transaction context */ -#define mdb_tassert(mc, expr) mdb_assert0((txn)->mt_env, expr, #expr) +#define mdb_tassert(txn, expr) mdb_assert0((txn)->mt_env, expr, #expr) /** assert(3) variant in environment context */ #define mdb_eassert(env, expr) mdb_assert0(env, expr, #expr) @@ -1406,7 +1455,7 @@ mdb_strerror(int err) # define mdb_assert0(env, expr, expr_txt) ((expr) ? (void)0 : \ mdb_assert_fail(env, expr_txt, mdb_func_, __FILE__, __LINE__)) -static void +static void ESECT mdb_assert_fail(MDB_env *env, const char *expr_txt, const char *func, const char *file, int line) { @@ -1604,9 +1653,10 @@ static void mdb_audit(MDB_txn *txn) mdb_tassert(txn, rc == MDB_NOTFOUND); } } - if (freecount + count + 2 /* metapages */ != txn->mt_next_pgno) { + if (freecount + count + NUM_METAS != txn->mt_next_pgno) { fprintf(stderr, "audit: %lu freecount: %lu count: %lu total: %lu next_pgno: %lu\n", - txn->mt_txnid, freecount, count+2, freecount+count+2, txn->mt_next_pgno); + txn->mt_txnid, freecount, count+NUM_METAS, + freecount+count+NUM_METAS, txn->mt_next_pgno); } } #endif @@ -1875,7 +1925,7 @@ mdb_page_spill(MDB_cursor *m0, MDB_val *key, MDB_val *data) /* Estimate how much space this op will take */ i = m0->mc_db->md_depth; /* Named DBs also dirty the main DB */ - if (m0->mc_dbi > MAIN_DBI) + if (m0->mc_dbi >= CORE_DBS) i += txn->mt_dbs[MAIN_DBI].md_depth; /* For puts, roughly factor in the key+data size */ if (key) @@ -2483,12 +2533,6 @@ mdb_cursors_close(MDB_txn *txn, unsigned merge) } } -#if !(MDB_DEBUG) -#define mdb_txn_reset0(txn, act) mdb_txn_reset0(txn) -#endif -static void -mdb_txn_reset0(MDB_txn *txn, const char *act); - #if !(MDB_PIDLOCK) /* Currently the same as defined(_WIN32) */ enum Pidlock_op { Pidset, Pidcheck @@ -2559,7 +2603,7 @@ mdb_txn_renew0(MDB_txn *txn) if ((flags &= MDB_TXN_RDONLY) != 0) { if (!ti) { - meta = env->me_metas[ mdb_env_pick_meta(env) ]; + meta = mdb_env_pick_meta(env); txn->mt_txnid = meta->mm_txnid; txn->mt_u.reader = NULL; } else { @@ -2619,7 +2663,7 @@ mdb_txn_renew0(MDB_txn *txn) txn->mt_u.reader = r; meta = env->me_metas[txn->mt_txnid & 1]; } - txn->mt_dbxs = env->me_dbxs; /* mostly static anyway */ + } else { /* Not yet touching txn == env->me_txn0, it may be active */ if (ti) { @@ -2628,7 +2672,7 @@ mdb_txn_renew0(MDB_txn *txn) txn->mt_txnid = ti->mti_txnid; meta = env->me_metas[txn->mt_txnid & 1]; } else { - meta = env->me_metas[ mdb_env_pick_meta(env) ]; + meta = mdb_env_pick_meta(env); txn->mt_txnid = meta->mm_txnid; } txn->mt_txnid++; @@ -2650,7 +2694,7 @@ mdb_txn_renew0(MDB_txn *txn) } /* Copy the DB info and flags */ - memcpy(txn->mt_dbs, meta->mm_dbs, 2 * sizeof(MDB_db)); + memcpy(txn->mt_dbs, meta->mm_dbs, CORE_DBS * sizeof(MDB_db)); /* Moved to here to avoid a data race in read TXNs */ txn->mt_next_pgno = meta->mm_last_pg+1; @@ -2659,23 +2703,24 @@ mdb_txn_renew0(MDB_txn *txn) /* Setup db info */ txn->mt_numdbs = env->me_numdbs; - for (i=2; imt_numdbs; i++) { + for (i=CORE_DBS; imt_numdbs; i++) { x = env->me_dbflags[i]; txn->mt_dbs[i].md_flags = x & PERSISTENT_FLAGS; - txn->mt_dbflags[i] = (x & MDB_VALID) ? DB_VALID|DB_STALE : 0; + txn->mt_dbflags[i] = (x & MDB_VALID) ? DB_VALID|DB_USRVALID|DB_STALE : 0; } - txn->mt_dbflags[0] = txn->mt_dbflags[1] = DB_VALID; + txn->mt_dbflags[MAIN_DBI] = DB_VALID|DB_USRVALID; + txn->mt_dbflags[FREE_DBI] = DB_VALID; - if (env->me_maxpg < txn->mt_next_pgno) { - mdb_txn_reset0(txn, "renew0-mapfail"); - if (new_notls) { - txn->mt_u.reader->mr_pid = 0; - txn->mt_u.reader = NULL; - } - return MDB_MAP_RESIZED; + if (env->me_flags & MDB_FATAL_ERROR) { + DPUTS("environment had fatal error, must shutdown!"); + rc = MDB_PANIC; + } else if (env->me_maxpg < txn->mt_next_pgno) { + rc = MDB_MAP_RESIZED; + } else { + return MDB_SUCCESS; } - - return MDB_SUCCESS; + mdb_txn_end(txn, new_notls /*0 or MDB_END_SLOT*/ | MDB_END_FAIL_BEGIN); + return rc; } int @@ -2683,14 +2728,9 @@ mdb_txn_renew(MDB_txn *txn) { int rc; - if (!txn || txn->mt_dbxs) /* A reset txn has mt_dbxs==NULL */ + if (!txn || !F_ISSET(txn->mt_flags, MDB_TXN_RDONLY|MDB_TXN_FINISHED)) return EINVAL; - if (txn->mt_env->me_flags & MDB_FATAL_ERROR) { - DPUTS("environment had fatal error, must shutdown!"); - return MDB_PANIC; - } - rc = mdb_txn_renew0(txn); if (rc == MDB_SUCCESS) { DPRINTF(("renew txn %"Z"u%c %p on mdbenv %p, root page %"Z"u", @@ -2710,57 +2750,42 @@ mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned int flags, MDB_txn **ret) flags &= MDB_TXN_BEGIN_FLAGS; flags |= env->me_flags & MDB_WRITEMAP; - if (env->me_flags & MDB_FATAL_ERROR) { - DPUTS("environment had fatal error, must shutdown!"); - return MDB_PANIC; - } if (env->me_flags & MDB_RDONLY & ~flags) /* write txn in RDONLY env */ return EACCES; - size = tsize = sizeof(MDB_txn); if (parent) { /* Nested transactions: Max 1 child, write txns only, no writemap */ flags |= parent->mt_flags; - if (parent->mt_child || - (flags & (MDB_RDONLY|MDB_WRITEMAP|MDB_TXN_ERROR))) - { + if (flags & (MDB_RDONLY|MDB_WRITEMAP|MDB_TXN_BLOCKED)) { return (parent->mt_flags & MDB_TXN_RDONLY) ? EINVAL : MDB_BAD_TXN; } /* Child txns save MDB_pgstate and use own copy of cursors */ - size = tsize = sizeof(MDB_ntxn); - size += env->me_maxdbs * sizeof(MDB_cursor *); - } else if (!(flags & MDB_RDONLY)) { + size = env->me_maxdbs * (sizeof(MDB_db)+sizeof(MDB_cursor *)+1); + size += tsize = sizeof(MDB_ntxn); + } else if (flags & MDB_RDONLY) { + size = env->me_maxdbs * (sizeof(MDB_db)+1); + size += tsize = sizeof(MDB_txn); + } else { /* Reuse preallocated write txn. However, do not touch it until * mdb_txn_renew0() succeeds, since it currently may be active. */ txn = env->me_txn0; goto renew; } - size += env->me_maxdbs * (sizeof(MDB_db)+1); - if ((txn = calloc(1, size)) == NULL) { DPRINTF(("calloc: %s", strerror(errno))); return ENOMEM; } + txn->mt_dbxs = env->me_dbxs; /* static */ txn->mt_dbs = (MDB_db *) ((char *)txn + tsize); - if (flags & MDB_RDONLY) { - txn->mt_dbflags = (unsigned char *)(txn->mt_dbs + env->me_maxdbs); - txn->mt_dbiseqs = env->me_dbiseqs; - } else { - txn->mt_cursors = (MDB_cursor **)(txn->mt_dbs + env->me_maxdbs); - if (parent) { - txn->mt_dbiseqs = parent->mt_dbiseqs; - txn->mt_dbflags = (unsigned char *)(txn->mt_cursors + env->me_maxdbs); - } else { - txn->mt_dbiseqs = (unsigned int *)(txn->mt_cursors + env->me_maxdbs); - txn->mt_dbflags = (unsigned char *)(txn->mt_dbiseqs + env->me_maxdbs); - } - } + txn->mt_dbflags = (unsigned char *)txn + size - env->me_maxdbs; txn->mt_flags = flags; txn->mt_env = env; if (parent) { unsigned int i; + txn->mt_cursors = (MDB_cursor **)(txn->mt_dbs + env->me_maxdbs); + txn->mt_dbiseqs = parent->mt_dbiseqs; txn->mt_u.dirty_list = malloc(sizeof(MDB_ID2)*MDB_IDL_UM_SIZE); if (!txn->mt_u.dirty_list || !(txn->mt_free_pgs = mdb_midl_alloc(MDB_IDL_UM_MAX))) @@ -2774,10 +2799,10 @@ mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned int flags, MDB_txn **ret) txn->mt_u.dirty_list[0].mid = 0; txn->mt_spill_pgs = NULL; txn->mt_next_pgno = parent->mt_next_pgno; + parent->mt_flags |= MDB_TXN_HAS_CHILD; parent->mt_child = txn; txn->mt_parent = parent; txn->mt_numdbs = parent->mt_numdbs; - txn->mt_dbxs = parent->mt_dbxs; memcpy(txn->mt_dbs, parent->mt_dbs, txn->mt_numdbs * sizeof(MDB_db)); /* Copy parent's mt_dbflags, but clear DB_NEW */ for (i=0; imt_numdbs; i++) @@ -2796,8 +2821,9 @@ mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned int flags, MDB_txn **ret) if (!rc) rc = mdb_cursor_shadow(parent, txn); if (rc) - mdb_txn_reset0(txn, "beginchild-fail"); - } else { + mdb_txn_end(txn, MDB_END_FAIL_BEGINCHILD); + } else { /* MDB_RDONLY */ + txn->mt_dbiseqs = env->me_dbiseqs; renew: rc = mdb_txn_renew0(txn); } @@ -2805,7 +2831,7 @@ renew: if (txn != env->me_txn0) free(txn); } else { - txn->mt_flags |= flags; /* for txn==me_txn0, no effect otherwise */ + txn->mt_flags |= flags; /* could not change txn=me_txn0 earlier */ *ret = txn; DPRINTF(("begin txn %"Z"u%c %p on mdbenv %p, root page %"Z"u", txn->mt_txnid, (flags & MDB_RDONLY) ? 'r' : 'w', @@ -2838,7 +2864,7 @@ mdb_dbis_update(MDB_txn *txn, int keep) MDB_env *env = txn->mt_env; unsigned char *tdbflags = txn->mt_dbflags; - for (i = n; --i >= 2;) { + for (i = n; --i >= CORE_DBS;) { if (tdbflags[i] & DB_NEW) { if (keep) { env->me_dbflags[i] = txn->mt_dbs[i].md_flags | MDB_VALID; @@ -2858,39 +2884,52 @@ mdb_dbis_update(MDB_txn *txn, int keep) env->me_numdbs = n; } -/** Common code for #mdb_txn_reset() and #mdb_txn_abort(). +/** End a transaction, except successful commit of a nested transaction. * May be called twice for readonly txns: First reset it, then abort. - * @param[in] txn the transaction handle to reset - * @param[in] act why the transaction is being reset + * @param[in] txn the transaction handle to end + * @param[in] mode why and how to end the transaction */ static void -mdb_txn_reset0(MDB_txn *txn, const char *act) +mdb_txn_end(MDB_txn *txn, unsigned mode) { MDB_env *env = txn->mt_env; +#if MDB_DEBUG + static const char *const names[] = MDB_END_NAMES; +#endif - /* Close any DBI handles opened in this txn */ - mdb_dbis_update(txn, 0); + /* Export or close DBI handles opened in this txn */ + mdb_dbis_update(txn, mode & MDB_END_UPDATE); DPRINTF(("%s txn %"Z"u%c %p on mdbenv %p, root page %"Z"u", - act, txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w', + names[mode & MDB_END_OPMASK], + txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w', (void *) txn, (void *)env, txn->mt_dbs[MAIN_DBI].md_root)); if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) { if (txn->mt_u.reader) { txn->mt_u.reader->mr_txnid = (txnid_t)-1; - if (!(env->me_flags & MDB_NOTLS)) + if (!(env->me_flags & MDB_NOTLS)) { txn->mt_u.reader = NULL; /* txn does not own reader */ + } else if (mode & MDB_END_SLOT) { + txn->mt_u.reader->mr_pid = 0; + txn->mt_u.reader = NULL; + } /* else txn owns the slot until it does MDB_END_SLOT */ } - txn->mt_numdbs = 0; /* close nothing if called again */ - txn->mt_dbxs = NULL; /* mark txn as reset */ - } else { + txn->mt_numdbs = 0; /* prevent further DBI activity */ + txn->mt_flags |= MDB_TXN_FINISHED; + + } else if (!F_ISSET(txn->mt_flags, MDB_TXN_FINISHED)) { pgno_t *pghead = env->me_pghead; - mdb_cursors_close(txn, 0); + if (!(mode & MDB_END_UPDATE)) /* !(already closed cursors) */ + mdb_cursors_close(txn, 0); if (!(env->me_flags & MDB_WRITEMAP)) { mdb_dlist_free(txn); } + txn->mt_numdbs = 0; + txn->mt_flags = MDB_TXN_FINISHED; + if (!txn->mt_parent) { mdb_midl_shrink(&txn->mt_free_pgs); env->me_free_pgs = txn->mt_free_pgs; @@ -2899,11 +2938,14 @@ mdb_txn_reset0(MDB_txn *txn, const char *act) env->me_pglast = 0; env->me_txn = NULL; + mode = 0; /* txn == env->me_txn0, do not free() it */ + /* The writer mutex was locked in mdb_txn_begin. */ if (env->me_txns) UNLOCK_MUTEX(env->me_wmutex); } else { txn->mt_parent->mt_child = NULL; + txn->mt_parent->mt_flags &= ~MDB_TXN_HAS_CHILD; env->me_pgstate = ((MDB_ntxn *)txn)->mnt_pgstate; mdb_midl_free(txn->mt_free_pgs); mdb_midl_free(txn->mt_spill_pgs); @@ -2912,6 +2954,9 @@ mdb_txn_reset0(MDB_txn *txn, const char *act) mdb_midl_free(pghead); } + + if (mode & MDB_END_FREE) + free(txn); } void @@ -2924,7 +2969,7 @@ mdb_txn_reset(MDB_txn *txn) if (!(txn->mt_flags & MDB_TXN_RDONLY)) return; - mdb_txn_reset0(txn, "reset"); + mdb_txn_end(txn, MDB_END_RESET); } void @@ -2936,13 +2981,7 @@ mdb_txn_abort(MDB_txn *txn) if (txn->mt_child) mdb_txn_abort(txn->mt_child); - mdb_txn_reset0(txn, "abort"); - /* Free reader slot tied to this txn (if MDB_NOTLS && writable FS) */ - if ((txn->mt_flags & MDB_TXN_RDONLY) && txn->mt_u.reader) - txn->mt_u.reader->mr_pid = 0; - - if (txn != txn->mt_env->me_txn0) - free(txn); + mdb_txn_end(txn, MDB_END_ABORT|MDB_END_SLOT|MDB_END_FREE); } /** Save the freelist as of this transaction to the freeDB. @@ -3293,15 +3332,17 @@ int mdb_txn_commit(MDB_txn *txn) { int rc; - unsigned int i; + unsigned int i, end_mode; MDB_env *env; - if (txn == NULL || txn->mt_env == NULL) + if (txn == NULL) return EINVAL; + /* mdb_txn_end() mode for a commit which writes nothing */ + end_mode = MDB_END_EMPTY_COMMIT|MDB_END_UPDATE|MDB_END_SLOT|MDB_END_FREE; + if (txn->mt_child) { rc = mdb_txn_commit(txn->mt_child); - txn->mt_child = NULL; if (rc) goto fail; } @@ -3309,14 +3350,11 @@ mdb_txn_commit(MDB_txn *txn) env = txn->mt_env; if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) { - mdb_dbis_update(txn, 1); - txn->mt_numdbs = 2; /* so txn_abort() doesn't close any new handles */ - mdb_txn_abort(txn); - return MDB_SUCCESS; + goto done; } - if (F_ISSET(txn->mt_flags, MDB_TXN_ERROR)) { - DPUTS("error flag is set, can't commit"); + if (txn->mt_flags & (MDB_TXN_FINISHED|MDB_TXN_ERROR)) { + DPUTS("txn has failed/finished, can't commit"); if (txn->mt_parent) txn->mt_parent->mt_flags |= MDB_TXN_ERROR; rc = MDB_BAD_TXN; @@ -3348,9 +3386,9 @@ mdb_txn_commit(MDB_txn *txn) /* Update parent's DB table. */ memcpy(parent->mt_dbs, txn->mt_dbs, txn->mt_numdbs * sizeof(MDB_db)); parent->mt_numdbs = txn->mt_numdbs; - parent->mt_dbflags[0] = txn->mt_dbflags[0]; - parent->mt_dbflags[1] = txn->mt_dbflags[1]; - for (i=2; imt_numdbs; i++) { + parent->mt_dbflags[FREE_DBI] = txn->mt_dbflags[FREE_DBI]; + parent->mt_dbflags[MAIN_DBI] = txn->mt_dbflags[MAIN_DBI]; + for (i=CORE_DBS; imt_numdbs; i++) { /* preserve parent's DB_NEW status */ x = parent->mt_dbflags[i] & DB_NEW; parent->mt_dbflags[i] = txn->mt_dbflags[i] | x; @@ -3451,14 +3489,14 @@ mdb_txn_commit(MDB_txn *txn) txn->mt_txnid, (void*)txn, (void*)env, txn->mt_dbs[MAIN_DBI].md_root)); /* Update DB root pointers */ - if (txn->mt_numdbs > 2) { + if (txn->mt_numdbs > CORE_DBS) { MDB_cursor mc; MDB_dbi i; MDB_val data; data.mv_size = sizeof(MDB_db); mdb_cursor_init(&mc, txn, MAIN_DBI, NULL); - for (i = 2; i < txn->mt_numdbs; i++) { + for (i = CORE_DBS; i < txn->mt_numdbs; i++) { if (txn->mt_dbflags[i] & DB_DIRTY) { if (TXN_DBI_CHANGED(txn, i)) { rc = MDB_BAD_DBI; @@ -3480,7 +3518,6 @@ mdb_txn_commit(MDB_txn *txn) mdb_midl_free(env->me_pghead); env->me_pghead = NULL; mdb_midl_shrink(&txn->mt_free_pgs); - env->me_free_pgs = txn->mt_free_pgs; #if (MDB_DEBUG) > 2 mdb_audit(txn); @@ -3490,21 +3527,10 @@ mdb_txn_commit(MDB_txn *txn) (rc = mdb_env_sync(env, 0)) || (rc = mdb_env_write_meta(txn))) goto fail; - - /* Free P_LOOSE pages left behind in dirty_list */ - if (!(env->me_flags & MDB_WRITEMAP)) - mdb_dlist_free(txn); + end_mode = MDB_END_COMMITTED|MDB_END_UPDATE; done: - env->me_pglast = 0; - env->me_txn = NULL; - mdb_dbis_update(txn, 1); - - if (env->me_txns) - UNLOCK_MUTEX(env->me_wmutex); - if (txn != env->me_txn0) - free(txn); - + mdb_txn_end(txn, end_mode); return MDB_SUCCESS; fail: @@ -3531,7 +3557,7 @@ mdb_env_read_header(MDB_env *env, MDB_meta *meta) * Read both meta pages so we can use the latest one. */ - for (i=off=0; i<2; i++, off = meta->mm_psize) { + for (i=off=0; imm_psize) { #ifdef _WIN32 DWORD len; OVERLAPPED ov; @@ -3584,11 +3610,11 @@ mdb_env_init_meta0(MDB_env *env, MDB_meta *meta) meta->mm_version = MDB_DATA_VERSION; meta->mm_mapsize = env->me_mapsize; meta->mm_psize = env->me_psize; - meta->mm_last_pg = 1; + meta->mm_last_pg = NUM_METAS-1; meta->mm_flags = env->me_flags & 0xffff; - meta->mm_flags |= MDB_INTEGERKEY; - meta->mm_dbs[0].md_root = P_INVALID; - meta->mm_dbs[1].md_root = P_INVALID; + meta->mm_flags |= MDB_INTEGERKEY; /* this is mm_dbs[FREE_DBI].md_flags */ + meta->mm_dbs[FREE_DBI].md_root = P_INVALID; + meta->mm_dbs[MAIN_DBI].md_root = P_INVALID; } /** Write the environment parameters of a freshly created DB environment. @@ -3621,7 +3647,7 @@ mdb_env_init_meta(MDB_env *env, MDB_meta *meta) psize = env->me_psize; - p = calloc(2, psize); + p = calloc(NUM_METAS, psize); if (!p) return ENOMEM; @@ -3634,10 +3660,10 @@ mdb_env_init_meta(MDB_env *env, MDB_meta *meta) q->mp_flags = P_META; *(MDB_meta *)METADATA(q) = *meta; - DO_PWRITE(rc, env->me_fd, p, psize * 2, len, 0); + DO_PWRITE(rc, env->me_fd, p, psize * NUM_METAS, len, 0); if (!rc) rc = ErrCode(); - else if ((unsigned) len == psize * 2) + else if ((unsigned) len == psize * NUM_METAS) rc = MDB_SUCCESS; else rc = ENOSPC; @@ -3680,8 +3706,8 @@ mdb_env_write_meta(MDB_txn *txn) if (flags & MDB_WRITEMAP) { mp->mm_mapsize = mapsize; - mp->mm_dbs[0] = txn->mt_dbs[0]; - mp->mm_dbs[1] = txn->mt_dbs[1]; + mp->mm_dbs[FREE_DBI] = txn->mt_dbs[FREE_DBI]; + mp->mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI]; mp->mm_last_pg = txn->mt_next_pgno - 1; #if (__GNUC__ * 100 + __GNUC_MINOR__ >= 404) && /* TODO: portability */ \ !(defined(__i386__) || defined(__x86_64__)) @@ -3692,15 +3718,12 @@ mdb_env_write_meta(MDB_txn *txn) if (!(flags & (MDB_NOMETASYNC|MDB_NOSYNC))) { unsigned meta_size = env->me_psize; rc = (env->me_flags & MDB_MAPASYNC) ? MS_ASYNC : MS_SYNC; - ptr = env->me_map; - if (toggle) { + ptr = (char *)mp - PAGEHDRSZ; #ifndef _WIN32 /* POSIX msync() requires ptr = start of OS page */ - if (meta_size < env->me_os_psize) - meta_size += meta_size; - else + r2 = (ptr - env->me_map) & (env->me_os_psize - 1); + ptr -= r2; + meta_size += r2; #endif - ptr += meta_size; - } if (MDB_MSYNC(ptr, meta_size, rc)) { rc = ErrCode(); goto fail; @@ -3708,21 +3731,19 @@ mdb_env_write_meta(MDB_txn *txn) } goto done; } - metab.mm_txnid = env->me_metas[toggle]->mm_txnid; - metab.mm_last_pg = env->me_metas[toggle]->mm_last_pg; + metab.mm_txnid = mp->mm_txnid; + metab.mm_last_pg = mp->mm_last_pg; meta.mm_mapsize = mapsize; - meta.mm_dbs[0] = txn->mt_dbs[0]; - meta.mm_dbs[1] = txn->mt_dbs[1]; + meta.mm_dbs[FREE_DBI] = txn->mt_dbs[FREE_DBI]; + meta.mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI]; meta.mm_last_pg = txn->mt_next_pgno - 1; meta.mm_txnid = txn->mt_txnid; off = offsetof(MDB_meta, mm_mapsize); ptr = (char *)&meta + off; len = sizeof(MDB_meta) - off; - if (toggle) - off += env->me_psize; - off += PAGEHDRSZ; + off += (char *)mp - env->me_map; /* Write to the SYNC fd */ mfd = (flags & (MDB_NOSYNC|MDB_NOMETASYNC)) ? env->me_fd : env->me_mfd; @@ -3779,12 +3800,13 @@ done: /** Check both meta pages to see which one is newer. * @param[in] env the environment handle - * @return meta toggle (0 or 1). + * @return newest #MDB_meta. */ -static int +static MDB_meta * mdb_env_pick_meta(const MDB_env *env) { - return (env->me_metas[0]->mm_txnid < env->me_metas[1]->mm_txnid); + MDB_meta *const *metas = env->me_metas; + return metas[ metas[0]->mm_txnid < metas[1]->mm_txnid ]; } int ESECT @@ -3797,7 +3819,7 @@ mdb_env_create(MDB_env **env) return ENOMEM; e->me_maxreaders = DEFAULT_READERS; - e->me_maxdbs = e->me_numdbs = 2; + e->me_maxdbs = e->me_numdbs = CORE_DBS; e->me_fd = INVALID_HANDLE_VALUE; e->me_lfd = INVALID_HANDLE_VALUE; e->me_mfd = INVALID_HANDLE_VALUE; @@ -3908,7 +3930,7 @@ mdb_env_set_mapsize(MDB_env *env, size_t size) void *old; if (env->me_txn) return EINVAL; - meta = env->me_metas[mdb_env_pick_meta(env)]; + meta = mdb_env_pick_meta(env); if (!size) size = meta->mm_mapsize; { @@ -3935,7 +3957,7 @@ mdb_env_set_maxdbs(MDB_env *env, MDB_dbi dbs) { if (env->me_map) return EINVAL; - env->me_maxdbs = dbs + 2; /* Named databases + main and free DB */ + env->me_maxdbs = dbs + CORE_DBS; return MDB_SUCCESS; } @@ -4115,12 +4137,12 @@ mdb_env_open2(MDB_env *env) #if MDB_DEBUG { - int toggle = mdb_env_pick_meta(env); - MDB_db *db = &env->me_metas[toggle]->mm_dbs[MAIN_DBI]; + MDB_meta *meta = mdb_env_pick_meta(env); + MDB_db *db = &meta->mm_dbs[MAIN_DBI]; DPRINTF(("opened database version %u, pagesize %u", - env->me_metas[0]->mm_version, env->me_psize)); - DPRINTF(("using meta page %d", toggle)); + meta->mm_version, env->me_psize)); + DPRINTF(("using meta page %d", (int) (meta->mm_txnid & 1))); DPRINTF(("depth: %u", db->md_depth)); DPRINTF(("entries: %"Z"u", db->md_entries)); DPRINTF(("branch pages: %"Z"u", db->md_branch_pages)); @@ -4207,9 +4229,10 @@ PIMAGE_TLS_CALLBACK mdb_tls_cbp = mdb_tls_callback; static int ESECT mdb_env_share_locks(MDB_env *env, int *excl) { - int rc = 0, toggle = mdb_env_pick_meta(env); + int rc = 0; + MDB_meta *meta = mdb_env_pick_meta(env); - env->me_txns->mti_txnid = env->me_metas[toggle]->mm_txnid; + env->me_txns->mti_txnid = meta->mm_txnid; #ifdef _WIN32 { @@ -4358,7 +4381,7 @@ mdb_hash_val(MDB_val *val, mdb_hash_t hval) */ static const char mdb_a85[]= "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!#$%&()*+-;<=>?@^_`{|}~"; -static void +static void ESECT mdb_pack85(unsigned long l, char *out) { int i; @@ -4369,7 +4392,7 @@ mdb_pack85(unsigned long l, char *out) } } -static void +static void ESECT mdb_hash_enc(MDB_val *val, char *encbuf) { mdb_hash_t h = mdb_hash_val(val, MDB_HASH_INIT); @@ -4405,7 +4428,7 @@ mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl) off_t size, rsize; #ifdef _WIN32 - env->me_lfd = CreateFile(lpath, GENERIC_READ|GENERIC_WRITE, + env->me_lfd = CreateFileA(lpath, GENERIC_READ|GENERIC_WRITE, FILE_SHARE_READ|FILE_SHARE_WRITE, NULL, OPEN_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL); #else @@ -4509,9 +4532,9 @@ mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl) mdb_hash_enc(&val, encbuf); sprintf(env->me_txns->mti_rmname, "Global\\MDBr%s", encbuf); sprintf(env->me_txns->mti_wmname, "Global\\MDBw%s", encbuf); - env->me_rmutex = CreateMutex(&mdb_all_sa, FALSE, env->me_txns->mti_rmname); + env->me_rmutex = CreateMutexA(&mdb_all_sa, FALSE, env->me_txns->mti_rmname); if (!env->me_rmutex) goto fail_errno; - env->me_wmutex = CreateMutex(&mdb_all_sa, FALSE, env->me_txns->mti_wmname); + env->me_wmutex = CreateMutexA(&mdb_all_sa, FALSE, env->me_txns->mti_wmname); if (!env->me_wmutex) goto fail_errno; #elif defined(MDB_USE_POSIX_SEM) struct stat stbuf; @@ -4583,9 +4606,9 @@ mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl) goto fail; } #ifdef _WIN32 - env->me_rmutex = OpenMutex(SYNCHRONIZE, FALSE, env->me_txns->mti_rmname); + env->me_rmutex = OpenMutexA(SYNCHRONIZE, FALSE, env->me_txns->mti_rmname); if (!env->me_rmutex) goto fail_errno; - env->me_wmutex = OpenMutex(SYNCHRONIZE, FALSE, env->me_txns->mti_wmname); + env->me_wmutex = OpenMutexA(SYNCHRONIZE, FALSE, env->me_txns->mti_wmname); if (!env->me_wmutex) goto fail_errno; #elif defined(MDB_USE_POSIX_SEM) env->me_rmutex = sem_open(env->me_txns->mti_rmname, 0); @@ -4688,7 +4711,7 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode len = OPEN_ALWAYS; } mode = FILE_ATTRIBUTE_NORMAL; - env->me_fd = CreateFile(dpath, oflags, FILE_SHARE_READ|FILE_SHARE_WRITE, + env->me_fd = CreateFileA(dpath, oflags, FILE_SHARE_READ|FILE_SHARE_WRITE, NULL, len, mode, NULL); #else if (F_ISSET(flags, MDB_RDONLY)) @@ -4718,7 +4741,7 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode */ #ifdef _WIN32 len = OPEN_EXISTING; - env->me_mfd = CreateFile(dpath, oflags, + env->me_mfd = CreateFileA(dpath, oflags, FILE_SHARE_READ|FILE_SHARE_WRITE, NULL, len, mode | FILE_FLAG_WRITE_THROUGH, NULL); #else @@ -4749,6 +4772,7 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode txn->mt_dbflags = (unsigned char *)(txn->mt_dbiseqs + env->me_maxdbs); txn->mt_env = env; txn->mt_dbxs = env->me_dbxs; + txn->mt_flags = MDB_TXN_FINISHED; env->me_txn0 = txn; } else { rc = ENOMEM; @@ -4775,7 +4799,7 @@ mdb_env_close0(MDB_env *env, int excl) /* Doing this here since me_dbxs may not exist during mdb_env_close */ if (env->me_dbxs) { - for (i = env->me_maxdbs; --i > MAIN_DBI; ) + for (i = env->me_maxdbs; --i >= CORE_DBS; ) free(env->me_dbxs[i].md_name.mv_data); free(env->me_dbxs); } @@ -5092,15 +5116,12 @@ static void mdb_cursor_pop(MDB_cursor *mc) { if (mc->mc_snum) { -#if MDB_DEBUG - MDB_page *top = mc->mc_pg[mc->mc_top]; -#endif + DPRINTF(("popping page %"Z"u off db %d cursor %p", + mc->mc_pg[mc->mc_top]->mp_pgno, DDBI(mc), (void *) mc)); + mc->mc_snum--; if (mc->mc_snum) mc->mc_top--; - - DPRINTF(("popped page %"Z"u off db %d cursor %p", top->mp_pgno, - DDBI(mc), (void *) mc)); } } @@ -5294,8 +5315,8 @@ mdb_page_search(MDB_cursor *mc, MDB_val *key, int flags) /* Make sure the txn is still viable, then find the root from * the txn's db table and set it as the root of the cursor's stack. */ - if (F_ISSET(mc->mc_txn->mt_flags, MDB_TXN_ERROR)) { - DPUTS("transaction has failed, must abort"); + if (mc->mc_txn->mt_flags & MDB_TXN_BLOCKED) { + DPUTS("transaction may not be used now"); return MDB_BAD_TXN; } else { /* Make sure we're using an up-to-date root */ @@ -5479,10 +5500,10 @@ mdb_get(MDB_txn *txn, MDB_dbi dbi, DPRINTF(("===> get db %u key [%s]", dbi, DKEY(key))); - if (!key || !data || dbi == FREE_DBI || !TXN_DBI_EXIST(txn, dbi)) + if (!key || !data || !TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) return EINVAL; - if (txn->mt_flags & MDB_TXN_ERROR) + if (txn->mt_flags & MDB_TXN_BLOCKED) return MDB_BAD_TXN; mdb_cursor_init(&mc, txn, dbi, &mx); @@ -6003,7 +6024,7 @@ mdb_cursor_get(MDB_cursor *mc, MDB_val *key, MDB_val *data, if (mc == NULL) return EINVAL; - if (mc->mc_txn->mt_flags & MDB_TXN_ERROR) + if (mc->mc_txn->mt_flags & MDB_TXN_BLOCKED) return MDB_BAD_TXN; switch (op) { @@ -6174,7 +6195,7 @@ mdb_cursor_touch(MDB_cursor *mc) { int rc = MDB_SUCCESS; - if (mc->mc_dbi > MAIN_DBI && !(*mc->mc_dbflag & DB_DIRTY)) { + if (mc->mc_dbi >= CORE_DBS && !(*mc->mc_dbflag & DB_DIRTY)) { MDB_cursor mc2; MDB_xcursor mcx; if (TXN_DBI_CHANGED(mc->mc_txn, mc->mc_dbi)) @@ -6233,7 +6254,7 @@ mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, nospill = flags & MDB_NOSPILL; flags &= ~MDB_NOSPILL; - if (mc->mc_txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_ERROR)) + if (mc->mc_txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_BLOCKED)) return (mc->mc_txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; if (key->mv_size-1 >= ENV_MAXKEY(env)) @@ -6726,7 +6747,7 @@ mdb_cursor_del(MDB_cursor *mc, unsigned int flags) MDB_page *mp; int rc; - if (mc->mc_txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_ERROR)) + if (mc->mc_txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_BLOCKED)) return (mc->mc_txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; if (!(mc->mc_flags & C_INITIALIZED)) @@ -6935,6 +6956,7 @@ mdb_node_add(MDB_cursor *mc, indx_t indx, MDB_node *node; MDB_page *mp = mc->mc_pg[mc->mc_top]; MDB_page *ofp = NULL; /* overflow page */ + void *ndata; DKBUF; mdb_cassert(mc, mp->mp_upper >= mp->mp_lower); @@ -6965,7 +6987,7 @@ mdb_node_add(MDB_cursor *mc, indx_t indx, if (key != NULL) node_size += key->mv_size; if (IS_LEAF(mp)) { - mdb_cassert(mc, data); + mdb_cassert(mc, key && data); if (F_ISSET(flags, F_BIGDATA)) { /* Data already on overflow page. */ node_size += sizeof(pgno_t); @@ -7016,23 +7038,21 @@ update: memcpy(NODEKEY(node), key->mv_data, key->mv_size); if (IS_LEAF(mp)) { - mdb_cassert(mc, key); + ndata = NODEDATA(node); if (ofp == NULL) { if (F_ISSET(flags, F_BIGDATA)) - memcpy(node->mn_data + key->mv_size, data->mv_data, - sizeof(pgno_t)); + memcpy(ndata, data->mv_data, sizeof(pgno_t)); else if (F_ISSET(flags, MDB_RESERVE)) - data->mv_data = node->mn_data + key->mv_size; + data->mv_data = ndata; else - memcpy(node->mn_data + key->mv_size, data->mv_data, - data->mv_size); + memcpy(ndata, data->mv_data, data->mv_size); } else { - memcpy(node->mn_data + key->mv_size, &ofp->mp_pgno, - sizeof(pgno_t)); + memcpy(ndata, &ofp->mp_pgno, sizeof(pgno_t)); + ndata = METADATA(ofp); if (F_ISSET(flags, MDB_RESERVE)) - data->mv_data = METADATA(ofp); + data->mv_data = ndata; else - memcpy(METADATA(ofp), data->mv_data, data->mv_size); + memcpy(ndata, data->mv_data, data->mv_size); } } @@ -7114,45 +7134,38 @@ mdb_node_shrink(MDB_page *mp, indx_t indx) MDB_node *node; MDB_page *sp, *xp; char *base; - int nsize, delta; - indx_t i, numkeys, ptr; + indx_t delta, nsize, len, ptr; + int i; node = NODEPTR(mp, indx); sp = (MDB_page *)NODEDATA(node); delta = SIZELEFT(sp); - xp = (MDB_page *)((char *)sp + delta); + nsize = NODEDSZ(node) - delta; - /* shift subpage upward */ + /* Prepare to shift upward, set len = length(subpage part to shift) */ if (IS_LEAF2(sp)) { - nsize = NUMKEYS(sp) * sp->mp_pad; + len = nsize; if (nsize & 1) return; /* do not make the node uneven-sized */ - memmove(METADATA(xp), METADATA(sp), nsize); } else { - int i; - numkeys = NUMKEYS(sp); - for (i=numkeys-1; i>=0; i--) + xp = (MDB_page *)((char *)sp + delta); /* destination subpage */ + for (i = NUMKEYS(sp); --i >= 0; ) xp->mp_ptrs[i] = sp->mp_ptrs[i] - delta; + len = PAGEHDRSZ; } - xp->mp_upper = sp->mp_lower; - xp->mp_lower = sp->mp_lower; - xp->mp_flags = sp->mp_flags; - xp->mp_pad = sp->mp_pad; - COPY_PGNO(xp->mp_pgno, mp->mp_pgno); - - nsize = NODEDSZ(node) - delta; + sp->mp_upper = sp->mp_lower; + COPY_PGNO(sp->mp_pgno, mp->mp_pgno); SETDSZ(node, nsize); - /* shift lower nodes upward */ + /* Shift upward */ + base = (char *)mp + mp->mp_upper + PAGEBASE; + memmove(base + delta, base, (char *)sp + len - base); + ptr = mp->mp_ptrs[indx]; - numkeys = NUMKEYS(mp); - for (i = 0; i < numkeys; i++) { + for (i = NUMKEYS(mp); --i >= 0; ) { if (mp->mp_ptrs[i] <= ptr) mp->mp_ptrs[i] += delta; } - - base = (char *)mp + mp->mp_upper + PAGEBASE; - memmove(base + delta, base, ptr - mp->mp_upper + NODESIZE + NODEKSZ(node)); mp->mp_upper += delta; } @@ -7227,7 +7240,7 @@ mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node) } DPRINTF(("Sub-db -%u root page %"Z"u", mx->mx_cursor.mc_dbi, mx->mx_db.md_root)); - mx->mx_dbflag = DB_VALID|DB_DIRTY; /* DB_DIRTY guides mdb_cursor_touch */ + mx->mx_dbflag = DB_VALID|DB_USRVALID|DB_DIRTY; /* DB_DIRTY guides mdb_cursor_touch */ #if UINT_MAX < SIZE_MAX if (mx->mx_dbx.md_cmp == mdb_cmp_int && mx->mx_db.md_pad == sizeof(size_t)) mx->mx_dbx.md_cmp = mdb_cmp_clong; @@ -7300,14 +7313,13 @@ mdb_cursor_open(MDB_txn *txn, MDB_dbi dbi, MDB_cursor **ret) MDB_cursor *mc; size_t size = sizeof(MDB_cursor); - if (!ret || !TXN_DBI_EXIST(txn, dbi)) + if (!ret || !TXN_DBI_EXIST(txn, dbi, DB_VALID)) return EINVAL; - if (txn->mt_flags & MDB_TXN_ERROR) + if (txn->mt_flags & MDB_TXN_BLOCKED) return MDB_BAD_TXN; - /* Allow read access to the freelist */ - if (!dbi && !F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) + if (dbi == FREE_DBI && !F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) return EINVAL; if (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT) @@ -7332,13 +7344,13 @@ mdb_cursor_open(MDB_txn *txn, MDB_dbi dbi, MDB_cursor **ret) int mdb_cursor_renew(MDB_txn *txn, MDB_cursor *mc) { - if (!mc || !TXN_DBI_EXIST(txn, mc->mc_dbi)) + if (!mc || !TXN_DBI_EXIST(txn, mc->mc_dbi, DB_VALID)) return EINVAL; if ((mc->mc_flags & C_UNTRACK) || txn->mt_cursors) return EINVAL; - if (txn->mt_flags & MDB_TXN_ERROR) + if (txn->mt_flags & MDB_TXN_BLOCKED) return MDB_BAD_TXN; mdb_cursor_init(mc, txn, mc->mc_dbi, mc->mc_xcursor); @@ -7357,7 +7369,7 @@ mdb_cursor_count(MDB_cursor *mc, size_t *countp) if (mc->mc_xcursor == NULL) return MDB_INCOMPATIBLE; - if (mc->mc_txn->mt_flags & MDB_TXN_ERROR) + if (mc->mc_txn->mt_flags & MDB_TXN_BLOCKED) return MDB_BAD_TXN; if (!(mc->mc_flags & C_INITIALIZED)) @@ -7782,6 +7794,7 @@ mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst) /* Adjust other cursors pointing to mp */ MDB_cursor *m2, *m3; MDB_dbi dbi = csrc->mc_dbi; + unsigned int top = csrc->mc_top; for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { if (csrc->mc_flags & C_SUB) @@ -7790,9 +7803,10 @@ mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst) m3 = m2; if (m3 == csrc) continue; if (m3->mc_snum < csrc->mc_snum) continue; - if (m3->mc_pg[csrc->mc_top] == psrc) { - m3->mc_pg[csrc->mc_top] = pdst; - m3->mc_ki[csrc->mc_top] += nkeys; + if (m3->mc_pg[top] == psrc) { + m3->mc_pg[top] = pdst; + m3->mc_ki[top] += nkeys; + m3->mc_ki[top-1] = cdst->mc_ki[top-1]; } } } @@ -8104,10 +8118,10 @@ int mdb_del(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data) { - if (!key || dbi == FREE_DBI || !TXN_DBI_EXIST(txn, dbi)) + if (!key || !TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) return EINVAL; - if (txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_ERROR)) + if (txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_BLOCKED)) return (txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; if (!F_ISSET(txn->mt_dbs[dbi].md_flags, MDB_DUPSORT)) { @@ -8284,6 +8298,7 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno rp->mp_upper -= ksize - sizeof(indx_t); mc->mc_ki[mc->mc_top] = x; mc->mc_pg[mc->mc_top] = rp; + mc->mc_ki[ptop]++; } } else { int psize, nsize, k; @@ -8570,12 +8585,15 @@ mdb_put(MDB_txn *txn, MDB_dbi dbi, MDB_cursor mc; MDB_xcursor mx; - if (!key || !data || dbi == FREE_DBI || !TXN_DBI_EXIST(txn, dbi)) + if (!key || !data || !TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) return EINVAL; - if ((flags & (MDB_NOOVERWRITE|MDB_NODUPDATA|MDB_RESERVE|MDB_APPEND|MDB_APPENDDUP)) != flags) + if (flags & ~(MDB_NOOVERWRITE|MDB_NODUPDATA|MDB_RESERVE|MDB_APPEND|MDB_APPENDDUP)) return EINVAL; + if (txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_BLOCKED)) + return (txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; + mdb_cursor_init(&mc, txn, dbi, &mx); return mdb_cursor_put(&mc, key, data, flags); } @@ -8880,7 +8898,7 @@ mdb_env_copyfd1(MDB_env *env, HANDLE fd) my.mc_wlen[1] = 0; my.mc_olen[0] = 0; my.mc_olen[1] = 0; - my.mc_next_pgno = 2; + my.mc_next_pgno = NUM_METAS; my.mc_status = 0; my.mc_new = 1; my.mc_toggle = 0; @@ -8893,7 +8911,7 @@ mdb_env_copyfd1(MDB_env *env, HANDLE fd) return rc; mp = (MDB_page *)my.mc_wbuf[0]; - memset(mp, 0, 2*env->me_psize); + memset(mp, 0, NUM_METAS * env->me_psize); mp->mp_pgno = 0; mp->mp_flags = P_META; mm = (MDB_meta *)METADATA(mp); @@ -8916,27 +8934,27 @@ mdb_env_copyfd1(MDB_env *env, HANDLE fd) mdb_cursor_init(&mc, txn, FREE_DBI, NULL); while ((rc = mdb_cursor_get(&mc, &key, &data, MDB_NEXT)) == 0) freecount += *(MDB_ID *)data.mv_data; - freecount += txn->mt_dbs[0].md_branch_pages + - txn->mt_dbs[0].md_leaf_pages + - txn->mt_dbs[0].md_overflow_pages; + freecount += txn->mt_dbs[FREE_DBI].md_branch_pages + + txn->mt_dbs[FREE_DBI].md_leaf_pages + + txn->mt_dbs[FREE_DBI].md_overflow_pages; /* Set metapage 1 */ mm->mm_last_pg = txn->mt_next_pgno - freecount - 1; - mm->mm_dbs[1] = txn->mt_dbs[1]; - if (mm->mm_last_pg > 1) { - mm->mm_dbs[1].md_root = mm->mm_last_pg; + mm->mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI]; + if (mm->mm_last_pg > NUM_METAS-1) { + mm->mm_dbs[MAIN_DBI].md_root = mm->mm_last_pg; mm->mm_txnid = 1; } else { - mm->mm_dbs[1].md_root = P_INVALID; + mm->mm_dbs[MAIN_DBI].md_root = P_INVALID; } } - my.mc_wlen[0] = env->me_psize * 2; + my.mc_wlen[0] = env->me_psize * NUM_METAS; my.mc_txn = txn; pthread_mutex_lock(&my.mc_mutex); while(my.mc_new) pthread_cond_wait(&my.mc_cond, &my.mc_mutex); pthread_mutex_unlock(&my.mc_mutex); - rc = mdb_env_cwalk(&my, &txn->mt_dbs[1].md_root, 0); + rc = mdb_env_cwalk(&my, &txn->mt_dbs[MAIN_DBI].md_root, 0); if (rc == MDB_SUCCESS && my.mc_wlen[my.mc_toggle]) rc = mdb_env_cthr_toggle(&my, 1); mdb_env_cthr_toggle(&my, -1); @@ -8986,7 +9004,7 @@ mdb_env_copyfd0(MDB_env *env, HANDLE fd) if (env->me_txns) { /* We must start the actual read txn after blocking writers */ - mdb_txn_reset0(txn, "reset-stage1"); + mdb_txn_end(txn, MDB_END_RESET_TMP); /* Temporarily block writers until we snapshot the meta pages */ wmutex = env->me_wmutex; @@ -9000,7 +9018,7 @@ mdb_env_copyfd0(MDB_env *env, HANDLE fd) } } - wsize = env->me_psize * 2; + wsize = env->me_psize * NUM_METAS; ptr = env->me_map; w2 = wsize; while (w2 > 0) { @@ -9097,7 +9115,7 @@ mdb_env_copy2(MDB_env *env, const char *path, unsigned int flags) * already in the OS cache. */ #ifdef _WIN32 - newfd = CreateFile(lpath, GENERIC_WRITE, 0, NULL, CREATE_NEW, + newfd = CreateFileA(lpath, GENERIC_WRITE, 0, NULL, CREATE_NEW, FILE_FLAG_NO_BUFFERING|FILE_FLAG_WRITE_THROUGH, NULL); #else newfd = open(lpath, O_WRONLY|O_CREAT|O_EXCL, 0666); @@ -9143,7 +9161,7 @@ mdb_env_copy(MDB_env *env, const char *path) int ESECT mdb_env_set_flags(MDB_env *env, unsigned int flag, int onoff) { - if ((flag & CHANGEABLE) != flag) + if (flag & ~CHANGEABLE) return EINVAL; if (onoff) env->me_flags |= flag; @@ -9230,32 +9248,32 @@ mdb_stat0(MDB_env *env, MDB_db *db, MDB_stat *arg) int ESECT mdb_env_stat(MDB_env *env, MDB_stat *arg) { - int toggle; + MDB_meta *meta; if (env == NULL || arg == NULL) return EINVAL; - toggle = mdb_env_pick_meta(env); + meta = mdb_env_pick_meta(env); - return mdb_stat0(env, &env->me_metas[toggle]->mm_dbs[MAIN_DBI], arg); + return mdb_stat0(env, &meta->mm_dbs[MAIN_DBI], arg); } int ESECT mdb_env_info(MDB_env *env, MDB_envinfo *arg) { - int toggle; + MDB_meta *meta; if (env == NULL || arg == NULL) return EINVAL; - toggle = mdb_env_pick_meta(env); - arg->me_mapaddr = env->me_metas[toggle]->mm_address; + meta = mdb_env_pick_meta(env); + arg->me_mapaddr = meta->mm_address; + arg->me_last_pgno = meta->mm_last_pg; + arg->me_last_txnid = meta->mm_txnid; + arg->me_mapsize = env->me_mapsize; arg->me_maxreaders = env->me_maxreaders; arg->me_numreaders = env->me_txns ? env->me_txns->mti_numreaders : 0; - - arg->me_last_pgno = env->me_metas[toggle]->mm_last_pg; - arg->me_last_txnid = env->me_metas[toggle]->mm_txnid; return MDB_SUCCESS; } @@ -9292,9 +9310,9 @@ int mdb_dbi_open(MDB_txn *txn, const char *name, unsigned int flags, MDB_dbi *db unsigned int unused = 0, seq; size_t len; - if ((flags & VALID_FLAGS) != flags) + if (flags & ~VALID_FLAGS) return EINVAL; - if (txn->mt_flags & MDB_TXN_ERROR) + if (txn->mt_flags & MDB_TXN_BLOCKED) return MDB_BAD_TXN; /* main DB? */ @@ -9318,7 +9336,7 @@ int mdb_dbi_open(MDB_txn *txn, const char *name, unsigned int flags, MDB_dbi *db /* Is the DB already open? */ len = strlen(name); - for (i=2; imt_numdbs; i++) { + for (i=CORE_DBS; imt_numdbs; i++) { if (!txn->mt_dbxs[i].md_name.mv_size) { /* Remember this free slot */ if (!unused) unused = i; @@ -9340,7 +9358,7 @@ int mdb_dbi_open(MDB_txn *txn, const char *name, unsigned int flags, MDB_dbi *db return (flags & MDB_CREATE) ? MDB_INCOMPATIBLE : MDB_NOTFOUND; /* Find the DB info */ - dbflag = DB_NEW|DB_VALID; + dbflag = DB_NEW|DB_VALID|DB_USRVALID; exact = 0; key.mv_size = len; key.mv_data = (void *)name; @@ -9386,12 +9404,13 @@ int mdb_dbi_open(MDB_txn *txn, const char *name, unsigned int flags, MDB_dbi *db return rc; } -int mdb_stat(MDB_txn *txn, MDB_dbi dbi, MDB_stat *arg) +int ESECT +mdb_stat(MDB_txn *txn, MDB_dbi dbi, MDB_stat *arg) { - if (!arg || !TXN_DBI_EXIST(txn, dbi)) + if (!arg || !TXN_DBI_EXIST(txn, dbi, DB_VALID)) return EINVAL; - if (txn->mt_flags & MDB_TXN_ERROR) + if (txn->mt_flags & MDB_TXN_BLOCKED) return MDB_BAD_TXN; if (txn->mt_dbflags[dbi] & DB_STALE) { @@ -9406,7 +9425,7 @@ int mdb_stat(MDB_txn *txn, MDB_dbi dbi, MDB_stat *arg) void mdb_dbi_close(MDB_env *env, MDB_dbi dbi) { char *ptr; - if (dbi <= MAIN_DBI || dbi >= env->me_maxdbs) + if (dbi < CORE_DBS || dbi >= env->me_maxdbs) return; ptr = env->me_dbxs[dbi].md_name.mv_data; /* If there was no name, this was already closed */ @@ -9422,7 +9441,7 @@ void mdb_dbi_close(MDB_env *env, MDB_dbi dbi) int mdb_dbi_flags(MDB_txn *txn, MDB_dbi dbi, unsigned int *flags) { /* We could return the flags for the FREE_DBI too but what's the point? */ - if (dbi == FREE_DBI || !TXN_DBI_EXIST(txn, dbi)) + if (!TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) return EINVAL; *flags = txn->mt_dbs[dbi].md_flags & PERSISTENT_FLAGS; return MDB_SUCCESS; @@ -9445,8 +9464,10 @@ mdb_drop0(MDB_cursor *mc, int subs) MDB_cursor mx; unsigned int i; - /* LEAF2 pages have no nodes, cannot have sub-DBs */ - if (IS_LEAF2(mc->mc_pg[mc->mc_top])) + /* DUPSORT sub-DBs have no ovpages/DBs. Omit scanning leaves. + * This also avoids any P_LEAF2 pages, which have no nodes. + */ + if (mc->mc_flags & C_SUB) mdb_cursor_pop(mc); mdb_cursor_copy(mc, &mx); @@ -9520,13 +9541,13 @@ int mdb_drop(MDB_txn *txn, MDB_dbi dbi, int del) MDB_cursor *mc, *m2; int rc; - if ((unsigned)del > 1 || dbi == FREE_DBI || !TXN_DBI_EXIST(txn, dbi)) + if ((unsigned)del > 1 || !TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) return EINVAL; if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) return EACCES; - if (dbi > MAIN_DBI && TXN_DBI_CHANGED(txn, dbi)) + if (TXN_DBI_CHANGED(txn, dbi)) return MDB_BAD_DBI; rc = mdb_cursor_open(txn, dbi, &mc); @@ -9541,7 +9562,7 @@ int mdb_drop(MDB_txn *txn, MDB_dbi dbi, int del) goto leave; /* Can't delete the main DB */ - if (del && dbi > MAIN_DBI) { + if (del && dbi >= CORE_DBS) { rc = mdb_del0(txn, MAIN_DBI, &mc->mc_dbx->md_name, NULL, F_SUBDATA); if (!rc) { txn->mt_dbflags[dbi] = DB_STALE; @@ -9568,7 +9589,7 @@ leave: int mdb_set_compare(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp) { - if (dbi == FREE_DBI || !TXN_DBI_EXIST(txn, dbi)) + if (!TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) return EINVAL; txn->mt_dbxs[dbi].md_cmp = cmp; @@ -9577,7 +9598,7 @@ int mdb_set_compare(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp) int mdb_set_dupsort(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp) { - if (dbi == FREE_DBI || !TXN_DBI_EXIST(txn, dbi)) + if (!TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) return EINVAL; txn->mt_dbxs[dbi].md_dcmp = cmp; @@ -9586,7 +9607,7 @@ int mdb_set_dupsort(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp) int mdb_set_relfunc(MDB_txn *txn, MDB_dbi dbi, MDB_rel_func *rel) { - if (dbi == FREE_DBI || !TXN_DBI_EXIST(txn, dbi)) + if (!TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) return EINVAL; txn->mt_dbxs[dbi].md_rel = rel; @@ -9595,7 +9616,7 @@ int mdb_set_relfunc(MDB_txn *txn, MDB_dbi dbi, MDB_rel_func *rel) int mdb_set_relctx(MDB_txn *txn, MDB_dbi dbi, void *ctx) { - if (dbi == FREE_DBI || !TXN_DBI_EXIST(txn, dbi)) + if (!TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) return EINVAL; txn->mt_dbxs[dbi].md_relctx = ctx; @@ -9697,7 +9718,8 @@ mdb_reader_check(MDB_env *env, int *dead) } /** As #mdb_reader_check(). rlocked = . */ -static int mdb_reader_check0(MDB_env *env, int rlocked, int *dead) +static int ESECT +mdb_reader_check0(MDB_env *env, int rlocked, int *dead) { mdb_mutexref_t rmutex = rlocked ? NULL : env->me_rmutex; unsigned int i, j, rdrs; @@ -9756,9 +9778,11 @@ static int mdb_reader_check0(MDB_env *env, int rlocked, int *dead) * @param[in] rc LOCK_MUTEX0() error (nonzero) * @return 0 on success with the mutex locked, or an error code on failure. */ -static int mdb_mutex_failed(MDB_env *env, mdb_mutexref_t mutex, int rc) +static int ESECT +mdb_mutex_failed(MDB_env *env, mdb_mutexref_t mutex, int rc) { - int toggle, rlocked, rc2; + int rlocked, rc2; + MDB_meta *meta; if (rc == MDB_OWNERDEAD) { /* We own the mutex. Clean up after dead previous owner. */ @@ -9768,8 +9792,8 @@ static int mdb_mutex_failed(MDB_env *env, mdb_mutexref_t mutex, int rc) /* Keep mti_txnid updated, otherwise next writer can * overwrite data which latest meta page refers to. */ - toggle = mdb_env_pick_meta(env); - env->me_txns->mti_txnid = env->me_metas[toggle]->mm_txnid; + meta = mdb_env_pick_meta(env); + env->me_txns->mti_txnid = meta->mm_txnid; /* env is hosed if the dead thread was ours */ if (env->me_txn) { env->me_flags |= MDB_FATAL_ERROR;