X-Git-Url: https://git.sur5r.net/?a=blobdiff_plain;f=libraries%2Fliblmdb%2Fmdb.c;h=fc0340c51e2e33fe22eb62adc3b5b779e13e9588;hb=71c07f0d60be6113e2ea4327a15b3646c19825d0;hp=fb12cf4459e86254e9661e65ec82d8b4b72df828;hpb=602c9787614324be6b6f4846d70d0998ef876f3b;p=openldap diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index fb12cf4459..fc0340c51e 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -96,7 +96,13 @@ extern int cacheflush(char *addr, int nbytes, int cache); #include #include #include + +#ifdef _MSC_VER +#include +typedef SSIZE_T ssize_t; +#else #include +#endif #if defined(__sun) || defined(ANDROID) /* Most platforms have posix_memalign, older may only have memalign */ @@ -232,7 +238,34 @@ extern int cacheflush(char *addr, int nbytes, int cache); #define MDB_OWNERDEAD EOWNERDEAD /**< #LOCK_MUTEX0() result if dead owner */ #endif -#ifdef MDB_OWNERDEAD +#ifdef __GLIBC__ +#define GLIBC_VER ((__GLIBC__ << 16 )| __GLIBC_MINOR__) +#endif +/** Some platforms define the EOWNERDEAD error code + * even though they don't support Robust Mutexes. + * Compile with -DMDB_USE_ROBUST=0, or use some other + * mechanism like -DMDB_USE_SYSV_SEM instead of + * -DMDB_USE_POSIX_MUTEX. (SysV semaphores are + * also Robust, but some systems don't support them + * either.) + */ +#ifndef MDB_USE_ROBUST +/* Android currently lacks Robust Mutex support. So does glibc < 2.4. */ +# if defined(MDB_USE_POSIX_MUTEX) && (defined(ANDROID) || \ + (defined(__GLIBC__) && GLIBC_VER < 0x020004)) +# define MDB_USE_ROBUST 0 +# else +# define MDB_USE_ROBUST 1 +/* glibc < 2.10 only provided _np API */ +# if defined(__GLIBC__) && GLIBC_VER < 0x02000a +# define PTHREAD_MUTEX_ROBUST PTHREAD_MUTEX_ROBUST_NP +# define pthread_mutexattr_setrobust(attr, flag) pthread_mutexattr_setrobust_np(attr, flag) +# define pthread_mutex_consistent(mutex) pthread_mutex_consistent_np(mutex) +# endif +# endif +#endif /* MDB_USE_ROBUST */ + +#if defined(MDB_OWNERDEAD) && MDB_USE_ROBUST #define MDB_ROBUST_SUPPORTED 1 #endif @@ -370,10 +403,13 @@ static int mdb_mutex_failed(MDB_env *env, mdb_mutexref_t mutex, int rc); * * @note If O_DSYNC is undefined but exists in /usr/include, * preferably set some compiler flag to get the definition. - * Otherwise compile with the less efficient -DMDB_DSYNC=O_SYNC. */ #ifndef MDB_DSYNC +# ifdef O_DSYNC # define MDB_DSYNC O_DSYNC +# else +# define MDB_DSYNC O_SYNC +# endif #endif #endif @@ -943,6 +979,11 @@ typedef struct MDB_db { #define FREE_DBI 0 /** Handle for the default DB. */ #define MAIN_DBI 1 + /** Number of DBs in metapage (free and main) - also hardcoded elsewhere */ +#define CORE_DBS 2 + + /** Number of meta pages - also hardcoded elsewhere */ +#define NUM_METAS 2 /** Meta page content. * A meta page is the start point for accessing a database snapshot. @@ -956,11 +997,11 @@ typedef struct MDB_meta { uint32_t mm_version; void *mm_address; /**< address for fixed mapping */ size_t mm_mapsize; /**< size of mmap region */ - MDB_db mm_dbs[2]; /**< first is free space, 2nd is main db */ + MDB_db mm_dbs[CORE_DBS]; /**< first is free space, 2nd is main db */ /** The size of pages used in this DB */ -#define mm_psize mm_dbs[0].md_pad +#define mm_psize mm_dbs[FREE_DBI].md_pad /** Any persistent environment flags. @ref mdb_env */ -#define mm_flags mm_dbs[0].md_flags +#define mm_flags mm_dbs[FREE_DBI].md_flags pgno_t mm_last_pg; /**< last used page in file */ volatile txnid_t mm_txnid; /**< txnid that committed this page */ } MDB_meta; @@ -1175,7 +1216,7 @@ struct MDB_env { char *me_path; /**< path to the DB files */ char *me_map; /**< the memory map of the data file */ MDB_txninfo *me_txns; /**< the memory map of the lock file or NULL */ - MDB_meta *me_metas[2]; /**< pointers to the two meta pages */ + MDB_meta *me_metas[NUM_METAS]; /**< pointers to the two meta pages */ void *me_pbuf; /**< scratch area for DUPSORT put() */ MDB_txn *me_txn; /**< current write transaction */ MDB_txn *me_txn0; /**< prealloc'd write transaction */ @@ -1406,7 +1447,7 @@ mdb_strerror(int err) ; } buf[0] = 0; - FormatMessage(FORMAT_MESSAGE_FROM_SYSTEM | + FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, NULL, err, 0, ptr, sizeof(buf), (va_list *)pad); return ptr; @@ -1418,7 +1459,7 @@ mdb_strerror(int err) /** assert(3) variant in cursor context */ #define mdb_cassert(mc, expr) mdb_assert0((mc)->mc_txn->mt_env, expr, #expr) /** assert(3) variant in transaction context */ -#define mdb_tassert(mc, expr) mdb_assert0((txn)->mt_env, expr, #expr) +#define mdb_tassert(txn, expr) mdb_assert0((txn)->mt_env, expr, #expr) /** assert(3) variant in environment context */ #define mdb_eassert(env, expr) mdb_assert0(env, expr, #expr) @@ -1624,9 +1665,10 @@ static void mdb_audit(MDB_txn *txn) mdb_tassert(txn, rc == MDB_NOTFOUND); } } - if (freecount + count + 2 /* metapages */ != txn->mt_next_pgno) { + if (freecount + count + NUM_METAS != txn->mt_next_pgno) { fprintf(stderr, "audit: %lu freecount: %lu count: %lu total: %lu next_pgno: %lu\n", - txn->mt_txnid, freecount, count+2, freecount+count+2, txn->mt_next_pgno); + txn->mt_txnid, freecount, count+NUM_METAS, + freecount+count+NUM_METAS, txn->mt_next_pgno); } } #endif @@ -1895,7 +1937,7 @@ mdb_page_spill(MDB_cursor *m0, MDB_val *key, MDB_val *data) /* Estimate how much space this op will take */ i = m0->mc_db->md_depth; /* Named DBs also dirty the main DB */ - if (m0->mc_dbi > MAIN_DBI) + if (m0->mc_dbi >= CORE_DBS) i += txn->mt_dbs[MAIN_DBI].md_depth; /* For puts, roughly factor in the key+data size */ if (key) @@ -2664,7 +2706,7 @@ mdb_txn_renew0(MDB_txn *txn) } /* Copy the DB info and flags */ - memcpy(txn->mt_dbs, meta->mm_dbs, 2 * sizeof(MDB_db)); + memcpy(txn->mt_dbs, meta->mm_dbs, CORE_DBS * sizeof(MDB_db)); /* Moved to here to avoid a data race in read TXNs */ txn->mt_next_pgno = meta->mm_last_pg+1; @@ -2673,7 +2715,7 @@ mdb_txn_renew0(MDB_txn *txn) /* Setup db info */ txn->mt_numdbs = env->me_numdbs; - for (i=2; imt_numdbs; i++) { + for (i=CORE_DBS; imt_numdbs; i++) { x = env->me_dbflags[i]; txn->mt_dbs[i].md_flags = x & PERSISTENT_FLAGS; txn->mt_dbflags[i] = (x & MDB_VALID) ? DB_VALID|DB_USRVALID|DB_STALE : 0; @@ -2681,12 +2723,16 @@ mdb_txn_renew0(MDB_txn *txn) txn->mt_dbflags[MAIN_DBI] = DB_VALID|DB_USRVALID; txn->mt_dbflags[FREE_DBI] = DB_VALID; - if (env->me_maxpg < txn->mt_next_pgno) { - mdb_txn_end(txn, new_notls /*0 or MDB_END_SLOT*/ | MDB_END_FAIL_BEGIN); - return MDB_MAP_RESIZED; + if (env->me_flags & MDB_FATAL_ERROR) { + DPUTS("environment had fatal error, must shutdown!"); + rc = MDB_PANIC; + } else if (env->me_maxpg < txn->mt_next_pgno) { + rc = MDB_MAP_RESIZED; + } else { + return MDB_SUCCESS; } - - return MDB_SUCCESS; + mdb_txn_end(txn, new_notls /*0 or MDB_END_SLOT*/ | MDB_END_FAIL_BEGIN); + return rc; } int @@ -2697,11 +2743,6 @@ mdb_txn_renew(MDB_txn *txn) if (!txn || !F_ISSET(txn->mt_flags, MDB_TXN_RDONLY|MDB_TXN_FINISHED)) return EINVAL; - if (txn->mt_env->me_flags & MDB_FATAL_ERROR) { - DPUTS("environment had fatal error, must shutdown!"); - return MDB_PANIC; - } - rc = mdb_txn_renew0(txn); if (rc == MDB_SUCCESS) { DPRINTF(("renew txn %"Z"u%c %p on mdbenv %p, root page %"Z"u", @@ -2721,10 +2762,6 @@ mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned int flags, MDB_txn **ret) flags &= MDB_TXN_BEGIN_FLAGS; flags |= env->me_flags & MDB_WRITEMAP; - if (env->me_flags & MDB_FATAL_ERROR) { - DPUTS("environment had fatal error, must shutdown!"); - return MDB_PANIC; - } if (env->me_flags & MDB_RDONLY & ~flags) /* write txn in RDONLY env */ return EACCES; @@ -2839,7 +2876,7 @@ mdb_dbis_update(MDB_txn *txn, int keep) MDB_env *env = txn->mt_env; unsigned char *tdbflags = txn->mt_dbflags; - for (i = n; --i >= 2;) { + for (i = n; --i >= CORE_DBS;) { if (tdbflags[i] & DB_NEW) { if (keep) { env->me_dbflags[i] = txn->mt_dbs[i].md_flags | MDB_VALID; @@ -3361,9 +3398,9 @@ mdb_txn_commit(MDB_txn *txn) /* Update parent's DB table. */ memcpy(parent->mt_dbs, txn->mt_dbs, txn->mt_numdbs * sizeof(MDB_db)); parent->mt_numdbs = txn->mt_numdbs; - parent->mt_dbflags[0] = txn->mt_dbflags[0]; - parent->mt_dbflags[1] = txn->mt_dbflags[1]; - for (i=2; imt_numdbs; i++) { + parent->mt_dbflags[FREE_DBI] = txn->mt_dbflags[FREE_DBI]; + parent->mt_dbflags[MAIN_DBI] = txn->mt_dbflags[MAIN_DBI]; + for (i=CORE_DBS; imt_numdbs; i++) { /* preserve parent's DB_NEW status */ x = parent->mt_dbflags[i] & DB_NEW; parent->mt_dbflags[i] = txn->mt_dbflags[i] | x; @@ -3464,14 +3501,14 @@ mdb_txn_commit(MDB_txn *txn) txn->mt_txnid, (void*)txn, (void*)env, txn->mt_dbs[MAIN_DBI].md_root)); /* Update DB root pointers */ - if (txn->mt_numdbs > 2) { + if (txn->mt_numdbs > CORE_DBS) { MDB_cursor mc; MDB_dbi i; MDB_val data; data.mv_size = sizeof(MDB_db); mdb_cursor_init(&mc, txn, MAIN_DBI, NULL); - for (i = 2; i < txn->mt_numdbs; i++) { + for (i = CORE_DBS; i < txn->mt_numdbs; i++) { if (txn->mt_dbflags[i] & DB_DIRTY) { if (TXN_DBI_CHANGED(txn, i)) { rc = MDB_BAD_DBI; @@ -3532,7 +3569,7 @@ mdb_env_read_header(MDB_env *env, MDB_meta *meta) * Read both meta pages so we can use the latest one. */ - for (i=off=0; i<2; i++, off = meta->mm_psize) { + for (i=off=0; imm_psize) { #ifdef _WIN32 DWORD len; OVERLAPPED ov; @@ -3585,11 +3622,11 @@ mdb_env_init_meta0(MDB_env *env, MDB_meta *meta) meta->mm_version = MDB_DATA_VERSION; meta->mm_mapsize = env->me_mapsize; meta->mm_psize = env->me_psize; - meta->mm_last_pg = 1; + meta->mm_last_pg = NUM_METAS-1; meta->mm_flags = env->me_flags & 0xffff; - meta->mm_flags |= MDB_INTEGERKEY; - meta->mm_dbs[0].md_root = P_INVALID; - meta->mm_dbs[1].md_root = P_INVALID; + meta->mm_flags |= MDB_INTEGERKEY; /* this is mm_dbs[FREE_DBI].md_flags */ + meta->mm_dbs[FREE_DBI].md_root = P_INVALID; + meta->mm_dbs[MAIN_DBI].md_root = P_INVALID; } /** Write the environment parameters of a freshly created DB environment. @@ -3622,7 +3659,7 @@ mdb_env_init_meta(MDB_env *env, MDB_meta *meta) psize = env->me_psize; - p = calloc(2, psize); + p = calloc(NUM_METAS, psize); if (!p) return ENOMEM; @@ -3635,10 +3672,10 @@ mdb_env_init_meta(MDB_env *env, MDB_meta *meta) q->mp_flags = P_META; *(MDB_meta *)METADATA(q) = *meta; - DO_PWRITE(rc, env->me_fd, p, psize * 2, len, 0); + DO_PWRITE(rc, env->me_fd, p, psize * NUM_METAS, len, 0); if (!rc) rc = ErrCode(); - else if ((unsigned) len == psize * 2) + else if ((unsigned) len == psize * NUM_METAS) rc = MDB_SUCCESS; else rc = ENOSPC; @@ -3681,8 +3718,8 @@ mdb_env_write_meta(MDB_txn *txn) if (flags & MDB_WRITEMAP) { mp->mm_mapsize = mapsize; - mp->mm_dbs[0] = txn->mt_dbs[0]; - mp->mm_dbs[1] = txn->mt_dbs[1]; + mp->mm_dbs[FREE_DBI] = txn->mt_dbs[FREE_DBI]; + mp->mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI]; mp->mm_last_pg = txn->mt_next_pgno - 1; #if (__GNUC__ * 100 + __GNUC_MINOR__ >= 404) && /* TODO: portability */ \ !(defined(__i386__) || defined(__x86_64__)) @@ -3693,15 +3730,12 @@ mdb_env_write_meta(MDB_txn *txn) if (!(flags & (MDB_NOMETASYNC|MDB_NOSYNC))) { unsigned meta_size = env->me_psize; rc = (env->me_flags & MDB_MAPASYNC) ? MS_ASYNC : MS_SYNC; - ptr = env->me_map; - if (toggle) { + ptr = (char *)mp - PAGEHDRSZ; #ifndef _WIN32 /* POSIX msync() requires ptr = start of OS page */ - if (meta_size < env->me_os_psize) - meta_size += meta_size; - else + r2 = (ptr - env->me_map) & (env->me_os_psize - 1); + ptr -= r2; + meta_size += r2; #endif - ptr += meta_size; - } if (MDB_MSYNC(ptr, meta_size, rc)) { rc = ErrCode(); goto fail; @@ -3709,21 +3743,19 @@ mdb_env_write_meta(MDB_txn *txn) } goto done; } - metab.mm_txnid = env->me_metas[toggle]->mm_txnid; - metab.mm_last_pg = env->me_metas[toggle]->mm_last_pg; + metab.mm_txnid = mp->mm_txnid; + metab.mm_last_pg = mp->mm_last_pg; meta.mm_mapsize = mapsize; - meta.mm_dbs[0] = txn->mt_dbs[0]; - meta.mm_dbs[1] = txn->mt_dbs[1]; + meta.mm_dbs[FREE_DBI] = txn->mt_dbs[FREE_DBI]; + meta.mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI]; meta.mm_last_pg = txn->mt_next_pgno - 1; meta.mm_txnid = txn->mt_txnid; off = offsetof(MDB_meta, mm_mapsize); ptr = (char *)&meta + off; len = sizeof(MDB_meta) - off; - if (toggle) - off += env->me_psize; - off += PAGEHDRSZ; + off += (char *)mp - env->me_map; /* Write to the SYNC fd */ mfd = (flags & (MDB_NOSYNC|MDB_NOMETASYNC)) ? env->me_fd : env->me_mfd; @@ -3799,7 +3831,7 @@ mdb_env_create(MDB_env **env) return ENOMEM; e->me_maxreaders = DEFAULT_READERS; - e->me_maxdbs = e->me_numdbs = 2; + e->me_maxdbs = e->me_numdbs = CORE_DBS; e->me_fd = INVALID_HANDLE_VALUE; e->me_lfd = INVALID_HANDLE_VALUE; e->me_mfd = INVALID_HANDLE_VALUE; @@ -3937,7 +3969,7 @@ mdb_env_set_maxdbs(MDB_env *env, MDB_dbi dbs) { if (env->me_map) return EINVAL; - env->me_maxdbs = dbs + 2; /* Named databases + main and free DB */ + env->me_maxdbs = dbs + CORE_DBS; return MDB_SUCCESS; } @@ -4408,7 +4440,7 @@ mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl) off_t size, rsize; #ifdef _WIN32 - env->me_lfd = CreateFile(lpath, GENERIC_READ|GENERIC_WRITE, + env->me_lfd = CreateFileA(lpath, GENERIC_READ|GENERIC_WRITE, FILE_SHARE_READ|FILE_SHARE_WRITE, NULL, OPEN_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL); #else @@ -4512,9 +4544,9 @@ mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl) mdb_hash_enc(&val, encbuf); sprintf(env->me_txns->mti_rmname, "Global\\MDBr%s", encbuf); sprintf(env->me_txns->mti_wmname, "Global\\MDBw%s", encbuf); - env->me_rmutex = CreateMutex(&mdb_all_sa, FALSE, env->me_txns->mti_rmname); + env->me_rmutex = CreateMutexA(&mdb_all_sa, FALSE, env->me_txns->mti_rmname); if (!env->me_rmutex) goto fail_errno; - env->me_wmutex = CreateMutex(&mdb_all_sa, FALSE, env->me_txns->mti_wmname); + env->me_wmutex = CreateMutexA(&mdb_all_sa, FALSE, env->me_txns->mti_wmname); if (!env->me_wmutex) goto fail_errno; #elif defined(MDB_USE_POSIX_SEM) struct stat stbuf; @@ -4586,9 +4618,9 @@ mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl) goto fail; } #ifdef _WIN32 - env->me_rmutex = OpenMutex(SYNCHRONIZE, FALSE, env->me_txns->mti_rmname); + env->me_rmutex = OpenMutexA(SYNCHRONIZE, FALSE, env->me_txns->mti_rmname); if (!env->me_rmutex) goto fail_errno; - env->me_wmutex = OpenMutex(SYNCHRONIZE, FALSE, env->me_txns->mti_wmname); + env->me_wmutex = OpenMutexA(SYNCHRONIZE, FALSE, env->me_txns->mti_wmname); if (!env->me_wmutex) goto fail_errno; #elif defined(MDB_USE_POSIX_SEM) env->me_rmutex = sem_open(env->me_txns->mti_rmname, 0); @@ -4691,7 +4723,7 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode len = OPEN_ALWAYS; } mode = FILE_ATTRIBUTE_NORMAL; - env->me_fd = CreateFile(dpath, oflags, FILE_SHARE_READ|FILE_SHARE_WRITE, + env->me_fd = CreateFileA(dpath, oflags, FILE_SHARE_READ|FILE_SHARE_WRITE, NULL, len, mode, NULL); #else if (F_ISSET(flags, MDB_RDONLY)) @@ -4721,7 +4753,7 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode */ #ifdef _WIN32 len = OPEN_EXISTING; - env->me_mfd = CreateFile(dpath, oflags, + env->me_mfd = CreateFileA(dpath, oflags, FILE_SHARE_READ|FILE_SHARE_WRITE, NULL, len, mode | FILE_FLAG_WRITE_THROUGH, NULL); #else @@ -4779,7 +4811,7 @@ mdb_env_close0(MDB_env *env, int excl) /* Doing this here since me_dbxs may not exist during mdb_env_close */ if (env->me_dbxs) { - for (i = env->me_maxdbs; --i > MAIN_DBI; ) + for (i = env->me_maxdbs; --i >= CORE_DBS; ) free(env->me_dbxs[i].md_name.mv_data); free(env->me_dbxs); } @@ -6175,7 +6207,7 @@ mdb_cursor_touch(MDB_cursor *mc) { int rc = MDB_SUCCESS; - if (mc->mc_dbi > MAIN_DBI && !(*mc->mc_dbflag & DB_DIRTY)) { + if (mc->mc_dbi >= CORE_DBS && !(*mc->mc_dbflag & DB_DIRTY)) { MDB_cursor mc2; MDB_xcursor mcx; if (TXN_DBI_CHANGED(mc->mc_txn, mc->mc_dbi)) @@ -6359,16 +6391,18 @@ fix_parent: * update branch key if there is a parent page */ if (mc->mc_top && !mc->mc_ki[mc->mc_top]) { - unsigned short top = mc->mc_top; + unsigned short dtop = 1; mc->mc_top--; /* slot 0 is always an empty key, find real slot */ - while (mc->mc_top && !mc->mc_ki[mc->mc_top]) + while (mc->mc_top && !mc->mc_ki[mc->mc_top]) { mc->mc_top--; + dtop++; + } if (mc->mc_ki[mc->mc_top]) rc2 = mdb_update_key(mc, key); else rc2 = MDB_SUCCESS; - mc->mc_top = top; + mc->mc_top += dtop; if (rc2) return rc2; } @@ -7299,8 +7333,7 @@ mdb_cursor_open(MDB_txn *txn, MDB_dbi dbi, MDB_cursor **ret) if (txn->mt_flags & MDB_TXN_BLOCKED) return MDB_BAD_TXN; - /* Allow read access to the freelist */ - if (!dbi && !F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) + if (dbi == FREE_DBI && !F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) return EINVAL; if (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT) @@ -7577,32 +7610,48 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst) /* Adjust other cursors pointing to mp */ MDB_cursor *m2, *m3; MDB_dbi dbi = csrc->mc_dbi; - MDB_page *mp; - - mp = cdst->mc_pg[csrc->mc_top]; - for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { - if (csrc->mc_flags & C_SUB) - m3 = &m2->mc_xcursor->mx_cursor; - else - m3 = m2; - if (m3 == cdst) continue; - if (m3->mc_pg[csrc->mc_top] == mp && m3->mc_ki[csrc->mc_top] >= - cdst->mc_ki[csrc->mc_top]) { - m3->mc_ki[csrc->mc_top]++; + MDB_page *mpd, *mps; + + mps = csrc->mc_pg[csrc->mc_top]; + /* If we're adding on the left, bump others up */ + if (!cdst->mc_ki[csrc->mc_top]) { + mpd = cdst->mc_pg[csrc->mc_top]; + for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { + if (csrc->mc_flags & C_SUB) + m3 = &m2->mc_xcursor->mx_cursor; + else + m3 = m2; + if (m3 != cdst && + m3->mc_pg[csrc->mc_top] == mpd && + m3->mc_ki[csrc->mc_top] >= cdst->mc_ki[csrc->mc_top]) { + m3->mc_ki[csrc->mc_top]++; + } + if (m3 !=csrc && + m3->mc_pg[csrc->mc_top] == mps && + m3->mc_ki[csrc->mc_top] == csrc->mc_ki[csrc->mc_top]) { + m3->mc_pg[csrc->mc_top] = cdst->mc_pg[cdst->mc_top]; + m3->mc_ki[csrc->mc_top] = cdst->mc_ki[cdst->mc_top]; + m3->mc_ki[csrc->mc_top-1]++; + } } - } - - mp = csrc->mc_pg[csrc->mc_top]; - for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { - if (csrc->mc_flags & C_SUB) - m3 = &m2->mc_xcursor->mx_cursor; - else - m3 = m2; - if (m3 == csrc) continue; - if (m3->mc_pg[csrc->mc_top] == mp && m3->mc_ki[csrc->mc_top] == - csrc->mc_ki[csrc->mc_top]) { - m3->mc_pg[csrc->mc_top] = cdst->mc_pg[cdst->mc_top]; - m3->mc_ki[csrc->mc_top] = cdst->mc_ki[cdst->mc_top]; + } else + /* Adding on the right, bump others down */ + { + for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { + if (csrc->mc_flags & C_SUB) + m3 = &m2->mc_xcursor->mx_cursor; + else + m3 = m2; + if (m3 == csrc) continue; + if (m3->mc_pg[csrc->mc_top] == mps) { + if (!m3->mc_ki[csrc->mc_top]) { + m3->mc_pg[csrc->mc_top] = cdst->mc_pg[cdst->mc_top]; + m3->mc_ki[csrc->mc_top] = cdst->mc_ki[cdst->mc_top]; + m3->mc_ki[csrc->mc_top-1]--; + } else { + m3->mc_ki[csrc->mc_top]--; + } + } } } } @@ -7698,6 +7747,9 @@ mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst) if ((rc = mdb_page_touch(cdst))) return rc; + /* get dst page again now that we've touched it. */ + pdst = cdst->mc_pg[cdst->mc_top]; + /* Move all nodes from src to dst. */ j = nkeys = NUMKEYS(pdst); @@ -7775,6 +7827,7 @@ mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst) /* Adjust other cursors pointing to mp */ MDB_cursor *m2, *m3; MDB_dbi dbi = csrc->mc_dbi; + unsigned int top = csrc->mc_top; for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { if (csrc->mc_flags & C_SUB) @@ -7783,9 +7836,10 @@ mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst) m3 = m2; if (m3 == csrc) continue; if (m3->mc_snum < csrc->mc_snum) continue; - if (m3->mc_pg[csrc->mc_top] == psrc) { - m3->mc_pg[csrc->mc_top] = pdst; - m3->mc_ki[csrc->mc_top] += nkeys; + if (m3->mc_pg[top] == psrc) { + m3->mc_pg[top] = pdst; + m3->mc_ki[top] += nkeys; + m3->mc_ki[top-1] = cdst->mc_ki[top-1]; } } } @@ -7921,9 +7975,9 @@ mdb_rebalance(MDB_cursor *mc) m3 = &m2->mc_xcursor->mx_cursor; else m3 = m2; - if (m3 == mc || m3->mc_snum < mc->mc_snum) continue; + if (m3 == mc) continue; if (m3->mc_pg[0] == mp) { - for (i=0; imc_snum; i++) { + for (i=0; imc_db->md_depth; i++) { m3->mc_pg[i] = m3->mc_pg[i+1]; m3->mc_ki[i] = m3->mc_ki[i+1]; } @@ -7988,7 +8042,8 @@ mdb_rebalance(MDB_cursor *mc) */ if (PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) >= thresh && NUMKEYS(mn.mc_pg[mn.mc_top]) > minkeys) { rc = mdb_node_move(&mn, mc); - if (mc->mc_ki[mc->mc_top-1]) { + if (!mc->mc_ki[mc->mc_top]) { + /* if we inserted on left, bump position up */ oldki++; } } else { @@ -8196,12 +8251,19 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno rp->mp_pad = mp->mp_pad; DPRINTF(("new right sibling: page %"Z"u", rp->mp_pgno)); - if (mc->mc_snum < 2) { + /* Usually when splitting the root page, the cursor + * height is 1. But when called from mdb_update_key, + * the cursor height may be greater because it walks + * up the stack while finding the branch slot to update. + */ + if (mc->mc_top < 1) { if ((rc = mdb_page_new(mc, P_BRANCH, 1, &pp))) goto done; /* shift current top to make room for new parent */ - mc->mc_pg[1] = mc->mc_pg[0]; - mc->mc_ki[1] = mc->mc_ki[0]; + for (i=mc->mc_snum; i>0; i--) { + mc->mc_pg[i] = mc->mc_pg[i-1]; + mc->mc_ki[i] = mc->mc_ki[i-1]; + } mc->mc_pg[0] = pp; mc->mc_ki[0] = 0; mc->mc_db->md_root = pp->mp_pgno; @@ -8217,8 +8279,8 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno mc->mc_db->md_depth--; goto done; } - mc->mc_snum = 2; - mc->mc_top = 1; + mc->mc_snum++; + mc->mc_top++; ptop = 0; } else { ptop = mc->mc_top-1; @@ -8277,6 +8339,7 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno rp->mp_upper -= ksize - sizeof(indx_t); mc->mc_ki[mc->mc_top] = x; mc->mc_pg[mc->mc_top] = rp; + mc->mc_ki[ptop]++; } } else { int psize, nsize, k; @@ -8569,6 +8632,9 @@ mdb_put(MDB_txn *txn, MDB_dbi dbi, if (flags & ~(MDB_NOOVERWRITE|MDB_NODUPDATA|MDB_RESERVE|MDB_APPEND|MDB_APPENDDUP)) return EINVAL; + if (txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_BLOCKED)) + return (txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; + mdb_cursor_init(&mc, txn, dbi, &mx); return mdb_cursor_put(&mc, key, data, flags); } @@ -8873,7 +8939,7 @@ mdb_env_copyfd1(MDB_env *env, HANDLE fd) my.mc_wlen[1] = 0; my.mc_olen[0] = 0; my.mc_olen[1] = 0; - my.mc_next_pgno = 2; + my.mc_next_pgno = NUM_METAS; my.mc_status = 0; my.mc_new = 1; my.mc_toggle = 0; @@ -8886,7 +8952,7 @@ mdb_env_copyfd1(MDB_env *env, HANDLE fd) return rc; mp = (MDB_page *)my.mc_wbuf[0]; - memset(mp, 0, 2*env->me_psize); + memset(mp, 0, NUM_METAS * env->me_psize); mp->mp_pgno = 0; mp->mp_flags = P_META; mm = (MDB_meta *)METADATA(mp); @@ -8909,27 +8975,27 @@ mdb_env_copyfd1(MDB_env *env, HANDLE fd) mdb_cursor_init(&mc, txn, FREE_DBI, NULL); while ((rc = mdb_cursor_get(&mc, &key, &data, MDB_NEXT)) == 0) freecount += *(MDB_ID *)data.mv_data; - freecount += txn->mt_dbs[0].md_branch_pages + - txn->mt_dbs[0].md_leaf_pages + - txn->mt_dbs[0].md_overflow_pages; + freecount += txn->mt_dbs[FREE_DBI].md_branch_pages + + txn->mt_dbs[FREE_DBI].md_leaf_pages + + txn->mt_dbs[FREE_DBI].md_overflow_pages; /* Set metapage 1 */ mm->mm_last_pg = txn->mt_next_pgno - freecount - 1; - mm->mm_dbs[1] = txn->mt_dbs[1]; - if (mm->mm_last_pg > 1) { - mm->mm_dbs[1].md_root = mm->mm_last_pg; + mm->mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI]; + if (mm->mm_last_pg > NUM_METAS-1) { + mm->mm_dbs[MAIN_DBI].md_root = mm->mm_last_pg; mm->mm_txnid = 1; } else { - mm->mm_dbs[1].md_root = P_INVALID; + mm->mm_dbs[MAIN_DBI].md_root = P_INVALID; } } - my.mc_wlen[0] = env->me_psize * 2; + my.mc_wlen[0] = env->me_psize * NUM_METAS; my.mc_txn = txn; pthread_mutex_lock(&my.mc_mutex); while(my.mc_new) pthread_cond_wait(&my.mc_cond, &my.mc_mutex); pthread_mutex_unlock(&my.mc_mutex); - rc = mdb_env_cwalk(&my, &txn->mt_dbs[1].md_root, 0); + rc = mdb_env_cwalk(&my, &txn->mt_dbs[MAIN_DBI].md_root, 0); if (rc == MDB_SUCCESS && my.mc_wlen[my.mc_toggle]) rc = mdb_env_cthr_toggle(&my, 1); mdb_env_cthr_toggle(&my, -1); @@ -8993,7 +9059,7 @@ mdb_env_copyfd0(MDB_env *env, HANDLE fd) } } - wsize = env->me_psize * 2; + wsize = env->me_psize * NUM_METAS; ptr = env->me_map; w2 = wsize; while (w2 > 0) { @@ -9090,7 +9156,7 @@ mdb_env_copy2(MDB_env *env, const char *path, unsigned int flags) * already in the OS cache. */ #ifdef _WIN32 - newfd = CreateFile(lpath, GENERIC_WRITE, 0, NULL, CREATE_NEW, + newfd = CreateFileA(lpath, GENERIC_WRITE, 0, NULL, CREATE_NEW, FILE_FLAG_NO_BUFFERING|FILE_FLAG_WRITE_THROUGH, NULL); #else newfd = open(lpath, O_WRONLY|O_CREAT|O_EXCL, 0666); @@ -9311,7 +9377,7 @@ int mdb_dbi_open(MDB_txn *txn, const char *name, unsigned int flags, MDB_dbi *db /* Is the DB already open? */ len = strlen(name); - for (i=2; imt_numdbs; i++) { + for (i=CORE_DBS; imt_numdbs; i++) { if (!txn->mt_dbxs[i].md_name.mv_size) { /* Remember this free slot */ if (!unused) unused = i; @@ -9400,7 +9466,7 @@ mdb_stat(MDB_txn *txn, MDB_dbi dbi, MDB_stat *arg) void mdb_dbi_close(MDB_env *env, MDB_dbi dbi) { char *ptr; - if (dbi <= MAIN_DBI || dbi >= env->me_maxdbs) + if (dbi < CORE_DBS || dbi >= env->me_maxdbs) return; ptr = env->me_dbxs[dbi].md_name.mv_data; /* If there was no name, this was already closed */ @@ -9537,7 +9603,7 @@ int mdb_drop(MDB_txn *txn, MDB_dbi dbi, int del) goto leave; /* Can't delete the main DB */ - if (del && dbi > MAIN_DBI) { + if (del && dbi >= CORE_DBS) { rc = mdb_del0(txn, MAIN_DBI, &mc->mc_dbx->md_name, NULL, F_SUBDATA); if (!rc) { txn->mt_dbflags[dbi] = DB_STALE;