]> git.sur5r.net Git - openldap/blobdiff - libraries/liblmdb/mdb.c
ITS#7992 cleanup
[openldap] / libraries / liblmdb / mdb.c
index 9af3704f78f9e0374a112ad49775c5ead54b8651..fa0c9e5b9ccd1f2ba6df9daba40bcf3258eee3aa 100644 (file)
@@ -256,8 +256,8 @@ typedef SSIZE_T     ssize_t;
 #  define MDB_USE_ROBUST       0
 # else
 #  define MDB_USE_ROBUST       1
-/* glibc < 2.10 only provided _np API */
-#  if defined(__GLIBC__) && GLIBC_VER < 0x02000a
+/* glibc < 2.12 only provided _np API */
+#  if defined(__GLIBC__) && GLIBC_VER < 0x02000c
 #   define PTHREAD_MUTEX_ROBUST        PTHREAD_MUTEX_ROBUST_NP
 #   define pthread_mutexattr_setrobust(attr, flag)     pthread_mutexattr_setrobust_np(attr, flag)
 #   define pthread_mutex_consistent(mutex)     pthread_mutex_consistent_np(mutex)
@@ -403,10 +403,13 @@ static int mdb_mutex_failed(MDB_env *env, mdb_mutexref_t mutex, int rc);
  *
  *     @note If O_DSYNC is undefined but exists in /usr/include,
  * preferably set some compiler flag to get the definition.
- * Otherwise compile with the less efficient -DMDB_DSYNC=O_SYNC.
  */
 #ifndef MDB_DSYNC
+# ifdef O_DSYNC
 # define MDB_DSYNC     O_DSYNC
+# else
+# define MDB_DSYNC     O_SYNC
+# endif
 #endif
 #endif
 
@@ -1158,7 +1161,6 @@ struct MDB_cursor {
 #define C_EOF  0x02                    /**< No more data */
 #define C_SUB  0x04                    /**< Cursor is a sub-cursor */
 #define C_DEL  0x08                    /**< last op was a cursor_del */
-#define C_SPLITTING    0x20            /**< Cursor is in page_split */
 #define C_UNTRACK      0x40            /**< Un-track cursor when closing */
 /** @} */
        unsigned int    mc_flags;       /**< @ref mdb_cursor */
@@ -1324,7 +1326,7 @@ static int  mdb_node_add(MDB_cursor *mc, indx_t indx,
                            MDB_val *key, MDB_val *data, pgno_t pgno, unsigned int flags);
 static void mdb_node_del(MDB_cursor *mc, int ksize);
 static void mdb_node_shrink(MDB_page *mp, indx_t indx);
-static int     mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst);
+static int     mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft);
 static int  mdb_node_read(MDB_txn *txn, MDB_node *leaf, MDB_val *data);
 static size_t  mdb_leaf_size(MDB_env *env, MDB_val *key, MDB_val *data);
 static size_t  mdb_branch_size(MDB_env *env, MDB_val *key);
@@ -1369,6 +1371,8 @@ static MDB_cmp_func       mdb_cmp_memn, mdb_cmp_memnr, mdb_cmp_int, mdb_cmp_cint, mdb_
 static SECURITY_DESCRIPTOR mdb_null_sd;
 static SECURITY_ATTRIBUTES mdb_all_sa;
 static int mdb_sec_inited;
+
+static int utf8_to_utf16(const char *src, int srcsize, wchar_t **dst, int *dstsize);
 #endif
 
 /** Return the library version info. */
@@ -1600,7 +1604,7 @@ mdb_cursor_chk(MDB_cursor *mc)
        MDB_node *node;
        MDB_page *mp;
 
-       if (!mc->mc_snum && !(mc->mc_flags & C_INITIALIZED)) return;
+       if (!mc->mc_snum || !(mc->mc_flags & C_INITIALIZED)) return;
        for (i=0; i<mc->mc_top; i++) {
                mp = mc->mc_pg[i];
                node = NODEPTR(mp, mc->mc_ki[i]);
@@ -1609,6 +1613,13 @@ mdb_cursor_chk(MDB_cursor *mc)
        }
        if (mc->mc_ki[i] >= NUMKEYS(mc->mc_pg[i]))
                printf("ack!\n");
+       if (mc->mc_xcursor && (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) {
+               node = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
+               if (((node->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA) &&
+                       mc->mc_xcursor->mx_cursor.mc_pg[0] != NODEDATA(node)) {
+                       printf("blah!\n");
+               }
+       }
 }
 #endif
 
@@ -2414,14 +2425,15 @@ done:
        } else {
                for (; m2; m2=m2->mc_next) {
                        if (m2->mc_snum < mc->mc_snum) continue;
+                       if (m2 == mc) continue;
                        if (m2->mc_pg[mc->mc_top] == mp) {
                                m2->mc_pg[mc->mc_top] = np;
                                if ((mc->mc_db->md_flags & MDB_DUPSORT) &&
                                        IS_LEAF(np) &&
-                                       m2->mc_ki[mc->mc_top] == mc->mc_ki[mc->mc_top])
+                                       (m2->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED))
                                {
-                                       MDB_node *leaf = NODEPTR(np, mc->mc_ki[mc->mc_top]);
-                                       if (!(leaf->mn_flags & F_SUBDATA))
+                                       MDB_node *leaf = NODEPTR(np, m2->mc_ki[mc->mc_top]);
+                                       if ((leaf->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA)
                                                m2->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf);
                                }
                        }
@@ -2485,14 +2497,15 @@ mdb_cursor_shadow(MDB_txn *src, MDB_txn *dst)
                                *bk = *mc;
                                mc->mc_backup = bk;
                                mc->mc_db = &dst->mt_dbs[i];
-                               /* Kill pointers into src - and dst to reduce abuse: The
-                                * user may not use mc until dst ends. Otherwise we'd...
+                               /* Kill pointers into src to reduce abuse: The
+                                * user may not use mc until dst ends. But we need a valid
+                                * txn pointer here for cursor fixups to keep working.
                                 */
-                               mc->mc_txn    = NULL;   /* ...set this to dst */
-                               mc->mc_dbflag = NULL;   /* ...and &dst->mt_dbflags[i] */
+                               mc->mc_txn    = dst;
+                               mc->mc_dbflag = &dst->mt_dbflags[i];
                                if ((mx = mc->mc_xcursor) != NULL) {
                                        *(MDB_xcursor *)(bk+1) = *mx;
-                                       mx->mx_cursor.mc_txn = NULL; /* ...and dst. */
+                                       mx->mx_cursor.mc_txn = dst;
                                }
                                mc->mc_next = dst->mt_cursors[i];
                                dst->mt_cursors[i] = mc;
@@ -3426,6 +3439,25 @@ mdb_txn_commit(MDB_txn *txn)
                        pspill[0] = y;
                }
 
+               /* Remove anything in our spill list from parent's dirty list */
+               if (txn->mt_spill_pgs && txn->mt_spill_pgs[0]) {
+                       for (i=1; i<=txn->mt_spill_pgs[0]; i++) {
+                               MDB_ID pn = txn->mt_spill_pgs[i];
+                               if (pn & 1)
+                                       continue;       /* deleted spillpg */
+                               pn >>= 1;
+                               y = mdb_mid2l_search(dst, pn);
+                               if (y <= dst[0].mid && dst[y].mid == pn) {
+                                       free(dst[y].mptr);
+                                       while (y < dst[0].mid) {
+                                               dst[y] = dst[y+1];
+                                               y++;
+                                       }
+                                       dst[0].mid--;
+                               }
+                       }
+               }
+
                /* Find len = length of merging our dirty list with parent's */
                x = dst[0].mid;
                dst[0].mid = 0;         /* simplify loops */
@@ -3471,7 +3503,7 @@ mdb_txn_commit(MDB_txn *txn)
                }
 
                /* Append our loose page list to parent's */
-               for (lp = &parent->mt_loose_pgs; *lp; lp = &NEXT_LOOSE_PAGE(lp))
+               for (lp = &parent->mt_loose_pgs; *lp; lp = &NEXT_LOOSE_PAGE(*lp))
                        ;
                *lp = txn->mt_loose_pgs;
                parent->mt_loose_count += txn->mt_loose_count;
@@ -4437,9 +4469,14 @@ mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl)
        off_t size, rsize;
 
 #ifdef _WIN32
-       env->me_lfd = CreateFileA(lpath, GENERIC_READ|GENERIC_WRITE,
+       wchar_t *wlpath;
+       rc = utf8_to_utf16(lpath, -1, &wlpath, NULL);
+       if (rc)
+               return rc;
+       env->me_lfd = CreateFileW(wlpath, GENERIC_READ|GENERIC_WRITE,
                FILE_SHARE_READ|FILE_SHARE_WRITE, NULL, OPEN_ALWAYS,
                FILE_ATTRIBUTE_NORMAL, NULL);
+       free(wlpath);
 #else
        env->me_lfd = open(lpath, O_RDWR|O_CREAT|MDB_CLOEXEC, mode);
 #endif
@@ -4657,6 +4694,9 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode
 {
        int             oflags, rc, len, excl = -1;
        char *lpath, *dpath;
+#ifdef _WIN32
+       wchar_t *wpath;
+#endif
 
        if (env->me_fd!=INVALID_HANDLE_VALUE || (flags & ~(CHANGEABLE|CHANGELESS)))
                return EINVAL;
@@ -4720,8 +4760,12 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode
                len = OPEN_ALWAYS;
        }
        mode = FILE_ATTRIBUTE_NORMAL;
-       env->me_fd = CreateFileA(dpath, oflags, FILE_SHARE_READ|FILE_SHARE_WRITE,
+       rc = utf8_to_utf16(dpath, -1, &wpath, NULL);
+       if (rc)
+               goto leave;
+       env->me_fd = CreateFileW(wpath, oflags, FILE_SHARE_READ|FILE_SHARE_WRITE,
                NULL, len, mode, NULL);
+       free(wpath);
 #else
        if (F_ISSET(flags, MDB_RDONLY))
                oflags = O_RDONLY;
@@ -4750,9 +4794,13 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode
                         */
 #ifdef _WIN32
                        len = OPEN_EXISTING;
-                       env->me_mfd = CreateFileA(dpath, oflags,
+                       rc = utf8_to_utf16(dpath, -1, &wpath, NULL);
+                       if (rc)
+                               goto leave;
+                       env->me_mfd = CreateFileW(wpath, oflags,
                                FILE_SHARE_READ|FILE_SHARE_WRITE, NULL, len,
                                mode | FILE_FLAG_WRITE_THROUGH, NULL);
+                       free(wpath);
 #else
                        oflags &= ~O_CREAT;
                        env->me_mfd = open(dpath, oflags | MDB_DSYNC, mode);
@@ -5129,8 +5177,11 @@ mdb_cursor_pop(MDB_cursor *mc)
                        mc->mc_pg[mc->mc_top]->mp_pgno, DDBI(mc), (void *) mc));
 
                mc->mc_snum--;
-               if (mc->mc_snum)
+               if (mc->mc_snum) {
                        mc->mc_top--;
+               } else {
+                       mc->mc_flags &= ~C_INITIALIZED;
+               }
        }
 }
 
@@ -5446,6 +5497,7 @@ mdb_ovpage_free(MDB_cursor *mc, MDB_page *mp)
                                return MDB_CORRUPTED;
                        }
                }
+               txn->mt_dirty_room++;
                if (!(env->me_flags & MDB_WRITEMAP))
                        mdb_dpage_free(env, mp);
 release:
@@ -5840,6 +5892,8 @@ mdb_cursor_set(MDB_cursor *mc, MDB_val *key, MDB_val *data,
                        } else
                                return MDB_NOTFOUND;
                }
+       } else {
+               mc->mc_pg[0] = 0;
        }
 
        rc = mdb_page_search(mc, key, 0);
@@ -6057,8 +6111,6 @@ mdb_cursor_get(MDB_cursor *mc, MDB_val *key, MDB_val *data,
                                MDB_GET_KEY(leaf, key);
                                if (data) {
                                        if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
-                                               if (mc->mc_flags & C_DEL)
-                                                       mdb_xcursor_init1(mc, leaf);
                                                rc = mdb_cursor_get(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_GET_CURRENT);
                                        } else {
                                                rc = mdb_node_read(mc->mc_txn, leaf, data);
@@ -6388,16 +6440,18 @@ fix_parent:
                         * update branch key if there is a parent page
                         */
                        if (mc->mc_top && !mc->mc_ki[mc->mc_top]) {
-                               unsigned short top = mc->mc_top;
+                               unsigned short dtop = 1;
                                mc->mc_top--;
                                /* slot 0 is always an empty key, find real slot */
-                               while (mc->mc_top && !mc->mc_ki[mc->mc_top])
+                               while (mc->mc_top && !mc->mc_ki[mc->mc_top]) {
                                        mc->mc_top--;
+                                       dtop++;
+                               }
                                if (mc->mc_ki[mc->mc_top])
                                        rc2 = mdb_update_key(mc, key);
                                else
                                        rc2 = MDB_SUCCESS;
-                               mc->mc_top top;
+                               mc->mc_top += dtop;
                                if (rc2)
                                        return rc2;
                        }
@@ -6578,6 +6632,7 @@ current:
                                                return ENOMEM;
                                        id2.mid = pg;
                                        id2.mptr = np;
+                                       /* Note - this page is already counted in parent's dirty_room */
                                        rc2 = mdb_mid2l_insert(mc->mc_txn->mt_u.dirty_list, &id2);
                                        mdb_cassert(mc, rc2 == 0);
                                        if (!(flags & MDB_RESERVE)) {
@@ -6634,7 +6689,7 @@ new_sub:
        } else {
                /* There is room already in this leaf page. */
                rc = mdb_node_add(mc, mc->mc_ki[mc->mc_top], key, rdata, 0, nflags);
-               if (rc == 0 && insert_key) {
+               if (rc == 0) {
                        /* Adjust other cursors pointing to mp */
                        MDB_cursor *m2, *m3;
                        MDB_dbi dbi = mc->mc_dbi;
@@ -6646,10 +6701,15 @@ new_sub:
                                        m3 = &m2->mc_xcursor->mx_cursor;
                                else
                                        m3 = m2;
-                               if (m3 == mc || m3->mc_snum < mc->mc_snum) continue;
-                               if (m3->mc_pg[i] == mp && m3->mc_ki[i] >= mc->mc_ki[i]) {
+                               if (m3 == mc || m3->mc_snum < mc->mc_snum || m3->mc_pg[i] != mp) continue;
+                               if (m3->mc_ki[i] >= mc->mc_ki[i] && insert_key) {
                                        m3->mc_ki[i]++;
                                }
+                               if (m3->mc_xcursor && (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) {
+                                       MDB_node *n2 = NODEPTR(mp, m3->mc_ki[i]);
+                                       if ((n2->mn_flags & (F_SUBDATA|F_DUPDATA)) == F_DUPDATA)
+                                               m3->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(n2);
+                               }
                        }
                }
        }
@@ -6691,6 +6751,7 @@ put_sub:
                                MDB_xcursor *mx = mc->mc_xcursor;
                                unsigned i = mc->mc_top;
                                MDB_page *mp = mc->mc_pg[i];
+                               int nkeys = NUMKEYS(mp);
 
                                for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) {
                                        if (m2 == mc || m2->mc_snum < mc->mc_snum) continue;
@@ -6698,9 +6759,9 @@ put_sub:
                                        if (m2->mc_pg[i] == mp) {
                                                if (m2->mc_ki[i] == mc->mc_ki[i]) {
                                                        mdb_xcursor_init2(m2, mx, new_dupdata);
-                                               } else if (!insert_key) {
+                                               } else if (!insert_key && m2->mc_ki[i] < nkeys) {
                                                        MDB_node *n2 = NODEPTR(mp, m2->mc_ki[i]);
-                                                       if (!(n2->mn_flags & F_SUBDATA))
+                                                       if ((n2->mn_flags & (F_SUBDATA|F_DUPDATA)) == F_DUPDATA)
                                                                m2->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(n2);
                                                }
                                        }
@@ -6781,6 +6842,7 @@ mdb_cursor_del(MDB_cursor *mc, unsigned int flags)
                if (flags & MDB_NODUPDATA) {
                        /* mdb_cursor_del0() will subtract the final entry */
                        mc->mc_db->md_entries -= mc->mc_xcursor->mx_db.md_entries - 1;
+                       mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED;
                } else {
                        if (!F_ISSET(leaf->mn_flags, F_SUBDATA)) {
                                mc->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf);
@@ -6816,8 +6878,9 @@ mdb_cursor_del(MDB_cursor *mc, unsigned int flags)
                                        }
                                }
                                mc->mc_db->md_entries--;
-                               mc->mc_flags |= C_DEL;
                                return rc;
+                       } else {
+                               mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED;
                        }
                        /* otherwise fall thru and delete the sub-DB */
                }
@@ -7504,10 +7567,26 @@ mdb_update_key(MDB_cursor *mc, MDB_val *key)
 static void
 mdb_cursor_copy(const MDB_cursor *csrc, MDB_cursor *cdst);
 
+/** Perform \b act while tracking temporary cursor \b mn */
+#define WITH_CURSOR_TRACKING(mn, act) do { \
+       MDB_cursor dummy, *tracked, **tp = &(mn).mc_txn->mt_cursors[mn.mc_dbi]; \
+       if ((mn).mc_flags & C_SUB) { \
+               dummy.mc_flags =  C_INITIALIZED; \
+               dummy.mc_xcursor = (MDB_xcursor *)&(mn);        \
+               tracked = &dummy; \
+       } else { \
+               tracked = &(mn); \
+       } \
+       tracked->mc_next = *tp; \
+       *tp = tracked; \
+       { act; } \
+       *tp = tracked->mc_next; \
+} while (0)
+
 /** Move a node from csrc to cdst.
  */
 static int
-mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst)
+mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft)
 {
        MDB_node                *srcnode;
        MDB_val          key, data;
@@ -7559,6 +7638,7 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst)
                data.mv_size = NODEDSZ(srcnode);
                data.mv_data = NODEDATA(srcnode);
        }
+       mn.mc_xcursor = NULL;
        if (IS_BRANCH(cdst->mc_pg[cdst->mc_top]) && cdst->mc_ki[cdst->mc_top] == 0) {
                unsigned int snum = cdst->mc_snum;
                MDB_node *s2;
@@ -7609,13 +7689,15 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst)
 
                mps = csrc->mc_pg[csrc->mc_top];
                /* If we're adding on the left, bump others up */
-               if (!cdst->mc_ki[csrc->mc_top]) {
+               if (fromleft) {
                        mpd = cdst->mc_pg[csrc->mc_top];
                        for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
                                if (csrc->mc_flags & C_SUB)
                                        m3 = &m2->mc_xcursor->mx_cursor;
                                else
                                        m3 = m2;
+                               if (!(m3->mc_flags & C_INITIALIZED) || m3->mc_top < csrc->mc_top)
+                                       continue;
                                if (m3 != cdst &&
                                        m3->mc_pg[csrc->mc_top] == mpd &&
                                        m3->mc_ki[csrc->mc_top] >= cdst->mc_ki[csrc->mc_top]) {
@@ -7628,6 +7710,12 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst)
                                        m3->mc_ki[csrc->mc_top] = cdst->mc_ki[cdst->mc_top];
                                        m3->mc_ki[csrc->mc_top-1]++;
                                }
+                               if (m3->mc_xcursor && (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) &&
+                                       IS_LEAF(mps)) {
+                                       MDB_node *node = NODEPTR(m3->mc_pg[csrc->mc_top], m3->mc_ki[csrc->mc_top]);
+                                       if ((node->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA)
+                                               m3->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(node);
+                               }
                        }
                } else
                /* Adding on the right, bump others down */
@@ -7638,6 +7726,8 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst)
                                else
                                        m3 = m2;
                                if (m3 == csrc) continue;
+                               if (!(m3->mc_flags & C_INITIALIZED) || m3->mc_top < csrc->mc_top)
+                                       continue;
                                if (m3->mc_pg[csrc->mc_top] == mps) {
                                        if (!m3->mc_ki[csrc->mc_top]) {
                                                m3->mc_pg[csrc->mc_top] = cdst->mc_pg[cdst->mc_top];
@@ -7646,6 +7736,12 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst)
                                        } else {
                                                m3->mc_ki[csrc->mc_top]--;
                                        }
+                                       if (m3->mc_xcursor && (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) &&
+                                               IS_LEAF(mps)) {
+                                               MDB_node *node = NODEPTR(m3->mc_pg[csrc->mc_top], m3->mc_ki[csrc->mc_top]);
+                                               if ((node->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA)
+                                                       m3->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(node);
+                                       }
                                }
                        }
                }
@@ -7667,7 +7763,10 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst)
                        mdb_cursor_copy(csrc, &mn);
                        mn.mc_snum--;
                        mn.mc_top--;
-                       if ((rc = mdb_update_key(&mn, &key)) != MDB_SUCCESS)
+                       /* We want mdb_rebalance to find mn when doing fixups */
+                       WITH_CURSOR_TRACKING(mn,
+                               rc = mdb_update_key(&mn, &key));
+                       if (rc)
                                return rc;
                }
                if (IS_BRANCH(csrc->mc_pg[csrc->mc_top])) {
@@ -7695,7 +7794,10 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst)
                        mdb_cursor_copy(cdst, &mn);
                        mn.mc_snum--;
                        mn.mc_top--;
-                       if ((rc = mdb_update_key(&mn, &key)) != MDB_SUCCESS)
+                       /* We want mdb_rebalance to find mn when doing fixups */
+                       WITH_CURSOR_TRACKING(mn,
+                               rc = mdb_update_key(&mn, &key));
+                       if (rc)
                                return rc;
                }
                if (IS_BRANCH(cdst->mc_pg[cdst->mc_top])) {
@@ -7764,6 +7866,7 @@ mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst)
                                MDB_cursor mn;
                                MDB_node *s2;
                                mdb_cursor_copy(csrc, &mn);
+                               mn.mc_xcursor = NULL;
                                /* must find the lowest key below src */
                                rc = mdb_page_search_lowest(&mn);
                                if (rc)
@@ -7835,6 +7938,15 @@ mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst)
                                m3->mc_pg[top] = pdst;
                                m3->mc_ki[top] += nkeys;
                                m3->mc_ki[top-1] = cdst->mc_ki[top-1];
+                       } else if (m3->mc_pg[top-1] == csrc->mc_pg[top-1] &&
+                               m3->mc_ki[top-1] > csrc->mc_ki[top-1]) {
+                               m3->mc_ki[top-1]--;
+                       }
+                       if (m3->mc_xcursor && (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) &&
+                               IS_LEAF(psrc)) {
+                               MDB_node *node = NODEPTR(m3->mc_pg[top], m3->mc_ki[top]);
+                               if ((node->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA)
+                                       m3->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(node);
                        }
                }
        }
@@ -7884,7 +7996,7 @@ static int
 mdb_rebalance(MDB_cursor *mc)
 {
        MDB_node        *node;
-       int rc;
+       int rc, fromleft;
        unsigned int ptop, minkeys, thresh;
        MDB_cursor      mn;
        indx_t oldki;
@@ -7935,7 +8047,8 @@ mdb_rebalance(MDB_cursor *mc)
                                                m3 = &m2->mc_xcursor->mx_cursor;
                                        else
                                                m3 = m2;
-                                       if (m3->mc_snum < mc->mc_snum) continue;
+                                       if (!(m3->mc_flags & C_INITIALIZED) || (m3->mc_snum < mc->mc_snum))
+                                               continue;
                                        if (m3->mc_pg[0] == mp) {
                                                m3->mc_snum = 0;
                                                m3->mc_top = 0;
@@ -7970,9 +8083,11 @@ mdb_rebalance(MDB_cursor *mc)
                                                m3 = &m2->mc_xcursor->mx_cursor;
                                        else
                                                m3 = m2;
-                                       if (m3 == mc || m3->mc_snum < mc->mc_snum) continue;
+                                       if (m3 == mc) continue;
+                                       if (!(m3->mc_flags & C_INITIALIZED))
+                                               continue;
                                        if (m3->mc_pg[0] == mp) {
-                                               for (i=0; i<m3->mc_snum; i++) {
+                                               for (i=0; i<mc->mc_db->md_depth; i++) {
                                                        m3->mc_pg[i] = m3->mc_pg[i+1];
                                                        m3->mc_ki[i] = m3->mc_ki[i+1];
                                                }
@@ -8014,6 +8129,7 @@ mdb_rebalance(MDB_cursor *mc)
                        return rc;
                mn.mc_ki[mn.mc_top] = 0;
                mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]);
+               fromleft = 0;
        } else {
                /* There is at least one neighbor to the left.
                 */
@@ -8025,6 +8141,7 @@ mdb_rebalance(MDB_cursor *mc)
                        return rc;
                mn.mc_ki[mn.mc_top] = NUMKEYS(mn.mc_pg[mn.mc_top]) - 1;
                mc->mc_ki[mc->mc_top] = 0;
+               fromleft = 1;
        }
 
        DPRINTF(("found neighbor page %"Z"u (%u keys, %.1f%% full)",
@@ -8036,32 +8153,20 @@ mdb_rebalance(MDB_cursor *mc)
         * (A branch page must never have less than 2 keys.)
         */
        if (PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) >= thresh && NUMKEYS(mn.mc_pg[mn.mc_top]) > minkeys) {
-               rc = mdb_node_move(&mn, mc);
-               if (!mc->mc_ki[mc->mc_top]) {
+               rc = mdb_node_move(&mn, mc, fromleft);
+               if (fromleft) {
                        /* if we inserted on left, bump position up */
                        oldki++;
                }
        } else {
-               if (mc->mc_ki[ptop] == 0) {
+               if (!fromleft) {
                        rc = mdb_page_merge(&mn, mc);
                } else {
-                       MDB_cursor dummy;
                        oldki += NUMKEYS(mn.mc_pg[mn.mc_top]);
                        mn.mc_ki[mn.mc_top] += mc->mc_ki[mn.mc_top] + 1;
                        /* We want mdb_rebalance to find mn when doing fixups */
-                       if (mc->mc_flags & C_SUB) {
-                               dummy.mc_next = mc->mc_txn->mt_cursors[mc->mc_dbi];
-                               mc->mc_txn->mt_cursors[mc->mc_dbi] = &dummy;
-                               dummy.mc_xcursor = (MDB_xcursor *)&mn;
-                       } else {
-                               mn.mc_next = mc->mc_txn->mt_cursors[mc->mc_dbi];
-                               mc->mc_txn->mt_cursors[mc->mc_dbi] = &mn;
-                       }
-                       rc = mdb_page_merge(mc, &mn);
-                       if (mc->mc_flags & C_SUB)
-                               mc->mc_txn->mt_cursors[mc->mc_dbi] = dummy.mc_next;
-                       else
-                               mc->mc_txn->mt_cursors[mc->mc_dbi] = mn.mc_next;
+                       WITH_CURSOR_TRACKING(mn,
+                               rc = mdb_page_merge(mc, &mn));
                        mdb_cursor_copy(&mn, mc);
                }
                mc->mc_flags &= ~C_EOF;
@@ -8094,12 +8199,17 @@ mdb_cursor_del0(MDB_cursor *mc)
                        if (m3 == mc || m3->mc_snum < mc->mc_snum)
                                continue;
                        if (m3->mc_pg[mc->mc_top] == mp) {
-                               if (m3->mc_ki[mc->mc_top] >= ki) {
+                               if (m3->mc_ki[mc->mc_top] == ki) {
                                        m3->mc_flags |= C_DEL;
-                                       if (m3->mc_ki[mc->mc_top] > ki)
-                                               m3->mc_ki[mc->mc_top]--;
-                                       else if (mc->mc_db->md_flags & MDB_DUPSORT)
-                                               m3->mc_xcursor->mx_cursor.mc_flags |= C_EOF;
+                                       if (mc->mc_db->md_flags & MDB_DUPSORT)
+                                               m3->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED;
+                               } else if (m3->mc_ki[mc->mc_top] > ki) {
+                                       m3->mc_ki[mc->mc_top]--;
+                               }
+                               if (m3->mc_xcursor && (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) {
+                                       MDB_node *node = NODEPTR(m3->mc_pg[mc->mc_top], m3->mc_ki[mc->mc_top]);
+                                       if ((node->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA)
+                                               m3->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(node);
                                }
                        }
                }
@@ -8246,12 +8356,19 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno
        rp->mp_pad = mp->mp_pad;
        DPRINTF(("new right sibling: page %"Z"u", rp->mp_pgno));
 
-       if (mc->mc_snum < 2) {
+       /* Usually when splitting the root page, the cursor
+        * height is 1. But when called from mdb_update_key,
+        * the cursor height may be greater because it walks
+        * up the stack while finding the branch slot to update.
+        */
+       if (mc->mc_top < 1) {
                if ((rc = mdb_page_new(mc, P_BRANCH, 1, &pp)))
                        goto done;
                /* shift current top to make room for new parent */
-               mc->mc_pg[1] = mc->mc_pg[0];
-               mc->mc_ki[1] = mc->mc_ki[0];
+               for (i=mc->mc_snum; i>0; i--) {
+                       mc->mc_pg[i] = mc->mc_pg[i-1];
+                       mc->mc_ki[i] = mc->mc_ki[i-1];
+               }
                mc->mc_pg[0] = pp;
                mc->mc_ki[0] = 0;
                mc->mc_db->md_root = pp->mp_pgno;
@@ -8267,16 +8384,16 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno
                        mc->mc_db->md_depth--;
                        goto done;
                }
-               mc->mc_snum = 2;
-               mc->mc_top = 1;
+               mc->mc_snum++;
+               mc->mc_top++;
                ptop = 0;
        } else {
                ptop = mc->mc_top-1;
                DPRINTF(("parent branch page is %"Z"u", mc->mc_pg[ptop]->mp_pgno));
        }
 
-       mc->mc_flags |= C_SPLITTING;
        mdb_cursor_copy(mc, &mn);
+       mn.mc_xcursor = NULL;
        mn.mc_pg[mn.mc_top] = rp;
        mn.mc_ki[ptop] = mc->mc_ki[ptop]+1;
 
@@ -8326,8 +8443,6 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno
                                rp->mp_lower += sizeof(indx_t);
                                rp->mp_upper -= ksize - sizeof(indx_t);
                                mc->mc_ki[mc->mc_top] = x;
-                               mc->mc_pg[mc->mc_top] = rp;
-                               mc->mc_ki[ptop]++;
                        }
                } else {
                        int psize, nsize, k;
@@ -8420,21 +8535,18 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno
        /* Copy separator key to the parent.
         */
        if (SIZELEFT(mn.mc_pg[ptop]) < mdb_branch_size(env, &sepkey)) {
+               int snum = mc->mc_snum;
                mn.mc_snum--;
                mn.mc_top--;
                did_split = 1;
-               rc = mdb_page_split(&mn, &sepkey, NULL, rp->mp_pgno, 0);
+               /* We want other splits to find mn when doing fixups */
+               WITH_CURSOR_TRACKING(mn,
+                       rc = mdb_page_split(&mn, &sepkey, NULL, rp->mp_pgno, 0));
                if (rc)
                        goto done;
 
                /* root split? */
-               if (mn.mc_snum == mc->mc_snum) {
-                       mc->mc_pg[mc->mc_snum] = mc->mc_pg[mc->mc_top];
-                       mc->mc_ki[mc->mc_snum] = mc->mc_ki[mc->mc_top];
-                       mc->mc_pg[mc->mc_top] = mc->mc_pg[ptop];
-                       mc->mc_ki[mc->mc_top] = mc->mc_ki[ptop];
-                       mc->mc_snum++;
-                       mc->mc_top++;
+               if (mc->mc_snum > snum) {
                        ptop++;
                }
                /* Right page might now have changed parent.
@@ -8460,7 +8572,6 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno
                rc = mdb_node_add(&mn, mn.mc_ki[ptop], &sepkey, NULL, rp->mp_pgno, 0);
                mn.mc_top++;
        }
-       mc->mc_flags ^= C_SPLITTING;
        if (rc != MDB_SUCCESS) {
                goto done;
        }
@@ -8530,11 +8641,6 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno
                /* reset back to original page */
                if (newindx < split_indx) {
                        mc->mc_pg[mc->mc_top] = mp;
-                       if (nflags & MDB_RESERVE) {
-                               node = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
-                               if (!(node->mn_flags & F_BIGDATA))
-                                       newdata->mv_data = NODEDATA(node);
-                       }
                } else {
                        mc->mc_pg[mc->mc_top] = rp;
                        mc->mc_ki[ptop]++;
@@ -8548,13 +8654,32 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno
                                }
                        }
                }
+               if (nflags & MDB_RESERVE) {
+                       node = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
+                       if (!(node->mn_flags & F_BIGDATA))
+                               newdata->mv_data = NODEDATA(node);
+               }
+       } else {
+               if (newindx >= split_indx) {
+                       mc->mc_pg[mc->mc_top] = rp;
+                       mc->mc_ki[ptop]++;
+                       /* Make sure mc_ki is still valid.
+                        */
+                       if (mn.mc_pg[ptop] != mc->mc_pg[ptop] &&
+                               mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) {
+                               for (i=0; i<=ptop; i++) {
+                                       mc->mc_pg[i] = mn.mc_pg[i];
+                                       mc->mc_ki[i] = mn.mc_ki[i];
+                               }
+                       }
+               }
        }
 
        {
                /* Adjust other cursors pointing to mp */
                MDB_cursor *m2, *m3;
                MDB_dbi dbi = mc->mc_dbi;
-               int fixup = NUMKEYS(mp);
+               nkeys = NUMKEYS(mp);
 
                for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
                        if (mc->mc_flags & C_SUB)
@@ -8565,16 +8690,17 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno
                                continue;
                        if (!(m2->mc_flags & m3->mc_flags & C_INITIALIZED))
                                continue;
-                       if (m3->mc_flags & C_SPLITTING)
-                               continue;
                        if (new_root) {
                                int k;
+                               /* sub cursors may be on different DB */
+                               if (m3->mc_pg[0] != mp)
+                                       continue;
                                /* root split */
                                for (k=new_root; k>=0; k--) {
                                        m3->mc_ki[k+1] = m3->mc_ki[k];
                                        m3->mc_pg[k+1] = m3->mc_pg[k];
                                }
-                               if (m3->mc_ki[0] >= split_indx) {
+                               if (m3->mc_ki[0] >= nkeys) {
                                        m3->mc_ki[0] = 1;
                                } else {
                                        m3->mc_ki[0] = 0;
@@ -8586,15 +8712,24 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno
                        if (m3->mc_top >= mc->mc_top && m3->mc_pg[mc->mc_top] == mp) {
                                if (m3->mc_ki[mc->mc_top] >= newindx && !(nflags & MDB_SPLIT_REPLACE))
                                        m3->mc_ki[mc->mc_top]++;
-                               if (m3->mc_ki[mc->mc_top] >= fixup) {
+                               if (m3->mc_ki[mc->mc_top] >= nkeys) {
                                        m3->mc_pg[mc->mc_top] = rp;
-                                       m3->mc_ki[mc->mc_top] -= fixup;
-                                       m3->mc_ki[ptop] = mn.mc_ki[ptop];
+                                       m3->mc_ki[mc->mc_top] -= nkeys;
+                                       for (i=0; i<mc->mc_top; i++) {
+                                               m3->mc_ki[i] = mn.mc_ki[i];
+                                               m3->mc_pg[i] = mn.mc_pg[i];
+                                       }
                                }
                        } else if (!did_split && m3->mc_top >= ptop && m3->mc_pg[ptop] == mc->mc_pg[ptop] &&
                                m3->mc_ki[ptop] >= mc->mc_ki[ptop]) {
                                m3->mc_ki[ptop]++;
                        }
+                       if (m3->mc_xcursor && (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) &&
+                               IS_LEAF(mp)) {
+                               MDB_node *node = NODEPTR(m3->mc_pg[mc->mc_top], m3->mc_ki[mc->mc_top]);
+                               if ((node->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA)
+                                       m3->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(node);
+                       }
                }
        }
        DPRINTF(("mp left: %d, rp left: %d", SIZELEFT(mp), SIZELEFT(rp)));
@@ -8613,6 +8748,7 @@ mdb_put(MDB_txn *txn, MDB_dbi dbi,
 {
        MDB_cursor mc;
        MDB_xcursor mx;
+       int rc;
 
        if (!key || !data || !TXN_DBI_EXIST(txn, dbi, DB_USRVALID))
                return EINVAL;
@@ -8624,7 +8760,11 @@ mdb_put(MDB_txn *txn, MDB_dbi dbi,
                return (txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN;
 
        mdb_cursor_init(&mc, txn, dbi, &mx);
-       return mdb_cursor_put(&mc, key, data, flags);
+       mc.mc_next = txn->mt_cursors[dbi];
+       txn->mt_cursors[dbi] = &mc;
+       rc = mdb_cursor_put(&mc, key, data, flags);
+       txn->mt_cursors[dbi] = mc.mc_next;
+       return rc;
 }
 
 #ifndef MDB_WBUF
@@ -9127,6 +9267,9 @@ mdb_env_copy2(MDB_env *env, const char *path, unsigned int flags)
        int rc, len;
        char *lpath;
        HANDLE newfd = INVALID_HANDLE_VALUE;
+#ifdef _WIN32
+       wchar_t *wpath;
+#endif
 
        if (env->me_flags & MDB_NOSUBDIR) {
                lpath = (char *)path;
@@ -9144,8 +9287,12 @@ mdb_env_copy2(MDB_env *env, const char *path, unsigned int flags)
         * already in the OS cache.
         */
 #ifdef _WIN32
-       newfd = CreateFileA(lpath, GENERIC_WRITE, 0, NULL, CREATE_NEW,
+       rc = utf8_to_utf16(lpath, -1, &wpath, NULL);
+       if (rc)
+               return rc;
+       newfd = CreateFileW(wpath, GENERIC_WRITE, 0, NULL, CREATE_NEW,
                                FILE_FLAG_NO_BUFFERING|FILE_FLAG_WRITE_THROUGH, NULL);
+       free(wpath);
 #else
        newfd = open(lpath, O_WRONLY|O_CREAT|O_EXCL, 0666);
 #endif
@@ -9337,6 +9484,7 @@ int mdb_dbi_open(MDB_txn *txn, const char *name, unsigned int flags, MDB_dbi *db
        MDB_db dummy;
        int rc, dbflag, exact;
        unsigned int unused = 0, seq;
+       char *namedup;
        size_t len;
 
        if (flags & ~VALID_FLAGS)
@@ -9398,8 +9546,16 @@ int mdb_dbi_open(MDB_txn *txn, const char *name, unsigned int flags, MDB_dbi *db
                MDB_node *node = NODEPTR(mc.mc_pg[mc.mc_top], mc.mc_ki[mc.mc_top]);
                if ((node->mn_flags & (F_DUPDATA|F_SUBDATA)) != F_SUBDATA)
                        return MDB_INCOMPATIBLE;
-       } else if (rc == MDB_NOTFOUND && (flags & MDB_CREATE)) {
-               /* Create if requested */
+       } else if (! (rc == MDB_NOTFOUND && (flags & MDB_CREATE))) {
+               return rc;
+       }
+
+       /* Done here so we cannot fail after creating a new DB */
+       if ((namedup = strdup(name)) == NULL)
+               return ENOMEM;
+
+       if (rc) {
+               /* MDB_NOTFOUND and MDB_CREATE: Create new DB */
                data.mv_size = sizeof(MDB_db);
                data.mv_data = &dummy;
                memset(&dummy, 0, sizeof(dummy));
@@ -9409,10 +9565,12 @@ int mdb_dbi_open(MDB_txn *txn, const char *name, unsigned int flags, MDB_dbi *db
                dbflag |= DB_DIRTY;
        }
 
-       /* OK, got info, add to table */
-       if (rc == MDB_SUCCESS) {
+       if (rc) {
+               free(namedup);
+       } else {
+               /* Got info, register DBI in this txn */
                unsigned int slot = unused ? unused : txn->mt_numdbs;
-               txn->mt_dbxs[slot].md_name.mv_data = strdup(name);
+               txn->mt_dbxs[slot].md_name.mv_data = namedup;
                txn->mt_dbxs[slot].md_name.mv_size = len;
                txn->mt_dbxs[slot].md_rel = NULL;
                txn->mt_dbflags[slot] = dbflag;
@@ -9562,6 +9720,7 @@ done:
        } else if (rc == MDB_NOTFOUND) {
                rc = MDB_SUCCESS;
        }
+       mc->mc_flags &= ~C_INITIALIZED;
        return rc;
 }
 
@@ -9850,3 +10009,24 @@ mdb_mutex_failed(MDB_env *env, mdb_mutexref_t mutex, int rc)
 }
 #endif /* MDB_ROBUST_SUPPORTED */
 /** @} */
+
+#if defined(_WIN32)
+static int utf8_to_utf16(const char *src, int srcsize, wchar_t **dst, int *dstsize)
+{
+       int need;
+       wchar_t *result;
+       need = MultiByteToWideChar(CP_UTF8, 0, src, srcsize, NULL, 0);
+       if (need == 0xFFFD)
+               return EILSEQ;
+       if (need == 0)
+               return EINVAL;
+       result = malloc(sizeof(wchar_t) * need);
+       if (!result)
+               return ENOMEM;
+       MultiByteToWideChar(CP_UTF8, 0, src, srcsize, result, need);
+       if (dstsize)
+               *dstsize = need;
+       *dst = result;
+       return 0;
+}
+#endif /* defined(_WIN32) */