]> git.sur5r.net Git - openldap/blobdiff - libraries/liblmdb/mdb.c
Rationalize mdb_env_copy2 API
[openldap] / libraries / liblmdb / mdb.c
index d64c954990bef66080248a21109d2fa5af27ba98..5acdba4dac75bd50f58d1123edcd3bf6df637cba 100644 (file)
 #define THREAD_RET     DWORD
 #define pthread_t      HANDLE
 #define pthread_mutex_t        HANDLE
+#define pthread_cond_t HANDLE
 #define pthread_key_t  DWORD
 #define pthread_self() GetCurrentThreadId()
 #define pthread_key_create(x,y)        \
 #define pthread_setspecific(x,y)       (TlsSetValue(x,y) ? 0 : ErrCode())
 #define pthread_mutex_unlock(x)        ReleaseMutex(x)
 #define pthread_mutex_lock(x)  WaitForSingleObject(x, INFINITE)
+#define pthread_cond_signal(x) SetEvent(*x)
+#define pthread_cond_wait(cond,mutex)  SignalObjectAndWait(*mutex, *cond, INFINITE, FALSE); WaitForSingleObject(*mutex, INFINITE)
 #define THREAD_CREATE(thr,start,arg)   thr=CreateThread(NULL,0,start,arg,0,NULL)
 #define THREAD_FINISH(thr)     WaitForSingleObject(thr, INFINITE)
 #define LOCK_MUTEX_R(env)      pthread_mutex_lock((env)->me_rmutex)
@@ -8033,8 +8036,10 @@ mdb_put(MDB_txn *txn, MDB_dbi dbi,
 #define MDB_WBUF       (1024*1024)
 #endif
 
+       /** State needed for a compacting copy. */
 typedef struct mdb_copy {
-       pthread_mutex_t mc_mutex[2];
+       pthread_mutex_t mc_mutex;
+       pthread_cond_t mc_cond;
        char *mc_wbuf[2];
        char *mc_over[2];
        MDB_env *mc_env;
@@ -8044,9 +8049,12 @@ typedef struct mdb_copy {
        pgno_t mc_next_pgno;
        HANDLE mc_fd;
        int mc_status;
+       volatile int mc_new;
        int mc_toggle;
+
 } mdb_copy;
 
+       /** Dedicated writer thread for compacting copy. */
 static THREAD_RET
 mdb_env_copythr(void *arg)
 {
@@ -8061,14 +8069,17 @@ mdb_env_copythr(void *arg)
 #define DO_WRITE(rc, fd, ptr, w2, len) len = write(fd, ptr, w2); rc = (len >= 0)
 #endif
 
-       pthread_mutex_lock(&my->mc_mutex[toggle^1]);
+       pthread_mutex_lock(&my->mc_mutex);
+       my->mc_new = 0;
+       pthread_cond_signal(&my->mc_cond);
        for(;;) {
-               pthread_mutex_lock(&my->mc_mutex[toggle]);
-               pthread_mutex_unlock(&my->mc_mutex[toggle^1]);
-               if (!my->mc_wlen[toggle]) {
-                       pthread_mutex_unlock(&my->mc_mutex[toggle]);
+               while (!my->mc_new)
+                       pthread_cond_wait(&my->mc_cond, &my->mc_mutex);
+               if (my->mc_new < 0) {
+                       my->mc_new = 0;
                        break;
                }
+               my->mc_new = 0;
                wsize = my->mc_wlen[toggle];
                ptr = my->mc_wbuf[toggle];
 again:
@@ -8087,10 +8098,8 @@ again:
                                break;
                        }
                }
-               my->mc_wlen[toggle] = wsize;
                if (rc) {
                        my->mc_status = rc;
-                       pthread_mutex_unlock(&my->mc_mutex[toggle]);
                        break;
                }
                /* If there's an overflow page tail, write it too */
@@ -8100,29 +8109,36 @@ again:
                        my->mc_olen[toggle] = 0;
                        goto again;
                }
+               my->mc_wlen[toggle] = 0;
                toggle ^= 1;
+               pthread_cond_signal(&my->mc_cond);
        }
+       pthread_cond_signal(&my->mc_cond);
+       pthread_mutex_unlock(&my->mc_mutex);
        return (THREAD_RET)0;
 #undef DO_WRITE
 }
 
+       /** Tell the writer thread there's a buffer ready to write */
 static int
-mdb_env_cthr_toggle(mdb_copy *my)
+mdb_env_cthr_toggle(mdb_copy *my, int st)
 {
        int toggle = my->mc_toggle ^ 1;
-
-       pthread_mutex_unlock(&my->mc_mutex[my->mc_toggle]);
-       pthread_mutex_lock(&my->mc_mutex[toggle]);
+       pthread_mutex_lock(&my->mc_mutex);
        if (my->mc_status) {
-               pthread_mutex_unlock(&my->mc_mutex[toggle]);
+               pthread_mutex_unlock(&my->mc_mutex);
                return my->mc_status;
        }
-       my->mc_wlen[toggle] = 0;
-       my->mc_olen[toggle] = 0;
+       while (my->mc_new == 1)
+               pthread_cond_wait(&my->mc_cond, &my->mc_mutex);
+       my->mc_new = st;
        my->mc_toggle = toggle;
+       pthread_cond_signal(&my->mc_cond);
+       pthread_mutex_unlock(&my->mc_mutex);
        return 0;
 }
 
+       /** Depth-first tree traversal for compacting copy. */
 static int
 mdb_env_cwalk(mdb_copy *my, pgno_t *pg, int flags)
 {
@@ -8134,6 +8150,10 @@ mdb_env_cwalk(mdb_copy *my, pgno_t *pg, int flags)
        int rc, toggle;
        unsigned int i;
 
+       /* Empty DB, nothing to do */
+       if (*pg == P_INVALID)
+               return MDB_SUCCESS;
+
        mc.mc_snum = 1;
        mc.mc_top = 0;
        mc.mc_txn = txn;
@@ -8186,10 +8206,10 @@ mdb_env_cwalk(mdb_copy *my, pgno_t *pg, int flags)
                                                if (rc)
                                                        goto done;
                                                if (my->mc_wlen[toggle] >= MDB_WBUF) {
-                                                       rc = mdb_env_cthr_toggle(my);
+                                                       rc = mdb_env_cthr_toggle(my, 1);
                                                        if (rc)
                                                                goto done;
-                                                       toggle ^= 1;
+                                                       toggle = my->mc_toggle;
                                                }
                                                mo = (MDB_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]);
                                                memcpy(mo, omp, my->mc_env->me_psize);
@@ -8199,10 +8219,10 @@ mdb_env_cwalk(mdb_copy *my, pgno_t *pg, int flags)
                                                if (omp->mp_pages > 1) {
                                                        my->mc_olen[toggle] = my->mc_env->me_psize * (omp->mp_pages - 1);
                                                        my->mc_over[toggle] = (char *)omp + my->mc_env->me_psize;
-                                                       rc = mdb_env_cthr_toggle(my);
+                                                       rc = mdb_env_cthr_toggle(my, 1);
                                                        if (rc)
                                                                goto done;
-                                                       toggle ^= 1;
+                                                       toggle = my->mc_toggle;
                                                }
                                                memcpy(NODEDATA(ni), &mo->mp_pgno, sizeof(pgno_t));
                                        } else if (ni->mn_flags & F_SUBDATA) {
@@ -8240,6 +8260,9 @@ again:
                                mc.mc_snum++;
                                mc.mc_ki[mc.mc_top] = 0;
                                if (IS_BRANCH(mp)) {
+                                       /* Whenever we advance to a sibling branch page,
+                                        * we must proceed all the way down to its first leaf.
+                                        */
                                        mdb_page_copy(mc.mc_pg[mc.mc_top], mp, my->mc_env->me_psize);
                                        goto again;
                                } else
@@ -8248,10 +8271,10 @@ again:
                        }
                }
                if (my->mc_wlen[toggle] >= MDB_WBUF) {
-                       rc = mdb_env_cthr_toggle(my);
+                       rc = mdb_env_cthr_toggle(my, 1);
                        if (rc)
                                goto done;
-                       toggle ^= 1;
+                       toggle = my->mc_toggle;
                }
                mo = (MDB_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]);
                mdb_page_copy(mo, mp, my->mc_env->me_psize);
@@ -8273,8 +8296,9 @@ done:
        return rc;
 }
 
-int
-mdb_env_copyfd2(MDB_env *env, HANDLE fd)
+       /** Copy environment with compaction. */
+static int
+mdb_env_copyfd1(MDB_env *env, HANDLE fd)
 {
        MDB_meta *mm;
        MDB_page *mp;
@@ -8284,14 +8308,14 @@ mdb_env_copyfd2(MDB_env *env, HANDLE fd)
        int rc;
 
 #ifdef _WIN32
-       my.mc_mutex[0] = CreateMutex(NULL, FALSE, NULL);
-       my.mc_mutex[1] = CreateMutex(NULL, FALSE, NULL);
+       my.mc_mutex = CreateMutex(NULL, FALSE, NULL);
+       my.mc_cond = CreateEvent(NULL, FALSE, FALSE, NULL);
        my.mc_wbuf[0] = _aligned_malloc(MDB_WBUF*2, env->me_psize);
        if (my.mc_wbuf[0] == NULL)
                return errno;
 #else
-       pthread_mutex_init(&my.mc_mutex[0], NULL);
-       pthread_mutex_init(&my.mc_mutex[1], NULL);
+       pthread_mutex_init(&my.mc_mutex, NULL);
+       pthread_cond_init(&my.mc_cond, NULL);
        rc = posix_memalign((void **)&my.mc_wbuf[0], env->me_psize, MDB_WBUF*2);
        if (rc)
                return rc;
@@ -8303,10 +8327,10 @@ mdb_env_copyfd2(MDB_env *env, HANDLE fd)
        my.mc_olen[1] = 0;
        my.mc_next_pgno = 2;
        my.mc_status = 0;
+       my.mc_new = 1;
        my.mc_toggle = 0;
        my.mc_env = env;
        my.mc_fd = fd;
-       pthread_mutex_lock(&my.mc_mutex[0]);
 
        /* Do the lock/unlock of the reader mutex before starting the
         * write txn.  Otherwise other read txns could block writers.
@@ -8329,6 +8353,7 @@ mdb_env_copyfd2(MDB_env *env, HANDLE fd)
                }
        }
 
+       THREAD_CREATE(thr, mdb_env_copythr, &my);
        mp = (MDB_page *)my.mc_wbuf[0];
        memset(mp, 0, 2*env->me_psize);
        mp->mp_pgno = 0;
@@ -8365,29 +8390,36 @@ mdb_env_copyfd2(MDB_env *env, HANDLE fd)
        }
        my.mc_wlen[0] = env->me_psize * 2;
        my.mc_txn = txn;
-       THREAD_CREATE(thr, mdb_env_copythr, &my);
+       pthread_mutex_lock(&my.mc_mutex);
+       while(my.mc_new)
+               pthread_cond_wait(&my.mc_cond, &my.mc_mutex);
+       pthread_mutex_unlock(&my.mc_mutex);
        rc = mdb_env_cwalk(&my, &txn->mt_dbs[1].md_root, 0);
        if (rc == MDB_SUCCESS && my.mc_wlen[my.mc_toggle])
-               rc = mdb_env_cthr_toggle(&my);
-       my.mc_wlen[my.mc_toggle] = 0;
-       pthread_mutex_unlock(&my.mc_mutex[my.mc_toggle]);
+               rc = mdb_env_cthr_toggle(&my, 1);
+       mdb_env_cthr_toggle(&my, -1);
+       pthread_mutex_lock(&my.mc_mutex);
+       while(my.mc_new)
+               pthread_cond_wait(&my.mc_cond, &my.mc_mutex);
+       pthread_mutex_unlock(&my.mc_mutex);
        THREAD_FINISH(thr);
 leave:
        mdb_txn_abort(txn);
 #ifdef _WIN32
-       CloseHandle(my.mc_mutex[1]);
-       CloseHandle(my.mc_mutex[0]);
+       CloseHandle(my.mc_cond);
+       CloseHandle(my.mc_mutex);
        _aligned_free(my.mc_wbuf[0]);
 #else
-       pthread_mutex_destroy(&my.mc_mutex[1]);
-       pthread_mutex_destroy(&my.mc_mutex[0]);
+       pthread_cond_destroy(&my.mc_cond);
+       pthread_mutex_destroy(&my.mc_mutex);
        free(my.mc_wbuf[0]);
 #endif
        return rc;
 }
 
-int
-mdb_env_copyfd(MDB_env *env, HANDLE fd)
+       /** Copy environment as-is. */
+static int
+mdb_env_copyfd0(MDB_env *env, HANDLE fd)
 {
        MDB_txn *txn = NULL;
        int rc;
@@ -8490,8 +8522,23 @@ leave:
        return rc;
 }
 
-static int
-mdb_env_copy0(MDB_env *env, const char *path, int flag)
+int
+mdb_env_copyfd2(MDB_env *env, HANDLE fd, unsigned int flags)
+{
+       if (flags & MDB_CP_COMPACT)
+               return mdb_env_copyfd1(env, fd);
+       else
+               return mdb_env_copyfd0(env, fd);
+}
+
+int
+mdb_env_copyfd(MDB_env *env, HANDLE fd)
+{
+       return mdb_env_copyfd2(env, fd, 0);
+}
+
+int
+mdb_env_copy2(MDB_env *env, const char *path, unsigned int flags)
 {
        int rc, len;
        char *lpath;
@@ -8536,10 +8583,7 @@ mdb_env_copy0(MDB_env *env, const char *path, int flag)
        }
 #endif
 
-       if (flag)
-               rc = mdb_env_copyfd2(env, newfd);
-       else
-               rc = mdb_env_copyfd(env, newfd);
+       rc = mdb_env_copyfd2(env, newfd, flags);
 
 leave:
        if (!(env->me_flags & MDB_NOSUBDIR))
@@ -8554,13 +8598,7 @@ leave:
 int
 mdb_env_copy(MDB_env *env, const char *path)
 {
-       return mdb_env_copy0(env, path, 0);
-}
-
-int
-mdb_env_copy2(MDB_env *env, const char *path)
-{
-       return mdb_env_copy0(env, path, 1);
+       return mdb_env_copy2(env, path, 0);
 }
 
 int