]> git.sur5r.net Git - openldap/commitdiff
Drop me_pgfree, add mdb_freelist_save().
authorHallvard Furuseth <hallvard@openldap.org>
Wed, 12 Jun 2013 15:20:42 +0000 (17:20 +0200)
committerHallvard Furuseth <hallvard@openldap.org>
Wed, 12 Jun 2013 15:20:42 +0000 (17:20 +0200)
Split up saving me_pghead, to make me_pgfree unneeded. Also mf_pghead
is now a midl. Needed after e7f6767ea815fe0ada1f95037dfdec176ec4d5bb
("Return fresh overflow pages to current pghead").
Tweak MDB_DEBUG freelist output, make it ascending.

libraries/liblmdb/mdb.c

index f52dda729141257e2f591bd835cf28a2383ab11a..3f314f4cf2107a1853beb4b4a0e7018314af7f18 100644 (file)
@@ -928,7 +928,6 @@ typedef struct MDB_xcursor {
 typedef struct MDB_pgstate {
        txnid_t         mf_pglast;      /**< ID of last old page record we used */
        pgno_t          *mf_pghead;     /**< old pages reclaimed from freelist */
-       pgno_t          *mf_pgfree;     /**< memory to free when dropping me_pghead */
 } MDB_pgstate;
 
        /** The database environment. */
@@ -963,14 +962,13 @@ struct MDB_env {
        MDB_pgstate     me_pgstate;             /**< state of old pages from freeDB */
 #      define          me_pglast       me_pgstate.mf_pglast
 #      define          me_pghead       me_pgstate.mf_pghead
-#      define          me_pgfree       me_pgstate.mf_pgfree
        MDB_page        *me_dpages;             /**< list of malloc'd blocks for re-use */
        /** IDL of pages that became unused in a write txn */
        MDB_IDL         me_free_pgs;
        /** ID2L of pages written during a write txn. Length MDB_IDL_UM_SIZE. */
        MDB_ID2L        me_dirty_list;
        /** Max number of freelist items that can fit in a single overflow page */
-       unsigned int    me_maxfree_1pg;
+       int                     me_maxfree_1pg;
        /** Max size of a node on a page */
        unsigned int    me_nodemax;
 #ifdef _WIN32
@@ -1408,7 +1406,7 @@ again:
                                if (!mop)
                                        return ENOMEM;
                                txn->mt_env->me_pglast = last;
-                               txn->mt_env->me_pghead = txn->mt_env->me_pgfree = mop;
+                               txn->mt_env->me_pghead = mop;
                                memcpy(mop, idl, MDB_IDL_SIZEOF(idl));
 
 #if MDB_DEBUG > 1
@@ -1416,9 +1414,8 @@ again:
                                        unsigned int i;
                                        DPRINTF("IDL read txn %zu root %zu num %zu",
                                                last, txn->mt_dbs[FREE_DBI].md_root, idl[0]);
-                                       for (i=0; i<idl[0]; i++) {
-                                               DPRINTF("IDL %zu", idl[i+1]);
-                                       }
+                                       for (i = idl[0]; i; i--)
+                                               DPRINTF("IDL %zu", idl[i]);
                                }
 #endif
                        }
@@ -1487,8 +1484,8 @@ none:
                                                                mop2[k--] = mop[j--];
                                                }
                                                txn->mt_env->me_pglast = last;
-                                               mdb_midl_free(txn->mt_env->me_pgfree);
-                                               txn->mt_env->me_pghead = txn->mt_env->me_pgfree = mop2;
+                                               mdb_midl_free(txn->mt_env->me_pghead);
+                                               txn->mt_env->me_pghead = mop2;
                                                mop = mop2;
                                                /* Keep trying to read until we have enough */
                                                if (mop[0] < (unsigned)num) {
@@ -1521,8 +1518,8 @@ none:
                                mop[0]--;
                        }
                        if (MDB_IDL_IS_ZERO(mop)) {
-                               mdb_midl_free(txn->mt_env->me_pgfree);
-                               txn->mt_env->me_pghead = txn->mt_env->me_pgfree = NULL;
+                               mdb_midl_free(txn->mt_env->me_pghead);
+                               txn->mt_env->me_pghead = NULL;
                        }
                }
        }
@@ -1996,7 +1993,6 @@ mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned int flags, MDB_txn **ret)
                        else
                                rc = ENOMEM;
                }
-               env->me_pgfree = env->me_pghead;
                if (!rc)
                        rc = mdb_cursor_shadow(parent, txn);
                if (rc)
@@ -2068,7 +2064,7 @@ mdb_txn_reset0(MDB_txn *txn)
                if (!(env->me_flags & MDB_WRITEMAP)) {
                        mdb_dlist_free(txn);
                }
-               mdb_midl_free(env->me_pgfree);
+               mdb_midl_free(env->me_pghead);
 
                if (txn->mt_parent) {
                        txn->mt_parent->mt_child = NULL;
@@ -2081,7 +2077,7 @@ mdb_txn_reset0(MDB_txn *txn)
                                env->me_free_pgs = txn->mt_free_pgs;
                }
 
-               txn->mt_env->me_pghead = txn->mt_env->me_pgfree = NULL;
+               txn->mt_env->me_pghead = NULL;
                txn->mt_env->me_pglast = 0;
 
                env->me_txn = NULL;
@@ -2128,6 +2124,149 @@ mdb_txn_abort(MDB_txn *txn)
        free(txn);
 }
 
+/** Save the freelist as of this transaction to the freeDB.
+ * This changes the freelist. Keep trying until it stabilizes.
+ */
+static int
+mdb_freelist_save(MDB_txn *txn)
+{
+       /* env->me_pghead[] can grow and shrink during this call.
+        * env->me_pglast and txn->mt_free_pgs[] can only grow.
+        * Page numbers cannot disappear from txn->mt_free_pgs[].
+        */
+       MDB_cursor mc;
+       MDB_env *env = txn->mt_env;
+       int rc, maxfree_1pg = env->me_maxfree_1pg, more = 1;
+       txnid_t pglast = 0, head_id = 0;
+       pgno_t  freecnt = 0, *free_pgs, *mop;
+       ssize_t head_room = 0, total_room = 0, mop_len;
+
+       mdb_cursor_init(&mc, txn, FREE_DBI, NULL);
+
+       if (env->me_pghead || env->me_pglast) {
+               /* Make sure first page of freeDB is touched and on freelist */
+               rc = mdb_page_search(&mc, NULL, MDB_PS_MODIFY);
+               if (rc && rc != MDB_NOTFOUND)
+                       return rc;
+       }
+
+       for (;;) {
+               /* Come back here after each Put() in case freelist changed */
+               MDB_val key, data;
+
+               /* If using records from freeDB which we have not yet
+                * deleted, delete them and any we reserved for me_pghead.
+                */
+               while (pglast < env->me_pglast) {
+                       rc = mdb_cursor_first(&mc, &key, NULL);
+                       if (rc)
+                               return rc;
+                       pglast = head_id = *(txnid_t *)key.mv_data;
+                       total_room = head_room = 0;
+                       assert(pglast <= env->me_pglast);
+                       rc = mdb_cursor_del(&mc, 0);
+                       if (rc)
+                               return rc;
+               }
+
+               /* Save the IDL of pages freed by this txn, to a single record */
+               if (freecnt < txn->mt_free_pgs[0]) {
+                       if (!freecnt) {
+                               /* Make sure last page of freeDB is touched and on freelist */
+                               key.mv_size = MDB_MAXKEYSIZE+1;
+                               key.mv_data = NULL;
+                               rc = mdb_page_search(&mc, &key, MDB_PS_MODIFY);
+                               if (rc && rc != MDB_NOTFOUND)
+                                       return rc;
+                       }
+                       free_pgs = txn->mt_free_pgs;
+                       /* Write to last page of freeDB */
+                       key.mv_size = sizeof(txn->mt_txnid);
+                       key.mv_data = &txn->mt_txnid;
+                       do {
+                               freecnt = free_pgs[0];
+                               data.mv_size = MDB_IDL_SIZEOF(free_pgs);
+                               rc = mdb_cursor_put(&mc, &key, &data, MDB_RESERVE);
+                               if (rc)
+                                       return rc;
+                               /* Retry if mt_free_pgs[] grew during the Put() */
+                               free_pgs = txn->mt_free_pgs;
+                       } while (freecnt < free_pgs[0]);
+                       mdb_midl_sort(free_pgs);
+                       memcpy(data.mv_data, free_pgs, data.mv_size);
+#if MDB_DEBUG > 1
+                       {
+                               unsigned int i = free_pgs[0];
+                               DPRINTF("IDL write txn %zu root %zu num %u",
+                                       txn->mt_txnid, txn->mt_dbs[FREE_DBI].md_root, i);
+                               for (; i; i--)
+                                       DPRINTF("IDL %zu", free_pgs[i]);
+                       }
+#endif
+                       continue;
+               }
+
+               mop = env->me_pghead;
+               mop_len = mop ? mop[0] : 0;
+
+               /* Reserve records for me_pghead[]. Split it if multi-page,
+                * to avoid searching freeDB for a page range. Use keys in
+                * range [1,me_pglast]: Smaller than txnid of oldest reader.
+                */
+               if (total_room >= mop_len) {
+                       if (total_room == mop_len || --more < 0)
+                               break;
+               } else if (head_room >= maxfree_1pg && head_id > 1) {
+                       /* Keep current record (overflow page), add a new one */
+                       head_id--;
+                       head_room = 0;
+               }
+               /* (Re)write {key = head_id, IDL length = head_room} */
+               total_room -= head_room;
+               head_room = mop_len - total_room;
+               if (head_room > maxfree_1pg && head_id > 1) {
+                       /* Overflow multi-page for part of me_pghead */
+                       head_room /= head_id; /* amortize page sizes */
+                       head_room += maxfree_1pg - head_room % (maxfree_1pg + 1);
+               } else if (head_room < 0) {
+                       /* Rare case, not bothering to delete this record */
+                       head_room = 0;
+               }
+               key.mv_size = sizeof(head_id);
+               key.mv_data = &head_id;
+               data.mv_size = (head_room + 1) * sizeof(pgno_t);
+               rc = mdb_cursor_put(&mc, &key, &data, MDB_RESERVE);
+               if (rc)
+                       return rc;
+               *(MDB_ID *)data.mv_data = 0; /* IDL is initially empty */
+               total_room += head_room;
+       }
+
+       /* Fill in the reserved, touched me_pghead records. Avoid write ops
+        * so they cannot rearrange anything, just read the destinations.
+        */
+       rc = MDB_SUCCESS;
+       if (mop_len) {
+               MDB_val key, data;
+
+               mop += mop_len + 1;
+               rc = mdb_cursor_first(&mc, &key, &data);
+               for (; !rc; rc = mdb_cursor_next(&mc, &key, &data, MDB_NEXT)) {
+                       MDB_IDL dest = data.mv_data;
+                       ssize_t len = (ssize_t)(data.mv_size / sizeof(MDB_ID)) - 1;
+
+                       assert(len >= 0 && *(txnid_t*)key.mv_data <= env->me_pglast);
+                       if (len > mop_len)
+                               len = mop_len;
+                       *dest++ = len;
+                       memcpy(dest, mop -= len, len * sizeof(MDB_ID));
+                       if (! (mop_len -= len))
+                               break;
+               }
+       }
+       return rc;
+}
+
 int
 mdb_txn_commit(MDB_txn *txn)
 {
@@ -2137,9 +2276,7 @@ mdb_txn_commit(MDB_txn *txn)
        off_t            size;
        MDB_page        *dp;
        MDB_env *env;
-       pgno_t  next, freecnt;
-       txnid_t oldpg_txnid, id;
-       MDB_cursor mc;
+       pgno_t  next;
 
        assert(txn != NULL);
        assert(txn->mt_env != NULL);
@@ -2234,7 +2371,7 @@ mdb_txn_commit(MDB_txn *txn)
                parent->mt_dirty_room = txn->mt_dirty_room;
 
                txn->mt_parent->mt_child = NULL;
-               free(((MDB_ntxn *)txn)->mnt_pgstate.mf_pgfree);
+               mdb_midl_free(((MDB_ntxn *)txn)->mnt_pgstate.mf_pghead);
                free(txn);
                return MDB_SUCCESS;
        }
@@ -2255,6 +2392,7 @@ mdb_txn_commit(MDB_txn *txn)
 
        /* Update DB root pointers */
        if (txn->mt_numdbs > 2) {
+               MDB_cursor mc;
                MDB_dbi i;
                MDB_val data;
                data.mv_size = sizeof(MDB_db);
@@ -2270,142 +2408,12 @@ mdb_txn_commit(MDB_txn *txn)
                }
        }
 
-       /* Save the freelist as of this transaction to the freeDB. This
-        * can change the freelist, so keep trying until it stabilizes.
-        *
-        * env->me_pglast and the length of txn->mt_free_pgs cannot decrease,
-        * except the code below can decrease env->me_pglast to split pghead.
-        * Page numbers cannot disappear from txn->mt_free_pgs.  New pages
-        * can only appear in env->me_pghead when env->me_pglast increases.
-        * Until then, the me_pghead pointer won't move but can become NULL.
-        */
-
-       mdb_cursor_init(&mc, txn, FREE_DBI, NULL);
-       oldpg_txnid = id = 0;
-       freecnt = 0;
-
-       /* should only be one record now */
-       if (env->me_pghead || env->me_pglast) {
-               /* make sure first page of freeDB is touched and on freelist */
-               rc = mdb_page_search(&mc, NULL, MDB_PS_MODIFY);
-               if (rc && rc != MDB_NOTFOUND) {
-fail:
-                       mdb_txn_abort(txn);
-                       return rc;
-               }
-       }
-
-       /* Delete IDLs we used from the free list */
-       if (env->me_pglast) {
-               MDB_val key;
-
-               do {
-free_pgfirst:
-                       rc = mdb_cursor_first(&mc, &key, NULL);
-                       if (rc)
-                               goto fail;
-                       oldpg_txnid = *(txnid_t *)key.mv_data;
-again:
-                       assert(oldpg_txnid <= env->me_pglast);
-                       id = 0;
-                       rc = mdb_cursor_del(&mc, 0);
-                       if (rc)
-                               goto fail;
-               } while (oldpg_txnid < env->me_pglast);
-       }
-
-       /* Save IDL of pages freed by this txn, to freeDB */
-free2:
-       if (freecnt != txn->mt_free_pgs[0]) {
-               MDB_val key, data;
-
-               /* make sure last page of freeDB is touched and on freelist */
-               key.mv_size = MDB_MAXKEYSIZE+1;
-               key.mv_data = NULL;
-               rc = mdb_page_search(&mc, &key, MDB_PS_MODIFY);
-               if (rc && rc != MDB_NOTFOUND)
-                       goto fail;
-
-#if MDB_DEBUG > 1
-               {
-                       unsigned int i;
-                       MDB_IDL idl = txn->mt_free_pgs;
-                       mdb_midl_sort(txn->mt_free_pgs);
-                       DPRINTF("IDL write txn %zu root %zu num %zu",
-                               txn->mt_txnid, txn->mt_dbs[FREE_DBI].md_root, idl[0]);
-                       for (i=1; i<=idl[0]; i++) {
-                               DPRINTF("IDL %zu", idl[i]);
-                       }
-               }
-#endif
-               /* write to last page of freeDB */
-               key.mv_size = sizeof(pgno_t);
-               key.mv_data = &txn->mt_txnid;
-               /* The free list can still grow during this call,
-                * despite the pre-emptive touches above. So retry
-                * until the reserved space remains big enough.
-                */
-               do {
-                       assert(freecnt < txn->mt_free_pgs[0]);
-                       freecnt = txn->mt_free_pgs[0];
-                       data.mv_size = MDB_IDL_SIZEOF(txn->mt_free_pgs);
-                       rc = mdb_cursor_put(&mc, &key, &data, MDB_RESERVE);
-                       if (rc)
-                               goto fail;
-               } while (freecnt != txn->mt_free_pgs[0]);
-               mdb_midl_sort(txn->mt_free_pgs);
-               memcpy(data.mv_data, txn->mt_free_pgs, data.mv_size);
-               if (oldpg_txnid < env->me_pglast || (!env->me_pghead && id))
-                       goto free_pgfirst;      /* used up freeDB[oldpg_txnid] */
-       }
-
-       /* Put back page numbers we took from freeDB but did not use */
-       if (env->me_pghead) {
-         for (;;) {
-               MDB_val key, data;
-               pgno_t orig, *mop;
-
-               mop = env->me_pghead;
-               id = env->me_pglast;
-               key.mv_size = sizeof(id);
-               key.mv_data = &id;
-               /* These steps may grow the freelist again
-                * due to freed overflow pages...
-                */
-               i = 2;
-               do {
-                       orig = mop[0];
-                       if (orig > env->me_maxfree_1pg && id > 4)
-                               orig = env->me_maxfree_1pg; /* Do not use more than 1 page */
-                       data.mv_size = (orig + 1) * sizeof(pgno_t);
-                       rc = mdb_cursor_put(&mc, &key, &data, MDB_RESERVE);
-                       if (rc)
-                               goto fail;
-                       assert(!env->me_pghead || env->me_pglast);
-                       /* mop could have been used again here */
-                       if (id != env->me_pglast || env->me_pghead == NULL)
-                               goto again;             /* was completely used up */
-                       assert(mop == env->me_pghead);
-               } while (mop[0] < orig && --i);
-               memcpy(data.mv_data, mop, data.mv_size);
-               if (mop[0] <= orig)
-                       break;
-               *(pgno_t *)data.mv_data = orig;
-               mop[orig] = mop[0] - orig;
-               env->me_pghead = mop += orig;
-               /* Save more oldpages at the previous txnid. */
-               assert(env->me_pglast == id && id == oldpg_txnid);
-               env->me_pglast = --oldpg_txnid;
-         }
-       }
-
-       /* Check for growth of freelist again */
-       if (freecnt != txn->mt_free_pgs[0])
-               goto free2;
-
-       mdb_midl_free(env->me_pgfree);
-       env->me_pghead = env->me_pgfree = NULL;
+       rc = mdb_freelist_save(txn);
+       if (rc)
+               goto fail;
 
+       mdb_midl_free(env->me_pghead);
+       env->me_pghead = NULL;
        if (!MDB_IDL_IS_ZERO(txn->mt_free_pgs)) {
                if (mdb_midl_shrink(&txn->mt_free_pgs))
                        env->me_free_pgs = txn->mt_free_pgs;
@@ -2535,6 +2543,10 @@ done:
        free(txn);
 
        return MDB_SUCCESS;
+
+fail:
+       mdb_txn_abort(txn);
+       return rc;
 }
 
 /** Read the environment parameters of a DB environment before
@@ -4293,7 +4305,7 @@ mdb_ovpage_free(MDB_cursor *mc, MDB_page *mp)
                        rc = mdb_midl_grow(&mop, ovpages);
                        if (rc)
                                return rc;
-                       mc->mc_txn->mt_env->me_pghead = mc->mc_txn->mt_env->me_pgfree = mop;
+                       mc->mc_txn->mt_env->me_pghead = mop;
                }
                for (i = mop[0]; i>0; i--) {
                        if (mop[i] < pg)