Re-use old pages

author Howard Chu <hyc@symas.com>

Fri, 1 Jul 2011 10:56:09 +0000 (03:56 -0700)

committer Howard Chu <hyc@symas.com>

Thu, 1 Sep 2011 22:53:33 +0000 (15:53 -0700)
author Howard Chu <hyc@symas.com>
Fri, 1 Jul 2011 10:56:09 +0000 (03:56 -0700)
committer Howard Chu <hyc@symas.com>
Thu, 1 Sep 2011 22:53:33 +0000 (15:53 -0700)
diff --git a/libraries/libmdb/Makefile b/libraries/libmdb/Makefile

index 13ce974e1b7314be135ebfc3fc8fed5e22f80692..94154e0d0a98f972a4922b38097e7843ea79d953 100644 (file)
--- a/libraries/libmdb/Makefile
+++ b/libraries/libmdb/Makefile
@@ -1,6 +1,7 @@
  CC     = gcc
  W      = -W -Wall -Wno-unused-parameter -Wcast-qual -Wbad-function-cast
-CFLAGS = -pthread -O2 -g $(W) $(XCFLAGS)
+OPT = -O2 -g
+CFLAGS = -pthread $(OPT) $(W) $(XCFLAGS)
  LDLIBS = -lssl
  
  all:   mtest mdb_stat
@@ -11,8 +12,8 @@ clean:
  test:  all
         ./mtest && ./mdb_stat testdb
  
-mdb_stat: mdb_stat.o mdb.o
-mtest:    mtest.o    mdb.o
+mdb_stat: mdb_stat.o mdb.o idl.o
+mtest:    mtest.o    mdb.o idl.o
  
  %:     %.o mdb.o
         $(CC) $(CFLAGS) $(LDFLAGS) $^ $(LDLIBS) -o $@
diff --git a/libraries/libmdb/idl.c b/libraries/libmdb/idl.c

new file mode 100644 (file)

index 0000000..e4aa1cc
--- /dev/null
+++ b/libraries/libmdb/idl.c
@@ -0,0 +1,95 @@
+/* Lifted from OpenLDAP back-bdb/idl.c */
+
+#include <strings.h>
+#include <sys/types.h>
+#include <assert.h>
+#include "idl.h"
+
+typedef ulong pgno_t;
+
+/* Sort the IDLs from highest to lowest */
+#define IDL_CMP(x,y)    ( x > y ? -1 : ( x < y ? 1 : 0 ) )
+
+unsigned mdb_idl_search( ID *ids, ID id )
+{
+       /*
+        * binary search of id in ids
+        * if found, returns position of id
+        * if not found, returns first position greater than id
+        */
+       unsigned base = 0;
+       unsigned cursor = 0;
+       int val = 0;
+       unsigned n = ids[0];
+
+       while( 0 < n ) {
+               int pivot = n >> 1;
+               cursor = base + pivot;
+               val = IDL_CMP( id, ids[cursor + 1] );
+
+               if( val < 0 ) {
+                       n = pivot;
+
+               } else if ( val > 0 ) {
+                       base = cursor + 1;
+                       n -= pivot + 1;
+
+               } else {
+                       return cursor + 1;
+               }
+       }
+       
+       if( val > 0 ) {
+               return cursor + 2;
+       } else {
+               return cursor + 1;
+       }
+}
+
+int mdb_idl_insert( ID *ids, ID id )
+{
+       unsigned x;
+
+       if (MDB_IDL_IS_RANGE( ids )) {
+               /* if already in range, treat as a dup */
+               if (id >= MDB_IDL_FIRST(ids) && id <= MDB_IDL_LAST(ids))
+                       return -1;
+               if (id < MDB_IDL_FIRST(ids))
+                       ids[1] = id;
+               else if (id > MDB_IDL_LAST(ids))
+                       ids[2] = id;
+               return 0;
+       }
+
+       x = mdb_idl_search( ids, id );
+       assert( x > 0 );
+
+       if( x < 1 ) {
+               /* internal error */
+               return -2;
+       }
+
+       if ( x <= ids[0] && ids[x] == id ) {
+               /* duplicate */
+               return -1;
+       }
+
+       if ( ++ids[0] >= MDB_IDL_DB_MAX ) {
+               if( id < ids[1] ) {
+                       ids[1] = id;
+                       ids[2] = ids[ids[0]-1];
+               } else if ( ids[ids[0]-1] < id ) {
+                       ids[2] = id;
+               } else {
+                       ids[2] = ids[ids[0]-1];
+               }
+               ids[0] = NOID;
+       
+       } else {
+               /* insert id */
+               AC_MEMCPY( &ids[x+1], &ids[x], (ids[0]-x) * sizeof(ID) );
+               ids[x] = id;
+       }
+
+       return 0;
+}
diff --git a/libraries/libmdb/idl.h b/libraries/libmdb/idl.h

new file mode 100644 (file)

index 0000000..288b16f
--- /dev/null
+++ b/libraries/libmdb/idl.h
@@ -0,0 +1,78 @@
+/* idl.h - ldap bdb back-end ID list header file */
+/* $OpenLDAP$ */
+/* This work is part of OpenLDAP Software <http://www.openldap.org/>.
+ *
+ * Copyright 2000-2011 The OpenLDAP Foundation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted only as authorized by the OpenLDAP
+ * Public License.
+ *
+ * A copy of this license is available in the file LICENSE in the
+ * top-level directory of the distribution or, alternatively, at
+ * <http://www.OpenLDAP.org/license.html>.
+ */
+
+#ifndef _MDB_IDL_H_
+#define _MDB_IDL_H_
+
+#define AC_MEMCPY(dst,src,size)        bcopy(src,dst,size)
+
+#define        ID      unsigned long
+#define        NOID    ((ID)~0)
+
+/* IDL sizes - likely should be even bigger
+ *   limiting factors: sizeof(ID), thread stack size
+ */
+#define        MDB_IDL_LOGN    16      /* DB_SIZE is 2^16, UM_SIZE is 2^17 */
+#define MDB_IDL_DB_SIZE                (1<<MDB_IDL_LOGN)
+#define MDB_IDL_UM_SIZE                (1<<(MDB_IDL_LOGN+1))
+#define MDB_IDL_UM_SIZEOF      (MDB_IDL_UM_SIZE * sizeof(ID))
+
+#define MDB_IDL_DB_MAX         (MDB_IDL_DB_SIZE-1)
+
+#define MDB_IDL_UM_MAX         (MDB_IDL_UM_SIZE-1)
+
+#define MDB_IDL_IS_RANGE(ids)  ((ids)[0] == NOID)
+#define MDB_IDL_RANGE_SIZE             (3)
+#define MDB_IDL_RANGE_SIZEOF   (MDB_IDL_RANGE_SIZE * sizeof(ID))
+#define MDB_IDL_SIZEOF(ids)            ((MDB_IDL_IS_RANGE(ids) \
+       ? MDB_IDL_RANGE_SIZE : ((ids)[0]+1)) * sizeof(ID))
+
+#define MDB_IDL_RANGE_FIRST(ids)       ((ids)[1])
+#define MDB_IDL_RANGE_LAST(ids)                ((ids)[2])
+
+#define MDB_IDL_RANGE( ids, f, l ) \
+       do { \
+               (ids)[0] = NOID; \
+               (ids)[1] = (f);  \
+               (ids)[2] = (l);  \
+       } while(0)
+
+#define MDB_IDL_ZERO(ids) \
+       do { \
+               (ids)[0] = 0; \
+               (ids)[1] = 0; \
+               (ids)[2] = 0; \
+       } while(0)
+
+#define MDB_IDL_IS_ZERO(ids) ( (ids)[0] == 0 )
+#define MDB_IDL_IS_ALL( range, ids ) ( (ids)[0] == NOID \
+       && (ids)[1] <= (range)[1] && (range)[2] <= (ids)[2] )
+
+#define MDB_IDL_CPY( dst, src ) (AC_MEMCPY( dst, src, MDB_IDL_SIZEOF( src ) ))
+
+#define MDB_IDL_ID( bdb, ids, id ) MDB_IDL_RANGE( ids, id, ((bdb)->bi_lastid) )
+#define MDB_IDL_ALL( bdb, ids ) MDB_IDL_RANGE( ids, 1, ((bdb)->bi_lastid) )
+
+#define MDB_IDL_FIRST( ids )   ( ids[1] )
+#define MDB_IDL_LAST( ids )            ( MDB_IDL_IS_RANGE(ids) \
+       ? ids[2] : ids[ids[0]] )
+
+#define MDB_IDL_N( ids )               ( MDB_IDL_IS_RANGE(ids) \
+       ? (ids[2]-ids[1])+1 : ids[0] )
+
+int mdb_idl_insert( ID *ids, ID id );
+
+#endif
diff --git a/libraries/libmdb/mdb.c b/libraries/libmdb/mdb.c

index 07ea59de05b159692f4796bf6ce5b39c58588fbc..6127cf85cbadbb338b93cf5464942e8017cc333c 100644 (file)
--- a/libraries/libmdb/mdb.c
+++ b/libraries/libmdb/mdb.c
@@ -24,6 +24,7 @@
  #include <pthread.h>
  
  #include "mdb.h"
+#include "idl.h"
  
  #ifndef DEBUG
  #define DEBUG 1
@@ -124,6 +125,8 @@ typedef struct MDB_page {           /* represents a page of storage */
  #define IS_BRANCH(p)    F_ISSET((p)->mp_flags, P_BRANCH)
  #define IS_OVERFLOW(p)  F_ISSET((p)->mp_flags, P_OVERFLOW)
  
+#define OVPAGES(size, psize)   (PAGEHDRSZ + size + psize - 1) / psize;
+
  typedef struct MDB_meta {                      /* meta (footer) page content */
         uint32_t        mm_magic;
         uint32_t        mm_version;
@@ -149,6 +152,12 @@ typedef struct MDB_dpage {
  
  SIMPLEQ_HEAD(dirty_queue, MDB_dpage);
  
+typedef struct MDB_oldpages {
+       struct MDB_oldpages *mo_next;
+       ulong           mo_txnid;
+       pgno_t          mo_pages[1];    /* dynamic */
+} MDB_oldpages;
+
  typedef struct MDB_pageparent {
         MDB_page *mp_page;
         MDB_page *mp_parent;
@@ -199,7 +208,9 @@ struct MDB_txn {
         pgno_t          mt_next_pgno;   /* next unallocated page */
         pgno_t          mt_first_pgno;
         ulong           mt_txnid;
+       ulong           mt_oldest;
         MDB_env         *mt_env;        
+       pgno_t          *mt_free_pgs;   /* this is an IDL */
         union {
                 struct dirty_queue      *dirty_queue;   /* modified pages */
                 MDB_reader      *reader;
@@ -243,6 +254,8 @@ struct MDB_env {
         size_t          me_mapsize;
         off_t           me_size;                /* current file size */
         pthread_key_t   me_txkey;       /* thread-key for readers */
+       MDB_oldpages *me_pghead;
+       MDB_oldpages *me_pgtail;
  };
  
  #define NODESIZE        offsetof(MDB_node, mn_data)
@@ -375,6 +388,40 @@ static MDB_dpage *
  mdb_newpage(MDB_txn *txn, MDB_page *parent, unsigned int parent_idx, int num)
  {
         MDB_dpage *dp;
+       pgno_t pgno = P_INVALID;
+
+       if (txn->mt_env->me_pghead) {
+               ulong oldest = txn->mt_txnid - 2;
+               int i;
+               for (i=0; i<txn->mt_env->me_txns->mt_numreaders; i++) {
+                       if (txn->mt_env->me_txns->mt_readers[i].mr_txnid < oldest)
+                               oldest = txn->mt_env->me_txns->mt_readers[i].mr_txnid;
+               }
+               if (oldest > txn->mt_env->me_pghead->mo_txnid) {
+                       MDB_oldpages *mop = txn->mt_env->me_pghead;
+                       txn->mt_oldest = oldest;
+                       if (num > 1) {
+                               /* FIXME */
+                               ;
+                       } else {
+                               /* peel pages off tail, so we only have to truncate the list */
+                               pgno = MDB_IDL_LAST(mop->mo_pages);
+                               if (MDB_IDL_IS_RANGE(mop->mo_pages)) {
+                                       mop->mo_pages[2]++;
+                                       if (mop->mo_pages[2] > mop->mo_pages[1])
+                                               mop->mo_pages[0] = 0;
+                               } else {
+                                       mop->mo_pages[0]--;
+                               }
+                               if (MDB_IDL_IS_ZERO(mop->mo_pages)) {
+                                       txn->mt_env->me_pghead = mop->mo_next;
+                                       if (!txn->mt_env->me_pghead)
+                                               txn->mt_env->me_pgtail = NULL;
+                                       free(mop);
+                               }
+                       }
+               }
+       }
  
         if ((dp = malloc(txn->mt_env->me_meta.mm_stat.ms_psize * num + sizeof(MDB_dhead))) == NULL)
                 return NULL;
@@ -382,8 +429,12 @@ mdb_newpage(MDB_txn *txn, MDB_page *parent, unsigned int parent_idx, int num)
         dp->h.md_parent = parent;
         dp->h.md_pi = parent_idx;
         SIMPLEQ_INSERT_TAIL(txn->mt_u.dirty_queue, dp, h.md_next);
-       dp->p.mp_pgno = txn->mt_next_pgno;
-       txn->mt_next_pgno += num;
+       if (pgno == P_INVALID) {
+               dp->p.mp_pgno = txn->mt_next_pgno;
+               txn->mt_next_pgno += num;
+       } else {
+               dp->p.mp_pgno = pgno;
+       }
  
         return dp;
  }
@@ -400,9 +451,10 @@ mdb_touch(MDB_txn *txn, MDB_pageparent *pp)
  
         if (!F_ISSET(mp->mp_flags, P_DIRTY)) {
                 MDB_dpage *dp;
-               DPRINTF("touching page %lu -> %lu", mp->mp_pgno, txn->mt_next_pgno);
                 if ((dp = mdb_newpage(txn, pp->mp_parent, pp->mp_pi, 1)) == NULL)
                         return ENOMEM;
+               DPRINTF("touched page %lu -> %lu", mp->mp_pgno, dp->p.mp_pgno);
+               mdb_idl_insert(txn->mt_free_pgs, mp->mp_pgno);
                 pgno = dp->p.mp_pgno;
                 bcopy(mp, &dp->p, txn->mt_env->me_meta.mm_stat.ms_psize);
                 mp = &dp->p;
@@ -451,6 +503,13 @@ mdb_txn_begin(MDB_env *env, int rdonly, MDB_txn **ret)
  
                 pthread_mutex_lock(&env->me_txns->mt_wmutex);
                 env->me_txns->mt_txnid++;
+               txn->mt_free_pgs = malloc(MDB_IDL_UM_SIZEOF);
+               if (txn->mt_free_pgs == NULL) {
+                       free(txn->mt_u.dirty_queue);
+                       free(txn);
+                       return ENOMEM;
+               }
+               txn->mt_free_pgs[0] = 0;
         }
         txn->mt_txnid = env->me_txns->mt_txnid;
         if (rdonly) {
@@ -512,22 +571,32 @@ mdb_txn_abort(MDB_txn *txn)
         if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) {
                 txn->mt_u.reader->mr_txnid = 0;
         } else {
-               /* Discard all dirty pages.
+               /* Discard all dirty pages. Return any re-used pages
+                * to the free list.
                  */
+               MDB_IDL_ZERO(txn->mt_free_pgs);
                 while (!SIMPLEQ_EMPTY(txn->mt_u.dirty_queue)) {
                         dp = SIMPLEQ_FIRST(txn->mt_u.dirty_queue);
                         SIMPLEQ_REMOVE_HEAD(txn->mt_u.dirty_queue, h.md_next);
+                       if (dp->p.mp_pgno <= env->me_meta.mm_last_pg)
+                               mdb_idl_insert(txn->mt_free_pgs, dp->p.mp_pgno);
                         free(dp);
                 }
-
-#if 0
-               DPRINTF("releasing write lock on txn %p", txn);
-               txn->bt->txn = NULL;
-               if (flock(txn->bt->fd, LOCK_UN) != 0) {
-                       DPRINTF("failed to unlock fd %d: %s",
-                           txn->bt->fd, strerror(errno));
+               /* put back to head of free list */
+               if (!MDB_IDL_IS_ZERO(txn->mt_free_pgs)) {
+                       MDB_oldpages *mop;
+
+                       mop = malloc(sizeof(MDB_oldpages) + MDB_IDL_SIZEOF(txn->mt_free_pgs) - sizeof(pgno_t));
+                       mop->mo_next = env->me_pghead;
+                       mop->mo_txnid = txn->mt_oldest - 1;
+                       if (!env->me_pghead) {
+                               env->me_pgtail = mop;
+                       }
+                       env->me_pghead = mop;
+                       bcopy(txn->mt_free_pgs, mop->mo_pages, MDB_IDL_SIZEOF(txn->mt_free_pgs));
                 }
-#endif
+
+               free(txn->mt_free_pgs);
                 free(txn->mt_u.dirty_queue);
                 env->me_txn = NULL;
                 pthread_mutex_unlock(&env->me_txns->mt_wmutex);
@@ -636,7 +705,25 @@ mdb_txn_commit(MDB_txn *txn)
                 return n;
         }
         env->me_txn = NULL;
+
+       /* add to tail of free list */
+       if (!MDB_IDL_IS_ZERO(txn->mt_free_pgs)) {
+               MDB_oldpages *mop;
+
+               mop = malloc(sizeof(MDB_oldpages) + MDB_IDL_SIZEOF(txn->mt_free_pgs) - sizeof(pgno_t));
+               mop->mo_next = NULL;
+               if (env->me_pghead) {
+                       env->me_pgtail->mo_next = mop;
+               } else {
+                       env->me_pghead = mop;
+               }
+               env->me_pgtail = mop;
+               bcopy(txn->mt_free_pgs, mop->mo_pages, MDB_IDL_SIZEOF(txn->mt_free_pgs));
+               mop->mo_txnid = txn->mt_txnid;
+       }
+
         pthread_mutex_unlock(&env->me_txns->mt_wmutex);
+       free(txn->mt_free_pgs);
         free(txn->mt_u.dirty_queue);
         free(txn);
         txn = NULL;
@@ -751,7 +838,7 @@ mdbenv_write_meta(MDB_txn *txn)
         meta.mm_txnid = txn->mt_txnid;
  
         off = 0;
-       if (env->me_metatoggle)
+       if (!env->me_metatoggle)
                 off = env->me_meta.mm_stat.ms_psize;
         off += PAGEHDRSZ;
  
@@ -1093,17 +1180,19 @@ mdbenv_get_page(MDB_env *env, pgno_t pgno)
  {
         MDB_page *p = NULL;
         MDB_txn *txn = env->me_txn;
+       int found = 0;
  
-       if (txn && pgno >= txn->mt_first_pgno &&
-               !SIMPLEQ_EMPTY(txn->mt_u.dirty_queue)) {
+       if (txn && !SIMPLEQ_EMPTY(txn->mt_u.dirty_queue)) {
                 MDB_dpage *dp;
                 SIMPLEQ_FOREACH(dp, txn->mt_u.dirty_queue, h.md_next) {
                         if (dp->p.mp_pgno == pgno) {
                                 p = &dp->p;
+                               found = 1;
                                 break;
                         }
                 }
-       } else {
+       }
+       if (!found) {
                 p = (MDB_page *)(env->me_map + env->me_meta.mm_stat.ms_psize * pgno);
         }
         return p;
@@ -1528,10 +1617,10 @@ mdbenv_new_page(MDB_env *env, uint32_t flags, int num)
         assert(env != NULL);
         assert(env->me_txn != NULL);
  
-       DPRINTF("allocating new mpage %lu, page size %u",
-           env->me_txn->mt_next_pgno, env->me_meta.mm_stat.ms_psize);
         if ((dp = mdb_newpage(env->me_txn, NULL, 0, num)) == NULL)
                 return NULL;
+       DPRINTF("allocated new mpage %lu, page size %u",
+           dp->p.mp_pgno, env->me_meta.mm_stat.ms_psize);
         dp->p.mp_flags = flags | P_DIRTY;
         dp->p.mp_lower = PAGEHDRSZ;
         dp->p.mp_upper = env->me_meta.mm_stat.ms_psize;
@@ -1603,8 +1692,7 @@ mdb_add_node(MDB_db *db, MDB_page *mp, indx_t indx,
                         /* Data already on overflow page. */
                         node_size += sizeof(pgno_t);
                 } else if (data->mv_size >= db->md_env->me_meta.mm_stat.ms_psize / MDB_MINKEYS) {
-                       int ovpages = PAGEHDRSZ + data->mv_size + db->md_env->me_meta.mm_stat.ms_psize - 1;
-                       ovpages /= db->md_env->me_meta.mm_stat.ms_psize;
+                       int ovpages = OVPAGES(data->mv_size, db->md_env->me_meta.mm_stat.ms_psize);
                         /* Put data on overflow page. */
                         DPRINTF("data size is %zu, put on overflow page",
                             data->mv_size);
@@ -2056,6 +2144,18 @@ mdb_del(MDB_db *bt, MDB_txn *txn,
                 return rc;
  
         mdb_del_node(bt, mpp.mp_page, ki);
+       /* add overflow pages to free list */
+       if (F_ISSET(leaf->mn_flags, F_BIGDATA)) {
+               int i, ovpages;
+               pgno_t pg;
+
+               bcopy(NODEDATA(leaf), &pg, sizeof(pg));
+               ovpages = OVPAGES(NODEDSZ(leaf), bt->md_env->me_meta.mm_stat.ms_psize);
+               for (i=0; i<ovpages; i++) {
+                       mdb_idl_insert(txn->mt_free_pgs, pg);
+                       pg++;
+               }
+       }
         bt->md_env->me_meta.mm_stat.ms_entries--;
         rc = mdb_rebalance(bt, &mpp);
         if (rc != MDB_SUCCESS)
author	Howard Chu <hyc@symas.com>
	Fri, 1 Jul 2011 10:56:09 +0000 (03:56 -0700)
committer	Howard Chu <hyc@symas.com>
	Thu, 1 Sep 2011 22:53:33 +0000 (15:53 -0700)
libraries/libmdb/Makefile		patch \| blob \| history
libraries/libmdb/idl.c	[new file with mode: 0644]	patch \| blob
libraries/libmdb/idl.h	[new file with mode: 0644]	patch \| blob
libraries/libmdb/mdb.c		patch \| blob \| history