From 27ab75ef36ef7c6d10da824572423d8906e69cc0 Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Fri, 1 Jul 2011 03:56:09 -0700 Subject: [PATCH] Re-use old pages --- libraries/libmdb/Makefile | 7 +- libraries/libmdb/idl.c | 95 ++++++++++++++++++++++++++ libraries/libmdb/idl.h | 78 +++++++++++++++++++++ libraries/libmdb/mdb.c | 140 ++++++++++++++++++++++++++++++++------ 4 files changed, 297 insertions(+), 23 deletions(-) create mode 100644 libraries/libmdb/idl.c create mode 100644 libraries/libmdb/idl.h diff --git a/libraries/libmdb/Makefile b/libraries/libmdb/Makefile index 13ce974e1b..94154e0d0a 100644 --- a/libraries/libmdb/Makefile +++ b/libraries/libmdb/Makefile @@ -1,6 +1,7 @@ CC = gcc W = -W -Wall -Wno-unused-parameter -Wcast-qual -Wbad-function-cast -CFLAGS = -pthread -O2 -g $(W) $(XCFLAGS) +OPT = -O2 -g +CFLAGS = -pthread $(OPT) $(W) $(XCFLAGS) LDLIBS = -lssl all: mtest mdb_stat @@ -11,8 +12,8 @@ clean: test: all ./mtest && ./mdb_stat testdb -mdb_stat: mdb_stat.o mdb.o -mtest: mtest.o mdb.o +mdb_stat: mdb_stat.o mdb.o idl.o +mtest: mtest.o mdb.o idl.o %: %.o mdb.o $(CC) $(CFLAGS) $(LDFLAGS) $^ $(LDLIBS) -o $@ diff --git a/libraries/libmdb/idl.c b/libraries/libmdb/idl.c new file mode 100644 index 0000000000..e4aa1cc3e7 --- /dev/null +++ b/libraries/libmdb/idl.c @@ -0,0 +1,95 @@ +/* Lifted from OpenLDAP back-bdb/idl.c */ + +#include +#include +#include +#include "idl.h" + +typedef ulong pgno_t; + +/* Sort the IDLs from highest to lowest */ +#define IDL_CMP(x,y) ( x > y ? -1 : ( x < y ? 1 : 0 ) ) + +unsigned mdb_idl_search( ID *ids, ID id ) +{ + /* + * binary search of id in ids + * if found, returns position of id + * if not found, returns first position greater than id + */ + unsigned base = 0; + unsigned cursor = 0; + int val = 0; + unsigned n = ids[0]; + + while( 0 < n ) { + int pivot = n >> 1; + cursor = base + pivot; + val = IDL_CMP( id, ids[cursor + 1] ); + + if( val < 0 ) { + n = pivot; + + } else if ( val > 0 ) { + base = cursor + 1; + n -= pivot + 1; + + } else { + return cursor + 1; + } + } + + if( val > 0 ) { + return cursor + 2; + } else { + return cursor + 1; + } +} + +int mdb_idl_insert( ID *ids, ID id ) +{ + unsigned x; + + if (MDB_IDL_IS_RANGE( ids )) { + /* if already in range, treat as a dup */ + if (id >= MDB_IDL_FIRST(ids) && id <= MDB_IDL_LAST(ids)) + return -1; + if (id < MDB_IDL_FIRST(ids)) + ids[1] = id; + else if (id > MDB_IDL_LAST(ids)) + ids[2] = id; + return 0; + } + + x = mdb_idl_search( ids, id ); + assert( x > 0 ); + + if( x < 1 ) { + /* internal error */ + return -2; + } + + if ( x <= ids[0] && ids[x] == id ) { + /* duplicate */ + return -1; + } + + if ( ++ids[0] >= MDB_IDL_DB_MAX ) { + if( id < ids[1] ) { + ids[1] = id; + ids[2] = ids[ids[0]-1]; + } else if ( ids[ids[0]-1] < id ) { + ids[2] = id; + } else { + ids[2] = ids[ids[0]-1]; + } + ids[0] = NOID; + + } else { + /* insert id */ + AC_MEMCPY( &ids[x+1], &ids[x], (ids[0]-x) * sizeof(ID) ); + ids[x] = id; + } + + return 0; +} diff --git a/libraries/libmdb/idl.h b/libraries/libmdb/idl.h new file mode 100644 index 0000000000..288b16f92c --- /dev/null +++ b/libraries/libmdb/idl.h @@ -0,0 +1,78 @@ +/* idl.h - ldap bdb back-end ID list header file */ +/* $OpenLDAP$ */ +/* This work is part of OpenLDAP Software . + * + * Copyright 2000-2011 The OpenLDAP Foundation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in the file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * . + */ + +#ifndef _MDB_IDL_H_ +#define _MDB_IDL_H_ + +#define AC_MEMCPY(dst,src,size) bcopy(src,dst,size) + +#define ID unsigned long +#define NOID ((ID)~0) + +/* IDL sizes - likely should be even bigger + * limiting factors: sizeof(ID), thread stack size + */ +#define MDB_IDL_LOGN 16 /* DB_SIZE is 2^16, UM_SIZE is 2^17 */ +#define MDB_IDL_DB_SIZE (1<bi_lastid) ) +#define MDB_IDL_ALL( bdb, ids ) MDB_IDL_RANGE( ids, 1, ((bdb)->bi_lastid) ) + +#define MDB_IDL_FIRST( ids ) ( ids[1] ) +#define MDB_IDL_LAST( ids ) ( MDB_IDL_IS_RANGE(ids) \ + ? ids[2] : ids[ids[0]] ) + +#define MDB_IDL_N( ids ) ( MDB_IDL_IS_RANGE(ids) \ + ? (ids[2]-ids[1])+1 : ids[0] ) + +int mdb_idl_insert( ID *ids, ID id ); + +#endif diff --git a/libraries/libmdb/mdb.c b/libraries/libmdb/mdb.c index 07ea59de05..6127cf85cb 100644 --- a/libraries/libmdb/mdb.c +++ b/libraries/libmdb/mdb.c @@ -24,6 +24,7 @@ #include #include "mdb.h" +#include "idl.h" #ifndef DEBUG #define DEBUG 1 @@ -124,6 +125,8 @@ typedef struct MDB_page { /* represents a page of storage */ #define IS_BRANCH(p) F_ISSET((p)->mp_flags, P_BRANCH) #define IS_OVERFLOW(p) F_ISSET((p)->mp_flags, P_OVERFLOW) +#define OVPAGES(size, psize) (PAGEHDRSZ + size + psize - 1) / psize; + typedef struct MDB_meta { /* meta (footer) page content */ uint32_t mm_magic; uint32_t mm_version; @@ -149,6 +152,12 @@ typedef struct MDB_dpage { SIMPLEQ_HEAD(dirty_queue, MDB_dpage); +typedef struct MDB_oldpages { + struct MDB_oldpages *mo_next; + ulong mo_txnid; + pgno_t mo_pages[1]; /* dynamic */ +} MDB_oldpages; + typedef struct MDB_pageparent { MDB_page *mp_page; MDB_page *mp_parent; @@ -199,7 +208,9 @@ struct MDB_txn { pgno_t mt_next_pgno; /* next unallocated page */ pgno_t mt_first_pgno; ulong mt_txnid; + ulong mt_oldest; MDB_env *mt_env; + pgno_t *mt_free_pgs; /* this is an IDL */ union { struct dirty_queue *dirty_queue; /* modified pages */ MDB_reader *reader; @@ -243,6 +254,8 @@ struct MDB_env { size_t me_mapsize; off_t me_size; /* current file size */ pthread_key_t me_txkey; /* thread-key for readers */ + MDB_oldpages *me_pghead; + MDB_oldpages *me_pgtail; }; #define NODESIZE offsetof(MDB_node, mn_data) @@ -375,6 +388,40 @@ static MDB_dpage * mdb_newpage(MDB_txn *txn, MDB_page *parent, unsigned int parent_idx, int num) { MDB_dpage *dp; + pgno_t pgno = P_INVALID; + + if (txn->mt_env->me_pghead) { + ulong oldest = txn->mt_txnid - 2; + int i; + for (i=0; imt_env->me_txns->mt_numreaders; i++) { + if (txn->mt_env->me_txns->mt_readers[i].mr_txnid < oldest) + oldest = txn->mt_env->me_txns->mt_readers[i].mr_txnid; + } + if (oldest > txn->mt_env->me_pghead->mo_txnid) { + MDB_oldpages *mop = txn->mt_env->me_pghead; + txn->mt_oldest = oldest; + if (num > 1) { + /* FIXME */ + ; + } else { + /* peel pages off tail, so we only have to truncate the list */ + pgno = MDB_IDL_LAST(mop->mo_pages); + if (MDB_IDL_IS_RANGE(mop->mo_pages)) { + mop->mo_pages[2]++; + if (mop->mo_pages[2] > mop->mo_pages[1]) + mop->mo_pages[0] = 0; + } else { + mop->mo_pages[0]--; + } + if (MDB_IDL_IS_ZERO(mop->mo_pages)) { + txn->mt_env->me_pghead = mop->mo_next; + if (!txn->mt_env->me_pghead) + txn->mt_env->me_pgtail = NULL; + free(mop); + } + } + } + } if ((dp = malloc(txn->mt_env->me_meta.mm_stat.ms_psize * num + sizeof(MDB_dhead))) == NULL) return NULL; @@ -382,8 +429,12 @@ mdb_newpage(MDB_txn *txn, MDB_page *parent, unsigned int parent_idx, int num) dp->h.md_parent = parent; dp->h.md_pi = parent_idx; SIMPLEQ_INSERT_TAIL(txn->mt_u.dirty_queue, dp, h.md_next); - dp->p.mp_pgno = txn->mt_next_pgno; - txn->mt_next_pgno += num; + if (pgno == P_INVALID) { + dp->p.mp_pgno = txn->mt_next_pgno; + txn->mt_next_pgno += num; + } else { + dp->p.mp_pgno = pgno; + } return dp; } @@ -400,9 +451,10 @@ mdb_touch(MDB_txn *txn, MDB_pageparent *pp) if (!F_ISSET(mp->mp_flags, P_DIRTY)) { MDB_dpage *dp; - DPRINTF("touching page %lu -> %lu", mp->mp_pgno, txn->mt_next_pgno); if ((dp = mdb_newpage(txn, pp->mp_parent, pp->mp_pi, 1)) == NULL) return ENOMEM; + DPRINTF("touched page %lu -> %lu", mp->mp_pgno, dp->p.mp_pgno); + mdb_idl_insert(txn->mt_free_pgs, mp->mp_pgno); pgno = dp->p.mp_pgno; bcopy(mp, &dp->p, txn->mt_env->me_meta.mm_stat.ms_psize); mp = &dp->p; @@ -451,6 +503,13 @@ mdb_txn_begin(MDB_env *env, int rdonly, MDB_txn **ret) pthread_mutex_lock(&env->me_txns->mt_wmutex); env->me_txns->mt_txnid++; + txn->mt_free_pgs = malloc(MDB_IDL_UM_SIZEOF); + if (txn->mt_free_pgs == NULL) { + free(txn->mt_u.dirty_queue); + free(txn); + return ENOMEM; + } + txn->mt_free_pgs[0] = 0; } txn->mt_txnid = env->me_txns->mt_txnid; if (rdonly) { @@ -512,22 +571,32 @@ mdb_txn_abort(MDB_txn *txn) if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) { txn->mt_u.reader->mr_txnid = 0; } else { - /* Discard all dirty pages. + /* Discard all dirty pages. Return any re-used pages + * to the free list. */ + MDB_IDL_ZERO(txn->mt_free_pgs); while (!SIMPLEQ_EMPTY(txn->mt_u.dirty_queue)) { dp = SIMPLEQ_FIRST(txn->mt_u.dirty_queue); SIMPLEQ_REMOVE_HEAD(txn->mt_u.dirty_queue, h.md_next); + if (dp->p.mp_pgno <= env->me_meta.mm_last_pg) + mdb_idl_insert(txn->mt_free_pgs, dp->p.mp_pgno); free(dp); } - -#if 0 - DPRINTF("releasing write lock on txn %p", txn); - txn->bt->txn = NULL; - if (flock(txn->bt->fd, LOCK_UN) != 0) { - DPRINTF("failed to unlock fd %d: %s", - txn->bt->fd, strerror(errno)); + /* put back to head of free list */ + if (!MDB_IDL_IS_ZERO(txn->mt_free_pgs)) { + MDB_oldpages *mop; + + mop = malloc(sizeof(MDB_oldpages) + MDB_IDL_SIZEOF(txn->mt_free_pgs) - sizeof(pgno_t)); + mop->mo_next = env->me_pghead; + mop->mo_txnid = txn->mt_oldest - 1; + if (!env->me_pghead) { + env->me_pgtail = mop; + } + env->me_pghead = mop; + bcopy(txn->mt_free_pgs, mop->mo_pages, MDB_IDL_SIZEOF(txn->mt_free_pgs)); } -#endif + + free(txn->mt_free_pgs); free(txn->mt_u.dirty_queue); env->me_txn = NULL; pthread_mutex_unlock(&env->me_txns->mt_wmutex); @@ -636,7 +705,25 @@ mdb_txn_commit(MDB_txn *txn) return n; } env->me_txn = NULL; + + /* add to tail of free list */ + if (!MDB_IDL_IS_ZERO(txn->mt_free_pgs)) { + MDB_oldpages *mop; + + mop = malloc(sizeof(MDB_oldpages) + MDB_IDL_SIZEOF(txn->mt_free_pgs) - sizeof(pgno_t)); + mop->mo_next = NULL; + if (env->me_pghead) { + env->me_pgtail->mo_next = mop; + } else { + env->me_pghead = mop; + } + env->me_pgtail = mop; + bcopy(txn->mt_free_pgs, mop->mo_pages, MDB_IDL_SIZEOF(txn->mt_free_pgs)); + mop->mo_txnid = txn->mt_txnid; + } + pthread_mutex_unlock(&env->me_txns->mt_wmutex); + free(txn->mt_free_pgs); free(txn->mt_u.dirty_queue); free(txn); txn = NULL; @@ -751,7 +838,7 @@ mdbenv_write_meta(MDB_txn *txn) meta.mm_txnid = txn->mt_txnid; off = 0; - if (env->me_metatoggle) + if (!env->me_metatoggle) off = env->me_meta.mm_stat.ms_psize; off += PAGEHDRSZ; @@ -1093,17 +1180,19 @@ mdbenv_get_page(MDB_env *env, pgno_t pgno) { MDB_page *p = NULL; MDB_txn *txn = env->me_txn; + int found = 0; - if (txn && pgno >= txn->mt_first_pgno && - !SIMPLEQ_EMPTY(txn->mt_u.dirty_queue)) { + if (txn && !SIMPLEQ_EMPTY(txn->mt_u.dirty_queue)) { MDB_dpage *dp; SIMPLEQ_FOREACH(dp, txn->mt_u.dirty_queue, h.md_next) { if (dp->p.mp_pgno == pgno) { p = &dp->p; + found = 1; break; } } - } else { + } + if (!found) { p = (MDB_page *)(env->me_map + env->me_meta.mm_stat.ms_psize * pgno); } return p; @@ -1528,10 +1617,10 @@ mdbenv_new_page(MDB_env *env, uint32_t flags, int num) assert(env != NULL); assert(env->me_txn != NULL); - DPRINTF("allocating new mpage %lu, page size %u", - env->me_txn->mt_next_pgno, env->me_meta.mm_stat.ms_psize); if ((dp = mdb_newpage(env->me_txn, NULL, 0, num)) == NULL) return NULL; + DPRINTF("allocated new mpage %lu, page size %u", + dp->p.mp_pgno, env->me_meta.mm_stat.ms_psize); dp->p.mp_flags = flags | P_DIRTY; dp->p.mp_lower = PAGEHDRSZ; dp->p.mp_upper = env->me_meta.mm_stat.ms_psize; @@ -1603,8 +1692,7 @@ mdb_add_node(MDB_db *db, MDB_page *mp, indx_t indx, /* Data already on overflow page. */ node_size += sizeof(pgno_t); } else if (data->mv_size >= db->md_env->me_meta.mm_stat.ms_psize / MDB_MINKEYS) { - int ovpages = PAGEHDRSZ + data->mv_size + db->md_env->me_meta.mm_stat.ms_psize - 1; - ovpages /= db->md_env->me_meta.mm_stat.ms_psize; + int ovpages = OVPAGES(data->mv_size, db->md_env->me_meta.mm_stat.ms_psize); /* Put data on overflow page. */ DPRINTF("data size is %zu, put on overflow page", data->mv_size); @@ -2056,6 +2144,18 @@ mdb_del(MDB_db *bt, MDB_txn *txn, return rc; mdb_del_node(bt, mpp.mp_page, ki); + /* add overflow pages to free list */ + if (F_ISSET(leaf->mn_flags, F_BIGDATA)) { + int i, ovpages; + pgno_t pg; + + bcopy(NODEDATA(leaf), &pg, sizeof(pg)); + ovpages = OVPAGES(NODEDSZ(leaf), bt->md_env->me_meta.mm_stat.ms_psize); + for (i=0; imt_free_pgs, pg); + pg++; + } + } bt->md_env->me_meta.mm_stat.ms_entries--; rc = mdb_rebalance(bt, &mpp); if (rc != MDB_SUCCESS) -- 2.39.5