X-Git-Url: https://git.sur5r.net/?a=blobdiff_plain;f=libraries%2Fliblmdb%2Fmdb.c;h=1e784ae11de1b8ad549037fd2b11aceed37a7cef;hb=29fd241fadc3dd49b3486f0e3556b029b716bcbf;hp=6afd615621cf500a5b5208880c186bda3fa15ca9;hpb=cf4fe3b1fb24d57eec80c2f7e22f73c568eb0241;p=openldap diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 6afd615621..1e784ae11d 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -79,17 +79,23 @@ extern int cacheflush(char *addr, int nbytes, int cache); #define CACHEFLUSH(addr, bytes, cache) #endif + #include #include #include #include #include #include -#include #include #include #include +#if defined(__sun) +/* Most platforms have posix_memalign, older may only have memalign */ +#define HAVE_MEMALIGN 1 +#include +#endif + #if !(defined(BYTE_ORDER) || defined(__BYTE_ORDER)) #include #include /* defines BYTE_ORDER on HPUX and Solaris */ @@ -1077,6 +1083,7 @@ typedef struct MDB_xcursor { typedef struct MDB_pgstate { pgno_t *mf_pghead; /**< Reclaimed freeDB pages, or NULL before use */ txnid_t mf_pglast; /**< ID of last used record, or 0 if !mf_pghead */ + txnid_t mf_pgoldest; /**< ID of oldest reader last time we looked */ } MDB_pgstate; /** The database environment. */ @@ -1104,6 +1111,7 @@ struct MDB_env { MDB_meta *me_metas[2]; /**< pointers to the two meta pages */ void *me_pbuf; /**< scratch area for DUPSORT put() */ MDB_txn *me_txn; /**< current write transaction */ + MDB_txn *me_txn0; /**< prealloc'd write transaction */ size_t me_mapsize; /**< size of the data memory map */ off_t me_size; /**< current file size */ pgno_t me_maxpg; /**< me_mapsize / me_psize */ @@ -1114,6 +1122,7 @@ struct MDB_env { MDB_pgstate me_pgstate; /**< state of old pages from freeDB */ # define me_pglast me_pgstate.mf_pglast # define me_pghead me_pgstate.mf_pghead +# define me_pgoldest me_pgstate.mf_pgoldest MDB_page *me_dpages; /**< list of malloc'd blocks for re-use */ /** IDL of pages that became unused in a write txn */ MDB_IDL me_free_pgs; @@ -1949,6 +1958,7 @@ mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp) txnid_t oldest = 0, last; MDB_cursor_op op; MDB_cursor m2; + int found_old = 0; /* If there are any loose pages, just use them */ if (num == 1 && txn->mt_loose_pgs) { @@ -1990,8 +2000,8 @@ mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp) if (op == MDB_FIRST) { /* 1st iteration */ /* Prepare to fetch more and coalesce */ - oldest = mdb_find_oldest(txn); last = env->me_pglast; + oldest = env->me_pgoldest; mdb_cursor_init(&m2, txn, FREE_DBI, NULL); if (last) { op = MDB_SET_RANGE; @@ -2006,8 +2016,15 @@ mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp) last++; /* Do not fetch more if the record will be too recent */ - if (oldest <= last) - break; + if (oldest <= last) { + if (!found_old) { + oldest = mdb_find_oldest(txn); + env->me_pgoldest = oldest; + found_old = 1; + } + if (oldest <= last) + break; + } rc = mdb_cursor_get(&m2, &key, NULL, op); if (rc) { if (rc == MDB_NOTFOUND) @@ -2015,8 +2032,15 @@ mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp) goto fail; } last = *(txnid_t*)key.mv_data; - if (oldest <= last) - break; + if (oldest <= last) { + if (!found_old) { + oldest = mdb_find_oldest(txn); + env->me_pgoldest = oldest; + found_old = 1; + } + if (oldest <= last) + break; + } np = m2.mc_pg[m2.mc_top]; leaf = NODEPTR(np, m2.mc_ki[m2.mc_top]); if ((rc = mdb_node_read(txn, leaf, &data)) != MDB_SUCCESS) @@ -2601,6 +2625,10 @@ mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned int flags, MDB_txn **ret) } size = tsize + env->me_maxdbs * (sizeof(MDB_db)+1); if (!(flags & MDB_RDONLY)) { + if (!parent) { + txn = env->me_txn0; + goto ok; + } size += env->me_maxdbs * sizeof(MDB_cursor *); /* child txns use parent's dbiseqs */ if (!parent) @@ -2628,6 +2656,7 @@ mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned int flags, MDB_txn **ret) } txn->mt_env = env; +ok: if (parent) { unsigned int i; txn->mt_u.dirty_list = malloc(sizeof(MDB_ID2)*MDB_IDL_UM_SIZE); @@ -2670,9 +2699,10 @@ mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned int flags, MDB_txn **ret) } else { rc = mdb_txn_renew0(txn); } - if (rc) - free(txn); - else { + if (rc) { + if (txn != env->me_txn0) + free(txn); + } else { *ret = txn; DPRINTF(("begin txn %"Z"u%c %p on mdbenv %p, root page %"Z"u", txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w', @@ -2799,7 +2829,8 @@ mdb_txn_abort(MDB_txn *txn) if ((txn->mt_flags & MDB_TXN_RDONLY) && txn->mt_u.reader) txn->mt_u.reader->mr_pid = 0; - free(txn); + if (txn != txn->mt_env->me_txn0) + free(txn); } /** Save the freelist as of this transaction to the freeDB. @@ -3352,7 +3383,8 @@ done: if (env->me_txns) UNLOCK_MUTEX_W(env); - free(txn); + if (txn != env->me_txn0) + free(txn); return MDB_SUCCESS; @@ -3597,11 +3629,9 @@ fail: env->me_flags |= MDB_FATAL_ERROR; return rc; } -done: /* MIPS has cache coherency issues, this is a no-op everywhere else */ - if (!(env->me_flags & MDB_WRITEMAP)) { - CACHEFLUSH(env->me_map + off, len, DCACHE); - } + CACHEFLUSH(env->me_map + off, len, DCACHE); +done: /* Memory ordering issues are irrelevant; since the entire writer * is wrapped by wmutex, all of these changes will become visible * after the wmutex is unlocked. Since the DB is multi-version, @@ -4485,6 +4515,22 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode if (!((flags & MDB_RDONLY) || (env->me_pbuf = calloc(1, env->me_psize)))) rc = ENOMEM; + if (!(flags & MDB_RDONLY)) { + MDB_txn *txn; + int tsize = sizeof(MDB_txn), size = tsize + env->me_maxdbs * + (sizeof(MDB_db)+sizeof(MDB_cursor)+sizeof(unsigned int)+1); + txn = calloc(1, size); + if (txn) { + txn->mt_dbs = (MDB_db *)((char *)txn + tsize); + txn->mt_cursors = (MDB_cursor **)(txn->mt_dbs + env->me_maxdbs); + txn->mt_dbiseqs = (unsigned int *)(txn->mt_cursors + env->me_maxdbs); + txn->mt_dbflags = (unsigned char *)(txn->mt_dbiseqs + env->me_maxdbs); + txn->mt_env = env; + env->me_txn0 = txn; + } else { + rc = ENOMEM; + } + } } leave: @@ -5855,8 +5901,7 @@ fetchm: MDB_node *leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) { MDB_GET_KEY(leaf, key); - if (data) - rc = mdb_node_read(mc->mc_txn, leaf, data); + rc = mdb_node_read(mc->mc_txn, leaf, data); break; } } @@ -8480,9 +8525,15 @@ mdb_env_copyfd1(MDB_env *env, HANDLE fd) #else pthread_mutex_init(&my.mc_mutex, NULL); pthread_cond_init(&my.mc_cond, NULL); +#ifdef HAVE_MEMALIGN my.mc_wbuf[0] = memalign(env->me_os_psize, MDB_WBUF*2); if (my.mc_wbuf[0] == NULL) return errno; +#else + rc = posix_memalign((void **)&my.mc_wbuf[0], env->me_os_psize, MDB_WBUF*2); + if (rc) + return rc; +#endif #endif memset(my.mc_wbuf[0], 0, MDB_WBUF*2); my.mc_wbuf[1] = my.mc_wbuf[0] + MDB_WBUF;