X-Git-Url: https://git.sur5r.net/?a=blobdiff_plain;ds=sidebyside;f=libraries%2Flibmdb%2Fmdb.c;h=431d1af5aeb41f6ca32fcdf507d1c668115a079c;hb=9d821c26c3023b1efe3270cf8a8ce5eca111189f;hp=8937a8e9fd98f69eb2c34e9cbb58d9df3e6190ab;hpb=01b9fc59fb2b8b4576ed43445b16b531d0b7b2ce;p=openldap diff --git a/libraries/libmdb/mdb.c b/libraries/libmdb/mdb.c index 8937a8e9fd..431d1af5ae 100644 --- a/libraries/libmdb/mdb.c +++ b/libraries/libmdb/mdb.c @@ -1,4 +1,9 @@ -/* mdb.c - memory-mapped database library */ +/** @file mdb.c + * @brief memory-mapped database library + * + * A Btree-based database management library modeled loosely on the + * BerkeleyDB API, but much simplified. + */ /* * Copyright 2011 Howard Chu, Symas Corp. * All rights reserved. @@ -30,12 +35,16 @@ #include #include #include +#ifdef _WIN32 +#include +#else #include #include #ifdef HAVE_SYS_FILE_H #include #endif #include +#endif #include #include @@ -46,15 +55,55 @@ #include #include #include + +#ifndef _WIN32 #include +#endif #include "mdb.h" +#include "midl.h" -#define ULONG unsigned long -typedef ULONG pgno_t; +/** @defgroup internal MDB Internals + * @{ + */ +/** @defgroup compat Windows Compatibility Macros + * @{ + */ +#ifdef _WIN32 +#define pthread_t DWORD +#define pthread_mutex_t HANDLE +#define pthread_key_t DWORD +#define pthread_self() GetCurrentThreadId() +#define pthread_key_create(x,y) *(x) = TlsAlloc() +#define pthread_key_delete(x) TlsFree(x) +#define pthread_getspecific(x) TlsGetValue(x) +#define pthread_setspecific(x,y) TlsSetValue(x,y) +#define pthread_mutex_unlock(x) ReleaseMutex(x) +#define pthread_mutex_lock(x) WaitForSingleObject(x, INFINITE) +#define LOCK_MUTEX_R(env) pthread_mutex_lock(env->me_rmutex) +#define UNLOCK_MUTEX_R(env) pthread_mutex_unlock(env->me_rmutex) +#define LOCK_MUTEX_W(env) pthread_mutex_lock(env->me_wmutex) +#define UNLOCK_MUTEX_W(env) pthread_mutex_unlock(env->me_wmutex) +#define getpid() GetCurrentProcessId() +#define fdatasync(fd) !FlushFileBuffers(fd) +#define ErrCode() GetLastError() +#define GetPageSize(x) {SYSTEM_INFO si; GetSystemInfo(&si); (x) = si.dwPageSize;} +#define close(fd) CloseHandle(fd) +#define munmap(ptr,len) UnmapViewOfFile(ptr) +#else +#define LOCK_MUTEX_R(env) pthread_mutex_lock(&env->me_txns->mti_mutex) +#define UNLOCK_MUTEX_R(env) pthread_mutex_unlock(&env->me_txns->mti_mutex) +#define LOCK_MUTEX_W(env) pthread_mutex_lock(&env->me_txns->mti_wmutex) +#define UNLOCK_MUTEX_W(env) pthread_mutex_unlock(&env->me_txns->mti_wmutex) +#define ErrCode() errno +#define HANDLE int +#define INVALID_HANDLE_VALUE -1 +#define GetPageSize(x) (x) = sysconf(_SC_PAGE_SIZE) +#endif -#include "midl.h" +/** @} */ +#ifndef _WIN32 /* Note: If O_DSYNC is undefined but exists in /usr/include, * preferably set some compiler flag to get the definition. * Otherwise compile with the less efficient -DMDB_DSYNC=O_SYNC. @@ -62,9 +111,13 @@ typedef ULONG pgno_t; #ifndef MDB_DSYNC # define MDB_DSYNC O_DSYNC #endif +#endif + +#define ULONG unsigned long +typedef ULONG pgno_t; #ifndef DEBUG -#define DEBUG 1 +#define DEBUG 0 #endif #if !(__STDC_VERSION__ >= 199901L || defined(__GNUC__)) @@ -91,6 +144,32 @@ typedef ULONG pgno_t; #define DKEY(x) #endif +/* The DB view is always consistent because all writes are wrapped in + * the wmutex. Finer-grained locks aren't necessary. + */ +#ifndef LAZY_LOCKS +#define LAZY_LOCKS 1 +#endif +#if LAZY_LOCKS +#define LAZY_MUTEX_LOCK(x) +#define LAZY_MUTEX_UNLOCK(x) +#define LAZY_RWLOCK_UNLOCK(x) +#define LAZY_RWLOCK_WRLOCK(x) +#define LAZY_RWLOCK_RDLOCK(x) +#define LAZY_RWLOCK_DEF(x) +#define LAZY_RWLOCK_INIT(x,y) +#define LAZY_RWLOCK_DESTROY(x) +#else +#define LAZY_MUTEX_LOCK(x) pthread_mutex_lock(x) +#define LAZY_MUTEX_UNLOCK(x) pthread_mutex_unlock(x) +#define LAZY_RWLOCK_UNLOCK(x) pthread_rwlock_unlock(x) +#define LAZY_RWLOCK_WRLOCK(x) pthread_rwlock_wrlock(x) +#define LAZY_RWLOCK_RDLOCK(x) pthread_rwlock_rdlock(x) +#define LAZY_RWLOCK_DEF(x) pthread_rwlock_t x +#define LAZY_RWLOCK_INIT(x,y) pthread_rwlock_init(x,y) +#define LAZY_RWLOCK_DESTROY(x) pthread_rwlock_destroy(x) +#endif + #define P_INVALID (~0UL) #define F_ISSET(w, f) (((w) & (f)) == (f)) @@ -125,7 +204,16 @@ typedef struct MDB_reader { typedef struct MDB_txbody { uint32_t mtb_magic; uint32_t mtb_version; +/* For POSIX the actual mutexes reside in shared memory. + * On Windows, mutexes are allocated by the kernel; we store + * the name in shared memory so that other processes can + * grab them. + */ +#ifdef _WIN32 + char mtb_rmname[32]; +#else pthread_mutex_t mtb_mutex; +#endif ULONG mtb_txnid; uint32_t mtb_numreaders; uint32_t mtb_me_toggle; @@ -137,14 +225,20 @@ typedef struct MDB_txninfo { #define mti_magic mt1.mtb.mtb_magic #define mti_version mt1.mtb.mtb_version #define mti_mutex mt1.mtb.mtb_mutex +#define mti_rmname mt1.mtb.mtb_rmname #define mti_txnid mt1.mtb.mtb_txnid #define mti_numreaders mt1.mtb.mtb_numreaders #define mti_me_toggle mt1.mtb.mtb_me_toggle char pad[(sizeof(MDB_txbody)+CACHELINE-1) & ~(CACHELINE-1)]; } mt1; union { +#ifdef _WIN32 + char mt2_wmname[32]; +#define mti_wmname mt2.mt2_wmname +#else pthread_mutex_t mt2_wmutex; #define mti_wmutex mt2.mt2_wmutex +#endif char pad[(sizeof(pthread_mutex_t)+CACHELINE-1) & ~(CACHELINE-1)]; } mt2; MDB_reader mti_readers[1]; @@ -242,8 +336,8 @@ typedef struct MDB_pageparent { unsigned mp_pi; } MDB_pageparent; -static MDB_dpage *mdb_alloc_page(MDB_txn *txn, MDB_page *parent, unsigned int parent_idx, int num); -static int mdb_touch(MDB_txn *txn, MDB_pageparent *mp); +static MDB_dpage *mdb_alloc_page(MDB_txn *txn, MDB_dbi dbi, MDB_page *parent, unsigned int parent_idx, int num); +static int mdb_touch(MDB_txn *txn, MDB_dbi dbi, MDB_pageparent *mp); typedef struct MDB_ppage { /* ordered list of pages */ MDB_page *mp_page; @@ -295,11 +389,10 @@ typedef struct MDB_dbx { struct MDB_txn { pgno_t mt_next_pgno; /* next unallocated page */ ULONG mt_txnid; - ULONG mt_oldest; MDB_env *mt_env; pgno_t *mt_free_pgs; /* this is an IDL */ union { - MIDL2 *dirty_list; /* modified pages */ + ID2L dirty_list; /* modified pages */ MDB_reader *reader; } mt_u; MDB_dbx *mt_dbxs; /* array */ @@ -308,8 +401,8 @@ struct MDB_txn { #define MDB_TXN_RDONLY 0x01 /* read-only transaction */ #define MDB_TXN_ERROR 0x02 /* an error has occurred */ -#define MDB_TXN_METOGGLE 0x04 /* used meta page 1 */ unsigned int mt_flags; + unsigned int mt_toggle; }; /* Context for sorted-dup records */ @@ -321,9 +414,9 @@ typedef struct MDB_xcursor { } MDB_xcursor; struct MDB_env { - int me_fd; - int me_lfd; - int me_mfd; /* just for writing the meta pages */ + HANDLE me_fd; + HANDLE me_lfd; + HANDLE me_mfd; /* just for writing the meta pages */ #define MDB_FATAL_ERROR 0x80000000U uint32_t me_flags; uint32_t me_extrapad; /* unused for now */ @@ -334,7 +427,6 @@ struct MDB_env { char *me_map; MDB_txninfo *me_txns; MDB_meta *me_metas[2]; - MDB_meta *me_meta; MDB_txn *me_txn; /* current write transaction */ size_t me_mapsize; off_t me_size; /* current file size */ @@ -347,7 +439,12 @@ struct MDB_env { pthread_key_t me_txkey; /* thread-key for readers */ MDB_dpage *me_dpages; pgno_t me_free_pgs[MDB_IDL_UM_SIZE]; - MIDL2 me_dirty_list[MDB_IDL_DB_SIZE]; + ID2 me_dirty_list[MDB_IDL_DB_SIZE]; + LAZY_RWLOCK_DEF(me_dblock); +#ifdef _WIN32 + HANDLE me_rmutex; /* Windows mutexes don't reside in shared mem */ + HANDLE me_wmutex; +#endif }; #define NODESIZE offsetof(MDB_node, mn_data) @@ -434,6 +531,12 @@ static int memncmp(const void *s1, size_t n1, static int memnrcmp(const void *s1, size_t n1, const void *s2, size_t n2); +#ifdef _WIN32 +static SECURITY_DESCRIPTOR mdb_null_sd; +static SECURITY_ATTRIBUTES mdb_all_sa; +static int mdb_sec_inited; +#endif + static int memncmp(const void *s1, size_t n1, const void *s2, size_t n2) { @@ -470,13 +573,13 @@ memnrcmp(const void *s1, size_t n1, const void *s2, size_t n2) char * mdb_version(int *maj, int *min, int *pat) { - *maj = MDB_VERSION_MAJOR; - *min = MDB_VERSION_MINOR; - *pat = MDB_VERSION_PATCH; + if (maj) *maj = MDB_VERSION_MAJOR; + if (min) *min = MDB_VERSION_MINOR; + if (pat) *pat = MDB_VERSION_PATCH; return MDB_VERSION_STRING; } -static char *const errstr[] = { +static char *const mdb_errstr[] = { "MDB_KEYEXIST: Key/data pair already exists", "MDB_NOTFOUND: No matching key/data pair found", "MDB_PAGE_NOTFOUND: Requested page not found", @@ -492,11 +595,12 @@ mdb_strerror(int err) return ("Successful return: 0"); if (err >= MDB_KEYEXIST && err <= MDB_VERSION_MISMATCH) - return errstr[err - MDB_KEYEXIST]; + return mdb_errstr[err - MDB_KEYEXIST]; return strerror(err); } +#if DEBUG static char * mdb_dkey(MDB_val *key, char *buf) { @@ -509,6 +613,7 @@ mdb_dkey(MDB_val *key, char *buf) ptr += sprintf(ptr, "%02x", *c++); return buf; } +#endif int mdb_cmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b) @@ -544,73 +649,72 @@ mdb_dcmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b) /* Allocate new page(s) for writing */ static MDB_dpage * -mdb_alloc_page(MDB_txn *txn, MDB_page *parent, unsigned int parent_idx, int num) +mdb_alloc_page(MDB_txn *txn, MDB_dbi dbi, MDB_page *parent, unsigned int parent_idx, int num) { MDB_dpage *dp; pgno_t pgno = P_INVALID; - ULONG oldest; - MIDL2 mid; + ID2 mid; if (txn->mt_txnid > 2) { - oldest = txn->mt_txnid - 1; - if (!txn->mt_env->me_pghead && txn->mt_dbs[FREE_DBI].md_root != P_INVALID) { - /* See if there's anything in the free DB */ - MDB_pageparent mpp; - MDB_node *leaf; - ULONG *kptr; - - mpp.mp_parent = NULL; - mpp.mp_pi = 0; - mdb_search_page(txn, FREE_DBI, NULL, NULL, 0, &mpp); - leaf = NODEPTR(mpp.mp_page, 0); - kptr = (ULONG *)NODEKEY(leaf); + if (!txn->mt_env->me_pghead && dbi != FREE_DBI && + txn->mt_dbs[FREE_DBI].md_root != P_INVALID) { + /* See if there's anything in the free DB */ + MDB_pageparent mpp; + MDB_node *leaf; + ULONG *kptr, oldest; - /* It's potentially usable, unless there are still - * older readers outstanding. Grab it. - */ - if (oldest > *kptr) { - MDB_oldpages *mop; - MDB_val data; - pgno_t *idl; - - mdb_read_data(txn, leaf, &data); - idl = (ULONG *)data.mv_data; - mop = malloc(sizeof(MDB_oldpages) + MDB_IDL_SIZEOF(idl) - sizeof(pgno_t)); - mop->mo_next = txn->mt_env->me_pghead; - mop->mo_txnid = *kptr; - txn->mt_env->me_pghead = mop; - memcpy(mop->mo_pages, idl, MDB_IDL_SIZEOF(idl)); + mpp.mp_parent = NULL; + mpp.mp_pi = 0; + mdb_search_page(txn, FREE_DBI, NULL, NULL, 0, &mpp); + leaf = NODEPTR(mpp.mp_page, 0); + kptr = (ULONG *)NODEKEY(leaf); -#if DEBUG > 1 { unsigned int i; - DPRINTF("IDL read txn %lu root %lu num %lu", - mop->mo_txnid, txn->mt_dbs[FREE_DBI].md_root, idl[0]); - for (i=0; imt_txnid - 1; + for (i=0; imt_env->me_txns->mti_numreaders; i++) { + ULONG mr = txn->mt_env->me_txns->mti_readers[i].mr_txnid; + if (mr && mr < oldest) + oldest = mr; } } + + if (oldest > *kptr) { + /* It's usable, grab it. + */ + MDB_oldpages *mop; + MDB_val data; + pgno_t *idl; + + mdb_read_data(txn, leaf, &data); + idl = (ULONG *)data.mv_data; + mop = malloc(sizeof(MDB_oldpages) + MDB_IDL_SIZEOF(idl) - sizeof(pgno_t)); + mop->mo_next = txn->mt_env->me_pghead; + mop->mo_txnid = *kptr; + txn->mt_env->me_pghead = mop; + memcpy(mop->mo_pages, idl, MDB_IDL_SIZEOF(idl)); + +#if DEBUG > 1 + { + unsigned int i; + DPRINTF("IDL read txn %lu root %lu num %lu", + mop->mo_txnid, txn->mt_dbs[FREE_DBI].md_root, idl[0]); + for (i=0; imt_env->me_pghead) { - unsigned int i; - for (i=0; imt_env->me_txns->mti_numreaders; i++) { - ULONG mr = txn->mt_env->me_txns->mti_readers[i].mr_txnid; - if (!mr) continue; - if (mr < oldest) - oldest = txn->mt_env->me_txns->mti_readers[i].mr_txnid; + /* drop this IDL from the DB */ + mpp.mp_parent = NULL; + mpp.mp_pi = 0; + mdb_search_page(txn, FREE_DBI, NULL, NULL, 1, &mpp); + leaf = NODEPTR(mpp.mp_page, 0); + mdb_del0(txn, FREE_DBI, 0, &mpp, leaf); + } } - if (oldest > txn->mt_env->me_pghead->mo_txnid) { + if (txn->mt_env->me_pghead) { MDB_oldpages *mop = txn->mt_env->me_pghead; - txn->mt_oldest = oldest; if (num > 1) { /* FIXME: For now, always use fresh pages. We * really ought to search the free list for a @@ -634,7 +738,6 @@ mdb_alloc_page(MDB_txn *txn, MDB_page *parent, unsigned int parent_idx, int num) } } } - } if (pgno == P_INVALID) { /* DB size is maxed out */ @@ -659,7 +762,7 @@ mdb_alloc_page(MDB_txn *txn, MDB_page *parent, unsigned int parent_idx, int num) } mid.mid = dp->p.mp_pgno; mid.mptr = dp; - mdb_midl2_insert(txn->mt_u.dirty_list, &mid); + mdb_mid2l_insert(txn->mt_u.dirty_list, &mid); return dp; } @@ -667,7 +770,7 @@ mdb_alloc_page(MDB_txn *txn, MDB_page *parent, unsigned int parent_idx, int num) /* Touch a page: make it dirty and re-insert into tree with updated pgno. */ static int -mdb_touch(MDB_txn *txn, MDB_pageparent *pp) +mdb_touch(MDB_txn *txn, MDB_dbi dbi, MDB_pageparent *pp) { MDB_page *mp = pp->mp_page; pgno_t pgno; @@ -676,9 +779,9 @@ mdb_touch(MDB_txn *txn, MDB_pageparent *pp) if (!F_ISSET(mp->mp_flags, P_DIRTY)) { MDB_dpage *dp; - if ((dp = mdb_alloc_page(txn, pp->mp_parent, pp->mp_pi, 1)) == NULL) + if ((dp = mdb_alloc_page(txn, dbi, pp->mp_parent, pp->mp_pi, 1)) == NULL) return ENOMEM; - DPRINTF("touched page %lu -> %lu", mp->mp_pgno, dp->p.mp_pgno); + DPRINTF("touched db %u page %lu -> %lu", dbi, mp->mp_pgno, dp->p.mp_pgno); assert(mp->mp_pgno != dp->p.mp_pgno); mdb_midl_insert(txn->mt_free_pgs, mp->mp_pgno); pgno = dp->p.mp_pgno; @@ -701,7 +804,7 @@ mdb_env_sync(MDB_env *env, int force) int rc = 0; if (force || !F_ISSET(env->me_flags, MDB_NOSYNC)) { if (fdatasync(env->me_fd)) - rc = errno; + rc = ErrCode(); } return rc; } @@ -714,70 +817,55 @@ mdb_txn_renew0(MDB_txn *txn) { MDB_env *env = txn->mt_env; - int rc, toggle; - - if (env->me_flags & MDB_FATAL_ERROR) { - DPUTS("mdb_txn_begin: environment had fatal error, must shutdown!"); - return MDB_PANIC; - } - if (txn->mt_flags & MDB_TXN_RDONLY) { MDB_reader *r = pthread_getspecific(env->me_txkey); if (!r) { unsigned int i; - pthread_mutex_lock(&env->me_txns->mti_mutex); + pid_t pid = getpid(); + pthread_t tid = pthread_self(); + + LOCK_MUTEX_R(env); for (i=0; ime_txns->mti_numreaders; i++) if (env->me_txns->mti_readers[i].mr_pid == 0) break; if (i == env->me_maxreaders) { - pthread_mutex_unlock(&env->me_txns->mti_mutex); - return ENOSPC; + UNLOCK_MUTEX_R(env); + return ENOMEM; } - env->me_txns->mti_readers[i].mr_pid = getpid(); - env->me_txns->mti_readers[i].mr_tid = pthread_self(); - r = &env->me_txns->mti_readers[i]; - pthread_setspecific(env->me_txkey, r); + env->me_txns->mti_readers[i].mr_pid = pid; + env->me_txns->mti_readers[i].mr_tid = tid; if (i >= env->me_txns->mti_numreaders) env->me_txns->mti_numreaders = i+1; - pthread_mutex_unlock(&env->me_txns->mti_mutex); + UNLOCK_MUTEX_R(env); + r = &env->me_txns->mti_readers[i]; + pthread_setspecific(env->me_txkey, r); } txn->mt_txnid = env->me_txns->mti_txnid; + txn->mt_toggle = env->me_txns->mti_me_toggle; r->mr_txnid = txn->mt_txnid; txn->mt_u.reader = r; } else { - pthread_mutex_lock(&env->me_txns->mti_wmutex); + LOCK_MUTEX_W(env); txn->mt_txnid = env->me_txns->mti_txnid+1; + txn->mt_toggle = env->me_txns->mti_me_toggle; txn->mt_u.dirty_list = env->me_dirty_list; txn->mt_u.dirty_list[0].mid = 0; txn->mt_free_pgs = env->me_free_pgs; txn->mt_free_pgs[0] = 0; + txn->mt_next_pgno = env->me_metas[txn->mt_toggle]->mm_last_pg+1; env->me_txn = txn; } - toggle = env->me_txns->mti_me_toggle; - if ((rc = mdb_env_read_meta(env, &toggle)) != MDB_SUCCESS) { - mdb_txn_reset0(txn); - return rc; - } - /* Copy the DB arrays */ + LAZY_RWLOCK_RDLOCK(&env->me_dblock); txn->mt_numdbs = env->me_numdbs; txn->mt_dbxs = env->me_dbxs; /* mostly static anyway */ - memcpy(txn->mt_dbs, env->me_meta->mm_dbs, 2 * sizeof(MDB_db)); + memcpy(txn->mt_dbs, env->me_metas[txn->mt_toggle]->mm_dbs, 2 * sizeof(MDB_db)); if (txn->mt_numdbs > 2) memcpy(txn->mt_dbs+2, env->me_dbs[env->me_db_toggle]+2, (txn->mt_numdbs - 2) * sizeof(MDB_db)); - - if (!(txn->mt_flags & MDB_TXN_RDONLY)) { - if (toggle) - txn->mt_flags |= MDB_TXN_METOGGLE; - txn->mt_next_pgno = env->me_meta->mm_last_pg+1; - } - - DPRINTF("begin txn %p %lu%c on mdbenv %p, root page %lu", txn, - txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w', - (void *) env, txn->mt_dbs[MAIN_DBI].md_root); + LAZY_RWLOCK_UNLOCK(&env->me_dblock); return MDB_SUCCESS; } @@ -790,31 +878,36 @@ mdb_txn_renew(MDB_txn *txn) if (!txn) return EINVAL; + if (txn->mt_env->me_flags & MDB_FATAL_ERROR) { + DPUTS("environment had fatal error, must shutdown!"); + return MDB_PANIC; + } + rc = mdb_txn_renew0(txn); if (rc == MDB_SUCCESS) { - DPRINTF("reset txn %p %lu%c on mdbenv %p, root page %lu", txn, - txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w', + DPRINTF("renew txn %lu%c %p on mdbenv %p, root page %lu", + txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w', txn, (void *)txn->mt_env, txn->mt_dbs[MAIN_DBI].md_root); } return rc; } int -mdb_txn_begin(MDB_env *env, int rdonly, MDB_txn **ret) +mdb_txn_begin(MDB_env *env, unsigned int flags, MDB_txn **ret) { MDB_txn *txn; int rc; if (env->me_flags & MDB_FATAL_ERROR) { - DPUTS("mdb_txn_begin: environment had fatal error, must shutdown!"); + DPUTS("environment had fatal error, must shutdown!"); return MDB_PANIC; } if ((txn = calloc(1, sizeof(MDB_txn) + env->me_maxdbs * sizeof(MDB_db))) == NULL) { - DPRINTF("calloc: %s", strerror(errno)); + DPRINTF("calloc: %s", strerror(ErrCode())); return ENOMEM; } txn->mt_dbs = (MDB_db *)(txn+1); - if (rdonly) { + if (flags & MDB_RDONLY) { txn->mt_flags |= MDB_TXN_RDONLY; } txn->mt_env = env; @@ -824,8 +917,8 @@ mdb_txn_begin(MDB_env *env, int rdonly, MDB_txn **ret) free(txn); else { *ret = txn; - DPRINTF("begin txn %p %lu%c on mdbenv %p, root page %lu", txn, - txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w', + DPRINTF("begin txn %lu%c %p on mdbenv %p, root page %lu", + txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w', txn, (void *) env, txn->mt_dbs[MAIN_DBI].md_root); } @@ -864,7 +957,7 @@ mdb_txn_reset0(MDB_txn *txn) env->me_txn = NULL; for (i=2; ime_numdbs; i++) env->me_dbxs[i].md_dirty = 0; - pthread_mutex_unlock(&env->me_txns->mti_wmutex); + UNLOCK_MUTEX_W(env); } } @@ -874,8 +967,8 @@ mdb_txn_reset(MDB_txn *txn) if (txn == NULL) return; - DPRINTF("reset txn %p %lu%c on mdbenv %p, root page %lu", txn, - txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w', + DPRINTF("reset txn %lu%c %p on mdbenv %p, root page %lu", + txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w', txn, (void *)txn->mt_env, txn->mt_dbs[MAIN_DBI].md_root); mdb_txn_reset0(txn); @@ -887,8 +980,8 @@ mdb_txn_abort(MDB_txn *txn) if (txn == NULL) return; - DPRINTF("abort txn %p %lu%c on mdbenv %p, root page %lu", txn, - txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w', + DPRINTF("abort txn %lu%c %p on mdbenv %p, root page %lu", + txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w', txn, (void *)txn->mt_env, txn->mt_dbs[MAIN_DBI].md_root); mdb_txn_reset0(txn); @@ -905,7 +998,6 @@ mdb_txn_commit(MDB_txn *txn) MDB_dpage *dp; MDB_env *env; pgno_t next; - struct iovec iov[MDB_COMMIT_PAGES]; assert(txn != NULL); assert(txn->mt_env != NULL); @@ -932,27 +1024,23 @@ mdb_txn_commit(MDB_txn *txn) if (!txn->mt_u.dirty_list[0].mid) goto done; - DPRINTF("committing txn %p %lu on mdbenv %p, root page %lu", txn, - txn->mt_txnid, (void *) env, txn->mt_dbs[MAIN_DBI].md_root); + DPRINTF("committing txn %lu %p on mdbenv %p, root page %lu", + txn->mt_txnid, txn, (void *)env, txn->mt_dbs[MAIN_DBI].md_root); /* should only be one record now */ if (env->me_pghead) { - MDB_val key, data; - MDB_oldpages *mop; + MDB_pageparent mpp; - mop = env->me_pghead; - key.mv_size = sizeof(pgno_t); - key.mv_data = (char *)&mop->mo_txnid; - data.mv_size = MDB_IDL_SIZEOF(mop->mo_pages); - data.mv_data = mop->mo_pages; - mdb_put0(txn, FREE_DBI, &key, &data, 0); - free(env->me_pghead); - env->me_pghead = NULL; + /* make sure first page of freeDB is touched and on freelist */ + mpp.mp_parent = NULL; + mpp.mp_pi = 0; + mdb_search_page(txn, FREE_DBI, NULL, NULL, 1, &mpp); } /* save to free list */ if (!MDB_IDL_IS_ZERO(txn->mt_free_pgs)) { MDB_val key, data; MDB_pageparent mpp; + ULONG i; /* make sure last page of freeDB is touched and on freelist */ key.mv_size = MAXKEYSIZE+1; @@ -975,9 +1063,34 @@ mdb_txn_commit(MDB_txn *txn) /* write to last page of freeDB */ key.mv_size = sizeof(pgno_t); key.mv_data = (char *)&txn->mt_txnid; - data.mv_size = MDB_IDL_SIZEOF(txn->mt_free_pgs); data.mv_data = txn->mt_free_pgs; + /* The free list can still grow during this call, + * despite the pre-emptive touches above. So check + * and make sure the entire thing got written. + */ + do { + i = txn->mt_free_pgs[0]; + data.mv_size = MDB_IDL_SIZEOF(txn->mt_free_pgs); + rc = mdb_put0(txn, FREE_DBI, &key, &data, 0); + if (rc) { + mdb_txn_abort(txn); + return rc; + } + } while (i != txn->mt_free_pgs[0]); + } + /* should only be one record now */ + if (env->me_pghead) { + MDB_val key, data; + MDB_oldpages *mop; + + mop = env->me_pghead; + key.mv_size = sizeof(pgno_t); + key.mv_data = (char *)&mop->mo_txnid; + data.mv_size = MDB_IDL_SIZEOF(mop->mo_pages); + data.mv_data = mop->mo_pages; mdb_put0(txn, FREE_DBI, &key, &data, 0); + free(env->me_pghead); + env->me_pghead = NULL; } /* Update DB root pointers. Their pages have already been @@ -1000,6 +1113,36 @@ mdb_txn_commit(MDB_txn *txn) next = 0; i = 1; do { +#ifdef _WIN32 + /* Windows actually supports scatter/gather I/O, but only on + * unbuffered file handles. Since we're relying on the OS page + * cache for all our data, that's self-defeating. So we just + * write pages one at a time. We use the ov structure to set + * the write offset, to at least save the overhead of a Seek + * system call. + */ + OVERLAPPED ov; + memset(&ov, 0, sizeof(ov)); + for (; i<=txn->mt_u.dirty_list[0].mid; i++) { + dp = txn->mt_u.dirty_list[i].mptr; + DPRINTF("committing page %lu", dp->p.mp_pgno); + size = dp->p.mp_pgno * env->me_psize; + ov.Offset = size & 0xffffffff; + ov.OffsetHigh = size >> 16; + ov.OffsetHigh >>= 16; + /* clear dirty flag */ + dp->p.mp_flags &= ~P_DIRTY; + rc = WriteFile(env->me_fd, &dp->p, env->me_psize * dp->h.md_num, NULL, &ov); + if (!rc) { + n = ErrCode(); + DPRINTF("WriteFile: %d", n); + mdb_txn_abort(txn); + return n; + } + } + done = 1;; +#else + struct iovec iov[MDB_COMMIT_PAGES]; n = 0; done = 1; size = 0; @@ -1010,11 +1153,11 @@ mdb_txn_commit(MDB_txn *txn) DPRINTF("committing %u dirty pages", n); rc = writev(env->me_fd, iov, n); if (rc != size) { - n = errno; + n = ErrCode(); if (rc > 0) DPUTS("short write, filesystem full?"); else - DPRINTF("writev: %s", strerror(errno)); + DPRINTF("writev: %s", strerror(n)); mdb_txn_abort(txn); return n; } @@ -1044,15 +1187,15 @@ mdb_txn_commit(MDB_txn *txn) DPRINTF("committing %u dirty pages", n); rc = writev(env->me_fd, iov, n); if (rc != size) { - n = errno; + n = ErrCode(); if (rc > 0) DPUTS("short write, filesystem full?"); else - DPRINTF("writev: %s", strerror(errno)); + DPRINTF("writev: %s", strerror(n)); mdb_txn_abort(txn); return n; } - +#endif } while (!done); /* Drop the dirty pages. @@ -1075,16 +1218,21 @@ mdb_txn_commit(MDB_txn *txn) return n; } - env->me_txns->mti_txnid = txn->mt_txnid; - done: env->me_txn = NULL; /* update the DB tables */ { int toggle = !env->me_db_toggle; + MDB_db *ip, *jp; - for (i = 2; i < txn->mt_numdbs; i++) - env->me_dbs[toggle][i] = txn->mt_dbs[i]; + ip = &env->me_dbs[toggle][2]; + jp = &txn->mt_dbs[2]; + LAZY_RWLOCK_WRLOCK(&env->me_dblock); + for (i = 2; i < txn->mt_numdbs; i++) { + if (ip->md_root != jp->md_root) + *ip = *jp; + ip++; jp++; + } for (i = 2; i < txn->mt_numdbs; i++) { if (txn->mt_dbxs[i].md_dirty) @@ -1092,9 +1240,10 @@ done: } env->me_db_toggle = toggle; env->me_numdbs = txn->mt_numdbs; + LAZY_RWLOCK_UNLOCK(&env->me_dblock); } - pthread_mutex_unlock(&env->me_txns->mti_wmutex); + UNLOCK_MUTEX_W(env); free(txn); return MDB_SUCCESS; @@ -1106,20 +1255,25 @@ mdb_env_read_header(MDB_env *env, MDB_meta *meta) char page[PAGESIZE]; MDB_page *p; MDB_meta *m; - int rc; - - assert(env != NULL); + int rc, err; /* We don't know the page size yet, so use a minimum value. */ - if ((rc = pread(env->me_fd, page, PAGESIZE, 0)) == 0) { +#ifdef _WIN32 + if (!ReadFile(env->me_fd, page, PAGESIZE, (DWORD *)&rc, NULL) || rc == 0) +#else + if ((rc = read(env->me_fd, page, PAGESIZE)) == 0) +#endif + { return ENOENT; - } else if (rc != PAGESIZE) { + } + else if (rc != PAGESIZE) { + err = ErrCode(); if (rc > 0) - errno = EINVAL; - DPRINTF("read: %s", strerror(errno)); - return errno; + err = EINVAL; + DPRINTF("read: %s", strerror(err)); + return err; } p = (MDB_page *)page; @@ -1154,7 +1308,8 @@ mdb_env_init_meta(MDB_env *env, MDB_meta *meta) unsigned int psize; DPUTS("writing new meta page"); - psize = sysconf(_SC_PAGE_SIZE); + + GetPageSize(psize); meta->mm_magic = MDB_MAGIC; meta->mm_version = MDB_VERSION; @@ -1180,9 +1335,18 @@ mdb_env_init_meta(MDB_env *env, MDB_meta *meta) m = METADATA(q); memcpy(m, meta, sizeof(*meta)); +#ifdef _WIN32 + { + DWORD len; + rc = WriteFile(env->me_fd, p, psize * 2, &len, NULL); + rc = (len == psize * 2) ? MDB_SUCCESS : ErrCode(); + } +#else rc = write(env->me_fd, p, psize * 2); + rc = (rc == (int)psize * 2) ? MDB_SUCCESS : ErrCode(); +#endif free(p); - return (rc == (int)psize * 2) ? MDB_SUCCESS : errno; + return rc; } static int @@ -1193,11 +1357,14 @@ mdb_env_write_meta(MDB_txn *txn) off_t off; int rc, len, toggle; char *ptr; +#ifdef _WIN32 + OVERLAPPED ov; +#endif assert(txn != NULL); assert(txn->mt_env != NULL); - toggle = !F_ISSET(txn->mt_flags, MDB_TXN_METOGGLE); + toggle = !txn->mt_toggle; DPRINTF("writing meta page %d for root page %lu", toggle, txn->mt_dbs[MAIN_DBI].md_root); @@ -1221,10 +1388,18 @@ mdb_env_write_meta(MDB_txn *txn) off += PAGEHDRSZ; /* Write to the SYNC fd */ +#ifdef _WIN32 + { + memset(&ov, 0, sizeof(ov)); + ov.Offset = off; + WriteFile(env->me_mfd, ptr, len, (DWORD *)&rc, &ov); + } +#else rc = pwrite(env->me_mfd, ptr, len, off); +#endif if (rc != len) { int r2; - rc = errno; + rc = ErrCode(); DPUTS("write failed, disk error?"); /* On a failure, the pagecache still contains the new data. * Write some old data back, to prevent it from being used. @@ -1232,11 +1407,24 @@ mdb_env_write_meta(MDB_txn *txn) */ meta.mm_last_pg = metab.mm_last_pg; meta.mm_txnid = metab.mm_txnid; +#ifdef _WIN32 + WriteFile(env->me_fd, ptr, len, NULL, &ov); +#else r2 = pwrite(env->me_fd, ptr, len, off); +#endif env->me_flags |= MDB_FATAL_ERROR; return rc; } + /* Memory ordering issues are irrelevant; since the entire writer + * is wrapped by wmutex, all of these changes will become visible + * after the wmutex is unlocked. Since the DB is multi-version, + * readers will get consistent data regardless of how fresh or + * how stale their view of these values is. + */ + LAZY_MUTEX_LOCK(&env->me_txns->mti_mutex); txn->mt_env->me_txns->mti_me_toggle = toggle; + txn->mt_env->me_txns->mti_txnid = txn->mt_txnid; + LAZY_MUTEX_UNLOCK(&env->me_txns->mti_mutex); return MDB_SUCCESS; } @@ -1248,15 +1436,11 @@ mdb_env_read_meta(MDB_env *env, int *which) assert(env != NULL); - if (which) - toggle = *which; - else if (env->me_metas[0]->mm_txnid < env->me_metas[1]->mm_txnid) + if (env->me_metas[0]->mm_txnid < env->me_metas[1]->mm_txnid) toggle = 1; - if (env->me_meta != env->me_metas[toggle]) - env->me_meta = env->me_metas[toggle]; - DPRINTF("Using meta page %d", toggle); + *which = toggle; return MDB_SUCCESS; } @@ -1271,9 +1455,9 @@ mdb_env_create(MDB_env **env) e->me_maxreaders = DEFAULT_READERS; e->me_maxdbs = 2; - e->me_fd = -1; - e->me_lfd = -1; - e->me_mfd = -1; + e->me_fd = INVALID_HANDLE_VALUE; + e->me_lfd = INVALID_HANDLE_VALUE; + e->me_mfd = INVALID_HANDLE_VALUE; *env = e; return MDB_SUCCESS; } @@ -1290,6 +1474,8 @@ mdb_env_set_mapsize(MDB_env *env, size_t size) int mdb_env_set_maxdbs(MDB_env *env, int dbs) { + if (env->me_map) + return EINVAL; env->me_maxdbs = dbs; return MDB_SUCCESS; } @@ -1297,6 +1483,8 @@ mdb_env_set_maxdbs(MDB_env *env, int dbs) int mdb_env_set_maxreaders(MDB_env *env, int readers) { + if (env->me_map) + return EINVAL; env->me_maxreaders = readers; return MDB_SUCCESS; } @@ -1313,7 +1501,7 @@ mdb_env_get_maxreaders(MDB_env *env, int *readers) static int mdb_env_open2(MDB_env *env, unsigned int flags) { - int i, newenv = 0; + int i, newenv = 0, toggle; MDB_meta meta; MDB_page *p; @@ -1332,13 +1520,41 @@ mdb_env_open2(MDB_env *env, unsigned int flags) env->me_mapsize = newenv ? DEFAULT_MAPSIZE : meta.mm_mapsize; } +#ifdef _WIN32 + { + HANDLE mh; + LONG sizelo, sizehi; + sizelo = env->me_mapsize & 0xffffffff; + sizehi = env->me_mapsize >> 16; /* pointless on WIN32, only needed on W64 */ + sizehi >>= 16; + /* Windows won't create mappings for zero length files. + * Just allocate the maxsize right now. + */ + if (newenv) { + SetFilePointer(env->me_fd, sizelo, sizehi ? &sizehi : NULL, 0); + if (!SetEndOfFile(env->me_fd)) + return ErrCode(); + SetFilePointer(env->me_fd, 0, NULL, 0); + } + mh = CreateFileMapping(env->me_fd, NULL, PAGE_READONLY, + sizehi, sizelo, NULL); + if (!mh) + return ErrCode(); + env->me_map = MapViewOfFileEx(mh, FILE_MAP_READ, 0, 0, env->me_mapsize, + meta.mm_address); + CloseHandle(mh); + if (!env->me_map) + return ErrCode(); + } +#else i = MAP_SHARED; if (meta.mm_address && (flags & MDB_FIXEDMAP)) i |= MAP_FIXED; env->me_map = mmap(meta.mm_address, env->me_mapsize, PROT_READ, i, env->me_fd, 0); if (env->me_map == MAP_FAILED) - return errno; + return ErrCode(); +#endif if (newenv) { meta.mm_mapsize = env->me_mapsize; @@ -1358,21 +1574,23 @@ mdb_env_open2(MDB_env *env, unsigned int flags) env->me_metas[0] = METADATA(p); env->me_metas[1] = (MDB_meta *)((char *)env->me_metas[0] + meta.mm_psize); - if ((i = mdb_env_read_meta(env, NULL)) != 0) + if ((i = mdb_env_read_meta(env, &toggle)) != 0) return i; DPRINTF("opened database version %u, pagesize %u", - env->me_meta->mm_version, env->me_psize); - DPRINTF("depth: %u", env->me_meta->mm_dbs[MAIN_DBI].md_depth); - DPRINTF("entries: %lu", env->me_meta->mm_dbs[MAIN_DBI].md_entries); - DPRINTF("branch pages: %lu", env->me_meta->mm_dbs[MAIN_DBI].md_branch_pages); - DPRINTF("leaf pages: %lu", env->me_meta->mm_dbs[MAIN_DBI].md_leaf_pages); - DPRINTF("overflow pages: %lu", env->me_meta->mm_dbs[MAIN_DBI].md_overflow_pages); - DPRINTF("root: %lu", env->me_meta->mm_dbs[MAIN_DBI].md_root); + env->me_metas[toggle]->mm_version, env->me_psize); + DPRINTF("depth: %u", env->me_metas[toggle]->mm_dbs[MAIN_DBI].md_depth); + DPRINTF("entries: %lu", env->me_metas[toggle]->mm_dbs[MAIN_DBI].md_entries); + DPRINTF("branch pages: %lu", env->me_metas[toggle]->mm_dbs[MAIN_DBI].md_branch_pages); + DPRINTF("leaf pages: %lu", env->me_metas[toggle]->mm_dbs[MAIN_DBI].md_leaf_pages); + DPRINTF("overflow pages: %lu", env->me_metas[toggle]->mm_dbs[MAIN_DBI].md_overflow_pages); + DPRINTF("root: %lu", env->me_metas[toggle]->mm_dbs[MAIN_DBI].md_root); return MDB_SUCCESS; } +#ifndef _WIN32 +/* Windows doesn't support destructor callbacks for thread-specific storage */ static void mdb_env_reader_dest(void *ptr) { @@ -1382,23 +1600,41 @@ mdb_env_reader_dest(void *ptr) reader->mr_pid = 0; reader->mr_tid = 0; } +#endif /* downgrade the exclusive lock on the region back to shared */ static void mdb_env_share_locks(MDB_env *env) { - struct flock lock_info; + int toggle = 0; - env->me_txns->mti_txnid = env->me_meta->mm_txnid; if (env->me_metas[0]->mm_txnid < env->me_metas[1]->mm_txnid) - env->me_txns->mti_me_toggle = 1; + toggle = 1; + env->me_txns->mti_me_toggle = toggle; + env->me_txns->mti_txnid = env->me_metas[toggle]->mm_txnid; - memset((void *)&lock_info, 0, sizeof(lock_info)); - lock_info.l_type = F_RDLCK; - lock_info.l_whence = SEEK_SET; - lock_info.l_start = 0; - lock_info.l_len = 1; - fcntl(env->me_lfd, F_SETLK, &lock_info); +#ifdef _WIN32 + { + OVERLAPPED ov; + /* First acquire a shared lock. The Unlock will + * then release the existing exclusive lock. + */ + memset(&ov, 0, sizeof(ov)); + LockFileEx(env->me_lfd, 0, 0, 1, 0, &ov); + UnlockFile(env->me_lfd, 0, 0, 1, 0); + } +#else + { + struct flock lock_info; + /* The shared lock replaces the existing lock */ + memset((void *)&lock_info, 0, sizeof(lock_info)); + lock_info.l_type = F_RDLCK; + lock_info.l_whence = SEEK_SET; + lock_info.l_start = 0; + lock_info.l_len = 1; + fcntl(env->me_lfd, F_SETLK, &lock_info); + } +#endif } static int @@ -1406,52 +1642,138 @@ mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl) { int rc; off_t size, rsize; - struct flock lock_info; *excl = 0; +#ifdef _WIN32 + if ((env->me_lfd = CreateFile(lpath, GENERIC_READ|GENERIC_WRITE, + FILE_SHARE_READ|FILE_SHARE_WRITE, NULL, OPEN_ALWAYS, + FILE_ATTRIBUTE_NORMAL, NULL)) == INVALID_HANDLE_VALUE) { + rc = ErrCode(); + return rc; + } + /* Try to get exclusive lock. If we succeed, then + * nobody is using the lock region and we should initialize it. + */ + { + if (LockFile(env->me_lfd, 0, 0, 1, 0)) { + *excl = 1; + } else { + OVERLAPPED ov; + memset(&ov, 0, sizeof(ov)); + if (!LockFileEx(env->me_lfd, 0, 0, 1, 0, &ov)) { + rc = ErrCode(); + goto fail; + } + } + } + size = GetFileSize(env->me_lfd, NULL); +#else if ((env->me_lfd = open(lpath, O_RDWR|O_CREAT, mode)) == -1) { - rc = errno; + rc = ErrCode(); return rc; } /* Try to get exclusive lock. If we succeed, then * nobody is using the lock region and we should initialize it. */ - memset((void *)&lock_info, 0, sizeof(lock_info)); - lock_info.l_type = F_WRLCK; - lock_info.l_whence = SEEK_SET; - lock_info.l_start = 0; - lock_info.l_len = 1; - rc = fcntl(env->me_lfd, F_SETLK, &lock_info); - if (rc == 0) { - *excl = 1; - } else { - lock_info.l_type = F_RDLCK; + { + struct flock lock_info; + memset((void *)&lock_info, 0, sizeof(lock_info)); + lock_info.l_type = F_WRLCK; + lock_info.l_whence = SEEK_SET; + lock_info.l_start = 0; + lock_info.l_len = 1; rc = fcntl(env->me_lfd, F_SETLK, &lock_info); - if (rc) { - rc = errno; - goto fail; + if (rc == 0) { + *excl = 1; + } else { + lock_info.l_type = F_RDLCK; + rc = fcntl(env->me_lfd, F_SETLKW, &lock_info); + if (rc) { + rc = ErrCode(); + goto fail; + } } } size = lseek(env->me_lfd, 0, SEEK_END); +#endif rsize = (env->me_maxreaders-1) * sizeof(MDB_reader) + sizeof(MDB_txninfo); if (size < rsize && *excl) { +#ifdef _WIN32 + SetFilePointer(env->me_lfd, rsize, NULL, 0); + if (!SetEndOfFile(env->me_lfd)) { + rc = ErrCode(); + goto fail; + } +#else if (ftruncate(env->me_lfd, rsize) != 0) { - rc = errno; + rc = ErrCode(); goto fail; } +#endif } else { rsize = size; size = rsize - sizeof(MDB_txninfo); env->me_maxreaders = size/sizeof(MDB_reader) + 1; } +#ifdef _WIN32 + { + HANDLE mh; + mh = CreateFileMapping(env->me_lfd, NULL, PAGE_READWRITE, + 0, 0, NULL); + if (!mh) { + rc = ErrCode(); + goto fail; + } + env->me_txns = MapViewOfFileEx(mh, FILE_MAP_WRITE, 0, 0, rsize, NULL); + CloseHandle(mh); + if (!env->me_txns) { + rc = ErrCode(); + goto fail; + } + } +#else env->me_txns = mmap(0, rsize, PROT_READ|PROT_WRITE, MAP_SHARED, env->me_lfd, 0); if (env->me_txns == MAP_FAILED) { - rc = errno; + rc = ErrCode(); goto fail; } +#endif if (*excl) { +#ifdef _WIN32 + char *ptr; + if (!mdb_sec_inited) { + InitializeSecurityDescriptor(&mdb_null_sd, + SECURITY_DESCRIPTOR_REVISION); + SetSecurityDescriptorDacl(&mdb_null_sd, TRUE, 0, FALSE); + mdb_all_sa.nLength = sizeof(SECURITY_ATTRIBUTES); + mdb_all_sa.bInheritHandle = FALSE; + mdb_all_sa.lpSecurityDescriptor = &mdb_null_sd; + mdb_sec_inited = 1; + } + /* FIXME: only using up to 20 characters of the env path here, + * probably not enough to assure uniqueness... + */ + sprintf(env->me_txns->mti_rmname, "Global\\MDBr%.20s", lpath); + ptr = env->me_txns->mti_rmname + sizeof("Global\\MDBr"); + while ((ptr = strchr(ptr, '\\'))) + *ptr++ = '/'; + env->me_rmutex = CreateMutex(&mdb_all_sa, FALSE, env->me_txns->mti_rmname); + if (!env->me_rmutex) { + rc = ErrCode(); + goto fail; + } + sprintf(env->me_txns->mti_rmname, "Global\\MDBw%.20s", lpath); + ptr = env->me_txns->mti_rmname + sizeof("Global\\MDBw"); + while ((ptr = strchr(ptr, '\\'))) + *ptr++ = '/'; + env->me_wmutex = CreateMutex(&mdb_all_sa, FALSE, env->me_txns->mti_rmname); + if (!env->me_wmutex) { + rc = ErrCode(); + goto fail; + } +#else pthread_mutexattr_t mattr; pthread_mutexattr_init(&mattr); @@ -1461,6 +1783,7 @@ mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl) } pthread_mutex_init(&env->me_txns->mti_mutex, &mattr); pthread_mutex_init(&env->me_txns->mti_wmutex, &mattr); +#endif env->me_txns->mti_version = MDB_VERSION; env->me_txns->mti_magic = MDB_MAGIC; env->me_txns->mti_txnid = 0; @@ -1479,16 +1802,28 @@ mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl) rc = MDB_VERSION_MISMATCH; goto fail; } - if (errno != EACCES && errno != EAGAIN) { - rc = errno; + rc = ErrCode(); + if (rc != EACCES && rc != EAGAIN) { + goto fail; + } +#ifdef _WIN32 + env->me_rmutex = OpenMutex(SYNCHRONIZE, FALSE, env->me_txns->mti_rmname); + if (!env->me_rmutex) { + rc = ErrCode(); goto fail; } + env->me_wmutex = OpenMutex(SYNCHRONIZE, FALSE, env->me_txns->mti_wmname); + if (!env->me_wmutex) { + rc = ErrCode(); + goto fail; + } +#endif } return MDB_SUCCESS; fail: close(env->me_lfd); - env->me_lfd = -1; + env->me_lfd = INVALID_HANDLE_VALUE; return rc; } @@ -1513,28 +1848,54 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mode_t mode) if (rc) goto leave; +#ifdef _WIN32 + if (F_ISSET(flags, MDB_RDONLY)) { + oflags = GENERIC_READ; + len = OPEN_EXISTING; + } else { + oflags = GENERIC_READ|GENERIC_WRITE; + len = OPEN_ALWAYS; + } + mode = FILE_ATTRIBUTE_NORMAL; + if ((env->me_fd = CreateFile(dpath, oflags, FILE_SHARE_READ|FILE_SHARE_WRITE, + NULL, len, mode, NULL)) == INVALID_HANDLE_VALUE) { + rc = ErrCode(); + goto leave; + } +#else if (F_ISSET(flags, MDB_RDONLY)) oflags = O_RDONLY; else oflags = O_RDWR | O_CREAT; if ((env->me_fd = open(dpath, oflags, mode)) == -1) { - rc = errno; + rc = ErrCode(); goto leave; } +#endif if ((rc = mdb_env_open2(env, flags)) == MDB_SUCCESS) { /* synchronous fd for meta writes */ +#ifdef _WIN32 + if (!(flags & (MDB_RDONLY|MDB_NOSYNC))) + mode |= FILE_FLAG_WRITE_THROUGH; + if ((env->me_mfd = CreateFile(dpath, oflags, FILE_SHARE_READ|FILE_SHARE_WRITE, + NULL, len, mode, NULL)) == INVALID_HANDLE_VALUE) { + rc = ErrCode(); + goto leave; + } +#else if (!(flags & (MDB_RDONLY|MDB_NOSYNC))) oflags |= MDB_DSYNC; if ((env->me_mfd = open(dpath, oflags, mode)) == -1) { - rc = errno; + rc = ErrCode(); goto leave; } - +#endif env->me_path = strdup(path); DPRINTF("opened dbenv %p", (void *) env); pthread_key_create(&env->me_txkey, mdb_env_reader_dest); + LAZY_RWLOCK_INIT(&env->me_dblock, NULL); if (excl) mdb_env_share_locks(env); env->me_dbxs = calloc(env->me_maxdbs, sizeof(MDB_dbx)); @@ -1545,13 +1906,13 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mode_t mode) leave: if (rc) { - if (env->me_fd >= 0) { + if (env->me_fd != INVALID_HANDLE_VALUE) { close(env->me_fd); - env->me_fd = -1; + env->me_fd = INVALID_HANDLE_VALUE; } - if (env->me_lfd >= 0) { + if (env->me_lfd != INVALID_HANDLE_VALUE) { close(env->me_lfd); - env->me_lfd = -1; + env->me_lfd = INVALID_HANDLE_VALUE; } } free(lpath); @@ -1577,6 +1938,7 @@ mdb_env_close(MDB_env *env) free(env->me_dbxs); free(env->me_path); + LAZY_RWLOCK_DESTROY(&env->me_dblock); pthread_key_delete(env->me_txkey); if (env->me_map) { @@ -1586,12 +1948,11 @@ mdb_env_close(MDB_env *env) close(env->me_fd); if (env->me_txns) { pid_t pid = getpid(); - size_t size = (env->me_maxreaders-1) * sizeof(MDB_reader) + sizeof(MDB_txninfo); - int i; + unsigned int i; for (i=0; ime_txns->mti_numreaders; i++) if (env->me_txns->mti_readers[i].mr_pid == pid) env->me_txns->mti_readers[i].mr_pid = 0; - munmap(env->me_txns, size); + munmap(env->me_txns, (env->me_maxreaders-1)*sizeof(MDB_reader)+sizeof(MDB_txninfo)); } close(env->me_lfd); free(env); @@ -1708,17 +2069,15 @@ mdb_get_page(MDB_txn *txn, pgno_t pgno, MDB_page **ret) if (!F_ISSET(txn->mt_flags, MDB_TXN_RDONLY) && txn->mt_u.dirty_list[0].mid) { MDB_dpage *dp; - MIDL2 id; unsigned x; - id.mid = pgno; - x = mdb_midl2_search(txn->mt_u.dirty_list, &id); + x = mdb_mid2l_search(txn->mt_u.dirty_list, pgno); if (x <= txn->mt_u.dirty_list[0].mid && txn->mt_u.dirty_list[x].mid == pgno) { dp = txn->mt_u.dirty_list[x].mptr; p = &dp->p; } } if (!p) { - if (pgno <= txn->mt_env->me_meta->mm_last_pg) + if (pgno <= txn->mt_env->me_metas[txn->mt_toggle]->mm_last_pg) p = (MDB_page *)(txn->mt_env->me_map + txn->mt_env->me_psize * pgno); } *ret = p; @@ -1784,7 +2143,7 @@ mdb_search_page_root(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, if (modify) { MDB_dhead *dh = ((MDB_dhead *)mp)-1; - if ((rc = mdb_touch(txn, mpp)) != 0) + if ((rc = mdb_touch(txn, dbi, mpp)) != 0) return rc; dh = ((MDB_dhead *)mpp->mp_page)-1; dh->md_parent = mpp->mp_parent; @@ -1834,7 +2193,7 @@ mdb_search_page(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, return MDB_NOTFOUND; } - if (rc = mdb_get_page(txn, root, &mpp->mp_page)) + if ((rc = mdb_get_page(txn, root, &mpp->mp_page))) return rc; DPRINTF("db %u root page %lu has flags 0x%X", @@ -1853,7 +2212,7 @@ mdb_search_page(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, if (!F_ISSET(mpp->mp_page->mp_flags, P_DIRTY)) { mpp->mp_parent = NULL; mpp->mp_pi = 0; - if ((rc = mdb_touch(txn, mpp))) + if ((rc = mdb_touch(txn, dbi, mpp))) return rc; txn->mt_dbs[dbi].md_root = mpp->mp_page->mp_pgno; } @@ -1879,7 +2238,7 @@ mdb_read_data(MDB_txn *txn, MDB_node *leaf, MDB_val *data) */ data->mv_size = leaf->mn_dsize; memcpy(&pgno, NODEDATA(leaf), sizeof(pgno)); - if (rc = mdb_get_page(txn, pgno, &omp)) { + if ((rc = mdb_get_page(txn, pgno, &omp))) { DPRINTF("read overflow page %lu failed", pgno); return rc; } @@ -1975,7 +2334,7 @@ mdb_sibling(MDB_cursor *cursor, int move_right) assert(IS_BRANCH(parent->mp_page)); indx = NODEPTR(parent->mp_page, parent->mp_ki); - if (rc = mdb_get_page(cursor->mc_txn, NODEPGNO(indx), &mp)) + if ((rc = mdb_get_page(cursor->mc_txn, NODEPGNO(indx), &mp))) return rc;; #if 0 mp->parent = parent->mp_page; @@ -2221,6 +2580,7 @@ set2: int ex2, *ex2p; if (op == MDB_GET_BOTH) { ex2p = &ex2; + ex2 = 0; } else { ex2p = NULL; } @@ -2440,7 +2800,7 @@ mdb_new_page(MDB_txn *txn, MDB_dbi dbi, uint32_t flags, int num) { MDB_dpage *dp; - if ((dp = mdb_alloc_page(txn, NULL, 0, num)) == NULL) + if ((dp = mdb_alloc_page(txn, dbi, NULL, 0, num)) == NULL) return NULL; DPRINTF("allocated new mpage %lu, page size %u", dp->p.mp_pgno, txn->mt_env->me_psize); @@ -2502,10 +2862,10 @@ mdb_add_node(MDB_txn *txn, MDB_dbi dbi, MDB_page *mp, indx_t indx, assert(mp->mp_upper >= mp->mp_lower); - DPRINTF("add node [%s] to %s page %lu at index %i, key size %zu", - key ? DKEY(key) : NULL, + DPRINTF("add to %s page %lu index %i, data size %zu key size %zu [%s]", IS_LEAF(mp) ? "leaf" : "branch", - mp->mp_pgno, indx, key ? key->mv_size : 0); + mp->mp_pgno, indx, data ? data->mv_size : 0, + key ? key->mv_size : 0, key ? DKEY(key) : NULL); if (IS_LEAF2(mp)) { /* Move higher keys up one slot. */ @@ -2693,7 +3053,6 @@ mdb_xcursor_init1(MDB_txn *txn, MDB_dbi dbi, MDB_xcursor *mx, MDB_page *mp, MDB_ mx->mx_dbxs[dbn].md_name.mv_data = NODEKEY(node); mx->mx_dbxs[dbn].md_name.mv_size = node->mn_ksize; mx->mx_txn.mt_next_pgno = txn->mt_next_pgno; - mx->mx_txn.mt_oldest = txn->mt_oldest; mx->mx_txn.mt_u = txn->mt_u; mx->mx_cursor.mc_initialized = 0; mx->mx_cursor.mc_eof = 0; @@ -2703,7 +3062,6 @@ static void mdb_xcursor_fini(MDB_txn *txn, MDB_dbi dbi, MDB_xcursor *mx) { txn->mt_next_pgno = mx->mx_txn.mt_next_pgno; - txn->mt_oldest = mx->mx_txn.mt_oldest; txn->mt_u = mx->mx_txn.mt_u; txn->mt_dbs[0] = mx->mx_dbs[0]; txn->mt_dbs[1] = mx->mx_dbs[1]; @@ -2835,8 +3193,8 @@ mdb_move_node(MDB_txn *txn, MDB_dbi dbi, MDB_pageparent *src, indx_t srcindx, DKBUF; /* Mark src and dst as dirty. */ - if ((rc = mdb_touch(txn, src)) || - (rc = mdb_touch(txn, dst))) + if ((rc = mdb_touch(txn, dbi, src)) || + (rc = mdb_touch(txn, dbi, dst))) return rc;; if (IS_LEAF2(src->mp_page)) { @@ -2929,8 +3287,8 @@ mdb_merge(MDB_txn *txn, MDB_dbi dbi, MDB_pageparent *src, MDB_pageparent *dst) assert(dst->mp_parent); /* Mark src and dst as dirty. */ - if ((rc = mdb_touch(txn, src)) || - (rc = mdb_touch(txn, dst))) + if ((rc = mdb_touch(txn, dbi, src)) || + (rc = mdb_touch(txn, dbi, dst))) return rc; /* Move all nodes from src to dst. @@ -3019,7 +3377,7 @@ mdb_rebalance(MDB_txn *txn, MDB_dbi dbi, MDB_pageparent *mpp) } else if (IS_BRANCH(mpp->mp_page) && NUMKEYS(mpp->mp_page) == 1) { DPUTS("collapsing root page!"); txn->mt_dbs[dbi].md_root = NODEPGNO(NODEPTR(mpp->mp_page, 0)); - if (rc = mdb_get_page(txn, txn->mt_dbs[dbi].md_root, &root)) + if ((rc = mdb_get_page(txn, txn->mt_dbs[dbi].md_root, &root))) return rc; txn->mt_dbs[dbi].md_depth--; txn->mt_dbs[dbi].md_branch_pages--; @@ -3045,7 +3403,7 @@ mdb_rebalance(MDB_txn *txn, MDB_dbi dbi, MDB_pageparent *mpp) */ DPUTS("reading right neighbor"); node = NODEPTR(mpp->mp_parent, mpp->mp_pi + 1); - if (rc = mdb_get_page(txn, NODEPGNO(node), &npp.mp_page)) + if ((rc = mdb_get_page(txn, NODEPGNO(node), &npp.mp_page))) return rc; npp.mp_pi = mpp->mp_pi + 1; si = 0; @@ -3055,7 +3413,7 @@ mdb_rebalance(MDB_txn *txn, MDB_dbi dbi, MDB_pageparent *mpp) */ DPUTS("reading left neighbor"); node = NODEPTR(mpp->mp_parent, mpp->mp_pi - 1); - if (rc = mdb_get_page(txn, NODEPGNO(node), &npp.mp_page)) + if ((rc = mdb_get_page(txn, NODEPGNO(node), &npp.mp_page))) return rc; npp.mp_pi = mpp->mp_pi - 1; si = NUMKEYS(npp.mp_page) - 1; @@ -3110,8 +3468,7 @@ mdb_del0(MDB_txn *txn, MDB_dbi dbi, unsigned int ki, MDB_pageparent *mpp, MDB_no int mdb_del(MDB_txn *txn, MDB_dbi dbi, - MDB_val *key, MDB_val *data, - unsigned int flags) + MDB_val *key, MDB_val *data) { int rc, exact; unsigned int ki; @@ -3127,7 +3484,7 @@ mdb_del(MDB_txn *txn, MDB_dbi dbi, return EINVAL; if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) { - return EINVAL; + return EACCES; } if (key->mv_size == 0 || key->mv_size > MAXKEYSIZE) { @@ -3150,8 +3507,8 @@ mdb_del(MDB_txn *txn, MDB_dbi dbi, mdb_xcursor_init0(txn, dbi, &mx); mdb_xcursor_init1(txn, dbi, &mx, mpp.mp_page, leaf); - if (flags == MDB_DEL_DUP) { - rc = mdb_del(&mx.mx_txn, mx.mx_cursor.mc_dbi, data, NULL, 0); + if (data) { + rc = mdb_del(&mx.mx_txn, mx.mx_cursor.mc_dbi, data, NULL); mdb_xcursor_fini(txn, dbi, &mx); /* If sub-DB still has entries, we're done */ if (mx.mx_txn.mt_dbs[mx.mx_cursor.mc_dbi].md_root != P_INVALID) { @@ -3194,9 +3551,6 @@ mdb_del(MDB_txn *txn, MDB_dbi dbi, } } - if (data && (rc = mdb_read_data(txn, leaf, data)) != MDB_SUCCESS) - return rc; - return mdb_del0(txn, dbi, ki, &mpp, leaf); } @@ -3213,10 +3567,10 @@ mdb_split(MDB_txn *txn, MDB_dbi dbi, MDB_page **mpp, unsigned int *newindxp, int rc = MDB_SUCCESS, ins_new = 0; indx_t newindx; pgno_t pgno = 0; - unsigned int i, j, split_indx; + unsigned int i, j, split_indx, nkeys, pmax; MDB_node *node; MDB_val sepkey, rkey, rdata; - MDB_page *copy; + MDB_page *copy, *cptr; MDB_dpage *mdp, *rdp, *pdp; MDB_dhead *dh; DKBUF; @@ -3255,12 +3609,13 @@ mdb_split(MDB_txn *txn, MDB_dbi dbi, MDB_page **mpp, unsigned int *newindxp, rdp->h.md_pi = mdp->h.md_pi + 1; DPRINTF("new right sibling: page %lu", rdp->p.mp_pgno); - split_indx = NUMKEYS(&mdp->p) / 2 + 1; + nkeys = NUMKEYS(&mdp->p); + split_indx = nkeys / 2 + 1; if (IS_LEAF2(&rdp->p)) { char *split, *ins; int x; - unsigned int nkeys = NUMKEYS(&mdp->p), lsize, rsize, ksize; + unsigned int lsize, rsize, ksize; /* Move half of the keys to the right sibling */ copy = NULL; x = *newindxp - split_indx; @@ -3300,13 +3655,44 @@ mdb_split(MDB_txn *txn, MDB_dbi dbi, MDB_page **mpp, unsigned int *newindxp, goto newsep; } - /* Move half of the keys to the right sibling. */ - if ((copy = malloc(txn->mt_env->me_psize)) == NULL) - return ENOMEM; - memcpy(copy, &mdp->p, txn->mt_env->me_psize); - memset(&mdp->p.mp_ptrs, 0, txn->mt_env->me_psize - PAGEHDRSZ); - mdp->p.mp_lower = PAGEHDRSZ; - mdp->p.mp_upper = txn->mt_env->me_psize; + /* For leaf pages, check the split point based on what + * fits where, since otherwise add_node can fail. + */ + if (IS_LEAF(&mdp->p)) { + unsigned int psize, nsize; + /* Maximum free space in an empty page */ + pmax = txn->mt_env->me_psize - PAGEHDRSZ; + nsize = mdb_leaf_size(txn->mt_env, newkey, newdata); + if (newindx < split_indx) { + psize = nsize; + for (i=0; ip, i); + psize += NODESIZE + NODEKSZ(node) + sizeof(indx_t); + if (F_ISSET(node->mn_flags, F_BIGDATA)) + psize += sizeof(pgno_t); + else + psize += NODEDSZ(node); + if (psize > pmax) { + split_indx = i; + break; + } + } + } else { + psize = nsize; + for (i=nkeys-1; i>=split_indx; i--) { + node = NODEPTR(&mdp->p, i); + psize += NODESIZE + NODEKSZ(node) + sizeof(indx_t); + if (F_ISSET(node->mn_flags, F_BIGDATA)) + psize += sizeof(pgno_t); + else + psize += NODEDSZ(node); + if (psize > pmax) { + split_indx = i+1; + break; + } + } + } + } /* First find the separating key between the split pages. */ @@ -3315,7 +3701,7 @@ mdb_split(MDB_txn *txn, MDB_dbi dbi, MDB_page **mpp, unsigned int *newindxp, sepkey.mv_size = newkey->mv_size; sepkey.mv_data = newkey->mv_data; } else { - node = NODEPTR(copy, split_indx); + node = NODEPTR(&mdp->p, split_indx); sepkey.mv_size = node->mn_ksize; sepkey.mv_data = NODEKEY(node); } @@ -3345,20 +3731,24 @@ newsep: return rc; } if (rc != MDB_SUCCESS) { - free(copy); return rc; } - for (i = j = 0; i <= NUMKEYS(copy); j++) { - if (i < split_indx) { - /* Re-insert in left sibling. */ - pdp = mdp; - } else { - /* Insert in right sibling. */ - if (i == split_indx) - /* Reset insert index for right sibling. */ - j = (i == newindx && ins_new); - pdp = rdp; + /* Move half of the keys to the right sibling. */ + if ((copy = malloc(txn->mt_env->me_psize)) == NULL) + return ENOMEM; + + copy->mp_pgno = mdp->p.mp_pgno; + copy->mp_flags = mdp->p.mp_flags; + copy->mp_lower = PAGEHDRSZ; + copy->mp_upper = txn->mt_env->me_psize; + cptr = copy; + for (i = j = 0; i <= nkeys; j++) { + if (i == split_indx) { + /* Insert in right sibling. */ + /* Reset insert index for right sibling. */ + j = (i == newindx && ins_new); + cptr = &rdp->p; } if (i == newindx && !ins_new) { @@ -3376,11 +3766,12 @@ newsep: /* Update page and index for the new key. */ *newindxp = j; - *mpp = &pdp->p; - } else if (i == NUMKEYS(copy)) { + if (cptr == &rdp->p) + *mpp = cptr; + } else if (i == nkeys) { break; } else { - node = NODEPTR(copy, i); + node = NODEPTR(&mdp->p, i); rkey.mv_data = NODEKEY(node); rkey.mv_size = node->mn_ksize; if (IS_LEAF(&mdp->p)) { @@ -3398,8 +3789,15 @@ newsep: rkey.mv_size = 0; } - rc = mdb_add_node(txn, dbi, &pdp->p, j, &rkey, &rdata, pgno,flags); + rc = mdb_add_node(txn, dbi, cptr, j, &rkey, &rdata, pgno, flags); } + nkeys = NUMKEYS(copy); + for (i=0; ip.mp_ptrs[i] = copy->mp_ptrs[i]; + mdp->p.mp_lower = copy->mp_lower; + mdp->p.mp_upper = copy->mp_upper; + memcpy(NODEPTR(&mdp->p, nkeys-1), NODEPTR(copy, nkeys-1), + txn->mt_env->me_psize - copy->mp_upper); free(copy); return rc; @@ -3567,7 +3965,7 @@ mdb_put(MDB_txn *txn, MDB_dbi dbi, return EINVAL; if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) { - return EINVAL; + return EACCES; } if (key->mv_size == 0 || key->mv_size > MAXKEYSIZE) { @@ -3628,10 +4026,14 @@ mdb_stat0(MDB_env *env, MDB_db *db, MDB_stat *arg) int mdb_env_stat(MDB_env *env, MDB_stat *arg) { + int toggle; + if (env == NULL || arg == NULL) return EINVAL; - return mdb_stat0(env, &env->me_meta->mm_dbs[MAIN_DBI], arg); + mdb_env_read_meta(env, &toggle); + + return mdb_stat0(env, &env->me_metas[toggle]->mm_dbs[MAIN_DBI], arg); } int mdb_open(MDB_txn *txn, const char *name, unsigned int flags, MDB_dbi *dbi) @@ -3743,3 +4145,5 @@ int mdb_set_relfunc(MDB_txn *txn, MDB_dbi dbi, MDB_rel_func *rel) txn->mt_dbxs[dbi].md_rel = rel; return MDB_SUCCESS; } + +/** @} */