X-Git-Url: https://git.sur5r.net/?a=blobdiff_plain;f=libraries%2Fliblmdb%2Fmdb.c;h=6d02f8e5d1d16bc9b9e8fb0278e986599898891c;hb=534dcc721287cfe1051191a5cd7d3df6cf1dc409;hp=40e68de91256779995599fa92f6d410df0aaa827;hpb=a93810cc3d1a062bf5edbe9c14795d0360cda8a4;p=openldap diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 40e68de912..6d02f8e5d1 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -1,11 +1,11 @@ /** @file mdb.c - * @brief memory-mapped database library + * @brief Lightning memory-mapped database library * * A Btree-based database management library modeled loosely on the * BerkeleyDB API, but much simplified. */ /* - * Copyright 2011-2013 Howard Chu, Symas Corp. + * Copyright 2011-2014 Howard Chu, Symas Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -35,15 +35,17 @@ #ifndef _GNU_SOURCE #define _GNU_SOURCE 1 #endif -#include -#include #ifdef _WIN32 +#include #include /** getpid() returns int; MinGW defines pid_t but MinGW64 typedefs it * as int64 which is wrong. MSVC doesn't define it at all, so just * don't use it. */ #define MDB_PID_T int +#define MDB_THR_T DWORD +#include +#include #ifdef __GNUC__ # include #else @@ -55,7 +57,10 @@ # endif #endif #else +#include +#include #define MDB_PID_T pid_t +#define MDB_THR_T pthread_t #include #include #include @@ -145,7 +150,14 @@ # error "Two's complement, reasonably sized integer types, please" #endif -/** @defgroup internal MDB Internals +#ifdef __GNUC__ +/** Put infrequently used env functions in separate section */ +#define ESECT __attribute__ ((section("text_env"))) +#else +#define ESECT +#endif + +/** @defgroup internal LMDB Internals * @{ */ /** @defgroup compat Compatibility Macros @@ -169,8 +181,10 @@ #ifdef _WIN32 #define MDB_USE_HASH 1 #define MDB_PIDLOCK 0 -#define pthread_t DWORD +#define THREAD_RET DWORD +#define pthread_t HANDLE #define pthread_mutex_t HANDLE +#define pthread_cond_t HANDLE #define pthread_key_t DWORD #define pthread_self() GetCurrentThreadId() #define pthread_key_create(x,y) \ @@ -178,12 +192,16 @@ #define pthread_key_delete(x) TlsFree(x) #define pthread_getspecific(x) TlsGetValue(x) #define pthread_setspecific(x,y) (TlsSetValue(x,y) ? 0 : ErrCode()) -#define pthread_mutex_unlock(x) ReleaseMutex(x) -#define pthread_mutex_lock(x) WaitForSingleObject(x, INFINITE) -#define LOCK_MUTEX_R(env) pthread_mutex_lock((env)->me_rmutex) -#define UNLOCK_MUTEX_R(env) pthread_mutex_unlock((env)->me_rmutex) -#define LOCK_MUTEX_W(env) pthread_mutex_lock((env)->me_wmutex) -#define UNLOCK_MUTEX_W(env) pthread_mutex_unlock((env)->me_wmutex) +#define pthread_mutex_unlock(x) ReleaseMutex(*x) +#define pthread_mutex_lock(x) WaitForSingleObject(*x, INFINITE) +#define pthread_cond_signal(x) SetEvent(*x) +#define pthread_cond_wait(cond,mutex) do{SignalObjectAndWait(*mutex, *cond, INFINITE, FALSE); WaitForSingleObject(*mutex, INFINITE);}while(0) +#define THREAD_CREATE(thr,start,arg) thr=CreateThread(NULL,0,start,arg,0,NULL) +#define THREAD_FINISH(thr) WaitForSingleObject(thr, INFINITE) +#define LOCK_MUTEX_R(env) pthread_mutex_lock(&(env)->me_rmutex) +#define UNLOCK_MUTEX_R(env) pthread_mutex_unlock(&(env)->me_rmutex) +#define LOCK_MUTEX_W(env) pthread_mutex_lock(&(env)->me_wmutex) +#define UNLOCK_MUTEX_W(env) pthread_mutex_unlock(&(env)->me_wmutex) #define getpid() GetCurrentProcessId() #define MDB_FDATASYNC(fd) (!FlushFileBuffers(fd)) #define MDB_MSYNC(addr,len,flags) (!FlushViewOfFile(addr,len)) @@ -198,7 +216,9 @@ #endif #define Z "I" #else - +#define THREAD_RET void * +#define THREAD_CREATE(thr,start,arg) pthread_create(&thr,NULL,start,arg) +#define THREAD_FINISH(thr) pthread_join(thr,NULL) #define Z "z" /**< printf format modifier for size_t */ /** For MDB_LOCK_FORMAT: True if readers take a pid lock in the lockfile */ @@ -381,7 +401,7 @@ static txnid_t mdb_debug_start; */ #define MDB_MINKEYS 2 - /** A stamp that identifies a file as an MDB file. + /** A stamp that identifies a file as an LMDB file. * There's nothing special about this value other than that it is easily * recognizable, and it will reflect any byte order mismatches. */ @@ -537,7 +557,7 @@ typedef struct MDB_rxbody { /** The process ID of the process owning this reader txn. */ MDB_PID_T mrb_pid; /** The thread ID of the thread owning this txn. */ - pthread_t mrb_tid; + MDB_THR_T mrb_tid; } MDB_rxbody; /** The actual reader record, with cacheline padding. */ @@ -568,7 +588,7 @@ typedef struct MDB_reader { * unlikely. If a collision occurs, the results are unpredictable. */ typedef struct MDB_txbody { - /** Stamp identifying this as an MDB file. It must be set + /** Stamp identifying this as an LMDB file. It must be set * to #MDB_MAGIC. */ uint32_t mtb_magic; /** Format of this lock file. Must be set to #MDB_LOCK_FORMAT. */ @@ -650,6 +670,7 @@ typedef struct MDB_page { #define P_DIRTY 0x10 /**< dirty page, also set for #P_SUBP pages */ #define P_LEAF2 0x20 /**< for #MDB_DUPFIXED records */ #define P_SUBP 0x40 /**< for #MDB_DUPSORT sub-pages */ +#define P_LOOSE 0x4000 /**< page was dirtied then freed, can be reused */ #define P_KEEP 0x8000 /**< leave this page alone during spill */ /** @} */ uint16_t mp_flags; /**< @ref mdb_page */ @@ -700,6 +721,9 @@ typedef struct MDB_page { /** The number of overflow pages needed to store the given size. */ #define OVPAGES(size, psize) ((PAGEHDRSZ-1 + (size)) / (psize) + 1) + /** Link in #MDB_txn.%mt_loose_pages list */ +#define NEXT_LOOSE_PAGE(p) (*(MDB_page **)METADATA(p)) + /** Header for a single key/data pair within a page. * Used in pages of type #P_BRANCH and #P_LEAF without #P_LEAF2. * We guarantee 2-byte alignment for 'MDB_node's. @@ -841,7 +865,7 @@ typedef struct MDB_db { * Pages 0-1 are meta pages. Transaction N writes meta page #(N % 2). */ typedef struct MDB_meta { - /** Stamp identifying this as an MDB file. It must be set + /** Stamp identifying this as an LMDB file. It must be set * to #MDB_MAGIC. */ uint32_t mm_magic; /** Version number of this lock file. Must be set to #MDB_DATA_VERSION. */ @@ -898,6 +922,10 @@ struct MDB_txn { /** The list of pages that became unused during this transaction. */ MDB_IDL mt_free_pgs; + /** The list of loose pages that became unused and may be reused + * in this transaction, linked through #NEXT_LOOSE_PAGE(page). + */ + MDB_page *mt_loose_pgs; /** The sorted list of dirty pages we temporarily wrote to disk * because the dirty list was full. page numbers in here are * shifted left by 1, deleted slots have the LSB set. @@ -1182,7 +1210,7 @@ mdb_version(int *major, int *minor, int *patch) return MDB_VERSION_STRING; } -/** Table of descriptions for MDB @ref errors */ +/** Table of descriptions for LMDB @ref errors */ static char *const mdb_errstr[] = { "MDB_KEYEXIST: Key/data pair already exists", "MDB_NOTFOUND: No matching key/data pair found", @@ -1190,7 +1218,7 @@ static char *const mdb_errstr[] = { "MDB_CORRUPTED: Located page was wrong type", "MDB_PANIC: Update of meta page failed", "MDB_VERSION_MISMATCH: Database environment version mismatch", - "MDB_INVALID: File is not an MDB file", + "MDB_INVALID: File is not an LMDB file", "MDB_MAP_FULL: Environment mapsize limit reached", "MDB_DBS_FULL: Environment maxdbs limit reached", "MDB_READERS_FULL: Environment maxreaders limit reached", @@ -1485,7 +1513,6 @@ mdb_page_malloc(MDB_txn *txn, unsigned num) } return ret; } - /** Free a single page. * Saves single pages to a list, for future reuse. * (This is not used for multi-page overflow pages.) @@ -1525,6 +1552,58 @@ mdb_dlist_free(MDB_txn *txn) dl[0].mid = 0; } +/** Loosen or free a single page. + * Saves single pages to a list for future reuse + * in this same txn. It has been pulled from the freeDB + * and already resides on the dirty list, but has been + * deleted. Use these pages first before pulling again + * from the freeDB. + * + * If the page wasn't dirtied in this txn, just add it + * to this txn's free list. + */ +static int +mdb_page_loose(MDB_cursor *mc, MDB_page *mp) +{ + int loose = 0; + pgno_t pgno = mp->mp_pgno; + + if ((mp->mp_flags & P_DIRTY) && mc->mc_dbi != FREE_DBI) { + if (mc->mc_txn->mt_parent) { + MDB_ID2 *dl = mc->mc_txn->mt_u.dirty_list; + /* If txn has a parent, make sure the page is in our + * dirty list. + */ + if (dl[0].mid) { + unsigned x = mdb_mid2l_search(dl, pgno); + if (x <= dl[0].mid && dl[x].mid == pgno) { + if (mp != dl[x].mptr) { /* bad cursor? */ + mc->mc_flags &= ~(C_INITIALIZED|C_EOF); + mc->mc_txn->mt_flags |= MDB_TXN_ERROR; + return MDB_CORRUPTED; + } + /* ok, it's ours */ + loose = 1; + } + } + } else { + /* no parent txn, so it's just ours */ + loose = 1; + } + } + if (loose) { + NEXT_LOOSE_PAGE(mp) = mc->mc_txn->mt_loose_pgs; + mc->mc_txn->mt_loose_pgs = mp; + mp->mp_flags |= P_LOOSE; + } else { + int rc = mdb_midl_append(&mc->mc_txn->mt_free_pgs, pgno); + if (rc) + return rc; + } + + return MDB_SUCCESS; +} + /** Set or clear P_KEEP in dirty, non-overflow, non-sub pages watched by txn. * @param[in] mc A cursor handle for the current operation. * @param[in] pflags Flags of the pages to update: @@ -1573,6 +1652,12 @@ mdb_pages_xkeep(MDB_cursor *mc, unsigned pflags, int all) break; } + /* Loose pages shouldn't be spilled */ + for (dp = txn->mt_loose_pgs; dp; dp = NEXT_LOOSE_PAGE(dp)) { + if ((dp->mp_flags & Mask) == pflags) + dp->mp_flags ^= P_KEEP; + } + if (all) { /* Mark dirty root pages */ for (i=0; imt_numdbs; i++) { @@ -1790,7 +1875,7 @@ mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp) #else enum { Paranoid = 0, Max_retries = INT_MAX /*infinite*/ }; #endif - int rc, retry = Max_retries; + int rc, retry = num * 20; MDB_txn *txn = mc->mc_txn; MDB_env *env = txn->mt_env; pgno_t pgno, *mop = env->me_pghead; @@ -1800,6 +1885,14 @@ mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp) MDB_cursor_op op; MDB_cursor m2; + /* If there are any loose pages, just use them */ + if (num == 1 && txn->mt_loose_pgs) { + np = txn->mt_loose_pgs; + txn->mt_loose_pgs = NEXT_LOOSE_PAGE(np); + *mp = np; + return MDB_SUCCESS; + } + *mp = NULL; /* If our dirty list is already full, we can't do anything */ @@ -1823,7 +1916,7 @@ mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp) if (mop[i-n2] == pgno+n2) goto search_done; } while (--i > n2); - if (Max_retries < INT_MAX && --retry < 0) + if (--retry < 0) break; } @@ -2314,7 +2407,7 @@ mdb_txn_renew0(MDB_txn *txn) return MDB_BAD_RSLOT; } else { MDB_PID_T pid = env->me_pid; - pthread_t tid = pthread_self(); + MDB_THR_T tid = pthread_self(); if (!env->me_live_reader) { rc = mdb_reader_pid(env, Pidset, pid); @@ -2661,6 +2754,32 @@ mdb_freelist_save(MDB_txn *txn) return rc; } + /* Dispose of loose pages. Usually they will have all + * been used up by the time we get here. + */ + if (txn->mt_loose_pgs) { + MDB_page *mp = txn->mt_loose_pgs; + /* Just return them to freeDB */ + if (env->me_pghead) { + int i, j; + mop = env->me_pghead; + for (; mp; mp = NEXT_LOOSE_PAGE(mp)) { + pgno_t pg = mp->mp_pgno; + j = mop[0] + 1; + for (i = mop[0]; i && mop[i] < pg; i--) + mop[j--] = mop[i]; + mop[j] = pg; + mop[0] += 1; + } + } else { + /* Oh well, they were wasted. Put on freelist */ + for (; mp; mp = NEXT_LOOSE_PAGE(mp)) { + mdb_midl_append(&txn->mt_free_pgs, mp->mp_pgno); + } + } + txn->mt_loose_pgs = NULL; + } + /* MDB_RESERVE cancels meminit in ovpage malloc (when no WRITEMAP) */ clean_limit = (env->me_flags & (MDB_NOMEMINIT|MDB_WRITEMAP)) ? SSIZE_MAX : maxfree_1pg; @@ -3144,7 +3263,7 @@ fail: * @param[out] meta address of where to store the meta information * @return 0 on success, non-zero on failure. */ -static int +static int ESECT mdb_env_read_header(MDB_env *env, MDB_meta *meta) { MDB_metabuf pbuf; @@ -3202,12 +3321,26 @@ mdb_env_read_header(MDB_env *env, MDB_meta *meta) return 0; } +static void ESECT +mdb_env_init_meta0(MDB_env *env, MDB_meta *meta) +{ + meta->mm_magic = MDB_MAGIC; + meta->mm_version = MDB_DATA_VERSION; + meta->mm_mapsize = env->me_mapsize; + meta->mm_psize = env->me_psize; + meta->mm_last_pg = 1; + meta->mm_flags = env->me_flags & 0xffff; + meta->mm_flags |= MDB_INTEGERKEY; + meta->mm_dbs[0].md_root = P_INVALID; + meta->mm_dbs[1].md_root = P_INVALID; +} + /** Write the environment parameters of a freshly created DB environment. * @param[in] env the environment handle * @param[out] meta address of where to store the meta information * @return 0 on success, non-zero on failure. */ -static int +static int ESECT mdb_env_init_meta(MDB_env *env, MDB_meta *meta) { MDB_page *p, *q; @@ -3231,15 +3364,7 @@ mdb_env_init_meta(MDB_env *env, MDB_meta *meta) psize = env->me_psize; - meta->mm_magic = MDB_MAGIC; - meta->mm_version = MDB_DATA_VERSION; - meta->mm_mapsize = env->me_mapsize; - meta->mm_psize = psize; - meta->mm_last_pg = 1; - meta->mm_flags = env->me_flags & 0xffff; - meta->mm_flags |= MDB_INTEGERKEY; - meta->mm_dbs[0].md_root = P_INVALID; - meta->mm_dbs[1].md_root = P_INVALID; + mdb_env_init_meta0(env, meta); p = calloc(2, psize); p->mp_pgno = 0; @@ -3395,7 +3520,7 @@ mdb_env_pick_meta(const MDB_env *env) return (env->me_metas[0]->mm_txnid < env->me_metas[1]->mm_txnid); } -int +int ESECT mdb_env_create(MDB_env **env) { MDB_env *e; @@ -3420,7 +3545,7 @@ mdb_env_create(MDB_env **env) return MDB_SUCCESS; } -static int +static int ESECT mdb_env_map(MDB_env *env, void *addr, int newsize) { MDB_page *p; @@ -3429,8 +3554,17 @@ mdb_env_map(MDB_env *env, void *addr, int newsize) int rc; HANDLE mh; LONG sizelo, sizehi; - sizelo = env->me_mapsize & 0xffffffff; - sizehi = env->me_mapsize >> 16 >> 16; /* only needed on Win64 */ + size_t msize; + + if (flags & MDB_RDONLY) { + msize = 0; + sizelo = 0; + sizehi = 0; + } else { + msize = env->me_mapsize; + sizelo = msize & 0xffffffff; + sizehi = msize >> 16 >> 16; /* only needed on Win64 */ + } /* Windows won't create mappings for zero length files. * Just allocate the maxsize right now. @@ -3448,7 +3582,7 @@ mdb_env_map(MDB_env *env, void *addr, int newsize) return ErrCode(); env->me_map = MapViewOfFileEx(mh, flags & MDB_WRITEMAP ? FILE_MAP_WRITE : FILE_MAP_READ, - 0, 0, env->me_mapsize, addr); + 0, 0, msize, addr); rc = env->me_map ? 0 : ErrCode(); CloseHandle(mh); if (rc) @@ -3494,7 +3628,7 @@ mdb_env_map(MDB_env *env, void *addr, int newsize) return MDB_SUCCESS; } -int +int ESECT mdb_env_set_mapsize(MDB_env *env, size_t size) { /* If env is already open, caller is responsible for making @@ -3528,7 +3662,7 @@ mdb_env_set_mapsize(MDB_env *env, size_t size) return MDB_SUCCESS; } -int +int ESECT mdb_env_set_maxdbs(MDB_env *env, MDB_dbi dbs) { if (env->me_map) @@ -3537,7 +3671,7 @@ mdb_env_set_maxdbs(MDB_env *env, MDB_dbi dbs) return MDB_SUCCESS; } -int +int ESECT mdb_env_set_maxreaders(MDB_env *env, unsigned int readers) { if (env->me_map || readers < 1) @@ -3546,7 +3680,7 @@ mdb_env_set_maxreaders(MDB_env *env, unsigned int readers) return MDB_SUCCESS; } -int +int ESECT mdb_env_get_maxreaders(MDB_env *env, unsigned int *readers) { if (!env || !readers) @@ -3555,9 +3689,9 @@ mdb_env_get_maxreaders(MDB_env *env, unsigned int *readers) return MDB_SUCCESS; } -/** Further setup required for opening an MDB environment +/** Further setup required for opening an LMDB environment */ -static int +static int ESECT mdb_env_open2(MDB_env *env) { unsigned int flags = env->me_flags; @@ -3714,7 +3848,7 @@ PIMAGE_TLS_CALLBACK mdb_tls_cbp = mdb_tls_callback; #endif /** Downgrade the exclusive lock on the region back to shared */ -static int +static int ESECT mdb_env_share_locks(MDB_env *env, int *excl) { int rc = 0, toggle = mdb_env_pick_meta(env); @@ -3756,7 +3890,7 @@ mdb_env_share_locks(MDB_env *env, int *excl) /** Try to get exlusive lock, otherwise shared. * Maintain *excl = -1: no/unknown lock, 0: shared, 1: exclusive. */ -static int +static int ESECT mdb_env_excl_lock(MDB_env *env, int *excl) { int rc = 0; @@ -3891,14 +4025,14 @@ mdb_hash_enc(MDB_val *val, char *encbuf) #endif /** Open and/or initialize the lock region for the environment. - * @param[in] env The MDB environment. + * @param[in] env The LMDB environment. * @param[in] lpath The pathname of the file used for the lock region. * @param[in] mode The Unix permissions for the file, if we create it. * @param[out] excl Resulting file lock type: -1 none, 0 shared, 1 exclusive * @param[in,out] excl In -1, out lock type: -1 none, 0 shared, 1 exclusive * @return 0 on success, non-zero on failure. */ -static int +static int ESECT mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl) { #ifdef _WIN32 @@ -4128,7 +4262,7 @@ fail: # error "Persistent DB flags & env flags overlap, but both go in mm_flags" #endif -int +int ESECT mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode) { int oflags, rc, len, excl = -1; @@ -4256,7 +4390,7 @@ leave: } /** Destroy resources from mdb_env_open(), clear our readers & DBIs */ -static void +static void ESECT mdb_env_close0(MDB_env *env, int excl) { int i; @@ -4344,153 +4478,8 @@ mdb_env_close0(MDB_env *env, int excl) env->me_flags &= ~(MDB_ENV_ACTIVE|MDB_ENV_TXKEY); } -int -mdb_env_copyfd(MDB_env *env, HANDLE fd) -{ - MDB_txn *txn = NULL; - int rc; - size_t wsize; - char *ptr; -#ifdef _WIN32 - DWORD len, w2; -#define DO_WRITE(rc, fd, ptr, w2, len) rc = WriteFile(fd, ptr, w2, &len, NULL) -#else - ssize_t len; - size_t w2; -#define DO_WRITE(rc, fd, ptr, w2, len) len = write(fd, ptr, w2); rc = (len >= 0) -#endif - - /* Do the lock/unlock of the reader mutex before starting the - * write txn. Otherwise other read txns could block writers. - */ - rc = mdb_txn_begin(env, NULL, MDB_RDONLY, &txn); - if (rc) - return rc; - - if (env->me_txns) { - /* We must start the actual read txn after blocking writers */ - mdb_txn_reset0(txn, "reset-stage1"); - - /* Temporarily block writers until we snapshot the meta pages */ - LOCK_MUTEX_W(env); - - rc = mdb_txn_renew0(txn); - if (rc) { - UNLOCK_MUTEX_W(env); - goto leave; - } - } - - wsize = env->me_psize * 2; - ptr = env->me_map; - w2 = wsize; - while (w2 > 0) { - DO_WRITE(rc, fd, ptr, w2, len); - if (!rc) { - rc = ErrCode(); - break; - } else if (len > 0) { - rc = MDB_SUCCESS; - ptr += len; - w2 -= len; - continue; - } else { - /* Non-blocking or async handles are not supported */ - rc = EIO; - break; - } - } - if (env->me_txns) - UNLOCK_MUTEX_W(env); - - if (rc) - goto leave; - - wsize = txn->mt_next_pgno * env->me_psize - wsize; - while (wsize > 0) { - if (wsize > MAX_WRITE) - w2 = MAX_WRITE; - else - w2 = wsize; - DO_WRITE(rc, fd, ptr, w2, len); - if (!rc) { - rc = ErrCode(); - break; - } else if (len > 0) { - rc = MDB_SUCCESS; - ptr += len; - wsize -= len; - continue; - } else { - rc = EIO; - break; - } - } - -leave: - mdb_txn_abort(txn); - return rc; -} - -int -mdb_env_copy(MDB_env *env, const char *path) -{ - int rc, len; - char *lpath; - HANDLE newfd = INVALID_HANDLE_VALUE; - - if (env->me_flags & MDB_NOSUBDIR) { - lpath = (char *)path; - } else { - len = strlen(path); - len += sizeof(DATANAME); - lpath = malloc(len); - if (!lpath) - return ENOMEM; - sprintf(lpath, "%s" DATANAME, path); - } - - /* The destination path must exist, but the destination file must not. - * We don't want the OS to cache the writes, since the source data is - * already in the OS cache. - */ -#ifdef _WIN32 - newfd = CreateFile(lpath, GENERIC_WRITE, 0, NULL, CREATE_NEW, - FILE_FLAG_NO_BUFFERING|FILE_FLAG_WRITE_THROUGH, NULL); -#else - newfd = open(lpath, O_WRONLY|O_CREAT|O_EXCL, 0666); -#endif - if (newfd == INVALID_HANDLE_VALUE) { - rc = ErrCode(); - goto leave; - } - -#ifdef O_DIRECT - /* Set O_DIRECT if the file system supports it */ - if ((rc = fcntl(newfd, F_GETFL)) != -1) - (void) fcntl(newfd, F_SETFL, rc | O_DIRECT); -#endif -#ifdef F_NOCACHE /* __APPLE__ */ - rc = fcntl(newfd, F_NOCACHE, 1); - if (rc) { - rc = ErrCode(); - goto leave; - } -#endif - - rc = mdb_env_copyfd(env, newfd); - -leave: - if (!(env->me_flags & MDB_NOSUBDIR)) - free(lpath); - if (newfd != INVALID_HANDLE_VALUE) - if (close(newfd) < 0 && rc == MDB_SUCCESS) - rc = ErrCode(); - - return rc; -} -void +void ESECT mdb_env_close(MDB_env *env) { MDB_page *dp; @@ -4542,7 +4531,16 @@ mdb_cmp_cint(const MDB_val *a, const MDB_val *b) } while(!x && u > (unsigned short *)a->mv_data); return x; #else - return memcmp(a->mv_data, b->mv_data, a->mv_size); + unsigned short *u, *c, *end; + int x; + + end = (unsigned short *) ((char *) a->mv_data + a->mv_size); + u = (unsigned short *)a->mv_data; + c = (unsigned short *)b->mv_data; + do { + x = *u++ - *c++; + } while(!x && u < end); + return x; #endif } @@ -5421,7 +5419,7 @@ mdb_cursor_set(MDB_cursor *mc, MDB_val *key, MDB_val *data, if (!mc->mc_top) { /* There are no other pages */ mc->mc_ki[mc->mc_top] = 0; - if (op == MDB_SET_RANGE) { + if (op == MDB_SET_RANGE && !exactp) { rc = 0; goto set1; } else @@ -5457,8 +5455,10 @@ set1: mc->mc_flags &= ~C_EOF; if (IS_LEAF2(mp)) { - key->mv_size = mc->mc_db->md_pad; - key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); + if (op == MDB_SET_RANGE || op == MDB_SET_KEY) { + key->mv_size = mc->mc_db->md_pad; + key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); + } return MDB_SUCCESS; } @@ -5942,13 +5942,12 @@ mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, } else { /* there's only a key anyway, so this is a no-op */ if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { + char *ptr; unsigned int ksize = mc->mc_db->md_pad; if (key->mv_size != ksize) return MDB_BAD_VALSIZE; - if (flags == MDB_CURRENT) { - char *ptr = LEAF2KEY(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], ksize); - memcpy(ptr, key->mv_data, ksize); - } + ptr = LEAF2KEY(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], ksize); + memcpy(ptr, key->mv_data, ksize); return MDB_SUCCESS; } @@ -5978,12 +5977,12 @@ more: if (mc->mc_dbx->md_dcmp == mdb_cmp_int && olddata.mv_size == sizeof(size_t)) mc->mc_dbx->md_dcmp = mdb_cmp_clong; #endif - /* if data matches, skip it */ + /* does data match? */ if (!mc->mc_dbx->md_dcmp(data, &olddata)) { if (flags & MDB_NODUPDATA) return MDB_KEYEXIST; - rc = MDB_SUCCESS; - goto next_sub; + /* overwrite it */ + goto current; } /* Back up original data item */ @@ -6259,7 +6258,6 @@ put_sub: */ mc->mc_flags |= C_INITIALIZED; } -next_sub: if (flags & MDB_MULTIPLE) { if (!rc) { mcount++; @@ -6345,6 +6343,7 @@ mdb_cursor_del(MDB_cursor *mc, unsigned int flags) return rc; } /* otherwise fall thru and delete the sub-DB */ + mc->mc_xcursor->mx_cursor.mc_flags |= C_EOF; } if (leaf->mn_flags & F_SUBDATA) { @@ -7054,20 +7053,20 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst) MDB_node *s2; MDB_val bkey; /* must find the lowest key below dst */ - rc = mdb_page_search_lowest(cdst); + mdb_cursor_copy(cdst, &mn); + rc = mdb_page_search_lowest(&mn); if (rc) return rc; - if (IS_LEAF2(cdst->mc_pg[cdst->mc_top])) { - bkey.mv_size = cdst->mc_db->md_pad; - bkey.mv_data = LEAF2KEY(cdst->mc_pg[cdst->mc_top], 0, bkey.mv_size); + if (IS_LEAF2(mn.mc_pg[mn.mc_top])) { + bkey.mv_size = mn.mc_db->md_pad; + bkey.mv_data = LEAF2KEY(mn.mc_pg[mn.mc_top], 0, bkey.mv_size); } else { - s2 = NODEPTR(cdst->mc_pg[cdst->mc_top], 0); + s2 = NODEPTR(mn.mc_pg[mn.mc_top], 0); bkey.mv_size = NODEKSZ(s2); bkey.mv_data = NODEKEY(s2); } - cdst->mc_snum = snum--; - cdst->mc_top = snum; - mdb_cursor_copy(cdst, &mn); + mn.mc_snum = snum--; + mn.mc_top = snum; mn.mc_ki[snum] = 0; rc = mdb_update_key(&mn, &bkey); if (rc) @@ -7183,14 +7182,17 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst) static int mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst) { - int rc; - indx_t i, j; - MDB_node *srcnode; + MDB_page *psrc, *pdst; + MDB_node *srcnode; MDB_val key, data; - unsigned nkeys; + unsigned nkeys; + int rc; + indx_t i, j; - DPRINTF(("merging page %"Z"u into %"Z"u", csrc->mc_pg[csrc->mc_top]->mp_pgno, - cdst->mc_pg[cdst->mc_top]->mp_pgno)); + psrc = csrc->mc_pg[csrc->mc_top]; + pdst = cdst->mc_pg[cdst->mc_top]; + + DPRINTF(("merging page %"Z"u into %"Z"u", psrc->mp_pgno, pdst->mp_pgno)); mdb_cassert(csrc, csrc->mc_snum > 1); /* can't merge root page */ mdb_cassert(csrc, cdst->mc_snum > 1); @@ -7201,36 +7203,35 @@ mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst) /* Move all nodes from src to dst. */ - j = nkeys = NUMKEYS(cdst->mc_pg[cdst->mc_top]); - if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { + j = nkeys = NUMKEYS(pdst); + if (IS_LEAF2(psrc)) { key.mv_size = csrc->mc_db->md_pad; - key.mv_data = METADATA(csrc->mc_pg[csrc->mc_top]); - for (i = 0; i < NUMKEYS(csrc->mc_pg[csrc->mc_top]); i++, j++) { + key.mv_data = METADATA(psrc); + for (i = 0; i < NUMKEYS(psrc); i++, j++) { rc = mdb_node_add(cdst, j, &key, NULL, 0, 0); if (rc != MDB_SUCCESS) return rc; key.mv_data = (char *)key.mv_data + key.mv_size; } } else { - for (i = 0; i < NUMKEYS(csrc->mc_pg[csrc->mc_top]); i++, j++) { - srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], i); - if (i == 0 && IS_BRANCH(csrc->mc_pg[csrc->mc_top])) { - unsigned int snum = csrc->mc_snum; + for (i = 0; i < NUMKEYS(psrc); i++, j++) { + srcnode = NODEPTR(psrc, i); + if (i == 0 && IS_BRANCH(psrc)) { + MDB_cursor mn; MDB_node *s2; + mdb_cursor_copy(csrc, &mn); /* must find the lowest key below src */ - rc = mdb_page_search_lowest(csrc); + rc = mdb_page_search_lowest(&mn); if (rc) return rc; - if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { - key.mv_size = csrc->mc_db->md_pad; - key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], 0, key.mv_size); + if (IS_LEAF2(mn.mc_pg[mn.mc_top])) { + key.mv_size = mn.mc_db->md_pad; + key.mv_data = LEAF2KEY(mn.mc_pg[mn.mc_top], 0, key.mv_size); } else { - s2 = NODEPTR(csrc->mc_pg[csrc->mc_top], 0); + s2 = NODEPTR(mn.mc_pg[mn.mc_top], 0); key.mv_size = NODEKSZ(s2); key.mv_data = NODEKEY(s2); } - csrc->mc_snum = snum--; - csrc->mc_top = snum; } else { key.mv_size = srcnode->mn_ksize; key.mv_data = NODEKEY(srcnode); @@ -7245,8 +7246,8 @@ mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst) } DPRINTF(("dst page %"Z"u now has %u keys (%.1f%% filled)", - cdst->mc_pg[cdst->mc_top]->mp_pgno, NUMKEYS(cdst->mc_pg[cdst->mc_top]), - (float)PAGEFILL(cdst->mc_txn->mt_env, cdst->mc_pg[cdst->mc_top]) / 10)); + pdst->mp_pgno, NUMKEYS(pdst), + (float)PAGEFILL(cdst->mc_txn->mt_env, pdst) / 10)); /* Unlink the src page from parent and add to free list. */ @@ -7262,11 +7263,14 @@ mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst) } csrc->mc_top++; - rc = mdb_midl_append(&csrc->mc_txn->mt_free_pgs, - csrc->mc_pg[csrc->mc_top]->mp_pgno); + psrc = csrc->mc_pg[csrc->mc_top]; + /* If not operating on FreeDB, allow this page to be reused + * in this txn. Otherwise just add to free list. + */ + rc = mdb_page_loose(csrc, psrc); if (rc) return rc; - if (IS_LEAF(csrc->mc_pg[csrc->mc_top])) + if (IS_LEAF(psrc)) csrc->mc_db->md_leaf_pages--; else csrc->mc_db->md_branch_pages--; @@ -7274,7 +7278,6 @@ mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst) /* Adjust other cursors pointing to mp */ MDB_cursor *m2, *m3; MDB_dbi dbi = csrc->mc_dbi; - MDB_page *mp = cdst->mc_pg[cdst->mc_top]; for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { if (csrc->mc_flags & C_SUB) @@ -7283,8 +7286,8 @@ mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst) m3 = m2; if (m3 == csrc) continue; if (m3->mc_snum < csrc->mc_snum) continue; - if (m3->mc_pg[csrc->mc_top] == csrc->mc_pg[csrc->mc_top]) { - m3->mc_pg[csrc->mc_top] = mp; + if (m3->mc_pg[csrc->mc_top] == psrc) { + m3->mc_pg[csrc->mc_top] = pdst; m3->mc_ki[csrc->mc_top] += nkeys; } } @@ -8037,19 +8040,571 @@ mdb_put(MDB_txn *txn, MDB_dbi dbi, return mdb_cursor_put(&mc, key, data, flags); } -int -mdb_env_set_flags(MDB_env *env, unsigned int flag, int onoff) -{ - if ((flag & CHANGEABLE) != flag) - return EINVAL; - if (onoff) - env->me_flags |= flag; +#ifndef MDB_WBUF +#define MDB_WBUF (1024*1024) +#endif + + /** State needed for a compacting copy. */ +typedef struct mdb_copy { + pthread_mutex_t mc_mutex; + pthread_cond_t mc_cond; + char *mc_wbuf[2]; + char *mc_over[2]; + MDB_env *mc_env; + MDB_txn *mc_txn; + int mc_wlen[2]; + int mc_olen[2]; + pgno_t mc_next_pgno; + HANDLE mc_fd; + int mc_status; + volatile int mc_new; + int mc_toggle; + +} mdb_copy; + + /** Dedicated writer thread for compacting copy. */ +static THREAD_RET ESECT +mdb_env_copythr(void *arg) +{ + mdb_copy *my = arg; + char *ptr; + int toggle = 0, wsize, rc; +#ifdef _WIN32 + DWORD len; +#define DO_WRITE(rc, fd, ptr, w2, len) rc = WriteFile(fd, ptr, w2, &len, NULL) +#else + int len; +#define DO_WRITE(rc, fd, ptr, w2, len) len = write(fd, ptr, w2); rc = (len >= 0) +#endif + + pthread_mutex_lock(&my->mc_mutex); + my->mc_new = 0; + pthread_cond_signal(&my->mc_cond); + for(;;) { + while (!my->mc_new) + pthread_cond_wait(&my->mc_cond, &my->mc_mutex); + if (my->mc_new < 0) { + my->mc_new = 0; + break; + } + my->mc_new = 0; + wsize = my->mc_wlen[toggle]; + ptr = my->mc_wbuf[toggle]; +again: + while (wsize > 0) { + DO_WRITE(rc, my->mc_fd, ptr, wsize, len); + if (!rc) { + rc = ErrCode(); + break; + } else if (len > 0) { + rc = MDB_SUCCESS; + ptr += len; + wsize -= len; + continue; + } else { + rc = EIO; + break; + } + } + if (rc) { + my->mc_status = rc; + break; + } + /* If there's an overflow page tail, write it too */ + if (my->mc_olen[toggle]) { + wsize = my->mc_olen[toggle]; + ptr = my->mc_over[toggle]; + my->mc_olen[toggle] = 0; + goto again; + } + my->mc_wlen[toggle] = 0; + toggle ^= 1; + pthread_cond_signal(&my->mc_cond); + } + pthread_cond_signal(&my->mc_cond); + pthread_mutex_unlock(&my->mc_mutex); + return (THREAD_RET)0; +#undef DO_WRITE +} + + /** Tell the writer thread there's a buffer ready to write */ +static int ESECT +mdb_env_cthr_toggle(mdb_copy *my, int st) +{ + int toggle = my->mc_toggle ^ 1; + pthread_mutex_lock(&my->mc_mutex); + if (my->mc_status) { + pthread_mutex_unlock(&my->mc_mutex); + return my->mc_status; + } + while (my->mc_new == 1) + pthread_cond_wait(&my->mc_cond, &my->mc_mutex); + my->mc_new = st; + my->mc_toggle = toggle; + pthread_cond_signal(&my->mc_cond); + pthread_mutex_unlock(&my->mc_mutex); + return 0; +} + + /** Depth-first tree traversal for compacting copy. */ +static int ESECT +mdb_env_cwalk(mdb_copy *my, pgno_t *pg, int flags) +{ + MDB_cursor mc; + MDB_txn *txn = my->mc_txn; + MDB_node *ni; + MDB_page *mo, *mp, *leaf; + char *buf, *ptr; + int rc, toggle; + unsigned int i; + + /* Empty DB, nothing to do */ + if (*pg == P_INVALID) + return MDB_SUCCESS; + + mc.mc_snum = 1; + mc.mc_top = 0; + mc.mc_txn = txn; + + rc = mdb_page_get(my->mc_txn, *pg, &mc.mc_pg[0], NULL); + if (rc) + return rc; + rc = mdb_page_search_root(&mc, NULL, MDB_PS_FIRST); + if (rc) + return rc; + + /* Make cursor pages writable */ + buf = ptr = malloc(my->mc_env->me_psize * mc.mc_snum); + if (buf == NULL) + return ENOMEM; + + for (i=0; imc_env->me_psize); + mc.mc_pg[i] = (MDB_page *)ptr; + ptr += my->mc_env->me_psize; + } + + /* This is writable space for a leaf page. Usually not needed. */ + leaf = (MDB_page *)ptr; + + toggle = my->mc_toggle; + while (mc.mc_snum > 0) { + unsigned n; + mp = mc.mc_pg[mc.mc_top]; + n = NUMKEYS(mp); + + if (IS_LEAF(mp)) { + if (!IS_LEAF2(mp) && !(flags & F_DUPDATA)) { + for (i=0; imn_flags & F_BIGDATA) { + MDB_page *omp; + pgno_t pg; + + /* Need writable leaf */ + if (mp != leaf) { + mc.mc_pg[mc.mc_top] = leaf; + mdb_page_copy(leaf, mp, my->mc_env->me_psize); + mp = leaf; + ni = NODEPTR(mp, i); + } + + memcpy(&pg, NODEDATA(ni), sizeof(pg)); + rc = mdb_page_get(txn, pg, &omp, NULL); + if (rc) + goto done; + if (my->mc_wlen[toggle] >= MDB_WBUF) { + rc = mdb_env_cthr_toggle(my, 1); + if (rc) + goto done; + toggle = my->mc_toggle; + } + mo = (MDB_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]); + memcpy(mo, omp, my->mc_env->me_psize); + mo->mp_pgno = my->mc_next_pgno; + my->mc_next_pgno += omp->mp_pages; + my->mc_wlen[toggle] += my->mc_env->me_psize; + if (omp->mp_pages > 1) { + my->mc_olen[toggle] = my->mc_env->me_psize * (omp->mp_pages - 1); + my->mc_over[toggle] = (char *)omp + my->mc_env->me_psize; + rc = mdb_env_cthr_toggle(my, 1); + if (rc) + goto done; + toggle = my->mc_toggle; + } + memcpy(NODEDATA(ni), &mo->mp_pgno, sizeof(pgno_t)); + } else if (ni->mn_flags & F_SUBDATA) { + MDB_db db; + + /* Need writable leaf */ + if (mp != leaf) { + mc.mc_pg[mc.mc_top] = leaf; + mdb_page_copy(leaf, mp, my->mc_env->me_psize); + mp = leaf; + ni = NODEPTR(mp, i); + } + + memcpy(&db, NODEDATA(ni), sizeof(db)); + my->mc_toggle = toggle; + rc = mdb_env_cwalk(my, &db.md_root, ni->mn_flags & F_DUPDATA); + if (rc) + goto done; + toggle = my->mc_toggle; + memcpy(NODEDATA(ni), &db, sizeof(db)); + } + } + } + } else { + mc.mc_ki[mc.mc_top]++; + if (mc.mc_ki[mc.mc_top] < n) { + pgno_t pg; +again: + ni = NODEPTR(mp, mc.mc_ki[mc.mc_top]); + pg = NODEPGNO(ni); + rc = mdb_page_get(txn, pg, &mp, NULL); + if (rc) + goto done; + mc.mc_top++; + mc.mc_snum++; + mc.mc_ki[mc.mc_top] = 0; + if (IS_BRANCH(mp)) { + /* Whenever we advance to a sibling branch page, + * we must proceed all the way down to its first leaf. + */ + mdb_page_copy(mc.mc_pg[mc.mc_top], mp, my->mc_env->me_psize); + goto again; + } else + mc.mc_pg[mc.mc_top] = mp; + continue; + } + } + if (my->mc_wlen[toggle] >= MDB_WBUF) { + rc = mdb_env_cthr_toggle(my, 1); + if (rc) + goto done; + toggle = my->mc_toggle; + } + mo = (MDB_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]); + mdb_page_copy(mo, mp, my->mc_env->me_psize); + mo->mp_pgno = my->mc_next_pgno++; + my->mc_wlen[toggle] += my->mc_env->me_psize; + if (mc.mc_top) { + /* Update parent if there is one */ + ni = NODEPTR(mc.mc_pg[mc.mc_top-1], mc.mc_ki[mc.mc_top-1]); + SETPGNO(ni, mo->mp_pgno); + mdb_cursor_pop(&mc); + } else { + /* Otherwise we're done */ + *pg = mo->mp_pgno; + break; + } + } +done: + free(buf); + return rc; +} + + /** Copy environment with compaction. */ +static int ESECT +mdb_env_copyfd1(MDB_env *env, HANDLE fd) +{ + MDB_meta *mm; + MDB_page *mp; + mdb_copy my; + MDB_txn *txn = NULL; + pthread_t thr; + int rc; + +#ifdef _WIN32 + my.mc_mutex = CreateMutex(NULL, FALSE, NULL); + my.mc_cond = CreateEvent(NULL, FALSE, FALSE, NULL); + my.mc_wbuf[0] = _aligned_malloc(MDB_WBUF*2, env->me_psize); + if (my.mc_wbuf[0] == NULL) + return errno; +#else + pthread_mutex_init(&my.mc_mutex, NULL); + pthread_cond_init(&my.mc_cond, NULL); + rc = posix_memalign((void **)&my.mc_wbuf[0], env->me_psize, MDB_WBUF*2); + if (rc) + return rc; +#endif + my.mc_wbuf[1] = my.mc_wbuf[0] + MDB_WBUF; + my.mc_wlen[0] = 0; + my.mc_wlen[1] = 0; + my.mc_olen[0] = 0; + my.mc_olen[1] = 0; + my.mc_next_pgno = 2; + my.mc_status = 0; + my.mc_new = 1; + my.mc_toggle = 0; + my.mc_env = env; + my.mc_fd = fd; + THREAD_CREATE(thr, mdb_env_copythr, &my); + + rc = mdb_txn_begin(env, NULL, MDB_RDONLY, &txn); + if (rc) + return rc; + + mp = (MDB_page *)my.mc_wbuf[0]; + memset(mp, 0, 2*env->me_psize); + mp->mp_pgno = 0; + mp->mp_flags = P_META; + mm = (MDB_meta *)METADATA(mp); + mdb_env_init_meta0(env, mm); + mm->mm_address = env->me_metas[0]->mm_address; + + mp = (MDB_page *)(my.mc_wbuf[0] + env->me_psize); + mp->mp_pgno = 1; + mp->mp_flags = P_META; + *(MDB_meta *)METADATA(mp) = *mm; + mm = (MDB_meta *)METADATA(mp); + + /* Count the number of free pages, subtract from lastpg to find + * number of active pages + */ + { + MDB_ID freecount = 0; + MDB_cursor mc; + MDB_val key, data; + mdb_cursor_init(&mc, txn, FREE_DBI, NULL); + while ((rc = mdb_cursor_get(&mc, &key, &data, MDB_NEXT)) == 0) + freecount += *(MDB_ID *)data.mv_data; + freecount += txn->mt_dbs[0].md_branch_pages + + txn->mt_dbs[0].md_leaf_pages + + txn->mt_dbs[0].md_overflow_pages; + + /* Set metapage 1 */ + mm->mm_last_pg = txn->mt_next_pgno - freecount - 1; + mm->mm_dbs[1] = txn->mt_dbs[1]; + mm->mm_dbs[1].md_root = mm->mm_last_pg; + mm->mm_txnid = 1; + } + my.mc_wlen[0] = env->me_psize * 2; + my.mc_txn = txn; + pthread_mutex_lock(&my.mc_mutex); + while(my.mc_new) + pthread_cond_wait(&my.mc_cond, &my.mc_mutex); + pthread_mutex_unlock(&my.mc_mutex); + rc = mdb_env_cwalk(&my, &txn->mt_dbs[1].md_root, 0); + if (rc == MDB_SUCCESS && my.mc_wlen[my.mc_toggle]) + rc = mdb_env_cthr_toggle(&my, 1); + mdb_env_cthr_toggle(&my, -1); + pthread_mutex_lock(&my.mc_mutex); + while(my.mc_new) + pthread_cond_wait(&my.mc_cond, &my.mc_mutex); + pthread_mutex_unlock(&my.mc_mutex); + THREAD_FINISH(thr); + + mdb_txn_abort(txn); +#ifdef _WIN32 + CloseHandle(my.mc_cond); + CloseHandle(my.mc_mutex); + _aligned_free(my.mc_wbuf[0]); +#else + pthread_cond_destroy(&my.mc_cond); + pthread_mutex_destroy(&my.mc_mutex); + free(my.mc_wbuf[0]); +#endif + return rc; +} + + /** Copy environment as-is. */ +static int ESECT +mdb_env_copyfd0(MDB_env *env, HANDLE fd) +{ + MDB_txn *txn = NULL; + int rc; + size_t wsize; + char *ptr; +#ifdef _WIN32 + DWORD len, w2; +#define DO_WRITE(rc, fd, ptr, w2, len) rc = WriteFile(fd, ptr, w2, &len, NULL) +#else + ssize_t len; + size_t w2; +#define DO_WRITE(rc, fd, ptr, w2, len) len = write(fd, ptr, w2); rc = (len >= 0) +#endif + + /* Do the lock/unlock of the reader mutex before starting the + * write txn. Otherwise other read txns could block writers. + */ + rc = mdb_txn_begin(env, NULL, MDB_RDONLY, &txn); + if (rc) + return rc; + + if (env->me_txns) { + /* We must start the actual read txn after blocking writers */ + mdb_txn_reset0(txn, "reset-stage1"); + + /* Temporarily block writers until we snapshot the meta pages */ + LOCK_MUTEX_W(env); + + rc = mdb_txn_renew0(txn); + if (rc) { + UNLOCK_MUTEX_W(env); + goto leave; + } + } + + wsize = env->me_psize * 2; + ptr = env->me_map; + w2 = wsize; + while (w2 > 0) { + DO_WRITE(rc, fd, ptr, w2, len); + if (!rc) { + rc = ErrCode(); + break; + } else if (len > 0) { + rc = MDB_SUCCESS; + ptr += len; + w2 -= len; + continue; + } else { + /* Non-blocking or async handles are not supported */ + rc = EIO; + break; + } + } + if (env->me_txns) + UNLOCK_MUTEX_W(env); + + if (rc) + goto leave; + + w2 = txn->mt_next_pgno * env->me_psize; +#ifdef WIN32 + { + LARGE_INTEGER fsize; + GetFileSizeEx(env->me_fd, &fsize); + if (w2 > fsize.QuadPart) + w2 = fsize.QuadPart; + } +#else + { + struct stat st; + fstat(env->me_fd, &st); + if (w2 > (size_t)st.st_size) + w2 = st.st_size; + } +#endif + wsize = w2 - wsize; + while (wsize > 0) { + if (wsize > MAX_WRITE) + w2 = MAX_WRITE; + else + w2 = wsize; + DO_WRITE(rc, fd, ptr, w2, len); + if (!rc) { + rc = ErrCode(); + break; + } else if (len > 0) { + rc = MDB_SUCCESS; + ptr += len; + wsize -= len; + continue; + } else { + rc = EIO; + break; + } + } + +leave: + mdb_txn_abort(txn); + return rc; +} + +int ESECT +mdb_env_copyfd2(MDB_env *env, HANDLE fd, unsigned int flags) +{ + if (flags & MDB_CP_COMPACT) + return mdb_env_copyfd1(env, fd); + else + return mdb_env_copyfd0(env, fd); +} + +int ESECT +mdb_env_copyfd(MDB_env *env, HANDLE fd) +{ + return mdb_env_copyfd2(env, fd, 0); +} + +int ESECT +mdb_env_copy2(MDB_env *env, const char *path, unsigned int flags) +{ + int rc, len; + char *lpath; + HANDLE newfd = INVALID_HANDLE_VALUE; + + if (env->me_flags & MDB_NOSUBDIR) { + lpath = (char *)path; + } else { + len = strlen(path); + len += sizeof(DATANAME); + lpath = malloc(len); + if (!lpath) + return ENOMEM; + sprintf(lpath, "%s" DATANAME, path); + } + + /* The destination path must exist, but the destination file must not. + * We don't want the OS to cache the writes, since the source data is + * already in the OS cache. + */ +#ifdef _WIN32 + newfd = CreateFile(lpath, GENERIC_WRITE, 0, NULL, CREATE_NEW, + FILE_FLAG_NO_BUFFERING|FILE_FLAG_WRITE_THROUGH, NULL); +#else + newfd = open(lpath, O_WRONLY|O_CREAT|O_EXCL, 0666); +#endif + if (newfd == INVALID_HANDLE_VALUE) { + rc = ErrCode(); + goto leave; + } + +#ifdef O_DIRECT + /* Set O_DIRECT if the file system supports it */ + if ((rc = fcntl(newfd, F_GETFL)) != -1) + (void) fcntl(newfd, F_SETFL, rc | O_DIRECT); +#endif +#ifdef F_NOCACHE /* __APPLE__ */ + rc = fcntl(newfd, F_NOCACHE, 1); + if (rc) { + rc = ErrCode(); + goto leave; + } +#endif + + rc = mdb_env_copyfd2(env, newfd, flags); + +leave: + if (!(env->me_flags & MDB_NOSUBDIR)) + free(lpath); + if (newfd != INVALID_HANDLE_VALUE) + if (close(newfd) < 0 && rc == MDB_SUCCESS) + rc = ErrCode(); + + return rc; +} + +int ESECT +mdb_env_copy(MDB_env *env, const char *path) +{ + return mdb_env_copy2(env, path, 0); +} + +int ESECT +mdb_env_set_flags(MDB_env *env, unsigned int flag, int onoff) +{ + if ((flag & CHANGEABLE) != flag) + return EINVAL; + if (onoff) + env->me_flags |= flag; else env->me_flags &= ~flag; return MDB_SUCCESS; } -int +int ESECT mdb_env_get_flags(MDB_env *env, unsigned int *arg) { if (!env || !arg) @@ -8059,7 +8614,7 @@ mdb_env_get_flags(MDB_env *env, unsigned int *arg) return MDB_SUCCESS; } -int +int ESECT mdb_env_set_userctx(MDB_env *env, void *ctx) { if (!env) @@ -8068,13 +8623,13 @@ mdb_env_set_userctx(MDB_env *env, void *ctx) return MDB_SUCCESS; } -void * +void * ESECT mdb_env_get_userctx(MDB_env *env) { return env ? env->me_userctx : NULL; } -int +int ESECT mdb_env_set_assert(MDB_env *env, MDB_assert_func *func) { if (!env) @@ -8085,7 +8640,7 @@ mdb_env_set_assert(MDB_env *env, MDB_assert_func *func) return MDB_SUCCESS; } -int +int ESECT mdb_env_get_path(MDB_env *env, const char **arg) { if (!env || !arg) @@ -8095,7 +8650,7 @@ mdb_env_get_path(MDB_env *env, const char **arg) return MDB_SUCCESS; } -int +int ESECT mdb_env_get_fd(MDB_env *env, mdb_filehandle_t *arg) { if (!env || !arg) @@ -8111,7 +8666,7 @@ mdb_env_get_fd(MDB_env *env, mdb_filehandle_t *arg) * @param[out] arg the address of an #MDB_stat structure to receive the stats. * @return 0, this function always succeeds. */ -static int +static int ESECT mdb_stat0(MDB_env *env, MDB_db *db, MDB_stat *arg) { arg->ms_psize = env->me_psize; @@ -8123,7 +8678,8 @@ mdb_stat0(MDB_env *env, MDB_db *db, MDB_stat *arg) return MDB_SUCCESS; } -int + +int ESECT mdb_env_stat(MDB_env *env, MDB_stat *arg) { int toggle; @@ -8136,7 +8692,7 @@ mdb_env_stat(MDB_env *env, MDB_stat *arg) return mdb_stat0(env, &env->me_metas[toggle]->mm_dbs[MAIN_DBI], arg); } -int +int ESECT mdb_env_info(MDB_env *env, MDB_envinfo *arg) { int toggle; @@ -8145,7 +8701,7 @@ mdb_env_info(MDB_env *env, MDB_envinfo *arg) return EINVAL; toggle = mdb_env_pick_meta(env); - arg->me_mapaddr = (env->me_flags & MDB_FIXEDMAP) ? env->me_map : 0; + arg->me_mapaddr = env->me_metas[toggle]->mm_address; arg->me_mapsize = env->me_mapsize; arg->me_maxreaders = env->me_maxreaders; @@ -8493,12 +9049,14 @@ int mdb_set_relctx(MDB_txn *txn, MDB_dbi dbi, void *ctx) return MDB_SUCCESS; } -int mdb_env_get_maxkeysize(MDB_env *env) +int ESECT +mdb_env_get_maxkeysize(MDB_env *env) { return ENV_MAXKEY(env); } -int mdb_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx) +int ESECT +mdb_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx) { unsigned int i, rdrs; MDB_reader *mr; @@ -8538,7 +9096,8 @@ int mdb_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx) /** Insert pid into list if not already present. * return -1 if already present. */ -static int mdb_pid_insert(MDB_PID_T *ids, MDB_PID_T pid) +static int ESECT +mdb_pid_insert(MDB_PID_T *ids, MDB_PID_T pid) { /* binary search of pid in list */ unsigned base = 0; @@ -8574,7 +9133,8 @@ static int mdb_pid_insert(MDB_PID_T *ids, MDB_PID_T pid) return 0; } -int mdb_reader_check(MDB_env *env, int *dead) +int ESECT +mdb_reader_check(MDB_env *env, int *dead) { unsigned int i, j, rdrs; MDB_reader *mr;