X-Git-Url: https://git.sur5r.net/?a=blobdiff_plain;f=libraries%2Fliblmdb%2Fmdb.c;h=b47cb53a2ce1082c900437ec853db4c1c58b2b31;hb=98b2910ee89e9fbc6c2df00d3dd35aeca7b86daf;hp=fdfcb351097fe2ef2d17e99e2d4bf7fdf0dd22bb;hpb=a90415f3011399726a16e1d71d0ef757cda69896;p=openldap diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index fdfcb35109..b47cb53a2c 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -5,7 +5,7 @@ * BerkeleyDB API, but much simplified. */ /* - * Copyright 2011-2015 Howard Chu, Symas Corp. + * Copyright 2011-2017 Howard Chu, Symas Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -35,9 +35,14 @@ #ifndef _GNU_SOURCE #define _GNU_SOURCE 1 #endif +#if defined(__WIN64__) +#define _FILE_OFFSET_BITS 64 +#endif #ifdef _WIN32 #include #include +#include /* get wcscpy() */ + /** getpid() returns int; MinGW defines pid_t but MinGW64 typedefs it * as int64 which is wrong. MSVC doesn't define it at all, so just * don't use it. @@ -108,6 +113,10 @@ typedef SSIZE_T ssize_t; /* Most platforms have posix_memalign, older may only have memalign */ #define HAVE_MEMALIGN 1 #include +/* On Solaris, we need the POSIX sigwait function */ +#if defined (__sun) +# define _POSIX_PTHREAD_SEMANTICS 1 +#endif #endif #if !(defined(BYTE_ORDER) || defined(__BYTE_ORDER)) @@ -115,7 +124,7 @@ typedef SSIZE_T ssize_t; #include /* defines BYTE_ORDER on HPUX and Solaris */ #endif -#if defined(__APPLE__) || defined (BSD) +#if defined(__APPLE__) || defined (BSD) || defined(__FreeBSD_kernel__) # define MDB_USE_POSIX_SEM 1 # define MDB_FDATASYNC fsync #elif defined(ANDROID) @@ -124,6 +133,7 @@ typedef SSIZE_T ssize_t; #ifndef _WIN32 #include +#include #ifdef MDB_USE_POSIX_SEM # define MDB_USE_HASH 1 #include @@ -198,7 +208,7 @@ typedef SSIZE_T ssize_t; #define ESECT #endif -#ifdef _MSC_VER +#ifdef _WIN32 #define CALL_CONV WINAPI #else #define CALL_CONV @@ -244,10 +254,9 @@ typedef SSIZE_T ssize_t; /** Some platforms define the EOWNERDEAD error code * even though they don't support Robust Mutexes. * Compile with -DMDB_USE_ROBUST=0, or use some other - * mechanism like -DMDB_USE_SYSV_SEM instead of - * -DMDB_USE_POSIX_MUTEX. (SysV semaphores are - * also Robust, but some systems don't support them - * either.) + * mechanism like -DMDB_USE_POSIX_SEM instead of + * -DMDB_USE_POSIX_MUTEX. + * (Posix semaphores are not robust.) */ #ifndef MDB_USE_ROBUST /* Android currently lacks Robust Mutex support. So does glibc < 2.4. */ @@ -256,16 +265,20 @@ typedef SSIZE_T ssize_t; # define MDB_USE_ROBUST 0 # else # define MDB_USE_ROBUST 1 -/* glibc < 2.10 only provided _np API */ -# if defined(__GLIBC__) && GLIBC_VER < 0x02000a +# endif +#endif /* !MDB_USE_ROBUST */ + +#if defined(MDB_USE_POSIX_MUTEX) && (MDB_USE_ROBUST) +/* glibc < 2.12 only provided _np API */ +# if (defined(__GLIBC__) && GLIBC_VER < 0x02000c) || \ + (defined(PTHREAD_MUTEX_ROBUST_NP) && !defined(PTHREAD_MUTEX_ROBUST)) # define PTHREAD_MUTEX_ROBUST PTHREAD_MUTEX_ROBUST_NP # define pthread_mutexattr_setrobust(attr, flag) pthread_mutexattr_setrobust_np(attr, flag) # define pthread_mutex_consistent(mutex) pthread_mutex_consistent_np(mutex) # endif -# endif -#endif /* MDB_USE_ROBUST */ +#endif /* MDB_USE_POSIX_MUTEX && MDB_USE_ROBUST */ -#if defined(MDB_OWNERDEAD) && MDB_USE_ROBUST +#if defined(MDB_OWNERDEAD) && (MDB_USE_ROBUST) #define MDB_ROBUST_SUPPORTED 1 #endif @@ -288,8 +301,10 @@ typedef HANDLE mdb_mutex_t, mdb_mutexref_t; #define pthread_mutex_lock(x) WaitForSingleObject(*x, INFINITE) #define pthread_cond_signal(x) SetEvent(*x) #define pthread_cond_wait(cond,mutex) do{SignalObjectAndWait(*mutex, *cond, INFINITE, FALSE); WaitForSingleObject(*mutex, INFINITE);}while(0) -#define THREAD_CREATE(thr,start,arg) thr=CreateThread(NULL,0,start,arg,0,NULL) -#define THREAD_FINISH(thr) WaitForSingleObject(thr, INFINITE) +#define THREAD_CREATE(thr,start,arg) \ + (((thr) = CreateThread(NULL, 0, start, arg, 0, NULL)) ? 0 : ErrCode()) +#define THREAD_FINISH(thr) \ + (WaitForSingleObject(thr, INFINITE) ? ErrCode() : 0) #define LOCK_MUTEX0(mutex) WaitForSingleObject(mutex, INFINITE) #define UNLOCK_MUTEX(mutex) ReleaseMutex(mutex) #define mdb_mutex_consistent(mutex) 0 @@ -330,15 +345,15 @@ mdb_sem_wait(sem_t *sem) } #else /* MDB_USE_POSIX_MUTEX: */ - /** Shared mutex/semaphore as it is stored (mdb_mutex_t), and as - * local variables keep it (mdb_mutexref_t). + /** Shared mutex/semaphore as the original is stored. * - * When #mdb_mutexref_t is a pointer declaration and #mdb_mutex_t is - * not, then it is array[size 1] so it can be assigned to a pointer. - * @{ + * Not for copies. Instead it can be assigned to an #mdb_mutexref_t. + * When mdb_mutexref_t is a pointer and mdb_mutex_t is not, then it + * is array[size 1] so it can be assigned to the pointer. */ -typedef pthread_mutex_t mdb_mutex_t[1], *mdb_mutexref_t; - /* @} */ +typedef pthread_mutex_t mdb_mutex_t[1]; + /** Reference to an #mdb_mutex_t */ +typedef pthread_mutex_t *mdb_mutexref_t; /** Lock the reader or writer mutex. * Returns 0 or a code to give #mdb_mutex_failed(), as in #LOCK_MUTEX(). */ @@ -403,10 +418,13 @@ static int mdb_mutex_failed(MDB_env *env, mdb_mutexref_t mutex, int rc); * * @note If O_DSYNC is undefined but exists in /usr/include, * preferably set some compiler flag to get the definition. - * Otherwise compile with the less efficient -DMDB_DSYNC=O_SYNC. */ #ifndef MDB_DSYNC +# ifdef O_DSYNC # define MDB_DSYNC O_DSYNC +# else +# define MDB_DSYNC O_SYNC +# endif #endif #endif @@ -760,9 +778,23 @@ typedef struct MDB_txninfo { + (((MDB_PIDLOCK) != 0) << 16))) /** @} */ -/** Common header for all page types. - * Overflow records occupy a number of contiguous pages with no - * headers on any page after the first. +/** Common header for all page types. The page type depends on #mp_flags. + * + * #P_BRANCH and #P_LEAF pages have unsorted '#MDB_node's at the end, with + * sorted #mp_ptrs[] entries referring to them. Exception: #P_LEAF2 pages + * omit mp_ptrs and pack sorted #MDB_DUPFIXED values after the page header. + * + * #P_OVERFLOW records occupy one or more contiguous pages where only the + * first has a page header. They hold the real data of #F_BIGDATA nodes. + * + * #P_SUBP sub-pages are small leaf "pages" with duplicate data. + * A node with flag #F_DUPDATA but not #F_SUBDATA contains a sub-page. + * (Duplicate data can also go in sub-databases, which use normal pages.) + * + * #P_META pages contain #MDB_meta, the start point of an LMDB snapshot. + * + * Each non-metapage up to #MDB_meta.%mm_last_pg is reachable exactly once + * in the snapshot: Either used by a database or listed in a freeDB record. */ typedef struct MDB_page { #define mp_pgno mp_p.p_pgno @@ -771,7 +803,7 @@ typedef struct MDB_page { pgno_t p_pgno; /**< page number */ struct MDB_page *p_next; /**< for in-memory list of freed pages */ } mp_p; - uint16_t mp_pad; + uint16_t mp_pad; /**< key size if this is a LEAF2 page */ /** @defgroup mdb_page Page Flags * @ingroup internal * Flags for the page headers. @@ -838,25 +870,34 @@ typedef struct MDB_page { /** The number of overflow pages needed to store the given size. */ #define OVPAGES(size, psize) ((PAGEHDRSZ-1 + (size)) / (psize) + 1) - /** Link in #MDB_txn.%mt_loose_pgs list */ + /** Link in #MDB_txn.%mt_loose_pgs list. + * Kept outside the page header, which is needed when reusing the page. + */ #define NEXT_LOOSE_PAGE(p) (*(MDB_page **)((p) + 2)) /** Header for a single key/data pair within a page. * Used in pages of type #P_BRANCH and #P_LEAF without #P_LEAF2. * We guarantee 2-byte alignment for 'MDB_node's. + * + * #mn_lo and #mn_hi are used for data size on leaf nodes, and for child + * pgno on branch nodes. On 64 bit platforms, #mn_flags is also used + * for pgno. (Branch nodes have no flags). Lo and hi are in host byte + * order in case some accesses can be optimized to 32-bit word access. + * + * Leaf node flags describe node contents. #F_BIGDATA says the node's + * data part is the page number of an overflow page with actual data. + * #F_DUPDATA and #F_SUBDATA can be combined giving duplicate data in + * a sub-page/sub-database, and named databases (just #F_SUBDATA). */ typedef struct MDB_node { - /** lo and hi are used for data size on leaf nodes and for - * child pgno on branch nodes. On 64 bit platforms, flags - * is also used for pgno. (Branch nodes have no flags). - * They are in host byte order in case that lets some - * accesses be optimized into a 32-bit word access. - */ + /** part of data size or pgno + * @{ */ #if BYTE_ORDER == LITTLE_ENDIAN - unsigned short mn_lo, mn_hi; /**< part of data size or pgno */ + unsigned short mn_lo, mn_hi; #else unsigned short mn_hi, mn_lo; #endif + /** @} */ /** @defgroup mdb_node Node Flags * @ingroup internal * Flags for node headers. @@ -966,9 +1007,9 @@ typedef struct MDB_db { pgno_t md_root; /**< the root page of this tree */ } MDB_db; - /** mdb_dbi_open flags */ #define MDB_VALID 0x8000 /**< DB handle is valid, for me_dbflags */ #define PERSISTENT_FLAGS (0xffff & ~(MDB_VALID)) + /** #mdb_dbi_open() flags */ #define VALID_FLAGS (MDB_REVERSEKEY|MDB_DUPSORT|MDB_INTEGERKEY|MDB_DUPFIXED|\ MDB_INTEGERDUP|MDB_REVERSEDUP|MDB_CREATE) @@ -999,7 +1040,10 @@ typedef struct MDB_meta { #define mm_psize mm_dbs[FREE_DBI].md_pad /** Any persistent environment flags. @ref mdb_env */ #define mm_flags mm_dbs[FREE_DBI].md_flags - pgno_t mm_last_pg; /**< last used page in file */ + /** Last used page in the datafile. + * Actually the file may be shorter if the freeDB lists the final pages. + */ + pgno_t mm_last_pg; volatile txnid_t mm_txnid; /**< txnid that committed this page */ } MDB_meta; @@ -1049,7 +1093,7 @@ struct MDB_txn { * in this transaction, linked through #NEXT_LOOSE_PAGE(page). */ MDB_page *mt_loose_pgs; - /* #Number of loose pages (#mt_loose_pgs) */ + /** Number of loose pages (#mt_loose_pgs) */ int mt_loose_count; /** The sorted list of dirty pages we temporarily wrote to disk * because the dirty list was full. page numbers in here are @@ -1072,11 +1116,12 @@ struct MDB_txn { * @ingroup internal * @{ */ -#define DB_DIRTY 0x01 /**< DB was modified or is DUPSORT data */ +#define DB_DIRTY 0x01 /**< DB was written in this txn */ #define DB_STALE 0x02 /**< Named-DB record is older than txnID */ #define DB_NEW 0x04 /**< Named-DB handle opened in this txn */ #define DB_VALID 0x08 /**< DB handle is valid, see also #MDB_VALID */ #define DB_USRVALID 0x10 /**< As #DB_VALID, but not set for #FREE_DBI */ +#define DB_DUPDATA 0x20 /**< DB is #MDB_DUPSORT data */ /** @} */ /** In write txns, array of cursors for each DB */ MDB_cursor **mt_cursors; @@ -1158,7 +1203,6 @@ struct MDB_cursor { #define C_EOF 0x02 /**< No more data */ #define C_SUB 0x04 /**< Cursor is a sub-cursor */ #define C_DEL 0x08 /**< last op was a cursor_del */ -#define C_SPLITTING 0x20 /**< Cursor is in page_split */ #define C_UNTRACK 0x40 /**< Un-track cursor when closing */ /** @} */ unsigned int mc_flags; /**< @ref mdb_cursor */ @@ -1182,6 +1226,21 @@ typedef struct MDB_xcursor { unsigned char mx_dbflag; } MDB_xcursor; + /** Check if there is an inited xcursor, so #XCURSOR_REFRESH() is proper */ +#define XCURSOR_INITED(mc) \ + ((mc)->mc_xcursor && ((mc)->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) + + /** Update sub-page pointer, if any, in \b mc->mc_xcursor. Needed + * when the node which contains the sub-page may have moved. Called + * with \b mp = mc->mc_pg[mc->mc_top], \b ki = mc->mc_ki[mc->mc_top]. + */ +#define XCURSOR_REFRESH(mc, mp, ki) do { \ + MDB_page *xr_pg = (mp); \ + MDB_node *xr_node = NODEPTR(xr_pg, ki); \ + if ((xr_node->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA) \ + (mc)->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(xr_node); \ +} while (0) + /** State of FreeDB old pages, stored in the MDB_env */ typedef struct MDB_pgstate { pgno_t *mf_pghead; /**< Reclaimed freeDB pages, or NULL before use */ @@ -1192,7 +1251,7 @@ typedef struct MDB_pgstate { struct MDB_env { HANDLE me_fd; /**< The main data file */ HANDLE me_lfd; /**< The lock file */ - HANDLE me_mfd; /**< just for writing the meta pages */ + HANDLE me_mfd; /**< For writing and syncing the meta pages */ /** Failed to update the meta page. Probably an I/O error. */ #define MDB_FATAL_ERROR 0x80000000U /** Some fields are initialized. */ @@ -1269,7 +1328,7 @@ typedef struct MDB_ntxn { #endif /** max bytes to write in one call */ -#define MAX_WRITE (0x80000000U >> (sizeof(ssize_t) == 4)) +#define MAX_WRITE (0x40000000U >> (sizeof(ssize_t) == 4)) /** Check \b txn and \b dbi arguments to a function */ #define TXN_DBI_EXIST(txn, dbi, validity) \ @@ -1296,7 +1355,7 @@ enum { #define MDB_END_SLOT MDB_NOTLS /**< release any reader slot if #MDB_NOTLS */ static void mdb_txn_end(MDB_txn *txn, unsigned mode); -static int mdb_page_get(MDB_txn *txn, pgno_t pgno, MDB_page **mp, int *lvl); +static int mdb_page_get(MDB_cursor *mc, pgno_t pgno, MDB_page **mp, int *lvl); static int mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int modify); #define MDB_PS_MODIFY 1 @@ -1324,8 +1383,8 @@ static int mdb_node_add(MDB_cursor *mc, indx_t indx, MDB_val *key, MDB_val *data, pgno_t pgno, unsigned int flags); static void mdb_node_del(MDB_cursor *mc, int ksize); static void mdb_node_shrink(MDB_page *mp, indx_t indx); -static int mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst); -static int mdb_node_read(MDB_txn *txn, MDB_node *leaf, MDB_val *data); +static int mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft); +static int mdb_node_read(MDB_cursor *mc, MDB_node *leaf, MDB_val *data); static size_t mdb_leaf_size(MDB_env *env, MDB_val *key, MDB_val *data); static size_t mdb_branch_size(MDB_env *env, MDB_val *key); @@ -1369,6 +1428,9 @@ static MDB_cmp_func mdb_cmp_memn, mdb_cmp_memnr, mdb_cmp_int, mdb_cmp_cint, mdb_ static SECURITY_DESCRIPTOR mdb_null_sd; static SECURITY_ATTRIBUTES mdb_all_sa; static int mdb_sec_inited; + +struct MDB_name; +static int utf8_to_utf16(const char *src, struct MDB_name *dst, int xtra); #endif /** Return the library version info. */ @@ -1413,8 +1475,9 @@ mdb_strerror(int err) * This works as long as no function between the call to mdb_strerror * and the actual use of the message uses more than 4K of stack. */ - char pad[4096]; - char buf[1024], *ptr = buf; +#define MSGSIZE 1024 +#define PADSIZE 4096 + char buf[MSGSIZE+PADSIZE], *ptr = buf; #endif int i; if (!err) @@ -1446,7 +1509,7 @@ mdb_strerror(int err) buf[0] = 0; FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, - NULL, err, 0, ptr, sizeof(buf), (va_list *)pad); + NULL, err, 0, ptr, MSGSIZE, (va_list *)buf+MSGSIZE); return ptr; #else return strerror(err); @@ -1554,7 +1617,7 @@ mdb_page_list(MDB_page *mp) pgno, ((MDB_meta *)METADATA(mp))->mm_txnid); return; default: - fprintf(stderr, "Bad page %"Z"u flags 0x%u\n", pgno, mp->mp_flags); + fprintf(stderr, "Bad page %"Z"u flags 0x%X\n", pgno, mp->mp_flags); return; } @@ -1600,7 +1663,7 @@ mdb_cursor_chk(MDB_cursor *mc) MDB_node *node; MDB_page *mp; - if (!mc->mc_snum && !(mc->mc_flags & C_INITIALIZED)) return; + if (!mc->mc_snum || !(mc->mc_flags & C_INITIALIZED)) return; for (i=0; imc_top; i++) { mp = mc->mc_pg[i]; node = NODEPTR(mp, mc->mc_ki[i]); @@ -1609,6 +1672,13 @@ mdb_cursor_chk(MDB_cursor *mc) } if (mc->mc_ki[i] >= NUMKEYS(mc->mc_pg[i])) printf("ack!\n"); + if (XCURSOR_INITED(mc)) { + node = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + if (((node->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA) && + mc->mc_xcursor->mx_cursor.mc_pg[0] != NODEDATA(node)) { + printf("blah!\n"); + } + } } #endif @@ -1663,7 +1733,7 @@ static void mdb_audit(MDB_txn *txn) } } if (freecount + count + NUM_METAS != txn->mt_next_pgno) { - fprintf(stderr, "audit: %lu freecount: %lu count: %lu total: %lu next_pgno: %lu\n", + fprintf(stderr, "audit: %"Z"u freecount: %"Z"u count: %"Z"u total: %"Z"u next_pgno: %"Z"u\n", txn->mt_txnid, freecount, count+NUM_METAS, freecount+count+NUM_METAS, txn->mt_next_pgno); } @@ -1689,6 +1759,7 @@ mdb_dcmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b) /** Allocate memory for a page. * Re-use old malloc'd pages first for singletons, otherwise just malloc. + * Set #MDB_TXN_ERROR on failure. */ static MDB_page * mdb_page_malloc(MDB_txn *txn, unsigned num) @@ -1831,7 +1902,7 @@ mdb_pages_xkeep(MDB_cursor *mc, unsigned pflags, int all) { enum { Mask = P_SUBP|P_DIRTY|P_LOOSE|P_KEEP }; MDB_txn *txn = mc->mc_txn; - MDB_cursor *m3; + MDB_cursor *m3, *m0 = mc; MDB_xcursor *mx; MDB_page *dp, *mp; MDB_node *leaf; @@ -1874,7 +1945,7 @@ mdb_pages_xkeep(MDB_cursor *mc, unsigned pflags, int all) pgno_t pgno = txn->mt_dbs[i].md_root; if (pgno == P_INVALID) continue; - if ((rc = mdb_page_get(txn, pgno, &dp, &level)) != MDB_SUCCESS) + if ((rc = mdb_page_get(m0, pgno, &dp, &level)) != MDB_SUCCESS) break; if ((dp->mp_flags & Mask) == pflags && level <= 1) dp->mp_flags ^= P_KEEP; @@ -2056,7 +2127,7 @@ mdb_page_dirty(MDB_txn *txn, MDB_page *mp) } /** Allocate page numbers and memory for writing. Maintain me_pglast, - * me_pghead and mt_next_pgno. + * me_pghead and mt_next_pgno. Set #MDB_TXN_ERROR on failure. * * If there are free pages available from older transactions, they * are re-used first. Otherwise allocate a new page at mt_next_pgno. @@ -2178,8 +2249,8 @@ mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp) } np = m2.mc_pg[m2.mc_top]; leaf = NODEPTR(np, m2.mc_ki[m2.mc_top]); - if ((rc = mdb_node_read(txn, leaf, &data)) != MDB_SUCCESS) - return rc; + if ((rc = mdb_node_read(&m2, leaf, &data)) != MDB_SUCCESS) + goto fail; idl = (MDB_ID *) data.mv_data; i = idl[0]; @@ -2329,6 +2400,7 @@ mdb_page_unspill(MDB_txn *txn, MDB_page *mp, MDB_page **ret) } /** Touch a page: make it dirty and re-insert into tree with updated pgno. + * Set #MDB_TXN_ERROR on failure. * @param[in] mc cursor pointing to the page to be touched * @return 0 on success, non-zero on failure. */ @@ -2414,16 +2486,11 @@ done: } else { for (; m2; m2=m2->mc_next) { if (m2->mc_snum < mc->mc_snum) continue; + if (m2 == mc) continue; if (m2->mc_pg[mc->mc_top] == mp) { m2->mc_pg[mc->mc_top] = np; - if ((mc->mc_db->md_flags & MDB_DUPSORT) && - IS_LEAF(np) && - m2->mc_ki[mc->mc_top] == mc->mc_ki[mc->mc_top]) - { - MDB_node *leaf = NODEPTR(np, mc->mc_ki[mc->mc_top]); - if (!(leaf->mn_flags & F_SUBDATA)) - m2->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf); - } + if (XCURSOR_INITED(m2) && IS_LEAF(np)) + XCURSOR_REFRESH(m2, np, m2->mc_ki[mc->mc_top]); } } } @@ -2485,14 +2552,15 @@ mdb_cursor_shadow(MDB_txn *src, MDB_txn *dst) *bk = *mc; mc->mc_backup = bk; mc->mc_db = &dst->mt_dbs[i]; - /* Kill pointers into src - and dst to reduce abuse: The - * user may not use mc until dst ends. Otherwise we'd... + /* Kill pointers into src to reduce abuse: The + * user may not use mc until dst ends. But we need a valid + * txn pointer here for cursor fixups to keep working. */ - mc->mc_txn = NULL; /* ...set this to dst */ - mc->mc_dbflag = NULL; /* ...and &dst->mt_dbflags[i] */ + mc->mc_txn = dst; + mc->mc_dbflag = &dst->mt_dbflags[i]; if ((mx = mc->mc_xcursor) != NULL) { *(MDB_xcursor *)(bk+1) = *mx; - mx->mx_cursor.mc_txn = NULL; /* ...and dst. */ + mx->mx_cursor.mc_txn = dst; } mc->mc_next = dst->mt_cursors[i]; dst->mt_cursors[i] = mc; @@ -3426,6 +3494,25 @@ mdb_txn_commit(MDB_txn *txn) pspill[0] = y; } + /* Remove anything in our spill list from parent's dirty list */ + if (txn->mt_spill_pgs && txn->mt_spill_pgs[0]) { + for (i=1; i<=txn->mt_spill_pgs[0]; i++) { + MDB_ID pn = txn->mt_spill_pgs[i]; + if (pn & 1) + continue; /* deleted spillpg */ + pn >>= 1; + y = mdb_mid2l_search(dst, pn); + if (y <= dst[0].mid && dst[y].mid == pn) { + free(dst[y].mptr); + while (y < dst[0].mid) { + dst[y] = dst[y+1]; + y++; + } + dst[0].mid--; + } + } + } + /* Find len = length of merging our dirty list with parent's */ x = dst[0].mid; dst[0].mid = 0; /* simplify loops */ @@ -3471,7 +3558,7 @@ mdb_txn_commit(MDB_txn *txn) } /* Append our loose page list to parent's */ - for (lp = &parent->mt_loose_pgs; *lp; lp = &NEXT_LOOSE_PAGE(lp)) + for (lp = &parent->mt_loose_pgs; *lp; lp = &NEXT_LOOSE_PAGE(*lp)) ; *lp = txn->mt_loose_pgs; parent->mt_loose_count += txn->mt_loose_count; @@ -3754,7 +3841,10 @@ mdb_env_write_meta(MDB_txn *txn) len = sizeof(MDB_meta) - off; off += (char *)mp - env->me_map; - /* Write to the SYNC fd */ + /* Write to the SYNC fd unless MDB_NOSYNC/MDB_NOMETASYNC. + * (me_mfd goes to the same file as me_fd, but writing to it + * also syncs to disk. Avoids a separate fdatasync() call.) + */ mfd = (flags & (MDB_NOSYNC|MDB_NOMETASYNC)) ? env->me_fd : env->me_mfd; #ifdef _WIN32 { @@ -4009,6 +4099,189 @@ mdb_fsize(HANDLE fd, size_t *size) return MDB_SUCCESS; } + +#ifdef _WIN32 +typedef wchar_t mdb_nchar_t; +# define MDB_NAME(str) L##str +# define mdb_name_cpy wcscpy +#else +/** Character type for file names: char on Unix, wchar_t on Windows */ +typedef char mdb_nchar_t; +# define MDB_NAME(str) str /**< #mdb_nchar_t[] string literal */ +# define mdb_name_cpy strcpy /**< Copy name (#mdb_nchar_t string) */ +#endif + +/** Filename - string of #mdb_nchar_t[] */ +typedef struct MDB_name { + int mn_len; /**< Length */ + int mn_alloced; /**< True if #mn_val was malloced */ + mdb_nchar_t *mn_val; /**< Contents */ +} MDB_name; + +/** Filename suffixes [datafile,lockfile][without,with MDB_NOSUBDIR] */ +static const mdb_nchar_t *const mdb_suffixes[2][2] = { + { MDB_NAME("/data.mdb"), MDB_NAME("") }, + { MDB_NAME("/lock.mdb"), MDB_NAME("-lock") } +}; + +#define MDB_SUFFLEN 9 /**< Max string length in #mdb_suffixes[] */ + +/** Set up filename + scratch area for filename suffix, for opening files. + * It should be freed with #mdb_fname_destroy(). + * On Windows, paths are converted from char *UTF-8 to wchar_t *UTF-16. + * + * @param[in] path Pathname for #mdb_env_open(). + * @param[in] envflags Whether a subdir and/or lockfile will be used. + * @param[out] fname Resulting filename, with room for a suffix if necessary. + */ +static int ESECT +mdb_fname_init(const char *path, unsigned envflags, MDB_name *fname) +{ + int no_suffix = F_ISSET(envflags, MDB_NOSUBDIR|MDB_NOLOCK); + fname->mn_alloced = 0; +#ifdef _WIN32 + return utf8_to_utf16(path, fname, no_suffix ? 0 : MDB_SUFFLEN); +#else + fname->mn_len = strlen(path); + if (no_suffix) + fname->mn_val = (char *) path; + else if ((fname->mn_val = malloc(fname->mn_len + MDB_SUFFLEN+1)) != NULL) { + fname->mn_alloced = 1; + strcpy(fname->mn_val, path); + } + else + return ENOMEM; + return MDB_SUCCESS; +#endif +} + +/** Destroy \b fname from #mdb_fname_init() */ +#define mdb_fname_destroy(fname) \ + do { if ((fname).mn_alloced) free((fname).mn_val); } while (0) + +#ifdef O_CLOEXEC /* POSIX.1-2008: Set FD_CLOEXEC atomically at open() */ +# define MDB_CLOEXEC O_CLOEXEC +#else +# define MDB_CLOEXEC 0 +#endif + +/** File type, access mode etc. for #mdb_fopen() */ +enum mdb_fopen_type { +#ifdef _WIN32 + MDB_O_RDONLY, MDB_O_RDWR, MDB_O_META, MDB_O_COPY, MDB_O_LOCKS +#else + /* A comment in mdb_fopen() explains some O_* flag choices. */ + MDB_O_RDONLY= O_RDONLY, /**< for RDONLY me_fd */ + MDB_O_RDWR = O_RDWR |O_CREAT, /**< for me_fd */ + MDB_O_META = O_WRONLY|MDB_DSYNC |MDB_CLOEXEC, /**< for me_mfd */ + MDB_O_COPY = O_WRONLY|O_CREAT|O_EXCL|MDB_CLOEXEC, /**< for #mdb_env_copy() */ + /** Bitmask for open() flags in enum #mdb_fopen_type. The other bits + * distinguish otherwise-equal MDB_O_* constants from each other. + */ + MDB_O_MASK = MDB_O_RDWR|MDB_CLOEXEC | MDB_O_RDONLY|MDB_O_META|MDB_O_COPY, + MDB_O_LOCKS = MDB_O_RDWR|MDB_CLOEXEC | ((MDB_O_MASK+1) & ~MDB_O_MASK) /**< for me_lfd */ +#endif +}; + +/** Open an LMDB file. + * @param[in] env The LMDB environment. + * @param[in,out] fname Path from from #mdb_fname_init(). A suffix is + * appended if necessary to create the filename, without changing mn_len. + * @param[in] which Determines file type, access mode, etc. + * @param[in] mode The Unix permissions for the file, if we create it. + * @param[out] res Resulting file handle. + * @return 0 on success, non-zero on failure. + */ +static int ESECT +mdb_fopen(const MDB_env *env, MDB_name *fname, + enum mdb_fopen_type which, mdb_mode_t mode, + HANDLE *res) +{ + int rc = MDB_SUCCESS; + HANDLE fd; +#ifdef _WIN32 + DWORD acc, share, disp, attrs; +#else + int flags; +#endif + + if (fname->mn_alloced) /* modifiable copy */ + mdb_name_cpy(fname->mn_val + fname->mn_len, + mdb_suffixes[which==MDB_O_LOCKS][F_ISSET(env->me_flags, MDB_NOSUBDIR)]); + + /* The directory must already exist. Usually the file need not. + * MDB_O_META requires the file because we already created it using + * MDB_O_RDWR. MDB_O_COPY must not overwrite an existing file. + * + * With MDB_O_COPY we do not want the OS to cache the writes, since + * the source data is already in the OS cache. + * + * The lockfile needs FD_CLOEXEC (close file descriptor on exec*()) + * to avoid the flock() issues noted under Caveats in lmdb.h. + * Also set it for other filehandles which the user cannot get at + * and close himself, which he may need after fork(). I.e. all but + * me_fd, which programs do use via mdb_env_get_fd(). + */ + +#ifdef _WIN32 + acc = GENERIC_READ|GENERIC_WRITE; + share = FILE_SHARE_READ|FILE_SHARE_WRITE; + disp = OPEN_ALWAYS; + attrs = FILE_ATTRIBUTE_NORMAL; + switch (which) { + case MDB_O_RDONLY: /* read-only datafile */ + acc = GENERIC_READ; + disp = OPEN_EXISTING; + break; + case MDB_O_META: /* for writing metapages */ + acc = GENERIC_WRITE; + disp = OPEN_EXISTING; + attrs = FILE_ATTRIBUTE_NORMAL|FILE_FLAG_WRITE_THROUGH; + break; + case MDB_O_COPY: /* mdb_env_copy() & co */ + acc = GENERIC_WRITE; + share = 0; + disp = CREATE_NEW; + attrs = FILE_FLAG_NO_BUFFERING|FILE_FLAG_WRITE_THROUGH; + break; + default: break; /* silence gcc -Wswitch (not all enum values handled) */ + } + fd = CreateFileW(fname->mn_val, acc, share, NULL, disp, attrs, NULL); +#else + fd = open(fname->mn_val, which & MDB_O_MASK, mode); +#endif + + if (fd == INVALID_HANDLE_VALUE) + rc = ErrCode(); +#ifndef _WIN32 + else { + if (which != MDB_O_RDONLY && which != MDB_O_RDWR) { + /* Set CLOEXEC if we could not pass it to open() */ + if (!MDB_CLOEXEC && (flags = fcntl(fd, F_GETFD)) != -1) + (void) fcntl(fd, F_SETFD, flags | FD_CLOEXEC); + } + if (which == MDB_O_COPY && env->me_psize >= env->me_os_psize) { + /* This may require buffer alignment. There is no portable + * way to ask how much, so we require OS pagesize alignment. + */ +# ifdef F_NOCACHE /* __APPLE__ */ + (void) fcntl(fd, F_NOCACHE, 1); +# elif defined O_DIRECT + /* open(...O_DIRECT...) would break on filesystems without + * O_DIRECT support (ITS#7682). Try to set it here instead. + */ + if ((flags = fcntl(fd, F_GETFL)) != -1) + (void) fcntl(fd, F_SETFL, flags | O_DIRECT); +# endif + } + } +#endif /* !_WIN32 */ + + *res = fd; + return rc; +} + + #ifdef BROKEN_FDATASYNC #include #include @@ -4174,7 +4447,11 @@ mdb_env_reader_dest(void *ptr) { MDB_reader *reader = ptr; - reader->mr_pid = 0; +#ifndef _WIN32 + if (reader->mr_pid == getpid()) /* catch pthread_exit() in child process */ +#endif + /* We omit the mutex, so do this atomically (i.e. skip mr_txnid) */ + reader->mr_pid = 0; } #ifdef _WIN32 @@ -4414,47 +4691,30 @@ mdb_hash_enc(MDB_val *val, char *encbuf) /** Open and/or initialize the lock region for the environment. * @param[in] env The LMDB environment. - * @param[in] lpath The pathname of the file used for the lock region. + * @param[in] fname Filename + scratch area, from #mdb_fname_init(). * @param[in] mode The Unix permissions for the file, if we create it. * @param[in,out] excl In -1, out lock type: -1 none, 0 shared, 1 exclusive * @return 0 on success, non-zero on failure. */ static int ESECT -mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl) +mdb_env_setup_locks(MDB_env *env, MDB_name *fname, int mode, int *excl) { #ifdef _WIN32 # define MDB_ERRCODE_ROFS ERROR_WRITE_PROTECT #else # define MDB_ERRCODE_ROFS EROFS -#ifdef O_CLOEXEC /* Linux: Open file and set FD_CLOEXEC atomically */ -# define MDB_CLOEXEC O_CLOEXEC -#else - int fdflags; -# define MDB_CLOEXEC 0 -#endif #endif int rc; off_t size, rsize; -#ifdef _WIN32 - env->me_lfd = CreateFileA(lpath, GENERIC_READ|GENERIC_WRITE, - FILE_SHARE_READ|FILE_SHARE_WRITE, NULL, OPEN_ALWAYS, - FILE_ATTRIBUTE_NORMAL, NULL); -#else - env->me_lfd = open(lpath, O_RDWR|O_CREAT|MDB_CLOEXEC, mode); -#endif - if (env->me_lfd == INVALID_HANDLE_VALUE) { - rc = ErrCode(); + rc = mdb_fopen(env, fname, MDB_O_LOCKS, mode, &env->me_lfd); + if (rc) { + /* Omit lockfile if read-only env on read-only filesystem */ if (rc == MDB_ERRCODE_ROFS && (env->me_flags & MDB_RDONLY)) { return MDB_SUCCESS; } - goto fail_errno; + goto fail; } -#if ! ((MDB_CLOEXEC) || defined(_WIN32)) - /* Lose record locks when exec*() */ - if ((fdflags = fcntl(env->me_lfd, F_GETFD) | FD_CLOEXEC) >= 0) - fcntl(env->me_lfd, F_SETFD, fdflags); -#endif if (!(env->me_flags & MDB_NOTLS)) { rc = pthread_key_create(&env->me_txkey, mdb_env_reader_dest); @@ -4582,15 +4842,25 @@ mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl) #else /* MDB_USE_POSIX_MUTEX: */ pthread_mutexattr_t mattr; - if ((rc = pthread_mutexattr_init(&mattr)) - || (rc = pthread_mutexattr_setpshared(&mattr, PTHREAD_PROCESS_SHARED)) + /* Solaris needs this before initing a robust mutex. Otherwise + * it may skip the init and return EBUSY "seems someone already + * inited" or EINVAL "it was inited differently". + */ + memset(env->me_txns->mti_rmutex, 0, sizeof(*env->me_txns->mti_rmutex)); + memset(env->me_txns->mti_wmutex, 0, sizeof(*env->me_txns->mti_wmutex)); + + if ((rc = pthread_mutexattr_init(&mattr))) + goto fail; + + rc = pthread_mutexattr_setpshared(&mattr, PTHREAD_PROCESS_SHARED); #ifdef MDB_ROBUST_SUPPORTED - || (rc = pthread_mutexattr_setrobust(&mattr, PTHREAD_MUTEX_ROBUST)) + if (!rc) rc = pthread_mutexattr_setrobust(&mattr, PTHREAD_MUTEX_ROBUST); #endif - || (rc = pthread_mutex_init(env->me_txns->mti_rmutex, &mattr)) - || (rc = pthread_mutex_init(env->me_txns->mti_wmutex, &mattr))) - goto fail; + if (!rc) rc = pthread_mutex_init(env->me_txns->mti_rmutex, &mattr); + if (!rc) rc = pthread_mutex_init(env->me_txns->mti_wmutex, &mattr); pthread_mutexattr_destroy(&mattr); + if (rc) + goto fail; #endif /* _WIN32 || MDB_USE_POSIX_SEM */ env->me_txns->mti_magic = MDB_MAGIC; @@ -4634,12 +4904,6 @@ fail: return rc; } - /** The name of the lock file in the DB environment */ -#define LOCKNAME "/lock.mdb" - /** The name of the data file in the DB environment */ -#define DATANAME "/data.mdb" - /** The suffix of the lock file when no subdir is used */ -#define LOCKSUFF "-lock" /** Only a subset of the @ref mdb_env flags can be changed * at runtime. Changing other flags requires closing the * environment and re-opening it with the new flags. @@ -4655,33 +4919,18 @@ fail: int ESECT mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode) { - int oflags, rc, len, excl = -1; - char *lpath, *dpath; + int rc, excl = -1; + MDB_name fname; if (env->me_fd!=INVALID_HANDLE_VALUE || (flags & ~(CHANGEABLE|CHANGELESS))) return EINVAL; - len = strlen(path); - if (flags & MDB_NOSUBDIR) { - rc = len + sizeof(LOCKSUFF) + len + 1; - } else { - rc = len + sizeof(LOCKNAME) + len + sizeof(DATANAME); - } - lpath = malloc(rc); - if (!lpath) - return ENOMEM; - if (flags & MDB_NOSUBDIR) { - dpath = lpath + len + sizeof(LOCKSUFF); - sprintf(lpath, "%s" LOCKSUFF, path); - strcpy(dpath, path); - } else { - dpath = lpath + len + sizeof(LOCKNAME); - sprintf(lpath, "%s" LOCKNAME, path); - sprintf(dpath, "%s" DATANAME, path); - } - - rc = MDB_SUCCESS; flags |= env->me_flags; + + rc = mdb_fname_init(path, flags, &fname); + if (rc) + return rc; + if (flags & MDB_RDONLY) { /* silently ignore WRITEMAP when we're only getting read access */ flags &= ~MDB_WRITEMAP; @@ -4706,61 +4955,31 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode /* For RDONLY, get lockfile after we know datafile exists */ if (!(flags & (MDB_RDONLY|MDB_NOLOCK))) { - rc = mdb_env_setup_locks(env, lpath, mode, &excl); + rc = mdb_env_setup_locks(env, &fname, mode, &excl); if (rc) goto leave; } -#ifdef _WIN32 - if (F_ISSET(flags, MDB_RDONLY)) { - oflags = GENERIC_READ; - len = OPEN_EXISTING; - } else { - oflags = GENERIC_READ|GENERIC_WRITE; - len = OPEN_ALWAYS; - } - mode = FILE_ATTRIBUTE_NORMAL; - env->me_fd = CreateFileA(dpath, oflags, FILE_SHARE_READ|FILE_SHARE_WRITE, - NULL, len, mode, NULL); -#else - if (F_ISSET(flags, MDB_RDONLY)) - oflags = O_RDONLY; - else - oflags = O_RDWR | O_CREAT; - - env->me_fd = open(dpath, oflags, mode); -#endif - if (env->me_fd == INVALID_HANDLE_VALUE) { - rc = ErrCode(); + rc = mdb_fopen(env, &fname, + (flags & MDB_RDONLY) ? MDB_O_RDONLY : MDB_O_RDWR, + mode, &env->me_fd); + if (rc) goto leave; - } if ((flags & (MDB_RDONLY|MDB_NOLOCK)) == MDB_RDONLY) { - rc = mdb_env_setup_locks(env, lpath, mode, &excl); + rc = mdb_env_setup_locks(env, &fname, mode, &excl); if (rc) goto leave; } if ((rc = mdb_env_open2(env)) == MDB_SUCCESS) { - if (flags & (MDB_RDONLY|MDB_WRITEMAP)) { - env->me_mfd = env->me_fd; - } else { + if (!(flags & (MDB_RDONLY|MDB_WRITEMAP))) { /* Synchronous fd for meta writes. Needed even with * MDB_NOSYNC/MDB_NOMETASYNC, in case these get reset. */ -#ifdef _WIN32 - len = OPEN_EXISTING; - env->me_mfd = CreateFileA(dpath, oflags, - FILE_SHARE_READ|FILE_SHARE_WRITE, NULL, len, - mode | FILE_FLAG_WRITE_THROUGH, NULL); -#else - oflags &= ~O_CREAT; - env->me_mfd = open(dpath, oflags | MDB_DSYNC, mode); -#endif - if (env->me_mfd == INVALID_HANDLE_VALUE) { - rc = ErrCode(); + rc = mdb_fopen(env, &fname, MDB_O_META, mode, &env->me_mfd); + if (rc) goto leave; - } } DPRINTF(("opened dbenv %p", (void *) env)); if (excl > 0) { @@ -4793,7 +5012,7 @@ leave: if (rc) { mdb_env_close0(env, excl); } - free(lpath); + mdb_fname_destroy(fname); return rc; } @@ -4837,7 +5056,7 @@ mdb_env_close0(MDB_env *env, int excl) if (env->me_map) { munmap(env->me_map, env->me_mapsize); } - if (env->me_mfd != env->me_fd && env->me_mfd != INVALID_HANDLE_VALUE) + if (env->me_mfd != INVALID_HANDLE_VALUE) (void) close(env->me_mfd); if (env->me_fd != INVALID_HANDLE_VALUE) (void) close(env->me_fd); @@ -5129,12 +5348,17 @@ mdb_cursor_pop(MDB_cursor *mc) mc->mc_pg[mc->mc_top]->mp_pgno, DDBI(mc), (void *) mc)); mc->mc_snum--; - if (mc->mc_snum) + if (mc->mc_snum) { mc->mc_top--; + } else { + mc->mc_flags &= ~C_INITIALIZED; + } } } -/** Push a page onto the top of the cursor's stack. */ +/** Push a page onto the top of the cursor's stack. + * Set #MDB_TXN_ERROR on failure. + */ static int mdb_cursor_push(MDB_cursor *mc, MDB_page *mp) { @@ -5154,15 +5378,17 @@ mdb_cursor_push(MDB_cursor *mc, MDB_page *mp) } /** Find the address of the page corresponding to a given page number. - * @param[in] txn the transaction for this access. + * Set #MDB_TXN_ERROR on failure. + * @param[in] mc the cursor accessing the page. * @param[in] pgno the page number for the page to retrieve. * @param[out] ret address of a pointer where the page's address will be stored. * @param[out] lvl dirty_list inheritance level of found page. 1=current txn, 0=mapped page. * @return 0 on success, non-zero on failure. */ static int -mdb_page_get(MDB_txn *txn, pgno_t pgno, MDB_page **ret, int *lvl) +mdb_page_get(MDB_cursor *mc, pgno_t pgno, MDB_page **ret, int *lvl) { + MDB_txn *txn = mc->mc_txn; MDB_env *env = txn->mt_env; MDB_page *p = NULL; int level; @@ -5228,13 +5454,26 @@ mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int flags) indx_t i; DPRINTF(("branch page %"Z"u has %u keys", mp->mp_pgno, NUMKEYS(mp))); - mdb_cassert(mc, NUMKEYS(mp) > 1); + /* Don't assert on branch pages in the FreeDB. We can get here + * while in the process of rebalancing a FreeDB branch page; we must + * let that proceed. ITS#8336 + */ + mdb_cassert(mc, !mc->mc_dbi || NUMKEYS(mp) > 1); DPRINTF(("found index 0 to page %"Z"u", NODEPGNO(NODEPTR(mp, 0)))); if (flags & (MDB_PS_FIRST|MDB_PS_LAST)) { i = 0; - if (flags & MDB_PS_LAST) + if (flags & MDB_PS_LAST) { i = NUMKEYS(mp) - 1; + /* if already init'd, see if we're already in right place */ + if (mc->mc_flags & C_INITIALIZED) { + if (mc->mc_ki[mc->mc_top] == i) { + mc->mc_top = mc->mc_snum++; + mp = mc->mc_pg[mc->mc_top]; + goto ready; + } + } + } } else { int exact; node = mdb_node_search(mc, key, &exact); @@ -5253,13 +5492,14 @@ mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int flags) mdb_cassert(mc, i < NUMKEYS(mp)); node = NODEPTR(mp, i); - if ((rc = mdb_page_get(mc->mc_txn, NODEPGNO(node), &mp, NULL)) != 0) + if ((rc = mdb_page_get(mc, NODEPGNO(node), &mp, NULL)) != 0) return rc; mc->mc_ki[mc->mc_top] = i; if ((rc = mdb_cursor_push(mc, mp))) return rc; +ready: if (flags & MDB_PS_MODIFY) { if ((rc = mdb_page_touch(mc)) != 0) return rc; @@ -5295,7 +5535,7 @@ mdb_page_search_lowest(MDB_cursor *mc) MDB_node *node = NODEPTR(mp, 0); int rc; - if ((rc = mdb_page_get(mc->mc_txn, NODEPGNO(node), &mp, NULL)) != 0) + if ((rc = mdb_page_get(mc, NODEPGNO(node), &mp, NULL)) != 0) return rc; mc->mc_ki[mc->mc_top] = 0; @@ -5347,7 +5587,7 @@ mdb_page_search(MDB_cursor *mc, MDB_val *key, int flags) return MDB_NOTFOUND; if ((leaf->mn_flags & (F_DUPDATA|F_SUBDATA)) != F_SUBDATA) return MDB_INCOMPATIBLE; /* not a named DB */ - rc = mdb_node_read(mc->mc_txn, leaf, &data); + rc = mdb_node_read(&mc2, leaf, &data); if (rc) return rc; memcpy(&flags, ((char *) data.mv_data + offsetof(MDB_db, md_flags)), @@ -5371,7 +5611,7 @@ mdb_page_search(MDB_cursor *mc, MDB_val *key, int flags) mdb_cassert(mc, root > 1); if (!mc->mc_pg[0] || mc->mc_pg[0]->mp_pgno != root) - if ((rc = mdb_page_get(mc->mc_txn, root, &mc->mc_pg[0], NULL)) != 0) + if ((rc = mdb_page_get(mc, root, &mc->mc_pg[0], NULL)) != 0) return rc; mc->mc_snum = 1; @@ -5446,6 +5686,7 @@ mdb_ovpage_free(MDB_cursor *mc, MDB_page *mp) return MDB_CORRUPTED; } } + txn->mt_dirty_room++; if (!(env->me_flags & MDB_WRITEMAP)) mdb_dpage_free(env, mp); release: @@ -5467,13 +5708,13 @@ release: } /** Return the data associated with a given node. - * @param[in] txn The transaction for this operation. + * @param[in] mc The cursor for this operation. * @param[in] leaf The node being read. * @param[out] data Updated to point to the node's data. * @return 0 on success, non-zero on failure. */ static int -mdb_node_read(MDB_txn *txn, MDB_node *leaf, MDB_val *data) +mdb_node_read(MDB_cursor *mc, MDB_node *leaf, MDB_val *data) { MDB_page *omp; /* overflow page */ pgno_t pgno; @@ -5489,7 +5730,7 @@ mdb_node_read(MDB_txn *txn, MDB_node *leaf, MDB_val *data) */ data->mv_size = NODEDSZ(leaf); memcpy(&pgno, NODEDATA(leaf), sizeof(pgno)); - if ((rc = mdb_page_get(txn, pgno, &omp, NULL)) != 0) { + if ((rc = mdb_page_get(mc, pgno, &omp, NULL)) != 0) { DPRINTF(("read overflow page %"Z"u failed", pgno)); return rc; } @@ -5563,7 +5804,7 @@ mdb_cursor_sibling(MDB_cursor *mc, int move_right) mdb_cassert(mc, IS_BRANCH(mc->mc_pg[mc->mc_top])); indx = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); - if ((rc = mdb_page_get(mc->mc_txn, NODEPGNO(indx), &mp, NULL)) != 0) { + if ((rc = mdb_page_get(mc, NODEPGNO(indx), &mp, NULL)) != 0) { /* mc will be inconsistent if caller does mc_snum++ as above */ mc->mc_flags &= ~(C_INITIALIZED|C_EOF); return rc; @@ -5584,14 +5825,20 @@ mdb_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) MDB_node *leaf; int rc; - if (mc->mc_flags & C_EOF) { + if ((mc->mc_flags & C_DEL && op == MDB_NEXT_DUP)) return MDB_NOTFOUND; - } - mdb_cassert(mc, mc->mc_flags & C_INITIALIZED); + if (!(mc->mc_flags & C_INITIALIZED)) + return mdb_cursor_first(mc, key, data); mp = mc->mc_pg[mc->mc_top]; + if (mc->mc_flags & C_EOF) { + if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mp)-1) + return MDB_NOTFOUND; + mc->mc_flags ^= C_EOF; + } + if (mc->mc_db->md_flags & MDB_DUPSORT) { leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { @@ -5612,8 +5859,10 @@ mdb_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) DPRINTF(("cursor_next: top page is %"Z"u in cursor %p", mdb_dbg_pgno(mp), (void *) mc)); - if (mc->mc_flags & C_DEL) + if (mc->mc_flags & C_DEL) { + mc->mc_flags ^= C_DEL; goto skip; + } if (mc->mc_ki[mc->mc_top] + 1u >= NUMKEYS(mp)) { DPUTS("=====> move to next sibling page"); @@ -5643,7 +5892,7 @@ skip: mdb_xcursor_init1(mc, leaf); } if (data) { - if ((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS) + if ((rc = mdb_node_read(mc, leaf, data)) != MDB_SUCCESS) return rc; if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { @@ -5665,7 +5914,12 @@ mdb_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) MDB_node *leaf; int rc; - mdb_cassert(mc, mc->mc_flags & C_INITIALIZED); + if (!(mc->mc_flags & C_INITIALIZED)) { + rc = mdb_cursor_last(mc, key, data); + if (rc) + return rc; + mc->mc_ki[mc->mc_top]++; + } mp = mc->mc_pg[mc->mc_top]; @@ -5692,6 +5946,8 @@ mdb_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) DPRINTF(("cursor_prev: top page is %"Z"u in cursor %p", mdb_dbg_pgno(mp), (void *) mc)); + mc->mc_flags &= ~(C_EOF|C_DEL); + if (mc->mc_ki[mc->mc_top] == 0) { DPUTS("=====> move to prev sibling page"); if ((rc = mdb_cursor_sibling(mc, 0)) != MDB_SUCCESS) { @@ -5703,8 +5959,6 @@ mdb_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) } else mc->mc_ki[mc->mc_top]--; - mc->mc_flags &= ~C_EOF; - DPRINTF(("==> cursor points to page %"Z"u with %u keys, key index %u", mdb_dbg_pgno(mp), NUMKEYS(mp), mc->mc_ki[mc->mc_top])); @@ -5721,7 +5975,7 @@ mdb_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) mdb_xcursor_init1(mc, leaf); } if (data) { - if ((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS) + if ((rc = mdb_node_read(mc, leaf, data)) != MDB_SUCCESS) return rc; if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { @@ -5815,6 +6069,7 @@ mdb_cursor_set(MDB_cursor *mc, MDB_val *key, MDB_val *data, } } rc = 0; + mc->mc_flags &= ~C_EOF; goto set2; } } @@ -5840,6 +6095,8 @@ mdb_cursor_set(MDB_cursor *mc, MDB_val *key, MDB_val *data, } else return MDB_NOTFOUND; } + } else { + mc->mc_pg[0] = 0; } rc = mdb_page_search(mc, key, 0); @@ -5901,7 +6158,7 @@ set1: } else if (op == MDB_GET_BOTH || op == MDB_GET_BOTH_RANGE) { MDB_val olddata; MDB_cmp_func *dcmp; - if ((rc = mdb_node_read(mc->mc_txn, leaf, &olddata)) != MDB_SUCCESS) + if ((rc = mdb_node_read(mc, leaf, &olddata)) != MDB_SUCCESS) return rc; dcmp = mc->mc_dbx->md_dcmp; #if UINT_MAX < SIZE_MAX @@ -5913,13 +6170,13 @@ set1: if (op == MDB_GET_BOTH || rc > 0) return MDB_NOTFOUND; rc = 0; - *data = olddata; } + *data = olddata; } else { if (mc->mc_xcursor) mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); - if ((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS) + if ((rc = mdb_node_read(mc, leaf, data)) != MDB_SUCCESS) return rc; } } @@ -5968,7 +6225,7 @@ mdb_cursor_first(MDB_cursor *mc, MDB_val *key, MDB_val *data) if (rc) return rc; } else { - if ((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS) + if ((rc = mdb_node_read(mc, leaf, data)) != MDB_SUCCESS) return rc; } } @@ -5986,16 +6243,13 @@ mdb_cursor_last(MDB_cursor *mc, MDB_val *key, MDB_val *data) if (mc->mc_xcursor) mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); - if (!(mc->mc_flags & C_EOF)) { - - if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { - rc = mdb_page_search(mc, NULL, MDB_PS_LAST); - if (rc != MDB_SUCCESS) - return rc; - } - mdb_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); - + if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { + rc = mdb_page_search(mc, NULL, MDB_PS_LAST); + if (rc != MDB_SUCCESS) + return rc; } + mdb_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); + mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]) - 1; mc->mc_flags |= C_INITIALIZED|C_EOF; leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); @@ -6013,7 +6267,7 @@ mdb_cursor_last(MDB_cursor *mc, MDB_val *key, MDB_val *data) if (rc) return rc; } else { - if ((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS) + if ((rc = mdb_node_read(mc, leaf, data)) != MDB_SUCCESS) return rc; } } @@ -6057,11 +6311,9 @@ mdb_cursor_get(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_GET_KEY(leaf, key); if (data) { if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - if (mc->mc_flags & C_DEL) - mdb_xcursor_init1(mc, leaf); rc = mdb_cursor_get(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_GET_CURRENT); } else { - rc = mdb_node_read(mc->mc_txn, leaf, data); + rc = mdb_node_read(mc, leaf, data); } } } @@ -6111,10 +6363,7 @@ mdb_cursor_get(MDB_cursor *mc, MDB_val *key, MDB_val *data, rc = MDB_INCOMPATIBLE; break; } - if (!(mc->mc_flags & C_INITIALIZED)) - rc = mdb_cursor_first(mc, key, data); - else - rc = mdb_cursor_next(mc, key, data, MDB_NEXT_DUP); + rc = mdb_cursor_next(mc, key, data, MDB_NEXT_DUP); if (rc == MDB_SUCCESS) { if (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { MDB_cursor *mx; @@ -6129,24 +6378,38 @@ fetchm: } } break; + case MDB_PREV_MULTIPLE: + if (data == NULL) { + rc = EINVAL; + break; + } + if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) { + rc = MDB_INCOMPATIBLE; + break; + } + if (!(mc->mc_flags & C_INITIALIZED)) + rc = mdb_cursor_last(mc, key, data); + else + rc = MDB_SUCCESS; + if (rc == MDB_SUCCESS) { + MDB_cursor *mx = &mc->mc_xcursor->mx_cursor; + if (mx->mc_flags & C_INITIALIZED) { + rc = mdb_cursor_sibling(mx, 0); + if (rc == MDB_SUCCESS) + goto fetchm; + } else { + rc = MDB_NOTFOUND; + } + } + break; case MDB_NEXT: case MDB_NEXT_DUP: case MDB_NEXT_NODUP: - if (!(mc->mc_flags & C_INITIALIZED)) - rc = mdb_cursor_first(mc, key, data); - else - rc = mdb_cursor_next(mc, key, data, op); + rc = mdb_cursor_next(mc, key, data, op); break; case MDB_PREV: case MDB_PREV_DUP: case MDB_PREV_NODUP: - if (!(mc->mc_flags & C_INITIALIZED)) { - rc = mdb_cursor_last(mc, key, data); - if (rc) - break; - mc->mc_flags |= C_INITIALIZED; - mc->mc_ki[mc->mc_top]++; - } rc = mdb_cursor_prev(mc, key, data, op); break; case MDB_FIRST: @@ -6163,11 +6426,16 @@ fetchm: rc = MDB_INCOMPATIBLE; break; } + if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mc->mc_pg[mc->mc_top])) { + mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]); + rc = MDB_NOTFOUND; + break; + } { MDB_node *leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) { MDB_GET_KEY(leaf, key); - rc = mdb_node_read(mc->mc_txn, leaf, data); + rc = mdb_node_read(mc, leaf, data); break; } } @@ -6204,7 +6472,8 @@ mdb_cursor_touch(MDB_cursor *mc) { int rc = MDB_SUCCESS; - if (mc->mc_dbi >= CORE_DBS && !(*mc->mc_dbflag & DB_DIRTY)) { + if (mc->mc_dbi >= CORE_DBS && !(*mc->mc_dbflag & (DB_DIRTY|DB_DUPDATA))) { + /* Touch DB record of named DB */ MDB_cursor mc2; MDB_xcursor mcx; if (TXN_DBI_CHANGED(mc->mc_txn, mc->mc_dbi)) @@ -6388,16 +6657,18 @@ fix_parent: * update branch key if there is a parent page */ if (mc->mc_top && !mc->mc_ki[mc->mc_top]) { - unsigned short top = mc->mc_top; + unsigned short dtop = 1; mc->mc_top--; /* slot 0 is always an empty key, find real slot */ - while (mc->mc_top && !mc->mc_ki[mc->mc_top]) + while (mc->mc_top && !mc->mc_ki[mc->mc_top]) { mc->mc_top--; + dtop++; + } if (mc->mc_ki[mc->mc_top]) rc2 = mdb_update_key(mc, key); else rc2 = MDB_SUCCESS; - mc->mc_top = top; + mc->mc_top += dtop; if (rc2) return rc2; } @@ -6433,7 +6704,7 @@ more: #endif /* does data match? */ if (!dcmp(data, &olddata)) { - if (flags & MDB_NODUPDATA) + if (flags & (MDB_NODUPDATA|MDB_APPENDDUP)) return MDB_KEYEXIST; /* overwrite it */ goto current; @@ -6549,7 +6820,7 @@ current: int level, ovpages, dpages = OVPAGES(data->mv_size, env->me_psize); memcpy(&pg, olddata.mv_data, sizeof(pg)); - if ((rc2 = mdb_page_get(mc->mc_txn, pg, &omp, &level)) != 0) + if ((rc2 = mdb_page_get(mc, pg, &omp, &level)) != 0) return rc2; ovpages = omp->mp_pages; @@ -6578,10 +6849,16 @@ current: return ENOMEM; id2.mid = pg; id2.mptr = np; + /* Note - this page is already counted in parent's dirty_room */ rc2 = mdb_mid2l_insert(mc->mc_txn->mt_u.dirty_list, &id2); mdb_cassert(mc, rc2 == 0); + /* Currently we make the page look as with put() in the + * parent txn, in case the user peeks at MDB_RESERVEd + * or unused parts. Some users treat ovpages specially. + */ if (!(flags & MDB_RESERVE)) { - /* Copy end of page, adjusting alignment so + /* Skip the part where LMDB will put *data. + * Copy end of page, adjusting alignment so * compiler may copy words instead of bytes. */ off = (PAGEHDRSZ + data->mv_size) & -sizeof(size_t); @@ -6634,7 +6911,7 @@ new_sub: } else { /* There is room already in this leaf page. */ rc = mdb_node_add(mc, mc->mc_ki[mc->mc_top], key, rdata, 0, nflags); - if (rc == 0 && insert_key) { + if (rc == 0) { /* Adjust other cursors pointing to mp */ MDB_cursor *m2, *m3; MDB_dbi dbi = mc->mc_dbi; @@ -6646,10 +6923,12 @@ new_sub: m3 = &m2->mc_xcursor->mx_cursor; else m3 = m2; - if (m3 == mc || m3->mc_snum < mc->mc_snum) continue; - if (m3->mc_pg[i] == mp && m3->mc_ki[i] >= mc->mc_ki[i]) { + if (m3 == mc || m3->mc_snum < mc->mc_snum || m3->mc_pg[i] != mp) continue; + if (m3->mc_ki[i] >= mc->mc_ki[i] && insert_key) { m3->mc_ki[i]++; } + if (XCURSOR_INITED(m3)) + XCURSOR_REFRESH(m3, mp, m3->mc_ki[i]); } } } @@ -6691,6 +6970,7 @@ put_sub: MDB_xcursor *mx = mc->mc_xcursor; unsigned i = mc->mc_top; MDB_page *mp = mc->mc_pg[i]; + int nkeys = NUMKEYS(mp); for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) { if (m2 == mc || m2->mc_snum < mc->mc_snum) continue; @@ -6698,10 +6978,8 @@ put_sub: if (m2->mc_pg[i] == mp) { if (m2->mc_ki[i] == mc->mc_ki[i]) { mdb_xcursor_init2(m2, mx, new_dupdata); - } else if (!insert_key) { - MDB_node *n2 = NODEPTR(mp, m2->mc_ki[i]); - if (!(n2->mn_flags & F_SUBDATA)) - m2->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(n2); + } else if (!insert_key && m2->mc_ki[i] < nkeys) { + XCURSOR_REFRESH(m2, mp, m2->mc_ki[i]); } } } @@ -6781,6 +7059,7 @@ mdb_cursor_del(MDB_cursor *mc, unsigned int flags) if (flags & MDB_NODUPDATA) { /* mdb_cursor_del0() will subtract the final entry */ mc->mc_db->md_entries -= mc->mc_xcursor->mx_db.md_entries - 1; + mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED; } else { if (!F_ISSET(leaf->mn_flags, F_SUBDATA)) { mc->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf); @@ -6805,19 +7084,20 @@ mdb_cursor_del(MDB_cursor *mc, unsigned int flags) if (m2 == mc || m2->mc_snum < mc->mc_snum) continue; if (!(m2->mc_flags & C_INITIALIZED)) continue; if (m2->mc_pg[mc->mc_top] == mp) { - if (m2->mc_ki[mc->mc_top] == mc->mc_ki[mc->mc_top]) { - m2->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf); - } else { - MDB_node *n2 = NODEPTR(mp, m2->mc_ki[mc->mc_top]); - if (!(n2->mn_flags & F_SUBDATA)) - m2->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(n2); + MDB_node *n2 = leaf; + if (m2->mc_ki[mc->mc_top] >= NUMKEYS(mp)) continue; + if (m2->mc_ki[mc->mc_top] != mc->mc_ki[mc->mc_top]) { + n2 = NODEPTR(mp, m2->mc_ki[mc->mc_top]); + if (n2->mn_flags & F_SUBDATA) continue; } + m2->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(n2); } } } mc->mc_db->md_entries--; - mc->mc_flags |= C_DEL; return rc; + } else { + mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED; } /* otherwise fall thru and delete the sub-DB */ } @@ -6841,7 +7121,7 @@ mdb_cursor_del(MDB_cursor *mc, unsigned int flags) pgno_t pg; memcpy(&pg, NODEDATA(leaf), sizeof(pg)); - if ((rc = mdb_page_get(mc->mc_txn, pg, &omp, NULL)) || + if ((rc = mdb_page_get(mc, pg, &omp, NULL)) || (rc = mdb_ovpage_free(mc, omp))) goto fail; } @@ -6855,6 +7135,7 @@ fail: } /** Allocate and initialize new pages for a database. + * Set #MDB_TXN_ERROR on failure. * @param[in] mc a cursor on the database being added to. * @param[in] flags flags defining what type of page is being allocated. * @param[in] num the number of pages to allocate. This is usually 1, @@ -6940,6 +7221,7 @@ mdb_branch_size(MDB_env *env, MDB_val *key) } /** Add a node to the page pointed to by the cursor. + * Set #MDB_TXN_ERROR on failure. * @param[in] mc The cursor for this operation. * @param[in] indx The index on the page where the new node should be added. * @param[in] key The key for the new node. @@ -7249,7 +7531,7 @@ mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node) } DPRINTF(("Sub-db -%u root page %"Z"u", mx->mx_cursor.mc_dbi, mx->mx_db.md_root)); - mx->mx_dbflag = DB_VALID|DB_USRVALID|DB_DIRTY; /* DB_DIRTY guides mdb_cursor_touch */ + mx->mx_dbflag = DB_VALID|DB_USRVALID|DB_DUPDATA; #if UINT_MAX < SIZE_MAX if (mx->mx_dbx.md_cmp == mdb_cmp_int && mx->mx_db.md_pad == sizeof(size_t)) mx->mx_dbx.md_cmp = mdb_cmp_clong; @@ -7275,7 +7557,7 @@ mdb_xcursor_init2(MDB_cursor *mc, MDB_xcursor *src_mx, int new_dupdata) mx->mx_cursor.mc_top = 0; mx->mx_cursor.mc_flags |= C_INITIALIZED; mx->mx_cursor.mc_ki[0] = 0; - mx->mx_dbflag = DB_VALID|DB_USRVALID|DB_DIRTY; /* DB_DIRTY guides mdb_cursor_touch */ + mx->mx_dbflag = DB_VALID|DB_USRVALID|DB_DUPDATA; #if UINT_MAX < SIZE_MAX mx->mx_dbx.md_cmp = src_mx->mx_dbx.md_cmp; #endif @@ -7384,9 +7666,15 @@ mdb_cursor_count(MDB_cursor *mc, size_t *countp) if (!(mc->mc_flags & C_INITIALIZED)) return EINVAL; - if (!mc->mc_snum || (mc->mc_flags & C_EOF)) + if (!mc->mc_snum) return MDB_NOTFOUND; + if (mc->mc_flags & C_EOF) { + if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mc->mc_pg[mc->mc_top])) + return MDB_NOTFOUND; + mc->mc_flags ^= C_EOF; + } + leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) { *countp = 1; @@ -7428,6 +7716,7 @@ mdb_cursor_dbi(MDB_cursor *mc) } /** Replace the key for a branch node with a new key. + * Set #MDB_TXN_ERROR on failure. * @param[in] mc Cursor pointing to the node to operate on. * @param[in] key The new key to use. * @return 0 on success, non-zero on failure. @@ -7504,10 +7793,26 @@ mdb_update_key(MDB_cursor *mc, MDB_val *key) static void mdb_cursor_copy(const MDB_cursor *csrc, MDB_cursor *cdst); +/** Perform \b act while tracking temporary cursor \b mn */ +#define WITH_CURSOR_TRACKING(mn, act) do { \ + MDB_cursor dummy, *tracked, **tp = &(mn).mc_txn->mt_cursors[mn.mc_dbi]; \ + if ((mn).mc_flags & C_SUB) { \ + dummy.mc_flags = C_INITIALIZED; \ + dummy.mc_xcursor = (MDB_xcursor *)&(mn); \ + tracked = &dummy; \ + } else { \ + tracked = &(mn); \ + } \ + tracked->mc_next = *tp; \ + *tp = tracked; \ + { act; } \ + *tp = tracked->mc_next; \ +} while (0) + /** Move a node from csrc to cdst. */ static int -mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst) +mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft) { MDB_node *srcnode; MDB_val key, data; @@ -7559,6 +7864,7 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst) data.mv_size = NODEDSZ(srcnode); data.mv_data = NODEDATA(srcnode); } + mn.mc_xcursor = NULL; if (IS_BRANCH(cdst->mc_pg[cdst->mc_top]) && cdst->mc_ki[cdst->mc_top] == 0) { unsigned int snum = cdst->mc_snum; MDB_node *s2; @@ -7609,13 +7915,15 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst) mps = csrc->mc_pg[csrc->mc_top]; /* If we're adding on the left, bump others up */ - if (!cdst->mc_ki[csrc->mc_top]) { + if (fromleft) { mpd = cdst->mc_pg[csrc->mc_top]; for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { if (csrc->mc_flags & C_SUB) m3 = &m2->mc_xcursor->mx_cursor; else m3 = m2; + if (!(m3->mc_flags & C_INITIALIZED) || m3->mc_top < csrc->mc_top) + continue; if (m3 != cdst && m3->mc_pg[csrc->mc_top] == mpd && m3->mc_ki[csrc->mc_top] >= cdst->mc_ki[csrc->mc_top]) { @@ -7628,6 +7936,8 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst) m3->mc_ki[csrc->mc_top] = cdst->mc_ki[cdst->mc_top]; m3->mc_ki[csrc->mc_top-1]++; } + if (XCURSOR_INITED(m3) && IS_LEAF(mps)) + XCURSOR_REFRESH(m3, m3->mc_pg[csrc->mc_top], m3->mc_ki[csrc->mc_top]); } } else /* Adding on the right, bump others down */ @@ -7638,6 +7948,8 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst) else m3 = m2; if (m3 == csrc) continue; + if (!(m3->mc_flags & C_INITIALIZED) || m3->mc_top < csrc->mc_top) + continue; if (m3->mc_pg[csrc->mc_top] == mps) { if (!m3->mc_ki[csrc->mc_top]) { m3->mc_pg[csrc->mc_top] = cdst->mc_pg[cdst->mc_top]; @@ -7646,6 +7958,8 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst) } else { m3->mc_ki[csrc->mc_top]--; } + if (XCURSOR_INITED(m3) && IS_LEAF(mps)) + XCURSOR_REFRESH(m3, m3->mc_pg[csrc->mc_top], m3->mc_ki[csrc->mc_top]); } } } @@ -7667,7 +7981,10 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst) mdb_cursor_copy(csrc, &mn); mn.mc_snum--; mn.mc_top--; - if ((rc = mdb_update_key(&mn, &key)) != MDB_SUCCESS) + /* We want mdb_rebalance to find mn when doing fixups */ + WITH_CURSOR_TRACKING(mn, + rc = mdb_update_key(&mn, &key)); + if (rc) return rc; } if (IS_BRANCH(csrc->mc_pg[csrc->mc_top])) { @@ -7695,7 +8012,10 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst) mdb_cursor_copy(cdst, &mn); mn.mc_snum--; mn.mc_top--; - if ((rc = mdb_update_key(&mn, &key)) != MDB_SUCCESS) + /* We want mdb_rebalance to find mn when doing fixups */ + WITH_CURSOR_TRACKING(mn, + rc = mdb_update_key(&mn, &key)); + if (rc) return rc; } if (IS_BRANCH(cdst->mc_pg[cdst->mc_top])) { @@ -7764,6 +8084,7 @@ mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst) MDB_cursor mn; MDB_node *s2; mdb_cursor_copy(csrc, &mn); + mn.mc_xcursor = NULL; /* must find the lowest key below src */ rc = mdb_page_search_lowest(&mn); if (rc) @@ -7835,7 +8156,12 @@ mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst) m3->mc_pg[top] = pdst; m3->mc_ki[top] += nkeys; m3->mc_ki[top-1] = cdst->mc_ki[top-1]; + } else if (m3->mc_pg[top-1] == csrc->mc_pg[top-1] && + m3->mc_ki[top-1] > csrc->mc_ki[top-1]) { + m3->mc_ki[top-1]--; } + if (XCURSOR_INITED(m3) && IS_LEAF(psrc)) + XCURSOR_REFRESH(m3, m3->mc_pg[top], m3->mc_ki[top]); } } { @@ -7884,7 +8210,7 @@ static int mdb_rebalance(MDB_cursor *mc) { MDB_node *node; - int rc; + int rc, fromleft; unsigned int ptop, minkeys, thresh; MDB_cursor mn; indx_t oldki; @@ -7935,7 +8261,8 @@ mdb_rebalance(MDB_cursor *mc) m3 = &m2->mc_xcursor->mx_cursor; else m3 = m2; - if (m3->mc_snum < mc->mc_snum) continue; + if (!(m3->mc_flags & C_INITIALIZED) || (m3->mc_snum < mc->mc_snum)) + continue; if (m3->mc_pg[0] == mp) { m3->mc_snum = 0; m3->mc_top = 0; @@ -7950,7 +8277,7 @@ mdb_rebalance(MDB_cursor *mc) if (rc) return rc; mc->mc_db->md_root = NODEPGNO(NODEPTR(mp, 0)); - rc = mdb_page_get(mc->mc_txn,mc->mc_db->md_root,&mc->mc_pg[0],NULL); + rc = mdb_page_get(mc, mc->mc_db->md_root, &mc->mc_pg[0], NULL); if (rc) return rc; mc->mc_db->md_depth--; @@ -7971,6 +8298,8 @@ mdb_rebalance(MDB_cursor *mc) else m3 = m2; if (m3 == mc) continue; + if (!(m3->mc_flags & C_INITIALIZED)) + continue; if (m3->mc_pg[0] == mp) { for (i=0; imc_db->md_depth; i++) { m3->mc_pg[i] = m3->mc_pg[i+1]; @@ -8009,22 +8338,24 @@ mdb_rebalance(MDB_cursor *mc) DPUTS("reading right neighbor"); mn.mc_ki[ptop]++; node = NODEPTR(mc->mc_pg[ptop], mn.mc_ki[ptop]); - rc = mdb_page_get(mc->mc_txn,NODEPGNO(node),&mn.mc_pg[mn.mc_top],NULL); + rc = mdb_page_get(mc, NODEPGNO(node), &mn.mc_pg[mn.mc_top], NULL); if (rc) return rc; mn.mc_ki[mn.mc_top] = 0; mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]); + fromleft = 0; } else { /* There is at least one neighbor to the left. */ DPUTS("reading left neighbor"); mn.mc_ki[ptop]--; node = NODEPTR(mc->mc_pg[ptop], mn.mc_ki[ptop]); - rc = mdb_page_get(mc->mc_txn,NODEPGNO(node),&mn.mc_pg[mn.mc_top],NULL); + rc = mdb_page_get(mc, NODEPGNO(node), &mn.mc_pg[mn.mc_top], NULL); if (rc) return rc; mn.mc_ki[mn.mc_top] = NUMKEYS(mn.mc_pg[mn.mc_top]) - 1; mc->mc_ki[mc->mc_top] = 0; + fromleft = 1; } DPRINTF(("found neighbor page %"Z"u (%u keys, %.1f%% full)", @@ -8036,32 +8367,20 @@ mdb_rebalance(MDB_cursor *mc) * (A branch page must never have less than 2 keys.) */ if (PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) >= thresh && NUMKEYS(mn.mc_pg[mn.mc_top]) > minkeys) { - rc = mdb_node_move(&mn, mc); - if (!mc->mc_ki[mc->mc_top]) { + rc = mdb_node_move(&mn, mc, fromleft); + if (fromleft) { /* if we inserted on left, bump position up */ oldki++; } } else { - if (mc->mc_ki[ptop] == 0) { + if (!fromleft) { rc = mdb_page_merge(&mn, mc); } else { - MDB_cursor dummy; oldki += NUMKEYS(mn.mc_pg[mn.mc_top]); mn.mc_ki[mn.mc_top] += mc->mc_ki[mn.mc_top] + 1; /* We want mdb_rebalance to find mn when doing fixups */ - if (mc->mc_flags & C_SUB) { - dummy.mc_next = mc->mc_txn->mt_cursors[mc->mc_dbi]; - mc->mc_txn->mt_cursors[mc->mc_dbi] = &dummy; - dummy.mc_xcursor = (MDB_xcursor *)&mn; - } else { - mn.mc_next = mc->mc_txn->mt_cursors[mc->mc_dbi]; - mc->mc_txn->mt_cursors[mc->mc_dbi] = &mn; - } - rc = mdb_page_merge(mc, &mn); - if (mc->mc_flags & C_SUB) - mc->mc_txn->mt_cursors[mc->mc_dbi] = dummy.mc_next; - else - mc->mc_txn->mt_cursors[mc->mc_dbi] = mn.mc_next; + WITH_CURSOR_TRACKING(mn, + rc = mdb_page_merge(mc, &mn)); mdb_cursor_copy(&mn, mc); } mc->mc_flags &= ~C_EOF; @@ -8094,13 +8413,18 @@ mdb_cursor_del0(MDB_cursor *mc) if (m3 == mc || m3->mc_snum < mc->mc_snum) continue; if (m3->mc_pg[mc->mc_top] == mp) { - if (m3->mc_ki[mc->mc_top] >= ki) { + if (m3->mc_ki[mc->mc_top] == ki) { m3->mc_flags |= C_DEL; - if (m3->mc_ki[mc->mc_top] > ki) - m3->mc_ki[mc->mc_top]--; - else if (mc->mc_db->md_flags & MDB_DUPSORT) - m3->mc_xcursor->mx_cursor.mc_flags |= C_EOF; + if (mc->mc_db->md_flags & MDB_DUPSORT) { + /* Sub-cursor referred into dataset which is gone */ + m3->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); + } + continue; + } else if (m3->mc_ki[mc->mc_top] > ki) { + m3->mc_ki[mc->mc_top]--; } + if (XCURSOR_INITED(m3)) + XCURSOR_REFRESH(m3, m3->mc_pg[mc->mc_top], m3->mc_ki[mc->mc_top]); } } } @@ -8126,11 +8450,32 @@ mdb_cursor_del0(MDB_cursor *mc) continue; if (m3->mc_pg[mc->mc_top] == mp) { /* if m3 points past last node in page, find next sibling */ - if (m3->mc_ki[mc->mc_top] >= nkeys) { - rc = mdb_cursor_sibling(m3, 1); - if (rc == MDB_NOTFOUND) { - m3->mc_flags |= C_EOF; - rc = MDB_SUCCESS; + if (m3->mc_ki[mc->mc_top] >= mc->mc_ki[mc->mc_top]) { + if (m3->mc_ki[mc->mc_top] >= nkeys) { + rc = mdb_cursor_sibling(m3, 1); + if (rc == MDB_NOTFOUND) { + m3->mc_flags |= C_EOF; + rc = MDB_SUCCESS; + continue; + } + } + if (mc->mc_db->md_flags & MDB_DUPSORT) { + MDB_node *node = NODEPTR(m3->mc_pg[m3->mc_top], m3->mc_ki[m3->mc_top]); + /* If this node has dupdata, it may need to be reinited + * because its data has moved. + * If the xcursor was not initd it must be reinited. + * Else if node points to a subDB, nothing is needed. + * Else (xcursor was initd, not a subDB) needs mc_pg[0] reset. + */ + if (node->mn_flags & F_DUPDATA) { + if (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { + if (!(node->mn_flags & F_SUBDATA)) + m3->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(node); + } else { + mdb_xcursor_init1(m3, node); + m3->mc_xcursor->mx_cursor.mc_flags |= C_DEL; + } + } } } } @@ -8205,6 +8550,7 @@ mdb_del0(MDB_txn *txn, MDB_dbi dbi, } /** Split a page and insert a new node. + * Set #MDB_TXN_ERROR on failure. * @param[in,out] mc Cursor pointing to the page and desired insertion index. * The cursor will be updated to point to the actual page and index where * the node got inserted after the split. @@ -8246,12 +8592,19 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno rp->mp_pad = mp->mp_pad; DPRINTF(("new right sibling: page %"Z"u", rp->mp_pgno)); - if (mc->mc_snum < 2) { + /* Usually when splitting the root page, the cursor + * height is 1. But when called from mdb_update_key, + * the cursor height may be greater because it walks + * up the stack while finding the branch slot to update. + */ + if (mc->mc_top < 1) { if ((rc = mdb_page_new(mc, P_BRANCH, 1, &pp))) goto done; /* shift current top to make room for new parent */ - mc->mc_pg[1] = mc->mc_pg[0]; - mc->mc_ki[1] = mc->mc_ki[0]; + for (i=mc->mc_snum; i>0; i--) { + mc->mc_pg[i] = mc->mc_pg[i-1]; + mc->mc_ki[i] = mc->mc_ki[i-1]; + } mc->mc_pg[0] = pp; mc->mc_ki[0] = 0; mc->mc_db->md_root = pp->mp_pgno; @@ -8267,16 +8620,16 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno mc->mc_db->md_depth--; goto done; } - mc->mc_snum = 2; - mc->mc_top = 1; + mc->mc_snum++; + mc->mc_top++; ptop = 0; } else { ptop = mc->mc_top-1; DPRINTF(("parent branch page is %"Z"u", mc->mc_pg[ptop]->mp_pgno)); } - mc->mc_flags |= C_SPLITTING; mdb_cursor_copy(mc, &mn); + mn.mc_xcursor = NULL; mn.mc_pg[mn.mc_top] = rp; mn.mc_ki[ptop] = mc->mc_ki[ptop]+1; @@ -8326,8 +8679,6 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno rp->mp_lower += sizeof(indx_t); rp->mp_upper -= ksize - sizeof(indx_t); mc->mc_ki[mc->mc_top] = x; - mc->mc_pg[mc->mc_top] = rp; - mc->mc_ki[ptop]++; } } else { int psize, nsize, k; @@ -8420,21 +8771,18 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno /* Copy separator key to the parent. */ if (SIZELEFT(mn.mc_pg[ptop]) < mdb_branch_size(env, &sepkey)) { + int snum = mc->mc_snum; mn.mc_snum--; mn.mc_top--; did_split = 1; - rc = mdb_page_split(&mn, &sepkey, NULL, rp->mp_pgno, 0); + /* We want other splits to find mn when doing fixups */ + WITH_CURSOR_TRACKING(mn, + rc = mdb_page_split(&mn, &sepkey, NULL, rp->mp_pgno, 0)); if (rc) goto done; /* root split? */ - if (mn.mc_snum == mc->mc_snum) { - mc->mc_pg[mc->mc_snum] = mc->mc_pg[mc->mc_top]; - mc->mc_ki[mc->mc_snum] = mc->mc_ki[mc->mc_top]; - mc->mc_pg[mc->mc_top] = mc->mc_pg[ptop]; - mc->mc_ki[mc->mc_top] = mc->mc_ki[ptop]; - mc->mc_snum++; - mc->mc_top++; + if (mc->mc_snum > snum) { ptop++; } /* Right page might now have changed parent. @@ -8460,7 +8808,6 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno rc = mdb_node_add(&mn, mn.mc_ki[ptop], &sepkey, NULL, rp->mp_pgno, 0); mn.mc_top++; } - mc->mc_flags ^= C_SPLITTING; if (rc != MDB_SUCCESS) { goto done; } @@ -8530,11 +8877,6 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno /* reset back to original page */ if (newindx < split_indx) { mc->mc_pg[mc->mc_top] = mp; - if (nflags & MDB_RESERVE) { - node = NODEPTR(mp, mc->mc_ki[mc->mc_top]); - if (!(node->mn_flags & F_BIGDATA)) - newdata->mv_data = NODEDATA(node); - } } else { mc->mc_pg[mc->mc_top] = rp; mc->mc_ki[ptop]++; @@ -8548,13 +8890,32 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno } } } + if (nflags & MDB_RESERVE) { + node = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + if (!(node->mn_flags & F_BIGDATA)) + newdata->mv_data = NODEDATA(node); + } + } else { + if (newindx >= split_indx) { + mc->mc_pg[mc->mc_top] = rp; + mc->mc_ki[ptop]++; + /* Make sure mc_ki is still valid. + */ + if (mn.mc_pg[ptop] != mc->mc_pg[ptop] && + mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) { + for (i=0; i<=ptop; i++) { + mc->mc_pg[i] = mn.mc_pg[i]; + mc->mc_ki[i] = mn.mc_ki[i]; + } + } + } } { /* Adjust other cursors pointing to mp */ MDB_cursor *m2, *m3; MDB_dbi dbi = mc->mc_dbi; - int fixup = NUMKEYS(mp); + nkeys = NUMKEYS(mp); for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { if (mc->mc_flags & C_SUB) @@ -8565,16 +8926,17 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno continue; if (!(m2->mc_flags & m3->mc_flags & C_INITIALIZED)) continue; - if (m3->mc_flags & C_SPLITTING) - continue; if (new_root) { int k; + /* sub cursors may be on different DB */ + if (m3->mc_pg[0] != mp) + continue; /* root split */ for (k=new_root; k>=0; k--) { m3->mc_ki[k+1] = m3->mc_ki[k]; m3->mc_pg[k+1] = m3->mc_pg[k]; } - if (m3->mc_ki[0] >= split_indx) { + if (m3->mc_ki[0] >= nkeys) { m3->mc_ki[0] = 1; } else { m3->mc_ki[0] = 0; @@ -8586,15 +8948,20 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno if (m3->mc_top >= mc->mc_top && m3->mc_pg[mc->mc_top] == mp) { if (m3->mc_ki[mc->mc_top] >= newindx && !(nflags & MDB_SPLIT_REPLACE)) m3->mc_ki[mc->mc_top]++; - if (m3->mc_ki[mc->mc_top] >= fixup) { + if (m3->mc_ki[mc->mc_top] >= nkeys) { m3->mc_pg[mc->mc_top] = rp; - m3->mc_ki[mc->mc_top] -= fixup; - m3->mc_ki[ptop] = mn.mc_ki[ptop]; + m3->mc_ki[mc->mc_top] -= nkeys; + for (i=0; imc_top; i++) { + m3->mc_ki[i] = mn.mc_ki[i]; + m3->mc_pg[i] = mn.mc_pg[i]; + } } } else if (!did_split && m3->mc_top >= ptop && m3->mc_pg[ptop] == mc->mc_pg[ptop] && m3->mc_ki[ptop] >= mc->mc_ki[ptop]) { m3->mc_ki[ptop]++; } + if (XCURSOR_INITED(m3) && IS_LEAF(mp)) + XCURSOR_REFRESH(m3, m3->mc_pg[mc->mc_top], m3->mc_ki[mc->mc_top]); } } DPRINTF(("mp left: %d, rp left: %d", SIZELEFT(mp), SIZELEFT(rp))); @@ -8613,6 +8980,7 @@ mdb_put(MDB_txn *txn, MDB_dbi dbi, { MDB_cursor mc; MDB_xcursor mx; + int rc; if (!key || !data || !TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) return EINVAL; @@ -8624,29 +8992,36 @@ mdb_put(MDB_txn *txn, MDB_dbi dbi, return (txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; mdb_cursor_init(&mc, txn, dbi, &mx); - return mdb_cursor_put(&mc, key, data, flags); + mc.mc_next = txn->mt_cursors[dbi]; + txn->mt_cursors[dbi] = &mc; + rc = mdb_cursor_put(&mc, key, data, flags); + txn->mt_cursors[dbi] = mc.mc_next; + return rc; } #ifndef MDB_WBUF #define MDB_WBUF (1024*1024) #endif +#define MDB_EOF 0x10 /**< #mdb_env_copyfd1() is done reading */ - /** State needed for a compacting copy. */ + /** State needed for a double-buffering compacting copy. */ typedef struct mdb_copy { + MDB_env *mc_env; + MDB_txn *mc_txn; pthread_mutex_t mc_mutex; - pthread_cond_t mc_cond; + pthread_cond_t mc_cond; /**< Condition variable for #mc_new */ char *mc_wbuf[2]; char *mc_over[2]; - MDB_env *mc_env; - MDB_txn *mc_txn; int mc_wlen[2]; int mc_olen[2]; pgno_t mc_next_pgno; HANDLE mc_fd; - int mc_status; - volatile int mc_new; - int mc_toggle; - + int mc_toggle; /**< Buffer number in provider */ + int mc_new; /**< (0-2 buffers to write) | (#MDB_EOF at end) */ + /** Error code. Never cleared if set. Both threads can set nonzero + * to fail the copy. Not mutex-protected, LMDB expects atomic int. + */ + volatile int mc_error; } mdb_copy; /** Dedicated writer thread for compacting copy. */ @@ -8662,26 +9037,38 @@ mdb_env_copythr(void *arg) #else int len; #define DO_WRITE(rc, fd, ptr, w2, len) len = write(fd, ptr, w2); rc = (len >= 0) +#ifdef SIGPIPE + sigset_t set; + sigemptyset(&set); + sigaddset(&set, SIGPIPE); + if ((rc = pthread_sigmask(SIG_BLOCK, &set, NULL)) != 0) + my->mc_error = rc; +#endif #endif pthread_mutex_lock(&my->mc_mutex); - my->mc_new = 0; - pthread_cond_signal(&my->mc_cond); for(;;) { while (!my->mc_new) pthread_cond_wait(&my->mc_cond, &my->mc_mutex); - if (my->mc_new < 0) { - my->mc_new = 0; + if (my->mc_new == 0 + MDB_EOF) /* 0 buffers, just EOF */ break; - } - my->mc_new = 0; wsize = my->mc_wlen[toggle]; ptr = my->mc_wbuf[toggle]; again: - while (wsize > 0) { + rc = MDB_SUCCESS; + while (wsize > 0 && !my->mc_error) { DO_WRITE(rc, my->mc_fd, ptr, wsize, len); if (!rc) { rc = ErrCode(); +#if defined(SIGPIPE) && !defined(_WIN32) + if (rc == EPIPE) { + /* Collect the pending SIGPIPE, otherwise at least OS X + * gives it to the process on thread-exit (ITS#8504). + */ + int tmp; + sigwait(&set, &tmp); + } +#endif break; } else if (len > 0) { rc = MDB_SUCCESS; @@ -8694,8 +9081,7 @@ again: } } if (rc) { - my->mc_status = rc; - break; + my->mc_error = rc; } /* If there's an overflow page tail, write it too */ if (my->mc_olen[toggle]) { @@ -8706,39 +9092,45 @@ again: } my->mc_wlen[toggle] = 0; toggle ^= 1; + /* Return the empty buffer to provider */ + my->mc_new--; pthread_cond_signal(&my->mc_cond); } - pthread_cond_signal(&my->mc_cond); pthread_mutex_unlock(&my->mc_mutex); return (THREAD_RET)0; #undef DO_WRITE } - /** Tell the writer thread there's a buffer ready to write */ + /** Give buffer and/or #MDB_EOF to writer thread, await unused buffer. + * + * @param[in] my control structure. + * @param[in] adjust (1 to hand off 1 buffer) | (MDB_EOF when ending). + */ static int ESECT -mdb_env_cthr_toggle(mdb_copy *my, int st) +mdb_env_cthr_toggle(mdb_copy *my, int adjust) { - int toggle = my->mc_toggle ^ 1; pthread_mutex_lock(&my->mc_mutex); - if (my->mc_status) { - pthread_mutex_unlock(&my->mc_mutex); - return my->mc_status; - } - while (my->mc_new == 1) - pthread_cond_wait(&my->mc_cond, &my->mc_mutex); - my->mc_new = st; - my->mc_toggle = toggle; + my->mc_new += adjust; pthread_cond_signal(&my->mc_cond); + while (my->mc_new & 2) /* both buffers in use */ + pthread_cond_wait(&my->mc_cond, &my->mc_mutex); pthread_mutex_unlock(&my->mc_mutex); - return 0; + + my->mc_toggle ^= (adjust & 1); + /* Both threads reset mc_wlen, to be safe from threading errors */ + my->mc_wlen[my->mc_toggle] = 0; + return my->mc_error; } - /** Depth-first tree traversal for compacting copy. */ + /** Depth-first tree traversal for compacting copy. + * @param[in] my control structure. + * @param[in,out] pg database root. + * @param[in] flags includes #F_DUPDATA if it is a sorted-duplicate sub-DB. + */ static int ESECT mdb_env_cwalk(mdb_copy *my, pgno_t *pg, int flags) { - MDB_cursor mc; - MDB_txn *txn = my->mc_txn; + MDB_cursor mc = {0}; MDB_node *ni; MDB_page *mo, *mp, *leaf; char *buf, *ptr; @@ -8750,10 +9142,9 @@ mdb_env_cwalk(mdb_copy *my, pgno_t *pg, int flags) return MDB_SUCCESS; mc.mc_snum = 1; - mc.mc_top = 0; - mc.mc_txn = txn; + mc.mc_txn = my->mc_txn; - rc = mdb_page_get(my->mc_txn, *pg, &mc.mc_pg[0], NULL); + rc = mdb_page_get(&mc, *pg, &mc.mc_pg[0], NULL); if (rc) return rc; rc = mdb_page_search_root(&mc, NULL, MDB_PS_FIRST); @@ -8797,7 +9188,8 @@ mdb_env_cwalk(mdb_copy *my, pgno_t *pg, int flags) } memcpy(&pg, NODEDATA(ni), sizeof(pg)); - rc = mdb_page_get(txn, pg, &omp, NULL); + memcpy(NODEDATA(ni), &my->mc_next_pgno, sizeof(pgno_t)); + rc = mdb_page_get(&mc, pg, &omp, NULL); if (rc) goto done; if (my->mc_wlen[toggle] >= MDB_WBUF) { @@ -8819,7 +9211,6 @@ mdb_env_cwalk(mdb_copy *my, pgno_t *pg, int flags) goto done; toggle = my->mc_toggle; } - memcpy(NODEDATA(ni), &mo->mp_pgno, sizeof(pgno_t)); } else if (ni->mn_flags & F_SUBDATA) { MDB_db db; @@ -8848,7 +9239,7 @@ mdb_env_cwalk(mdb_copy *my, pgno_t *pg, int flags) again: ni = NODEPTR(mp, mc.mc_ki[mc.mc_top]); pg = NODEPGNO(ni); - rc = mdb_page_get(txn, pg, &mp, NULL); + rc = mdb_page_get(&mc, pg, &mp, NULL); if (rc) goto done; mc.mc_top++; @@ -8897,47 +9288,56 @@ mdb_env_copyfd1(MDB_env *env, HANDLE fd) { MDB_meta *mm; MDB_page *mp; - mdb_copy my; + mdb_copy my = {0}; MDB_txn *txn = NULL; pthread_t thr; - int rc; + pgno_t root, new_root; + int rc = MDB_SUCCESS; #ifdef _WIN32 - my.mc_mutex = CreateMutex(NULL, FALSE, NULL); - my.mc_cond = CreateEvent(NULL, FALSE, FALSE, NULL); + if (!(my.mc_mutex = CreateMutex(NULL, FALSE, NULL)) || + !(my.mc_cond = CreateEvent(NULL, FALSE, FALSE, NULL))) { + rc = ErrCode(); + goto done; + } my.mc_wbuf[0] = _aligned_malloc(MDB_WBUF*2, env->me_os_psize); - if (my.mc_wbuf[0] == NULL) - return errno; + if (my.mc_wbuf[0] == NULL) { + /* _aligned_malloc() sets errno, but we use Windows error codes */ + rc = ERROR_NOT_ENOUGH_MEMORY; + goto done; + } #else - pthread_mutex_init(&my.mc_mutex, NULL); - pthread_cond_init(&my.mc_cond, NULL); + if ((rc = pthread_mutex_init(&my.mc_mutex, NULL)) != 0) + return rc; + if ((rc = pthread_cond_init(&my.mc_cond, NULL)) != 0) + goto done2; #ifdef HAVE_MEMALIGN my.mc_wbuf[0] = memalign(env->me_os_psize, MDB_WBUF*2); - if (my.mc_wbuf[0] == NULL) - return errno; + if (my.mc_wbuf[0] == NULL) { + rc = errno; + goto done; + } #else - rc = posix_memalign((void **)&my.mc_wbuf[0], env->me_os_psize, MDB_WBUF*2); - if (rc) - return rc; + { + void *p; + if ((rc = posix_memalign(&p, env->me_os_psize, MDB_WBUF*2)) != 0) + goto done; + my.mc_wbuf[0] = p; + } #endif #endif memset(my.mc_wbuf[0], 0, MDB_WBUF*2); my.mc_wbuf[1] = my.mc_wbuf[0] + MDB_WBUF; - my.mc_wlen[0] = 0; - my.mc_wlen[1] = 0; - my.mc_olen[0] = 0; - my.mc_olen[1] = 0; my.mc_next_pgno = NUM_METAS; - my.mc_status = 0; - my.mc_new = 1; - my.mc_toggle = 0; my.mc_env = env; my.mc_fd = fd; - THREAD_CREATE(thr, mdb_env_copythr, &my); + rc = THREAD_CREATE(thr, mdb_env_copythr, &my); + if (rc) + goto done; rc = mdb_txn_begin(env, NULL, MDB_RDONLY, &txn); if (rc) - return rc; + goto finish; mp = (MDB_page *)my.mc_wbuf[0]; memset(mp, 0, NUM_METAS * env->me_psize); @@ -8953,57 +9353,64 @@ mdb_env_copyfd1(MDB_env *env, HANDLE fd) *(MDB_meta *)METADATA(mp) = *mm; mm = (MDB_meta *)METADATA(mp); - /* Count the number of free pages, subtract from lastpg to find - * number of active pages - */ - { + /* Set metapage 1 with current main DB */ + root = new_root = txn->mt_dbs[MAIN_DBI].md_root; + if (root != P_INVALID) { + /* Count free pages + freeDB pages. Subtract from last_pg + * to find the new last_pg, which also becomes the new root. + */ MDB_ID freecount = 0; MDB_cursor mc; MDB_val key, data; mdb_cursor_init(&mc, txn, FREE_DBI, NULL); while ((rc = mdb_cursor_get(&mc, &key, &data, MDB_NEXT)) == 0) freecount += *(MDB_ID *)data.mv_data; + if (rc != MDB_NOTFOUND) + goto finish; freecount += txn->mt_dbs[FREE_DBI].md_branch_pages + txn->mt_dbs[FREE_DBI].md_leaf_pages + txn->mt_dbs[FREE_DBI].md_overflow_pages; - /* Set metapage 1 */ - mm->mm_last_pg = txn->mt_next_pgno - freecount - 1; + new_root = txn->mt_next_pgno - 1 - freecount; + mm->mm_last_pg = new_root; mm->mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI]; - if (mm->mm_last_pg > NUM_METAS-1) { - mm->mm_dbs[MAIN_DBI].md_root = mm->mm_last_pg; - mm->mm_txnid = 1; - } else { - mm->mm_dbs[MAIN_DBI].md_root = P_INVALID; - } + mm->mm_dbs[MAIN_DBI].md_root = new_root; + } else { + /* When the DB is empty, handle it specially to + * fix any breakage like page leaks from ITS#8174. + */ + mm->mm_dbs[MAIN_DBI].md_flags = txn->mt_dbs[MAIN_DBI].md_flags; } + if (root != P_INVALID || mm->mm_dbs[MAIN_DBI].md_flags) { + mm->mm_txnid = 1; /* use metapage 1 */ + } + my.mc_wlen[0] = env->me_psize * NUM_METAS; my.mc_txn = txn; - pthread_mutex_lock(&my.mc_mutex); - while(my.mc_new) - pthread_cond_wait(&my.mc_cond, &my.mc_mutex); - pthread_mutex_unlock(&my.mc_mutex); - rc = mdb_env_cwalk(&my, &txn->mt_dbs[MAIN_DBI].md_root, 0); - if (rc == MDB_SUCCESS && my.mc_wlen[my.mc_toggle]) - rc = mdb_env_cthr_toggle(&my, 1); - mdb_env_cthr_toggle(&my, -1); - pthread_mutex_lock(&my.mc_mutex); - while(my.mc_new) - pthread_cond_wait(&my.mc_cond, &my.mc_mutex); - pthread_mutex_unlock(&my.mc_mutex); - THREAD_FINISH(thr); + rc = mdb_env_cwalk(&my, &root, 0); + if (rc == MDB_SUCCESS && root != new_root) { + rc = MDB_INCOMPATIBLE; /* page leak or corrupt DB */ + } +finish: + if (rc) + my.mc_error = rc; + mdb_env_cthr_toggle(&my, 1 | MDB_EOF); + rc = THREAD_FINISH(thr); mdb_txn_abort(txn); + +done: #ifdef _WIN32 - CloseHandle(my.mc_cond); - CloseHandle(my.mc_mutex); - _aligned_free(my.mc_wbuf[0]); + if (my.mc_wbuf[0]) _aligned_free(my.mc_wbuf[0]); + if (my.mc_cond) CloseHandle(my.mc_cond); + if (my.mc_mutex) CloseHandle(my.mc_mutex); #else + free(my.mc_wbuf[0]); pthread_cond_destroy(&my.mc_cond); +done2: pthread_mutex_destroy(&my.mc_mutex); - free(my.mc_wbuf[0]); #endif - return rc; + return rc ? rc : my.mc_error; } /** Copy environment as-is. */ @@ -9013,7 +9420,7 @@ mdb_env_copyfd0(MDB_env *env, HANDLE fd) MDB_txn *txn = NULL; mdb_mutexref_t wmutex = NULL; int rc; - size_t wsize; + size_t wsize, w3; char *ptr; #ifdef _WIN32 DWORD len, w2; @@ -9072,15 +9479,15 @@ mdb_env_copyfd0(MDB_env *env, HANDLE fd) if (rc) goto leave; - w2 = txn->mt_next_pgno * env->me_psize; + w3 = txn->mt_next_pgno * env->me_psize; { size_t fsize = 0; if ((rc = mdb_fsize(env->me_fd, &fsize))) goto leave; - if (w2 > fsize) - w2 = fsize; + if (w3 > fsize) + w3 = fsize; } - wsize = w2 - wsize; + wsize = w3 - wsize; while (wsize > 0) { if (wsize > MAX_WRITE) w2 = MAX_WRITE; @@ -9124,60 +9531,20 @@ mdb_env_copyfd(MDB_env *env, HANDLE fd) int ESECT mdb_env_copy2(MDB_env *env, const char *path, unsigned int flags) { - int rc, len; - char *lpath; + int rc; + MDB_name fname; HANDLE newfd = INVALID_HANDLE_VALUE; - if (env->me_flags & MDB_NOSUBDIR) { - lpath = (char *)path; - } else { - len = strlen(path); - len += sizeof(DATANAME); - lpath = malloc(len); - if (!lpath) - return ENOMEM; - sprintf(lpath, "%s" DATANAME, path); - } - - /* The destination path must exist, but the destination file must not. - * We don't want the OS to cache the writes, since the source data is - * already in the OS cache. - */ -#ifdef _WIN32 - newfd = CreateFileA(lpath, GENERIC_WRITE, 0, NULL, CREATE_NEW, - FILE_FLAG_NO_BUFFERING|FILE_FLAG_WRITE_THROUGH, NULL); -#else - newfd = open(lpath, O_WRONLY|O_CREAT|O_EXCL, 0666); -#endif - if (newfd == INVALID_HANDLE_VALUE) { - rc = ErrCode(); - goto leave; - } - - if (env->me_psize >= env->me_os_psize) { -#ifdef O_DIRECT - /* Set O_DIRECT if the file system supports it */ - if ((rc = fcntl(newfd, F_GETFL)) != -1) - (void) fcntl(newfd, F_SETFL, rc | O_DIRECT); -#endif -#ifdef F_NOCACHE /* __APPLE__ */ - rc = fcntl(newfd, F_NOCACHE, 1); - if (rc) { - rc = ErrCode(); - goto leave; - } -#endif + rc = mdb_fname_init(path, env->me_flags | MDB_NOLOCK, &fname); + if (rc == MDB_SUCCESS) { + rc = mdb_fopen(env, &fname, MDB_O_COPY, 0666, &newfd); + mdb_fname_destroy(fname); } - - rc = mdb_env_copyfd2(env, newfd, flags); - -leave: - if (!(env->me_flags & MDB_NOSUBDIR)) - free(lpath); - if (newfd != INVALID_HANDLE_VALUE) + if (rc == MDB_SUCCESS) { + rc = mdb_env_copyfd2(env, newfd, flags); if (close(newfd) < 0 && rc == MDB_SUCCESS) rc = ErrCode(); - + } return rc; } @@ -9337,6 +9704,7 @@ int mdb_dbi_open(MDB_txn *txn, const char *name, unsigned int flags, MDB_dbi *db MDB_db dummy; int rc, dbflag, exact; unsigned int unused = 0, seq; + char *namedup; size_t len; if (flags & ~VALID_FLAGS) @@ -9398,21 +9766,35 @@ int mdb_dbi_open(MDB_txn *txn, const char *name, unsigned int flags, MDB_dbi *db MDB_node *node = NODEPTR(mc.mc_pg[mc.mc_top], mc.mc_ki[mc.mc_top]); if ((node->mn_flags & (F_DUPDATA|F_SUBDATA)) != F_SUBDATA) return MDB_INCOMPATIBLE; - } else if (rc == MDB_NOTFOUND && (flags & MDB_CREATE)) { - /* Create if requested */ + } else { + if (rc != MDB_NOTFOUND || !(flags & MDB_CREATE)) + return rc; + if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) + return EACCES; + } + + /* Done here so we cannot fail after creating a new DB */ + if ((namedup = strdup(name)) == NULL) + return ENOMEM; + + if (rc) { + /* MDB_NOTFOUND and MDB_CREATE: Create new DB */ data.mv_size = sizeof(MDB_db); data.mv_data = &dummy; memset(&dummy, 0, sizeof(dummy)); dummy.md_root = P_INVALID; dummy.md_flags = flags & PERSISTENT_FLAGS; - rc = mdb_cursor_put(&mc, &key, &data, F_SUBDATA); + WITH_CURSOR_TRACKING(mc, + rc = mdb_cursor_put(&mc, &key, &data, F_SUBDATA)); dbflag |= DB_DIRTY; } - /* OK, got info, add to table */ - if (rc == MDB_SUCCESS) { + if (rc) { + free(namedup); + } else { + /* Got info, register DBI in this txn */ unsigned int slot = unused ? unused : txn->mt_numdbs; - txn->mt_dbxs[slot].md_name.mv_data = strdup(name); + txn->mt_dbxs[slot].md_name.mv_data = namedup; txn->mt_dbxs[slot].md_name.mv_size = len; txn->mt_dbxs[slot].md_rel = NULL; txn->mt_dbflags[slot] = dbflag; @@ -9495,8 +9877,11 @@ mdb_drop0(MDB_cursor *mc, int subs) /* DUPSORT sub-DBs have no ovpages/DBs. Omit scanning leaves. * This also avoids any P_LEAF2 pages, which have no nodes. + * Also if the DB doesn't have sub-DBs and has no overflow + * pages, omit scanning leaves. */ - if (mc->mc_flags & C_SUB) + if ((mc->mc_flags & C_SUB) || + (!subs && !mc->mc_db->md_overflow_pages)) mdb_cursor_pop(mc); mdb_cursor_copy(mc, &mx); @@ -9510,7 +9895,7 @@ mdb_drop0(MDB_cursor *mc, int subs) MDB_page *omp; pgno_t pg; memcpy(&pg, NODEDATA(ni), sizeof(pg)); - rc = mdb_page_get(txn, pg, &omp, NULL); + rc = mdb_page_get(mc, pg, &omp, NULL); if (rc != 0) goto done; mdb_cassert(mc, IS_OVERFLOW(omp)); @@ -9518,6 +9903,9 @@ mdb_drop0(MDB_cursor *mc, int subs) pg, omp->mp_pages); if (rc) goto done; + mc->mc_db->md_overflow_pages -= omp->mp_pages; + if (!mc->mc_db->md_overflow_pages && !subs) + break; } else if (subs && (ni->mn_flags & F_SUBDATA)) { mdb_xcursor_init1(mc, ni); rc = mdb_drop0(&mc->mc_xcursor->mx_cursor, 0); @@ -9525,6 +9913,8 @@ mdb_drop0(MDB_cursor *mc, int subs) goto done; } } + if (!subs && !mc->mc_db->md_overflow_pages) + goto pop; } else { if ((rc = mdb_midl_need(&txn->mt_free_pgs, n)) != 0) goto done; @@ -9546,6 +9936,7 @@ mdb_drop0(MDB_cursor *mc, int subs) /* no more siblings, go back to beginning * of previous level. */ +pop: mdb_cursor_pop(mc); mc->mc_ki[0] = 0; for (i=1; imc_snum; i++) { @@ -9562,6 +9953,7 @@ done: } else if (rc == MDB_NOTFOUND) { rc = MDB_SUCCESS; } + mc->mc_flags &= ~C_INITIALIZED; return rc; } @@ -9746,7 +10138,7 @@ mdb_reader_check(MDB_env *env, int *dead) return env->me_txns ? mdb_reader_check0(env, 0, dead) : MDB_SUCCESS; } -/** As #mdb_reader_check(). rlocked = . */ +/** As #mdb_reader_check(). \b rlocked is set if caller locked #me_rmutex. */ static int ESECT mdb_reader_check0(MDB_env *env, int rlocked, int *dead) { @@ -9849,4 +10241,32 @@ mdb_mutex_failed(MDB_env *env, mdb_mutexref_t mutex, int rc) return rc; } #endif /* MDB_ROBUST_SUPPORTED */ + +#if defined(_WIN32) +/** Convert \b src to new wchar_t[] string with room for \b xtra extra chars */ +static int ESECT +utf8_to_utf16(const char *src, MDB_name *dst, int xtra) +{ + int rc, need = 0; + wchar_t *result = NULL; + for (;;) { /* malloc result, then fill it in */ + need = MultiByteToWideChar(CP_UTF8, 0, src, -1, result, need); + if (!need) { + rc = ErrCode(); + free(result); + return rc; + } + if (!result) { + result = malloc(sizeof(wchar_t) * (need + xtra)); + if (!result) + return ENOMEM; + continue; + } + dst->mn_alloced = 1; + dst->mn_len = need - 1; + dst->mn_val = result; + return MDB_SUCCESS; + } +} +#endif /* defined(_WIN32) */ /** @} */