* BerkeleyDB API, but much simplified.
*/
/*
- * Copyright 2011-2016 Howard Chu, Symas Corp.
+ * Copyright 2011-2017 Howard Chu, Symas Corp.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
/* Most platforms have posix_memalign, older may only have memalign */
#define HAVE_MEMALIGN 1
#include <malloc.h>
+/* On Solaris, we need the POSIX sigwait function */
+#if defined (__sun)
+# define _POSIX_PTHREAD_SEMANTICS 1
+#endif
#endif
#if !(defined(BYTE_ORDER) || defined(__BYTE_ORDER))
#include <resolv.h> /* defines BYTE_ORDER on HPUX and Solaris */
#endif
-#if defined(__APPLE__) || defined (BSD)
+#if defined(__APPLE__) || defined (BSD) || defined(__FreeBSD_kernel__)
# define MDB_USE_POSIX_SEM 1
# define MDB_FDATASYNC fsync
#elif defined(ANDROID)
#ifndef _WIN32
#include <pthread.h>
+#include <signal.h>
#ifdef MDB_USE_POSIX_SEM
# define MDB_USE_HASH 1
#include <semaphore.h>
}
#else /* MDB_USE_POSIX_MUTEX: */
- /** Shared mutex/semaphore as it is stored (mdb_mutex_t), and as
- * local variables keep it (mdb_mutexref_t).
+ /** Shared mutex/semaphore as the original is stored.
*
- * When #mdb_mutexref_t is a pointer declaration and #mdb_mutex_t is
- * not, then it is array[size 1] so it can be assigned to a pointer.
- * @{
+ * Not for copies. Instead it can be assigned to an #mdb_mutexref_t.
+ * When mdb_mutexref_t is a pointer and mdb_mutex_t is not, then it
+ * is array[size 1] so it can be assigned to the pointer.
*/
-typedef pthread_mutex_t mdb_mutex_t[1], *mdb_mutexref_t;
- /* @} */
+typedef pthread_mutex_t mdb_mutex_t[1];
+ /** Reference to an #mdb_mutex_t */
+typedef pthread_mutex_t *mdb_mutexref_t;
/** Lock the reader or writer mutex.
* Returns 0 or a code to give #mdb_mutex_failed(), as in #LOCK_MUTEX().
*/
/** Header for a single key/data pair within a page.
* Used in pages of type #P_BRANCH and #P_LEAF without #P_LEAF2.
* We guarantee 2-byte alignment for 'MDB_node's.
+ *
+ * #mn_lo and #mn_hi are used for data size on leaf nodes, and for child
+ * pgno on branch nodes. On 64 bit platforms, #mn_flags is also used
+ * for pgno. (Branch nodes have no flags). Lo and hi are in host byte
+ * order in case some accesses can be optimized to 32-bit word access.
+ *
+ * Leaf node flags describe node contents. #F_BIGDATA says the node's
+ * data part is the page number of an overflow page with actual data.
+ * #F_DUPDATA and #F_SUBDATA can be combined giving duplicate data in
+ * a sub-page/sub-database, and named databases (just #F_SUBDATA).
*/
typedef struct MDB_node {
- /** lo and hi are used for data size on leaf nodes and for
- * child pgno on branch nodes. On 64 bit platforms, flags
- * is also used for pgno. (Branch nodes have no flags).
- * They are in host byte order in case that lets some
- * accesses be optimized into a 32-bit word access.
- */
+ /** part of data size or pgno
+ * @{ */
#if BYTE_ORDER == LITTLE_ENDIAN
- unsigned short mn_lo, mn_hi; /**< part of data size or pgno */
+ unsigned short mn_lo, mn_hi;
#else
unsigned short mn_hi, mn_lo;
#endif
+ /** @} */
/** @defgroup mdb_node Node Flags
* @ingroup internal
* Flags for node headers.
/** Allocate memory for a page.
* Re-use old malloc'd pages first for singletons, otherwise just malloc.
+ * Set #MDB_TXN_ERROR on failure.
*/
static MDB_page *
mdb_page_malloc(MDB_txn *txn, unsigned num)
}
/** Allocate page numbers and memory for writing. Maintain me_pglast,
- * me_pghead and mt_next_pgno.
+ * me_pghead and mt_next_pgno. Set #MDB_TXN_ERROR on failure.
*
* If there are free pages available from older transactions, they
* are re-used first. Otherwise allocate a new page at mt_next_pgno.
np = m2.mc_pg[m2.mc_top];
leaf = NODEPTR(np, m2.mc_ki[m2.mc_top]);
if ((rc = mdb_node_read(&m2, leaf, &data)) != MDB_SUCCESS)
- return rc;
+ goto fail;
idl = (MDB_ID *) data.mv_data;
i = idl[0];
}
/** Touch a page: make it dirty and re-insert into tree with updated pgno.
+ * Set #MDB_TXN_ERROR on failure.
* @param[in] mc cursor pointing to the page to be touched
* @return 0 on success, non-zero on failure.
*/
}
}
-/** Push a page onto the top of the cursor's stack. */
+/** Push a page onto the top of the cursor's stack.
+ * Set #MDB_TXN_ERROR on failure.
+ */
static int
mdb_cursor_push(MDB_cursor *mc, MDB_page *mp)
{
}
/** Find the address of the page corresponding to a given page number.
+ * Set #MDB_TXN_ERROR on failure.
* @param[in] mc the cursor accessing the page.
* @param[in] pgno the page number for the page to retrieve.
* @param[out] ret address of a pointer where the page's address will be stored.
if (flags & (MDB_PS_FIRST|MDB_PS_LAST)) {
i = 0;
- if (flags & MDB_PS_LAST)
+ if (flags & MDB_PS_LAST) {
i = NUMKEYS(mp) - 1;
+ /* if already init'd, see if we're already in right place */
+ if (mc->mc_flags & C_INITIALIZED) {
+ if (mc->mc_ki[mc->mc_top] == i) {
+ mc->mc_top = mc->mc_snum++;
+ mp = mc->mc_pg[mc->mc_top];
+ goto ready;
+ }
+ }
+ }
} else {
int exact;
node = mdb_node_search(mc, key, &exact);
if ((rc = mdb_cursor_push(mc, mp)))
return rc;
+ready:
if (flags & MDB_PS_MODIFY) {
if ((rc = mdb_page_touch(mc)) != 0)
return rc;
MDB_node *leaf;
int rc;
- if ((mc->mc_flags & C_EOF) ||
- ((mc->mc_flags & C_DEL) && op == MDB_NEXT_DUP)) {
+ if ((mc->mc_flags & C_DEL && op == MDB_NEXT_DUP))
return MDB_NOTFOUND;
- }
+
if (!(mc->mc_flags & C_INITIALIZED))
return mdb_cursor_first(mc, key, data);
mp = mc->mc_pg[mc->mc_top];
+ if (mc->mc_flags & C_EOF) {
+ if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mp)-1)
+ return MDB_NOTFOUND;
+ mc->mc_flags ^= C_EOF;
+ }
+
if (mc->mc_db->md_flags & MDB_DUPSORT) {
leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
if (mc->mc_xcursor)
mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF);
- if (!(mc->mc_flags & C_EOF)) {
-
- if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) {
- rc = mdb_page_search(mc, NULL, MDB_PS_LAST);
- if (rc != MDB_SUCCESS)
- return rc;
- }
- mdb_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top]));
-
+ if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) {
+ rc = mdb_page_search(mc, NULL, MDB_PS_LAST);
+ if (rc != MDB_SUCCESS)
+ return rc;
}
+ mdb_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top]));
+
mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]) - 1;
mc->mc_flags |= C_INITIALIZED|C_EOF;
leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
}
/** Allocate and initialize new pages for a database.
+ * Set #MDB_TXN_ERROR on failure.
* @param[in] mc a cursor on the database being added to.
* @param[in] flags flags defining what type of page is being allocated.
* @param[in] num the number of pages to allocate. This is usually 1,
}
/** Add a node to the page pointed to by the cursor.
+ * Set #MDB_TXN_ERROR on failure.
* @param[in] mc The cursor for this operation.
* @param[in] indx The index on the page where the new node should be added.
* @param[in] key The key for the new node.
if (!(mc->mc_flags & C_INITIALIZED))
return EINVAL;
- if (!mc->mc_snum || (mc->mc_flags & C_EOF))
+ if (!mc->mc_snum)
return MDB_NOTFOUND;
+ if (mc->mc_flags & C_EOF) {
+ if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mc->mc_pg[mc->mc_top]))
+ return MDB_NOTFOUND;
+ mc->mc_flags ^= C_EOF;
+ }
+
leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) {
*countp = 1;
}
/** Replace the key for a branch node with a new key.
+ * Set #MDB_TXN_ERROR on failure.
* @param[in] mc Cursor pointing to the node to operate on.
* @param[in] key The new key to use.
* @return 0 on success, non-zero on failure.
}
if (mc->mc_db->md_flags & MDB_DUPSORT) {
MDB_node *node = NODEPTR(m3->mc_pg[m3->mc_top], m3->mc_ki[m3->mc_top]);
- /* If this node is a fake page, it needs to be reinited
- * because its data has moved. But just reset mc_pg[0]
- * if the xcursor is already live.
+ /* If this node has dupdata, it may need to be reinited
+ * because its data has moved.
+ * If the xcursor was not initd it must be reinited.
+ * Else if node points to a subDB, nothing is needed.
+ * Else (xcursor was initd, not a subDB) needs mc_pg[0] reset.
*/
- if ((node->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA) {
- if (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)
- m3->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(node);
- else
+ if (node->mn_flags & F_DUPDATA) {
+ if (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) {
+ if (!(node->mn_flags & F_SUBDATA))
+ m3->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(node);
+ } else
mdb_xcursor_init1(m3, node);
}
}
}
/** Split a page and insert a new node.
+ * Set #MDB_TXN_ERROR on failure.
* @param[in,out] mc Cursor pointing to the page and desired insertion index.
* The cursor will be updated to point to the actual page and index where
* the node got inserted after the split.
#else
int len;
#define DO_WRITE(rc, fd, ptr, w2, len) len = write(fd, ptr, w2); rc = (len >= 0)
+#ifdef SIGPIPE
+ sigset_t set;
+ sigemptyset(&set);
+ sigaddset(&set, SIGPIPE);
+ if ((rc = pthread_sigmask(SIG_BLOCK, &set, NULL)) != 0)
+ my->mc_error = rc;
+#endif
#endif
pthread_mutex_lock(&my->mc_mutex);
DO_WRITE(rc, my->mc_fd, ptr, wsize, len);
if (!rc) {
rc = ErrCode();
+#if defined(SIGPIPE) && !defined(_WIN32)
+ if (rc == EPIPE) {
+ /* Collect the pending SIGPIPE, otherwise at least OS X
+ * gives it to the process on thread-exit (ITS#8504).
+ */
+ int tmp;
+ sigwait(&set, &tmp);
+ }
+#endif
break;
} else if (len > 0) {
rc = MDB_SUCCESS;
memset(&dummy, 0, sizeof(dummy));
dummy.md_root = P_INVALID;
dummy.md_flags = flags & PERSISTENT_FLAGS;
- rc = mdb_cursor_put(&mc, &key, &data, F_SUBDATA);
+ WITH_CURSOR_TRACKING(mc,
+ rc = mdb_cursor_put(&mc, &key, &data, F_SUBDATA));
dbflag |= DB_DIRTY;
}
return env->me_txns ? mdb_reader_check0(env, 0, dead) : MDB_SUCCESS;
}
-/** As #mdb_reader_check(). rlocked = <caller locked the reader mutex>. */
+/** As #mdb_reader_check(). \b rlocked is set if caller locked #me_rmutex. */
static int ESECT
mdb_reader_check0(MDB_env *env, int rlocked, int *dead)
{