#include <assert.h>
#include <errno.h>
+#include <limits.h>
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include "mdb.h"
#include "midl.h"
+#if (__BYTE_ORDER == __LITTLE_ENDIAN) == (__BYTE_ORDER == __BIG_ENDIAN)
+# error "Unknown or unsupported endianness (__BYTE_ORDER)"
+#elif (-6 & 5) || CHAR_BIT != 8 || UINT_MAX < 0xffffffff || ULONG_MAX % 0xFFFF
+# error "Two's complement, reasonably sized integer types, please"
+#endif
+
/** @defgroup internal MDB Internals
* @{
*/
#define pthread_mutex_t HANDLE
#define pthread_key_t DWORD
#define pthread_self() GetCurrentThreadId()
-#define pthread_key_create(x,y) *(x) = TlsAlloc()
+#define pthread_key_create(x,y) (*(x) = TlsAlloc())
#define pthread_key_delete(x) TlsFree(x)
#define pthread_getspecific(x) TlsGetValue(x)
#define pthread_setspecific(x,y) TlsSetValue(x,y)
#define pthread_mutex_unlock(x) ReleaseMutex(x)
#define pthread_mutex_lock(x) WaitForSingleObject(x, INFINITE)
-#define LOCK_MUTEX_R(env) pthread_mutex_lock(env->me_rmutex)
-#define UNLOCK_MUTEX_R(env) pthread_mutex_unlock(env->me_rmutex)
-#define LOCK_MUTEX_W(env) pthread_mutex_lock(env->me_wmutex)
-#define UNLOCK_MUTEX_W(env) pthread_mutex_unlock(env->me_wmutex)
+#define LOCK_MUTEX_R(env) pthread_mutex_lock((env)->me_rmutex)
+#define UNLOCK_MUTEX_R(env) pthread_mutex_unlock((env)->me_rmutex)
+#define LOCK_MUTEX_W(env) pthread_mutex_lock((env)->me_wmutex)
+#define UNLOCK_MUTEX_W(env) pthread_mutex_unlock((env)->me_wmutex)
#define getpid() GetCurrentProcessId()
-#define fdatasync(fd) !FlushFileBuffers(fd)
+#define fdatasync(fd) (!FlushFileBuffers(fd))
#define ErrCode() GetLastError()
-#define GetPageSize(x) {SYSTEM_INFO si; GetSystemInfo(&si); (x) = si.dwPageSize;}
+#define GET_PAGESIZE(x) {SYSTEM_INFO si; GetSystemInfo(&si); (x) = si.dwPageSize;}
#define close(fd) CloseHandle(fd)
#define munmap(ptr,len) UnmapViewOfFile(ptr)
#else
/** Lock the reader mutex.
*/
-#define LOCK_MUTEX_R(env) pthread_mutex_lock(&env->me_txns->mti_mutex)
+#define LOCK_MUTEX_R(env) pthread_mutex_lock(&(env)->me_txns->mti_mutex)
/** Unlock the reader mutex.
*/
-#define UNLOCK_MUTEX_R(env) pthread_mutex_unlock(&env->me_txns->mti_mutex)
+#define UNLOCK_MUTEX_R(env) pthread_mutex_unlock(&(env)->me_txns->mti_mutex)
/** Lock the writer mutex.
* Only a single write transaction is allowed at a time. Other writers
* will block waiting for this mutex.
*/
-#define LOCK_MUTEX_W(env) pthread_mutex_lock(&env->me_txns->mti_wmutex)
+#define LOCK_MUTEX_W(env) pthread_mutex_lock(&(env)->me_txns->mti_wmutex)
/** Unlock the writer mutex.
*/
-#define UNLOCK_MUTEX_W(env) pthread_mutex_unlock(&env->me_txns->mti_wmutex)
+#define UNLOCK_MUTEX_W(env) pthread_mutex_unlock(&(env)->me_txns->mti_wmutex)
/** Get the error code for the last failed system function.
*/
* Mainly used to initialize file variables and signify that they are
* unused.
*/
-#define INVALID_HANDLE_VALUE -1
+#define INVALID_HANDLE_VALUE (-1)
/** Get the size of a memory page for the system.
* This is the basic size that the platform's memory manager uses, and is
* fundamental to the use of memory-mapped files.
*/
-#define GetPageSize(x) (x) = sysconf(_SC_PAGE_SIZE)
+#define GET_PAGESIZE(x) ((x) = sysconf(_SC_PAGE_SIZE))
#endif
/** @} */
/** A default memory page size.
* The actual size is platform-dependent, but we use this for
* boot-strapping. We probably should not be using this any more.
- * The #GetPageSize() macro is used to get the actual size.
+ * The #GET_PAGESIZE() macro is used to get the actual size.
*
* Note that we don't currently support Huge pages. On Linux,
* regular data files cannot use Huge pages, and in general
*/
#define DKEY(x) mdb_dkey(x, kbuf)
#else
-#define DKBUF
+#define DKBUF typedef int dummy_kbuf /* so we can put ';' after */
#define DKEY(x)
#endif
#define LAZY_RWLOCK_WRLOCK(x)
/** Grab the DB table read lock */
#define LAZY_RWLOCK_RDLOCK(x)
- /** Declare the DB table rwlock */
+ /** Declare the DB table rwlock. Should not be followed by ';'. */
#define LAZY_RWLOCK_DEF(x)
/** Initialize the DB table rwlock */
#define LAZY_RWLOCK_INIT(x,y)
* Since the database uses multi-version concurrency control, readers don't
* actually need any locking. This table is used to keep track of which
* readers are using data from which old transactions, so that we'll know
- * when a particular old transaction is no longer in use, Old transactions
- * that have freed any data pages can then have their freed pages reclaimed
+ * when a particular old transaction is no longer in use. Old transactions
+ * that have discarded any data pages can then have those pages reclaimed
* for use by a later write transaction.
*
* The lock table is constructed such that reader slots are aligned with the
* headers on any page after the first.
*/
typedef struct MDB_page {
- union {
- pgno_t mp_pgno; /**< page number */
- void * mp_next; /**< for in-memory list of freed structs */
- };
+#define mp_pgno mp_p.p_pgno
+#define mp_next mp_p.p_next
+ union padded {
+ pgno_t p_pgno; /**< page number */
+ void * p_next; /**< for in-memory list of freed structs */
+ } mp_p;
#define P_BRANCH 0x01 /**< branch page */
#define P_LEAF 0x02 /**< leaf page */
#define P_OVERFLOW 0x04 /**< overflow page */
#define P_DIRTY 0x10 /**< dirty page */
#define P_LEAF2 0x20 /**< for #MDB_DUPFIXED records */
uint32_t mp_flags;
- union {
+#define mp_lower mp_pb.pb.pb_lower
+#define mp_upper mp_pb.pb.pb_upper
+#define mp_pages mp_pb.pb_pages
+ union page_bounds {
struct {
- indx_t mp_lower; /**< lower bound of free space */
- indx_t mp_upper; /**< upper bound of free space */
- };
- uint32_t mp_pages; /**< number of overflow pages */
- };
+ indx_t pb_lower; /**< lower bound of free space */
+ indx_t pb_upper; /**< upper bound of free space */
+ } pb;
+ uint32_t pb_pages; /**< number of overflow pages */
+ } mp_pb;
indx_t mp_ptrs[1]; /**< dynamic size */
} MDB_page;
typedef struct MDB_node {
/** lo and hi are used for data size on leaf nodes and for
* child pgno on branch nodes. On 64 bit platforms, flags
- * is also used for pgno. (branch nodes ignore flags)
+ * is also used for pgno. (Branch nodes have no flags).
+ * They are in in host byte order in case that lets some
+ * accesses be optimized into a 32-bit word access.
*/
- unsigned short mn_lo;
- unsigned short mn_hi; /**< part of dsize or pgno */
+#define mn_lo mn_offset[__BYTE_ORDER!=__LITTLE_ENDIAN]
+#define mn_hi mn_offset[__BYTE_ORDER==__LITTLE_ENDIAN] /**< part of dsize or pgno */
+ unsigned short mn_offset[2];
unsigned short mn_flags; /**< flags for special node types */
#define F_BIGDATA 0x01 /**< data put on overflow page */
#define F_SUBDATA 0x02 /**< data is a sub-database */
/** Size of the node header, excluding dynamic data at the end */
#define NODESIZE offsetof(MDB_node, mn_data)
- /** Size of a node in a branch page.
+ /** Size of a node in a branch page with a given key.
* This is just the node header plus the key, there is no data.
*/
#define INDXSIZE(k) (NODESIZE + ((k) == NULL ? 0 : (k)->mv_size))
- /** Size of a node in a leaf page.
+ /** Size of a node in a leaf page with a given key and data.
* This is node header plus key plus data size.
*/
#define LEAFSIZE(k, d) (NODESIZE + (k)->mv_size + (d)->mv_size)
- /** Address of node \i in page \p */
+ /** Address of node \b i in page \b p */
#define NODEPTR(p, i) ((MDB_node *)((char *)(p) + (p)->mp_ptrs[i]))
/** Address of the key for the node */
#define LEAF2KEY(p, i, ks) ((char *)(p) + PAGEHDRSZ + ((i)*(ks)))
/** Set the \b node's key into \b key, if requested. */
-#define MDB_SET_KEY(node, key) if (key!=NULL) {(key)->mv_size = NODEKSZ(node); (key)->mv_data = NODEKEY(node);}
+#define MDB_SET_KEY(node, key) { if ((key) != NULL) { \
+ (key)->mv_size = NODEKSZ(node); (key)->mv_data = NODEKEY(node); } }
/** Information about a single database in the environment. */
typedef struct MDB_db {
HANDLE me_fd; /**< The main data file */
HANDLE me_lfd; /**< The lock file */
HANDLE me_mfd; /**< just for writing the meta pages */
+ /** Failed to update the meta page. Probably an I/O error. */
#define MDB_FATAL_ERROR 0x80000000U
uint32_t me_flags;
uint32_t me_extrapad; /**< unused for now */
size_t me_mapsize; /**< size of the data memory map */
off_t me_size; /**< current file size */
pgno_t me_maxpg; /**< me_mapsize / me_psize */
- unsigned int me_psize; /**< size of a page, from #GetPageSize */
+ unsigned int me_psize; /**< size of a page, from #GET_PAGESIZE */
unsigned int me_db_toggle; /**< which DB table is current */
MDB_dbx *me_dbxs; /**< array of static DB info */
MDB_db *me_dbs[2]; /**< two arrays of MDB_db info */
/** ID2L of pages that were written during a write txn */
ID2 me_dirty_list[MDB_IDL_UM_SIZE];
/** rwlock for the DB tables, if #LAZY_LOCKS is false */
- LAZY_RWLOCK_DEF(me_dblock);
+ LAZY_RWLOCK_DEF(me_dblock)
#ifdef _WIN32
HANDLE me_rmutex; /* Windows mutexes don't reside in shared mem */
HANDLE me_wmutex;
static int mdb_rebalance(MDB_cursor *mc);
static int mdb_update_key(MDB_page *mp, indx_t indx, MDB_val *key);
-static int mdb_move_node(MDB_cursor *csrcrc, MDB_cursor *cdstst);
-static int mdb_merge(MDB_cursor *csrcrc, MDB_cursor *cdstst);
+static int mdb_move_node(MDB_cursor *csrc, MDB_cursor *cdst);
+static int mdb_merge(MDB_cursor *csrc, MDB_cursor *cdst);
static int mdb_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata,
pgno_t newpgno);
static MDB_page *mdb_new_page(MDB_cursor *mc, uint32_t flags, int num);
static int mdb_sec_inited;
#endif
+/** Return the library version info. */
char *
mdb_version(int *major, int *minor, int *patch)
{
return MDB_VERSION_STRING;
}
- /** Table of descriptions for MDB @ref error codes */
+/** Table of descriptions for MDB @ref errors */
static char *const mdb_errstr[] = {
"MDB_KEYEXIST: Key/data pair already exists",
"MDB_NOTFOUND: No matching key/data pair found",
}
#if DEBUG
-static char *
+/** Display a key in hexadecimal and return the address of the result.
+ * @param[in] key the key to display
+ * @param[in] buf the buffer to write into. Should always be #DKBUF.
+ * @return The key in hexadecimal form.
+ */
+char *
mdb_dkey(MDB_val *key, char *buf)
{
char *ptr = buf;
unsigned int i;
if (key->mv_size > MAXKEYSIZE)
return "MAXKEYSIZE";
+ /* may want to make this a dynamic check: if the key is mostly
+ * printable characters, print it as-is instead of converting to hex.
+ */
#if 1
for (i=0; i<key->mv_size; i++)
ptr += sprintf(ptr, "%02x", *c++);
return txn->mt_dbxs[dbi].md_cmp(a, b);
}
+/** Compare two data items according to a particular database.
+ * This returns a comparison as if the two items were data items of
+ * a sorted duplicates #MDB_DUPSORT database.
+ * @param[in] txn A transaction handle returned by #mdb_txn_begin()
+ * @param[in] dbi A database handle returned by #mdb_open()
+ * @param[in] a The first item to compare
+ * @param[in] b The second item to compare
+ * @return < 0 if a < b, 0 if a == b, > 0 if a > b
+ */
int
mdb_dcmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b)
{
return EINVAL; /* too bad you can't distinguish this from a valid result */
}
-/* Allocate new page(s) for writing */
+/** Allocate pages for writing.
+ * If there are free pages available from older transactions, they
+ * will be re-used first. Otherwise a new page will be allocated.
+ * @param[in] mc cursor A cursor handle identifying the transaction and
+ * database for which we are allocating.
+ * @param[in] num the number of pages to allocate.
+ * @return Address of the allocated page(s). Requests for multiple pages
+ * will always be satisfied by a single contiguous chunk of memory.
+ */
static MDB_page *
mdb_alloc_page(MDB_cursor *mc, int num)
{
return np;
}
-/* Touch a page: make it dirty and re-insert into tree with updated pgno.
+/** Touch a page: make it dirty and re-insert into tree with updated pgno.
+ * @param[in] mc cursor pointing to the page to be touched
+ * @return 0 on success, non-zero on failure.
*/
static int
mdb_touch(MDB_cursor *mc)
mp->mp_flags |= P_DIRTY;
mc->mc_pg[mc->mc_top] = mp;
- /* Update the page number to new touched page. */
+ /** If this page has a parent, update the parent to point to
+ * this new page.
+ */
if (mc->mc_top)
SETPGNO(NODEPTR(mc->mc_pg[mc->mc_top-1], mc->mc_ki[mc->mc_top-1]), mp->mp_pgno);
}
static inline void
mdb_txn_reset0(MDB_txn *txn);
+/** Common code for #mdb_txn_begin() and #mdb_txn_renew().
+ * @param[in] txn the transaction handle to initialize
+ * @return 0 on success, non-zero on failure. This can only
+ * fail for read-only transactions, and then only if the
+ * reader table is full.
+ */
static inline int
mdb_txn_renew0(MDB_txn *txn)
{
return rc;
}
+/** Common code for #mdb_txn_reset() and #mdb_txn_abort().
+ * @param[in] txn the transaction handle to reset
+ */
static inline void
mdb_txn_reset0(MDB_txn *txn)
{
env->me_txn = NULL;
for (i=2; i<env->me_numdbs; i++)
env->me_dbxs[i].md_dirty = 0;
+ /* The writer mutex was locked in mdb_txn_begin. */
UNLOCK_MUTEX_W(env);
}
}
return MDB_SUCCESS;
}
+/** Read the environment parameters of a DB environment before
+ * mapping it into memory.
+ * @param[in] env the environment handle
+ * @param[out] meta address of where to store the meta information
+ * @return 0 on success, non-zero on failure.
+ */
static int
mdb_env_read_header(MDB_env *env, MDB_meta *meta)
{
return 0;
}
+/** Write the environment parameters of a freshly created DB environment.
+ * @param[in] env the environment handle
+ * @param[out] meta address of where to store the meta information
+ * @return 0 on success, non-zero on failure.
+ */
static int
mdb_env_init_meta(MDB_env *env, MDB_meta *meta)
{
DPUTS("writing new meta page");
- GetPageSize(psize);
+ GET_PAGESIZE(psize);
meta->mm_magic = MDB_MAGIC;
meta->mm_version = MDB_VERSION;
return rc;
}
+/** Update the environment info to commit a transaction.
+ * @param[in] txn the transaction that's being committed
+ * @return 0 on success, non-zero on failure.
+ */
static int
mdb_env_write_meta(MDB_txn *txn)
{
return MDB_SUCCESS;
}
+/** Check both meta pages to see which one is newer.
+ * @param[in] env the environment handle
+ * @param[out] which address of where to store the meta toggle ID
+ * @return 0 on success, non-zero on failure.
+ */
static int
mdb_env_read_meta(MDB_env *env, int *which)
{
return MDB_SUCCESS;
}
+/** Further setup required for opening an MDB environment
+ */
static int
mdb_env_open2(MDB_env *env, unsigned int flags)
{
unsigned short *u, *c;
int x;
- u = a->mv_data + a->mv_size;
- c = b->mv_data + a->mv_size;
+ u = (unsigned short *) ((char *) a->mv_data + a->mv_size);
+ c = (unsigned short *) ((char *) b->mv_data + a->mv_size);
do {
x = *--u - *--c;
} while(!x && u > (unsigned short *)a->mv_data);
static int
memncmp(const MDB_val *a, const MDB_val *b)
{
- int diff, len_diff;
+ int diff;
+ ssize_t len_diff;
unsigned int len;
len = a->mv_size;
- len_diff = a->mv_size - b->mv_size;
- if (len_diff > 0)
+ len_diff = (ssize_t) a->mv_size - (ssize_t) b->mv_size;
+ if (len_diff > 0) {
len = b->mv_size;
+ len_diff = 1;
+ }
+
diff = memcmp(a->mv_data, b->mv_data, len);
- return diff ? diff : len_diff;
+ return diff ? diff : len_diff<0 ? -1 : len_diff;
}
static int
memnrcmp(const MDB_val *a, const MDB_val *b)
{
const unsigned char *p1, *p2, *p1_lim;
- int diff, len_diff;
-
- if (b->mv_size == 0)
- return a->mv_size != 0;
- if (a->mv_size == 0)
- return -1;
+ ssize_t len_diff;
+ int diff;
- p1 = (const unsigned char *)a->mv_data + a->mv_size - 1;
- p2 = (const unsigned char *)b->mv_data + b->mv_size - 1;
+ p1_lim = (const unsigned char *)a->mv_data;
+ p1 = (const unsigned char *)a->mv_data + a->mv_size;
+ p2 = (const unsigned char *)b->mv_data + b->mv_size;
- len_diff = a->mv_size - b->mv_size;
- if (len_diff < 0)
- p1_lim = p1 - a->mv_size;
- else
- p1_lim = p1 - b->mv_size;
+ len_diff = (ssize_t) a->mv_size - (ssize_t) b->mv_size;
+ if (len_diff > 0) {
+ p1_lim += len_diff;
+ len_diff = 1;
+ }
while (p1 > p1_lim) {
- diff = *p1 - *p2;
+ diff = *--p1 - *--p2;
if (diff)
return diff;
- p1--;
- p2--;
}
- return len_diff;
+ return len_diff<0 ? -1 : len_diff;
}
/* Search for key within a leaf page, using binary search.
set1:
if (exactp)
*exactp = 1;
- rc = 0;
leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
goto set3;
}
goto top;
} else {
int exact = 0;
- rc = mdb_cursor_set(mc, key, NULL, MDB_SET, &exact);
+ MDB_val d2;
+ rc = mdb_cursor_set(mc, key, &d2, MDB_SET, &exact);
if (flags == MDB_NOOVERWRITE && rc == 0) {
DPRINTF("duplicate key [%s]", DKEY(key));
+ *data = d2;
return MDB_KEYEXIST;
}
if (rc && rc != MDB_NOTFOUND)
rdata = &xdata;
xdata.mv_size = sizeof(MDB_db);
xdata.mv_data = &dummy;
+ /* new sub-DB, must fully init xcursor */
+ if (flags == MDB_CURRENT)
+ flags = 0;
goto new_sub;
}
goto put_sub;
int
mdb_env_set_flags(MDB_env *env, unsigned int flag, int onoff)
{
+ /** Only a subset of the @ref mdb_env flags can be changed
+ * at runtime. Changing other flags requires closing the environment
+ * and re-opening it with the new flags.
+ */
#define CHANGEABLE (MDB_NOSYNC)
if ((flag & CHANGEABLE) != flag)
return EINVAL;