git.sur5r.net Git - openldap/blob - libraries/libmdb/mdb.c

   1 /** @file mdb.c
   2  *      @brief memory-mapped database library
   3  *
   4  *      A Btree-based database management library modeled loosely on the
   5  *      BerkeleyDB API, but much simplified.
   6  */
   7 /*
   8  * Copyright 2011-2012 Howard Chu, Symas Corp.
   9  * All rights reserved.
  10  *
  11  * Redistribution and use in source and binary forms, with or without
  12  * modification, are permitted only as authorized by the OpenLDAP
  13  * Public License.
  14  *
  15  * A copy of this license is available in the file LICENSE in the
  16  * top-level directory of the distribution or, alternatively, at
  17  * <http://www.OpenLDAP.org/license.html>.
  18  *
  19  * This code is derived from btree.c written by Martin Hedenfalk.
  20  *
  21  * Copyright (c) 2009, 2010 Martin Hedenfalk <martin@bzero.se>
  22  *
  23  * Permission to use, copy, modify, and distribute this software for any
  24  * purpose with or without fee is hereby granted, provided that the above
  25  * copyright notice and this permission notice appear in all copies.
  26  *
  27  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
  28  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
  29  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
  30  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  31  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  32  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
  33  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  34  */
  35 #include <sys/types.h>
  36 #include <sys/stat.h>
  37 #include <sys/param.h>
  38 #ifdef _WIN32
  39 #include <windows.h>
  40 #else
  41 #include <sys/uio.h>
  42 #include <sys/mman.h>
  43 #ifdef HAVE_SYS_FILE_H
  44 #include <sys/file.h>
  45 #endif
  46 #include <fcntl.h>
  47 #endif
  48
  49 #include <assert.h>
  50 #include <errno.h>
  51 #include <limits.h>
  52 #include <stddef.h>
  53 #include <inttypes.h>
  54 #include <stdio.h>
  55 #include <stdlib.h>
  56 #include <string.h>
  57 #include <time.h>
  58 #include <unistd.h>
  59
  60 #if !(defined(BYTE_ORDER) || defined(__BYTE_ORDER))
  61 #include <resolv.h>     /* defines BYTE_ORDER on HPUX and Solaris */
  62 #endif
  63
  64 #if defined(__APPLE__) || defined (BSD)
  65 #define USE_POSIX_SEM
  66 #endif
  67
  68 #ifndef _WIN32
  69 #include <pthread.h>
  70 #ifdef USE_POSIX_SEM
  71 #include <semaphore.h>
  72 #endif
  73 #endif
  74
  75 #ifdef USE_VALGRIND
  76 #include <valgrind/memcheck.h>
  77 #define VGMEMP_CREATE(h,r,z)    VALGRIND_CREATE_MEMPOOL(h,r,z)
  78 #define VGMEMP_ALLOC(h,a,s) VALGRIND_MEMPOOL_ALLOC(h,a,s)
  79 #define VGMEMP_FREE(h,a) VALGRIND_MEMPOOL_FREE(h,a)
  80 #define VGMEMP_DESTROY(h)       VALGRIND_DESTROY_MEMPOOL(h)
  81 #define VGMEMP_DEFINED(a,s)     VALGRIND_MAKE_MEM_DEFINED(a,s)
  82 #else
  83 #define VGMEMP_CREATE(h,r,z)
  84 #define VGMEMP_ALLOC(h,a,s)
  85 #define VGMEMP_FREE(h,a)
  86 #define VGMEMP_DESTROY(h)
  87 #define VGMEMP_DEFINED(a,s)
  88 #endif
  89
  90 #ifndef BYTE_ORDER
  91 # if (defined(_LITTLE_ENDIAN) || defined(_BIG_ENDIAN)) && !(defined(_LITTLE_ENDIAN) && defined(_BIG_ENDIAN))
  92 /* Solaris just defines one or the other */
  93 #  define LITTLE_ENDIAN 1234
  94 #  define BIG_ENDIAN    4321
  95 #  ifdef _LITTLE_ENDIAN
  96 #   define BYTE_ORDER  LITTLE_ENDIAN
  97 #  else
  98 #   define BYTE_ORDER  BIG_ENDIAN
  99 #  endif
 100 # else
 101 #  define BYTE_ORDER   __BYTE_ORDER
 102 # endif
 103 #endif
 104
 105 #ifndef LITTLE_ENDIAN
 106 #define LITTLE_ENDIAN   __LITTLE_ENDIAN
 107 #endif
 108 #ifndef BIG_ENDIAN
 109 #define BIG_ENDIAN      __BIG_ENDIAN
 110 #endif
 111
 112 #if defined(__i386) || defined(__x86_64)
 113 #define MISALIGNED_OK   1
 114 #endif
 115
 116 #include "mdb.h"
 117 #include "midl.h"
 118
 119 #if (BYTE_ORDER == LITTLE_ENDIAN) == (BYTE_ORDER == BIG_ENDIAN)
 120 # error "Unknown or unsupported endianness (BYTE_ORDER)"
 121 #elif (-6 & 5) || CHAR_BIT != 8 || UINT_MAX < 0xffffffff || ULONG_MAX % 0xFFFF
 122 # error "Two's complement, reasonably sized integer types, please"
 123 #endif
 124
 125 /** @defgroup internal  MDB Internals
 126  *      @{
 127  */
 128 /** @defgroup compat    Windows Compatibility Macros
 129  *      A bunch of macros to minimize the amount of platform-specific ifdefs
 130  *      needed throughout the rest of the code. When the features this library
 131  *      needs are similar enough to POSIX to be hidden in a one-or-two line
 132  *      replacement, this macro approach is used.
 133  *      @{
 134  */
 135 #ifdef _WIN32
 136 #define pthread_t       DWORD
 137 #define pthread_mutex_t HANDLE
 138 #define pthread_key_t   DWORD
 139 #define pthread_self()  GetCurrentThreadId()
 140 #define pthread_key_create(x,y) (*(x) = TlsAlloc())
 141 #define pthread_key_delete(x)   TlsFree(x)
 142 #define pthread_getspecific(x)  TlsGetValue(x)
 143 #define pthread_setspecific(x,y)        TlsSetValue(x,y)
 144 #define pthread_mutex_unlock(x) ReleaseMutex(x)
 145 #define pthread_mutex_lock(x)   WaitForSingleObject(x, INFINITE)
 146 #define LOCK_MUTEX_R(env)       pthread_mutex_lock((env)->me_rmutex)
 147 #define UNLOCK_MUTEX_R(env)     pthread_mutex_unlock((env)->me_rmutex)
 148 #define LOCK_MUTEX_W(env)       pthread_mutex_lock((env)->me_wmutex)
 149 #define UNLOCK_MUTEX_W(env)     pthread_mutex_unlock((env)->me_wmutex)
 150 #define getpid()        GetCurrentProcessId()
 151 #define MDB_FDATASYNC(fd)       (!FlushFileBuffers(fd))
 152 #define ErrCode()       GetLastError()
 153 #define GET_PAGESIZE(x) {SYSTEM_INFO si; GetSystemInfo(&si); (x) = si.dwPageSize;}
 154 #define close(fd)       CloseHandle(fd)
 155 #define munmap(ptr,len) UnmapViewOfFile(ptr)
 156 #else
 157 #ifdef USE_POSIX_SEM
 158 #define LOCK_MUTEX_R(env)       sem_wait((env)->me_rmutex)
 159 #define UNLOCK_MUTEX_R(env)     sem_post((env)->me_rmutex)
 160 #define LOCK_MUTEX_W(env)       sem_wait((env)->me_wmutex)
 161 #define UNLOCK_MUTEX_W(env)     sem_post((env)->me_wmutex)
 162 #define MDB_FDATASYNC(fd)       fsync(fd)
 163 #else
 164 #ifdef ANDROID
 165 #define MDB_FDATASYNC(fd)       fsync(fd)
 166 #endif
 167         /** Lock the reader mutex.
 168          */
 169 #define LOCK_MUTEX_R(env)       pthread_mutex_lock(&(env)->me_txns->mti_mutex)
 170         /** Unlock the reader mutex.
 171          */
 172 #define UNLOCK_MUTEX_R(env)     pthread_mutex_unlock(&(env)->me_txns->mti_mutex)
 173
 174         /** Lock the writer mutex.
 175          *      Only a single write transaction is allowed at a time. Other writers
 176          *      will block waiting for this mutex.
 177          */
 178 #define LOCK_MUTEX_W(env)       pthread_mutex_lock(&(env)->me_txns->mti_wmutex)
 179         /** Unlock the writer mutex.
 180          */
 181 #define UNLOCK_MUTEX_W(env)     pthread_mutex_unlock(&(env)->me_txns->mti_wmutex)
 182 #endif  /* USE_POSIX_SEM */
 183
 184         /** Get the error code for the last failed system function.
 185          */
 186 #define ErrCode()       errno
 187
 188         /** An abstraction for a file handle.
 189          *      On POSIX systems file handles are small integers. On Windows
 190          *      they're opaque pointers.
 191          */
 192 #define HANDLE  int
 193
 194         /**     A value for an invalid file handle.
 195          *      Mainly used to initialize file variables and signify that they are
 196          *      unused.
 197          */
 198 #define INVALID_HANDLE_VALUE    (-1)
 199
 200         /** Get the size of a memory page for the system.
 201          *      This is the basic size that the platform's memory manager uses, and is
 202          *      fundamental to the use of memory-mapped files.
 203          */
 204 #define GET_PAGESIZE(x) ((x) = sysconf(_SC_PAGE_SIZE))
 205 #endif
 206
 207 #if defined(_WIN32) || defined(USE_POSIX_SEM)
 208 #define MNAME_LEN       32
 209 #else
 210 #define MNAME_LEN       (sizeof(pthread_mutex_t))
 211 #endif
 212
 213 /** @} */
 214
 215 #ifndef _WIN32
 216 /**     A flag for opening a file and requesting synchronous data writes.
 217  *      This is only used when writing a meta page. It's not strictly needed;
 218  *      we could just do a normal write and then immediately perform a flush.
 219  *      But if this flag is available it saves us an extra system call.
 220  *
 221  *      @note If O_DSYNC is undefined but exists in /usr/include,
 222  * preferably set some compiler flag to get the definition.
 223  * Otherwise compile with the less efficient -DMDB_DSYNC=O_SYNC.
 224  */
 225 #ifndef MDB_DSYNC
 226 # define MDB_DSYNC      O_DSYNC
 227 #endif
 228 #endif
 229
 230 /** Function for flushing the data of a file. Define this to fsync
 231  *      if fdatasync() is not supported.
 232  */
 233 #ifndef MDB_FDATASYNC
 234 # define MDB_FDATASYNC  fdatasync
 235 #endif
 236
 237         /** A page number in the database.
 238          *      Note that 64 bit page numbers are overkill, since pages themselves
 239          *      already represent 12-13 bits of addressable memory, and the OS will
 240          *      always limit applications to a maximum of 63 bits of address space.
 241          *
 242          *      @note In the #MDB_node structure, we only store 48 bits of this value,
 243          *      which thus limits us to only 60 bits of addressable data.
 244          */
 245 typedef MDB_ID  pgno_t;
 246
 247         /** A transaction ID.
 248          *      See struct MDB_txn.mt_txnid for details.
 249          */
 250 typedef MDB_ID  txnid_t;
 251
 252 /** @defgroup debug     Debug Macros
 253  *      @{
 254  */
 255 #ifndef MDB_DEBUG
 256         /**     Enable debug output.
 257          *      Set this to 1 for copious tracing. Set to 2 to add dumps of all IDLs
 258          *      read from and written to the database (used for free space management).
 259          */
 260 #define MDB_DEBUG 0
 261 #endif
 262
 263 #if !(__STDC_VERSION__ >= 199901L || defined(__GNUC__))
 264 # define DPRINTF        (void)  /* Vararg macros may be unsupported */
 265 #elif MDB_DEBUG
 266 static int mdb_debug;
 267 static txnid_t mdb_debug_start;
 268
 269         /**     Print a debug message with printf formatting. */
 270 # define DPRINTF(fmt, ...)      /**< Requires 2 or more args */ \
 271         ((void) ((mdb_debug) && \
 272          fprintf(stderr, "%s:%d " fmt "\n", __func__, __LINE__, __VA_ARGS__)))
 273 #else
 274 # define DPRINTF(fmt, ...)      ((void) 0)
 275 #endif
 276         /**     Print a debug string.
 277          *      The string is printed literally, with no format processing.
 278          */
 279 #define DPUTS(arg)      DPRINTF("%s", arg)
 280 /** @} */
 281
 282         /** A default memory page size.
 283          *      The actual size is platform-dependent, but we use this for
 284          *      boot-strapping. We probably should not be using this any more.
 285          *      The #GET_PAGESIZE() macro is used to get the actual size.
 286          *
 287          *      Note that we don't currently support Huge pages. On Linux,
 288          *      regular data files cannot use Huge pages, and in general
 289          *      Huge pages aren't actually pageable. We rely on the OS
 290          *      demand-pager to read our data and page it out when memory
 291          *      pressure from other processes is high. So until OSs have
 292          *      actual paging support for Huge pages, they're not viable.
 293          */
 294 #define MDB_PAGESIZE     4096
 295
 296         /** The minimum number of keys required in a database page.
 297          *      Setting this to a larger value will place a smaller bound on the
 298          *      maximum size of a data item. Data items larger than this size will
 299          *      be pushed into overflow pages instead of being stored directly in
 300          *      the B-tree node. This value used to default to 4. With a page size
 301          *      of 4096 bytes that meant that any item larger than 1024 bytes would
 302          *      go into an overflow page. That also meant that on average 2-3KB of
 303          *      each overflow page was wasted space. The value cannot be lower than
 304          *      2 because then there would no longer be a tree structure. With this
 305          *      value, items larger than 2KB will go into overflow pages, and on
 306          *      average only 1KB will be wasted.
 307          */
 308 #define MDB_MINKEYS      2
 309
 310         /**     A stamp that identifies a file as an MDB file.
 311          *      There's nothing special about this value other than that it is easily
 312          *      recognizable, and it will reflect any byte order mismatches.
 313          */
 314 #define MDB_MAGIC        0xBEEFC0DE
 315
 316         /**     The version number for a database's file format. */
 317 #define MDB_VERSION      1
 318
 319         /**     The maximum size of a key in the database.
 320          *      While data items have essentially unbounded size, we require that
 321          *      keys all fit onto a regular page. This limit could be raised a bit
 322          *      further if needed; to something just under #MDB_PAGESIZE / #MDB_MINKEYS.
 323          */
 324 #define MAXKEYSIZE       511
 325
 326 #if MDB_DEBUG
 327         /**     A key buffer.
 328          *      @ingroup debug
 329          *      This is used for printing a hex dump of a key's contents.
 330          */
 331 #define DKBUF   char kbuf[(MAXKEYSIZE*2+1)]
 332         /**     Display a key in hex.
 333          *      @ingroup debug
 334          *      Invoke a function to display a key in hex.
 335          */
 336 #define DKEY(x) mdb_dkey(x, kbuf)
 337 #else
 338 #define DKBUF   typedef int dummy_kbuf  /* so we can put ';' after */
 339 #define DKEY(x) 0
 340 #endif
 341
 342         /** An invalid page number.
 343          *      Mainly used to denote an empty tree.
 344          */
 345 #define P_INVALID        (~0UL)
 346
 347         /** Test if a flag \b f is set in a flag word \b w. */
 348 #define F_ISSET(w, f)    (((w) & (f)) == (f))
 349
 350         /**     Used for offsets within a single page.
 351          *      Since memory pages are typically 4 or 8KB in size, 12-13 bits,
 352          *      this is plenty.
 353          */
 354 typedef uint16_t         indx_t;
 355
 356         /**     Default size of memory map.
 357          *      This is certainly too small for any actual applications. Apps should always set
 358          *      the size explicitly using #mdb_env_set_mapsize().
 359          */
 360 #define DEFAULT_MAPSIZE 1048576
 361
 362 /**     @defgroup readers       Reader Lock Table
 363  *      Readers don't acquire any locks for their data access. Instead, they
 364  *      simply record their transaction ID in the reader table. The reader
 365  *      mutex is needed just to find an empty slot in the reader table. The
 366  *      slot's address is saved in thread-specific data so that subsequent read
 367  *      transactions started by the same thread need no further locking to proceed.
 368  *
 369  *      Since the database uses multi-version concurrency control, readers don't
 370  *      actually need any locking. This table is used to keep track of which
 371  *      readers are using data from which old transactions, so that we'll know
 372  *      when a particular old transaction is no longer in use. Old transactions
 373  *      that have discarded any data pages can then have those pages reclaimed
 374  *      for use by a later write transaction.
 375  *
 376  *      The lock table is constructed such that reader slots are aligned with the
 377  *      processor's cache line size. Any slot is only ever used by one thread.
 378  *      This alignment guarantees that there will be no contention or cache
 379  *      thrashing as threads update their own slot info, and also eliminates
 380  *      any need for locking when accessing a slot.
 381  *
 382  *      A writer thread will scan every slot in the table to determine the oldest
 383  *      outstanding reader transaction. Any freed pages older than this will be
 384  *      reclaimed by the writer. The writer doesn't use any locks when scanning
 385  *      this table. This means that there's no guarantee that the writer will
 386  *      see the most up-to-date reader info, but that's not required for correct
 387  *      operation - all we need is to know the upper bound on the oldest reader,
 388  *      we don't care at all about the newest reader. So the only consequence of
 389  *      reading stale information here is that old pages might hang around a
 390  *      while longer before being reclaimed. That's actually good anyway, because
 391  *      the longer we delay reclaiming old pages, the more likely it is that a
 392  *      string of contiguous pages can be found after coalescing old pages from
 393  *      many old transactions together.
 394  *
 395  *      @todo We don't actually do such coalescing yet, we grab pages from one
 396  *      old transaction at a time.
 397  *      @{
 398  */
 399         /**     Number of slots in the reader table.
 400          *      This value was chosen somewhat arbitrarily. 126 readers plus a
 401          *      couple mutexes fit exactly into 8KB on my development machine.
 402          *      Applications should set the table size using #mdb_env_set_maxreaders().
 403          */
 404 #define DEFAULT_READERS 126
 405
 406         /**     The size of a CPU cache line in bytes. We want our lock structures
 407          *      aligned to this size to avoid false cache line sharing in the
 408          *      lock table.
 409          *      This value works for most CPUs. For Itanium this should be 128.
 410          */
 411 #ifndef CACHELINE
 412 #define CACHELINE       64
 413 #endif
 414
 415         /**     The information we store in a single slot of the reader table.
 416          *      In addition to a transaction ID, we also record the process and
 417          *      thread ID that owns a slot, so that we can detect stale information,
 418          *      e.g. threads or processes that went away without cleaning up.
 419          *      @note We currently don't check for stale records. We simply re-init
 420          *      the table when we know that we're the only process opening the
 421          *      lock file.
 422          */
 423 typedef struct MDB_rxbody {
 424         /**     The current Transaction ID when this transaction began.
 425          *      Multiple readers that start at the same time will probably have the
 426          *      same ID here. Again, it's not important to exclude them from
 427          *      anything; all we need to know is which version of the DB they
 428          *      started from so we can avoid overwriting any data used in that
 429          *      particular version.
 430          */
 431         txnid_t         mrb_txnid;
 432         /** The process ID of the process owning this reader txn. */
 433         pid_t           mrb_pid;
 434         /** The thread ID of the thread owning this txn. */
 435         pthread_t       mrb_tid;
 436 } MDB_rxbody;
 437
 438         /** The actual reader record, with cacheline padding. */
 439 typedef struct MDB_reader {
 440         union {
 441                 MDB_rxbody mrx;
 442                 /** shorthand for mrb_txnid */
 443 #define mr_txnid        mru.mrx.mrb_txnid
 444 #define mr_pid  mru.mrx.mrb_pid
 445 #define mr_tid  mru.mrx.mrb_tid
 446                 /** cache line alignment */
 447                 char pad[(sizeof(MDB_rxbody)+CACHELINE-1) & ~(CACHELINE-1)];
 448         } mru;
 449 } MDB_reader;
 450
 451         /** The header for the reader table.
 452          *      The table resides in a memory-mapped file. (This is a different file
 453          *      than is used for the main database.)
 454          *
 455          *      For POSIX the actual mutexes reside in the shared memory of this
 456          *      mapped file. On Windows, mutexes are named objects allocated by the
 457          *      kernel; we store the mutex names in this mapped file so that other
 458          *      processes can grab them. This same approach is also used on
 459          *      MacOSX/Darwin (using named semaphores) since MacOSX doesn't support
 460          *      process-shared POSIX mutexes. For these cases where a named object
 461          *      is used, the object name is derived from a 64 bit FNV hash of the
 462          *      environment pathname. As such, naming collisions are extremely
 463          *      unlikely. If a collision occurs, the results are unpredictable.
 464          */
 465 typedef struct MDB_txbody {
 466                 /** Stamp identifying this as an MDB file. It must be set
 467                  *      to #MDB_MAGIC. */
 468         uint32_t        mtb_magic;
 469                 /** Version number of this lock file. Must be set to #MDB_VERSION. */
 470         uint32_t        mtb_version;
 471 #if defined(_WIN32) || defined(USE_POSIX_SEM)
 472         char    mtb_rmname[MNAME_LEN];
 473 #else
 474                 /** Mutex protecting access to this table.
 475                  *      This is the reader lock that #LOCK_MUTEX_R acquires.
 476                  */
 477         pthread_mutex_t mtb_mutex;
 478 #endif
 479                 /**     The ID of the last transaction committed to the database.
 480                  *      This is recorded here only for convenience; the value can always
 481                  *      be determined by reading the main database meta pages.
 482                  */
 483         txnid_t         mtb_txnid;
 484                 /** The number of slots that have been used in the reader table.
 485                  *      This always records the maximum count, it is not decremented
 486                  *      when readers release their slots.
 487                  */
 488         unsigned        mtb_numreaders;
 489 } MDB_txbody;
 490
 491         /** The actual reader table definition. */
 492 typedef struct MDB_txninfo {
 493         union {
 494                 MDB_txbody mtb;
 495 #define mti_magic       mt1.mtb.mtb_magic
 496 #define mti_version     mt1.mtb.mtb_version
 497 #define mti_mutex       mt1.mtb.mtb_mutex
 498 #define mti_rmname      mt1.mtb.mtb_rmname
 499 #define mti_txnid       mt1.mtb.mtb_txnid
 500 #define mti_numreaders  mt1.mtb.mtb_numreaders
 501                 char pad[(sizeof(MDB_txbody)+CACHELINE-1) & ~(CACHELINE-1)];
 502         } mt1;
 503         union {
 504 #if defined(_WIN32) || defined(USE_POSIX_SEM)
 505                 char mt2_wmname[MNAME_LEN];
 506 #define mti_wmname      mt2.mt2_wmname
 507 #else
 508                 pthread_mutex_t mt2_wmutex;
 509 #define mti_wmutex      mt2.mt2_wmutex
 510 #endif
 511                 char pad[(MNAME_LEN+CACHELINE-1) & ~(CACHELINE-1)];
 512         } mt2;
 513         MDB_reader      mti_readers[1];
 514 } MDB_txninfo;
 515 /** @} */
 516
 517 /** Common header for all page types.
 518  * Overflow records occupy a number of contiguous pages with no
 519  * headers on any page after the first.
 520  */
 521 typedef struct MDB_page {
 522 #define mp_pgno mp_p.p_pgno
 523 #define mp_next mp_p.p_next
 524         union {
 525                 pgno_t          p_pgno; /**< page number */
 526                 void *          p_next; /**< for in-memory list of freed structs */
 527         } mp_p;
 528         uint16_t        mp_pad;
 529 /**     @defgroup mdb_page      Page Flags
 530  *      @ingroup internal
 531  *      Flags for the page headers.
 532  *      @{
 533  */
 534 #define P_BRANCH         0x01           /**< branch page */
 535 #define P_LEAF           0x02           /**< leaf page */
 536 #define P_OVERFLOW       0x04           /**< overflow page */
 537 #define P_META           0x08           /**< meta page */
 538 #define P_DIRTY          0x10           /**< dirty page */
 539 #define P_LEAF2          0x20           /**< for #MDB_DUPFIXED records */
 540 #define P_SUBP           0x40           /**< for #MDB_DUPSORT sub-pages */
 541 /** @} */
 542         uint16_t        mp_flags;               /**< @ref mdb_page */
 543 #define mp_lower        mp_pb.pb.pb_lower
 544 #define mp_upper        mp_pb.pb.pb_upper
 545 #define mp_pages        mp_pb.pb_pages
 546         union {
 547                 struct {
 548                         indx_t          pb_lower;               /**< lower bound of free space */
 549                         indx_t          pb_upper;               /**< upper bound of free space */
 550                 } pb;
 551                 uint32_t        pb_pages;       /**< number of overflow pages */
 552         } mp_pb;
 553         indx_t          mp_ptrs[1];             /**< dynamic size */
 554 } MDB_page;
 555
 556         /** Size of the page header, excluding dynamic data at the end */
 557 #define PAGEHDRSZ        ((unsigned) offsetof(MDB_page, mp_ptrs))
 558
 559         /** Address of first usable data byte in a page, after the header */
 560 #define METADATA(p)      ((void *)((char *)(p) + PAGEHDRSZ))
 561
 562         /** Number of nodes on a page */
 563 #define NUMKEYS(p)       (((p)->mp_lower - PAGEHDRSZ) >> 1)
 564
 565         /** The amount of space remaining in the page */
 566 #define SIZELEFT(p)      (indx_t)((p)->mp_upper - (p)->mp_lower)
 567
 568         /** The percentage of space used in the page, in tenths of a percent. */
 569 #define PAGEFILL(env, p) (1000L * ((env)->me_psize - PAGEHDRSZ - SIZELEFT(p)) / \
 570                                 ((env)->me_psize - PAGEHDRSZ))
 571         /** The minimum page fill factor, in tenths of a percent.
 572          *      Pages emptier than this are candidates for merging.
 573          */
 574 #define FILL_THRESHOLD   250
 575
 576         /** Test if a page is a leaf page */
 577 #define IS_LEAF(p)       F_ISSET((p)->mp_flags, P_LEAF)
 578         /** Test if a page is a LEAF2 page */
 579 #define IS_LEAF2(p)      F_ISSET((p)->mp_flags, P_LEAF2)
 580         /** Test if a page is a branch page */
 581 #define IS_BRANCH(p)     F_ISSET((p)->mp_flags, P_BRANCH)
 582         /** Test if a page is an overflow page */
 583 #define IS_OVERFLOW(p)   F_ISSET((p)->mp_flags, P_OVERFLOW)
 584         /** Test if a page is a sub page */
 585 #define IS_SUBP(p)       F_ISSET((p)->mp_flags, P_SUBP)
 586
 587         /** The number of overflow pages needed to store the given size. */
 588 #define OVPAGES(size, psize)    ((PAGEHDRSZ-1 + (size)) / (psize) + 1)
 589
 590         /** Header for a single key/data pair within a page.
 591          * We guarantee 2-byte alignment for nodes.
 592          */
 593 typedef struct MDB_node {
 594         /** lo and hi are used for data size on leaf nodes and for
 595          * child pgno on branch nodes. On 64 bit platforms, flags
 596          * is also used for pgno. (Branch nodes have no flags).
 597          * They are in host byte order in case that lets some
 598          * accesses be optimized into a 32-bit word access.
 599          */
 600 #define mn_lo mn_offset[BYTE_ORDER!=LITTLE_ENDIAN]
 601 #define mn_hi mn_offset[BYTE_ORDER==LITTLE_ENDIAN] /**< part of dsize or pgno */
 602         unsigned short  mn_offset[2];   /**< storage for #mn_lo and #mn_hi */
 603 /** @defgroup mdb_node Node Flags
 604  *      @ingroup internal
 605  *      Flags for node headers.
 606  *      @{
 607  */
 608 #define F_BIGDATA        0x01                   /**< data put on overflow page */
 609 #define F_SUBDATA        0x02                   /**< data is a sub-database */
 610 #define F_DUPDATA        0x04                   /**< data has duplicates */
 611
 612 /** valid flags for #mdb_node_add() */
 613 #define NODE_ADD_FLAGS  (F_DUPDATA|F_SUBDATA|MDB_RESERVE|MDB_APPEND)
 614
 615 /** @} */
 616         unsigned short  mn_flags;               /**< @ref mdb_node */
 617         unsigned short  mn_ksize;               /**< key size */
 618         char            mn_data[1];                     /**< key and data are appended here */
 619 } MDB_node;
 620
 621         /** Size of the node header, excluding dynamic data at the end */
 622 #define NODESIZE         offsetof(MDB_node, mn_data)
 623
 624         /** Bit position of top word in page number, for shifting mn_flags */
 625 #define PGNO_TOPWORD ((pgno_t)-1 > 0xffffffffu ? 32 : 0)
 626
 627         /** Size of a node in a branch page with a given key.
 628          *      This is just the node header plus the key, there is no data.
 629          */
 630 #define INDXSIZE(k)      (NODESIZE + ((k) == NULL ? 0 : (k)->mv_size))
 631
 632         /** Size of a node in a leaf page with a given key and data.
 633          *      This is node header plus key plus data size.
 634          */
 635 #define LEAFSIZE(k, d)   (NODESIZE + (k)->mv_size + (d)->mv_size)
 636
 637         /** Address of node \b i in page \b p */
 638 #define NODEPTR(p, i)    ((MDB_node *)((char *)(p) + (p)->mp_ptrs[i]))
 639
 640         /** Address of the key for the node */
 641 #define NODEKEY(node)    (void *)((node)->mn_data)
 642
 643         /** Address of the data for a node */
 644 #define NODEDATA(node)   (void *)((char *)(node)->mn_data + (node)->mn_ksize)
 645
 646         /** Get the page number pointed to by a branch node */
 647 #define NODEPGNO(node) \
 648         ((node)->mn_lo | ((pgno_t) (node)->mn_hi << 16) | \
 649          (PGNO_TOPWORD ? ((pgno_t) (node)->mn_flags << PGNO_TOPWORD) : 0))
 650         /** Set the page number in a branch node */
 651 #define SETPGNO(node,pgno)      do { \
 652         (node)->mn_lo = (pgno) & 0xffff; (node)->mn_hi = (pgno) >> 16; \
 653         if (PGNO_TOPWORD) (node)->mn_flags = (pgno) >> PGNO_TOPWORD; } while(0)
 654
 655         /** Get the size of the data in a leaf node */
 656 #define NODEDSZ(node)    ((node)->mn_lo | ((unsigned)(node)->mn_hi << 16))
 657         /** Set the size of the data for a leaf node */
 658 #define SETDSZ(node,size)       do { \
 659         (node)->mn_lo = (size) & 0xffff; (node)->mn_hi = (size) >> 16;} while(0)
 660         /** The size of a key in a node */
 661 #define NODEKSZ(node)    ((node)->mn_ksize)
 662
 663         /** Copy a page number from src to dst */
 664 #ifdef MISALIGNED_OK
 665 #define COPY_PGNO(dst,src)      dst = src
 666 #else
 667 #if SIZE_MAX > 4294967295UL
 668 #define COPY_PGNO(dst,src)      do { \
 669         unsigned short *s, *d;  \
 670         s = (unsigned short *)&(src);   \
 671         d = (unsigned short *)&(dst);   \
 672         *d++ = *s++;    \
 673         *d++ = *s++;    \
 674         *d++ = *s++;    \
 675         *d = *s;        \
 676 } while (0)
 677 #else
 678 #define COPY_PGNO(dst,src)      do { \
 679         unsigned short *s, *d;  \
 680         s = (unsigned short *)&(src);   \
 681         d = (unsigned short *)&(dst);   \
 682         *d++ = *s++;    \
 683         *d = *s;        \
 684 } while (0)
 685 #endif
 686 #endif
 687         /** The address of a key in a LEAF2 page.
 688          *      LEAF2 pages are used for #MDB_DUPFIXED sorted-duplicate sub-DBs.
 689          *      There are no node headers, keys are stored contiguously.
 690          */
 691 #define LEAF2KEY(p, i, ks)      ((char *)(p) + PAGEHDRSZ + ((i)*(ks)))
 692
 693         /** Set the \b node's key into \b key, if requested. */
 694 #define MDB_SET_KEY(node, key)  { if ((key) != NULL) { \
 695         (key)->mv_size = NODEKSZ(node); (key)->mv_data = NODEKEY(node); } }
 696
 697         /** Information about a single database in the environment. */
 698 typedef struct MDB_db {
 699         uint32_t        md_pad;         /**< also ksize for LEAF2 pages */
 700         uint16_t        md_flags;       /**< @ref mdb_open */
 701         uint16_t        md_depth;       /**< depth of this tree */
 702         pgno_t          md_branch_pages;        /**< number of internal pages */
 703         pgno_t          md_leaf_pages;          /**< number of leaf pages */
 704         pgno_t          md_overflow_pages;      /**< number of overflow pages */
 705         size_t          md_entries;             /**< number of data items */
 706         pgno_t          md_root;                /**< the root page of this tree */
 707 } MDB_db;
 708
 709         /** Handle for the DB used to track free pages. */
 710 #define FREE_DBI        0
 711         /** Handle for the default DB. */
 712 #define MAIN_DBI        1
 713
 714         /** Meta page content. */
 715 typedef struct MDB_meta {
 716                 /** Stamp identifying this as an MDB file. It must be set
 717                  *      to #MDB_MAGIC. */
 718         uint32_t        mm_magic;
 719                 /** Version number of this lock file. Must be set to #MDB_VERSION. */
 720         uint32_t        mm_version;
 721         void            *mm_address;            /**< address for fixed mapping */
 722         size_t          mm_mapsize;                     /**< size of mmap region */
 723         MDB_db          mm_dbs[2];                      /**< first is free space, 2nd is main db */
 724         /** The size of pages used in this DB */
 725 #define mm_psize        mm_dbs[0].md_pad
 726         /** Any persistent environment flags. @ref mdb_env */
 727 #define mm_flags        mm_dbs[0].md_flags
 728         pgno_t          mm_last_pg;                     /**< last used page in file */
 729         txnid_t         mm_txnid;                       /**< txnid that committed this page */
 730 } MDB_meta;
 731
 732         /** Buffer for a stack-allocated dirty page.
 733          *      The members define size and alignment, and silence type
 734          *      aliasing warnings.  They are not used directly; that could
 735          *      mean incorrectly using several union members in parallel.
 736          */
 737 typedef union MDB_pagebuf {
 738         char            mb_raw[MDB_PAGESIZE];
 739         MDB_page        mb_page;
 740         struct {
 741                 char            mm_pad[PAGEHDRSZ];
 742                 MDB_meta        mm_meta;
 743         } mb_metabuf;
 744 } MDB_pagebuf;
 745
 746         /** Auxiliary DB info.
 747          *      The information here is mostly static/read-only. There is
 748          *      only a single copy of this record in the environment.
 749          */
 750 typedef struct MDB_dbx {
 751         MDB_val         md_name;                /**< name of the database */
 752         MDB_cmp_func    *md_cmp;        /**< function for comparing keys */
 753         MDB_cmp_func    *md_dcmp;       /**< function for comparing data items */
 754         MDB_rel_func    *md_rel;        /**< user relocate function */
 755         void            *md_relctx;             /**< user-provided context for md_rel */
 756 } MDB_dbx;
 757
 758         /** A database transaction.
 759          *      Every operation requires a transaction handle.
 760          */
 761 struct MDB_txn {
 762         MDB_txn         *mt_parent;             /**< parent of a nested txn */
 763         MDB_txn         *mt_child;              /**< nested txn under this txn */
 764         pgno_t          mt_next_pgno;   /**< next unallocated page */
 765         /** The ID of this transaction. IDs are integers incrementing from 1.
 766          *      Only committed write transactions increment the ID. If a transaction
 767          *      aborts, the ID may be re-used by the next writer.
 768          */
 769         txnid_t         mt_txnid;
 770         MDB_env         *mt_env;                /**< the DB environment */
 771         /** The list of pages that became unused during this transaction.
 772          */
 773         MDB_IDL         mt_free_pgs;
 774         union {
 775                 MDB_ID2L        dirty_list;     /**< modified pages */
 776                 MDB_reader      *reader;        /**< this thread's slot in the reader table */
 777         } mt_u;
 778         /** Array of records for each DB known in the environment. */
 779         MDB_dbx         *mt_dbxs;
 780         /** Array of MDB_db records for each known DB */
 781         MDB_db          *mt_dbs;
 782 /** @defgroup mt_dbflag Transaction DB Flags
 783  *      @ingroup internal
 784  * @{
 785  */
 786 #define DB_DIRTY        0x01            /**< DB was written in this txn */
 787 #define DB_STALE        0x02            /**< DB record is older than txnID */
 788 /** @} */
 789         /** Array of cursors for each DB */
 790         MDB_cursor      **mt_cursors;
 791         /** Array of flags for each DB */
 792         unsigned char   *mt_dbflags;
 793         /**     Number of DB records in use. This number only ever increments;
 794          *      we don't decrement it when individual DB handles are closed.
 795          */
 796         MDB_dbi         mt_numdbs;
 797
 798 /** @defgroup mdb_txn   Transaction Flags
 799  *      @ingroup internal
 800  *      @{
 801  */
 802 #define MDB_TXN_RDONLY          0x01            /**< read-only transaction */
 803 #define MDB_TXN_ERROR           0x02            /**< an error has occurred */
 804 /** @} */
 805         unsigned int    mt_flags;               /**< @ref mdb_txn */
 806         /** Tracks which of the two meta pages was used at the start
 807          *      of this transaction.
 808          */
 809         unsigned int    mt_toggle;
 810 };
 811
 812 /** Enough space for 2^32 nodes with minimum of 2 keys per node. I.e., plenty.
 813  * At 4 keys per node, enough for 2^64 nodes, so there's probably no need to
 814  * raise this on a 64 bit machine.
 815  */
 816 #define CURSOR_STACK             32
 817
 818 struct MDB_xcursor;
 819
 820         /** Cursors are used for all DB operations */
 821 struct MDB_cursor {
 822         /** Next cursor on this DB in this txn */
 823         MDB_cursor      *mc_next;
 824         /** Original cursor if this is a shadow */
 825         MDB_cursor      *mc_orig;
 826         /** Context used for databases with #MDB_DUPSORT, otherwise NULL */
 827         struct MDB_xcursor      *mc_xcursor;
 828         /** The transaction that owns this cursor */
 829         MDB_txn         *mc_txn;
 830         /** The database handle this cursor operates on */
 831         MDB_dbi         mc_dbi;
 832         /** The database record for this cursor */
 833         MDB_db          *mc_db;
 834         /** The database auxiliary record for this cursor */
 835         MDB_dbx         *mc_dbx;
 836         /** The @ref mt_dbflag for this database */
 837         unsigned char   *mc_dbflag;
 838         unsigned short  mc_snum;        /**< number of pushed pages */
 839         unsigned short  mc_top;         /**< index of top page, normally mc_snum-1 */
 840 /** @defgroup mdb_cursor        Cursor Flags
 841  *      @ingroup internal
 842  *      Cursor state flags.
 843  *      @{
 844  */
 845 #define C_INITIALIZED   0x01    /**< cursor has been initialized and is valid */
 846 #define C_EOF   0x02                    /**< No more data */
 847 #define C_SUB   0x04                    /**< Cursor is a sub-cursor */
 848 #define C_SHADOW        0x08            /**< Cursor is a dup from a parent txn */
 849 #define C_ALLOCD        0x10            /**< Cursor was malloc'd */
 850 #define C_SPLITTING     0x20            /**< Cursor is in page_split */
 851 /** @} */
 852         unsigned int    mc_flags;       /**< @ref mdb_cursor */
 853         MDB_page        *mc_pg[CURSOR_STACK];   /**< stack of pushed pages */
 854         indx_t          mc_ki[CURSOR_STACK];    /**< stack of page indices */
 855 };
 856
 857         /** Context for sorted-dup records.
 858          *      We could have gone to a fully recursive design, with arbitrarily
 859          *      deep nesting of sub-databases. But for now we only handle these
 860          *      levels - main DB, optional sub-DB, sorted-duplicate DB.
 861          */
 862 typedef struct MDB_xcursor {
 863         /** A sub-cursor for traversing the Dup DB */
 864         MDB_cursor mx_cursor;
 865         /** The database record for this Dup DB */
 866         MDB_db  mx_db;
 867         /**     The auxiliary DB record for this Dup DB */
 868         MDB_dbx mx_dbx;
 869         /** The @ref mt_dbflag for this Dup DB */
 870         unsigned char mx_dbflag;
 871 } MDB_xcursor;
 872
 873         /** A set of pages freed by an earlier transaction. */
 874 typedef struct MDB_oldpages {
 875         /** Usually we only read one record from the FREEDB at a time, but
 876          *      in case we read more, this will chain them together.
 877          */
 878         struct MDB_oldpages *mo_next;
 879         /**     The ID of the transaction in which these pages were freed. */
 880         txnid_t         mo_txnid;
 881         /** An #MDB_IDL of the pages */
 882         pgno_t          mo_pages[1];    /* dynamic */
 883 } MDB_oldpages;
 884
 885         /** The database environment. */
 886 struct MDB_env {
 887         HANDLE          me_fd;          /**< The main data file */
 888         HANDLE          me_lfd;         /**< The lock file */
 889         HANDLE          me_mfd;                 /**< just for writing the meta pages */
 890         /** Failed to update the meta page. Probably an I/O error. */
 891 #define MDB_FATAL_ERROR 0x80000000U
 892         uint32_t        me_flags;               /**< @ref mdb_env */
 893         unsigned int    me_psize;       /**< size of a page, from #GET_PAGESIZE */
 894         unsigned int    me_maxreaders;  /**< size of the reader table */
 895         MDB_dbi         me_numdbs;              /**< number of DBs opened */
 896         MDB_dbi         me_maxdbs;              /**< size of the DB table */
 897         char            *me_path;               /**< path to the DB files */
 898         char            *me_map;                /**< the memory map of the data file */
 899         MDB_txninfo     *me_txns;               /**< the memory map of the lock file */
 900         MDB_meta        *me_metas[2];   /**< pointers to the two meta pages */
 901         MDB_txn         *me_txn;                /**< current write transaction */
 902         size_t          me_mapsize;             /**< size of the data memory map */
 903         off_t           me_size;                /**< current file size */
 904         pgno_t          me_maxpg;               /**< me_mapsize / me_psize */
 905         txnid_t         me_pgfirst;             /**< ID of first old page record we used */
 906         txnid_t         me_pglast;              /**< ID of last old page record we used */
 907         MDB_dbx         *me_dbxs;               /**< array of static DB info */
 908         uint16_t        *me_dbflags;    /**< array of DB flags */
 909         MDB_oldpages *me_pghead;        /**< list of old page records */
 910         MDB_oldpages *me_pgfree;        /**< list of page records to free */
 911         pthread_key_t   me_txkey;       /**< thread-key for readers */
 912         MDB_page        *me_dpages;             /**< list of malloc'd blocks for re-use */
 913         /** IDL of pages that became unused in a write txn */
 914         MDB_IDL         me_free_pgs;
 915         /** ID2L of pages that were written during a write txn */
 916         MDB_ID2         me_dirty_list[MDB_IDL_UM_SIZE];
 917 #ifdef _WIN32
 918         HANDLE          me_rmutex;              /* Windows mutexes don't reside in shared mem */
 919         HANDLE          me_wmutex;
 920 #endif
 921 #ifdef USE_POSIX_SEM
 922         sem_t           *me_rmutex;             /* Apple doesn't support shared mutexes */
 923         sem_t           *me_wmutex;
 924 #endif
 925 };
 926         /** max number of pages to commit in one writev() call */
 927 #define MDB_COMMIT_PAGES         64
 928 #if defined(IOV_MAX) && IOV_MAX < MDB_COMMIT_PAGES
 929 #undef MDB_COMMIT_PAGES
 930 #define MDB_COMMIT_PAGES        IOV_MAX
 931 #endif
 932
 933 static MDB_page *mdb_page_alloc(MDB_cursor *mc, int num);
 934 static MDB_page *mdb_page_new(MDB_cursor *mc, uint32_t flags, int num);
 935 static int              mdb_page_touch(MDB_cursor *mc);
 936
 937 static int  mdb_page_get(MDB_txn *txn, pgno_t pgno, MDB_page **mp);
 938 static int  mdb_page_search_root(MDB_cursor *mc,
 939                             MDB_val *key, int modify);
 940 #define MDB_PS_MODIFY   1
 941 #define MDB_PS_ROOTONLY 2
 942 static int  mdb_page_search(MDB_cursor *mc,
 943                             MDB_val *key, int flags);
 944 static int      mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst);
 945
 946 #define MDB_SPLIT_REPLACE       MDB_APPENDDUP   /**< newkey is not new */
 947 static int      mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata,
 948                                 pgno_t newpgno, unsigned int nflags);
 949
 950 static int  mdb_env_read_header(MDB_env *env, MDB_meta *meta);
 951 static int  mdb_env_pick_meta(const MDB_env *env);
 952 static int  mdb_env_write_meta(MDB_txn *txn);
 953
 954 static MDB_node *mdb_node_search(MDB_cursor *mc, MDB_val *key, int *exactp);
 955 static int  mdb_node_add(MDB_cursor *mc, indx_t indx,
 956                             MDB_val *key, MDB_val *data, pgno_t pgno, unsigned int flags);
 957 static void mdb_node_del(MDB_page *mp, indx_t indx, int ksize);
 958 static void mdb_node_shrink(MDB_page *mp, indx_t indx);
 959 static int      mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst);
 960 static int  mdb_node_read(MDB_txn *txn, MDB_node *leaf, MDB_val *data);
 961 static size_t   mdb_leaf_size(MDB_env *env, MDB_val *key, MDB_val *data);
 962 static size_t   mdb_branch_size(MDB_env *env, MDB_val *key);
 963
 964 static int      mdb_rebalance(MDB_cursor *mc);
 965 static int      mdb_update_key(MDB_page *mp, indx_t indx, MDB_val *key);
 966
 967 static void     mdb_cursor_pop(MDB_cursor *mc);
 968 static int      mdb_cursor_push(MDB_cursor *mc, MDB_page *mp);
 969
 970 static int      mdb_cursor_del0(MDB_cursor *mc, MDB_node *leaf);
 971 static int      mdb_cursor_sibling(MDB_cursor *mc, int move_right);
 972 static int      mdb_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op);
 973 static int      mdb_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op);
 974 static int      mdb_cursor_set(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op,
 975                                 int *exactp);
 976 static int      mdb_cursor_first(MDB_cursor *mc, MDB_val *key, MDB_val *data);
 977 static int      mdb_cursor_last(MDB_cursor *mc, MDB_val *key, MDB_val *data);
 978
 979 static void     mdb_cursor_init(MDB_cursor *mc, MDB_txn *txn, MDB_dbi dbi, MDB_xcursor *mx);
 980 static void     mdb_xcursor_init0(MDB_cursor *mc);
 981 static void     mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node);
 982
 983 static int      mdb_drop0(MDB_cursor *mc, int subs);
 984 static void mdb_default_cmp(MDB_txn *txn, MDB_dbi dbi);
 985
 986 /** @cond */
 987 static MDB_cmp_func     mdb_cmp_memn, mdb_cmp_memnr, mdb_cmp_int, mdb_cmp_cint, mdb_cmp_long;
 988 /** @endcond */
 989
 990 #ifdef _WIN32
 991 static SECURITY_DESCRIPTOR mdb_null_sd;
 992 static SECURITY_ATTRIBUTES mdb_all_sa;
 993 static int mdb_sec_inited;
 994 #endif
 995
 996 /** Return the library version info. */
 997 char *
 998 mdb_version(int *major, int *minor, int *patch)
 999 {
1000         if (major) *major = MDB_VERSION_MAJOR;
1001         if (minor) *minor = MDB_VERSION_MINOR;
1002         if (patch) *patch = MDB_VERSION_PATCH;
1003         return MDB_VERSION_STRING;
1004 }
1005
1006 /** Table of descriptions for MDB @ref errors */
1007 static char *const mdb_errstr[] = {
1008         "MDB_KEYEXIST: Key/data pair already exists",
1009         "MDB_NOTFOUND: No matching key/data pair found",
1010         "MDB_PAGE_NOTFOUND: Requested page not found",
1011         "MDB_CORRUPTED: Located page was wrong type",
1012         "MDB_PANIC: Update of meta page failed",
1013         "MDB_VERSION_MISMATCH: Database environment version mismatch"
1014 };
1015
1016 char *
1017 mdb_strerror(int err)
1018 {
1019         if (!err)
1020                 return ("Successful return: 0");
1021
1022         if (err >= MDB_KEYEXIST && err <= MDB_VERSION_MISMATCH)
1023                 return mdb_errstr[err - MDB_KEYEXIST];
1024
1025         return strerror(err);
1026 }
1027
1028 #if MDB_DEBUG
1029 /** Display a key in hexadecimal and return the address of the result.
1030  * @param[in] key the key to display
1031  * @param[in] buf the buffer to write into. Should always be #DKBUF.
1032  * @return The key in hexadecimal form.
1033  */
1034 char *
1035 mdb_dkey(MDB_val *key, char *buf)
1036 {
1037         char *ptr = buf;
1038         unsigned char *c = key->mv_data;
1039         unsigned int i;
1040         if (key->mv_size > MAXKEYSIZE)
1041                 return "MAXKEYSIZE";
1042         /* may want to make this a dynamic check: if the key is mostly
1043          * printable characters, print it as-is instead of converting to hex.
1044          */
1045 #if 1
1046         buf[0] = '\0';
1047         for (i=0; i<key->mv_size; i++)
1048                 ptr += sprintf(ptr, "%02x", *c++);
1049 #else
1050         sprintf(buf, "%.*s", key->mv_size, key->mv_data);
1051 #endif
1052         return buf;
1053 }
1054
1055 /** Display all the keys in the page. */
1056 static void
1057 mdb_page_keys(MDB_page *mp)
1058 {
1059         MDB_node *node;
1060         unsigned int i, nkeys;
1061         MDB_val key;
1062         DKBUF;
1063
1064         nkeys = NUMKEYS(mp);
1065         fprintf(stderr, "numkeys %d\n", nkeys);
1066         for (i=0; i<nkeys; i++) {
1067                 node = NODEPTR(mp, i);
1068                 key.mv_size = node->mn_ksize;
1069                 key.mv_data = node->mn_data;
1070                 fprintf(stderr, "key %d: %s\n", i, DKEY(&key));
1071         }
1072 }
1073
1074 void
1075 mdb_cursor_chk(MDB_cursor *mc)
1076 {
1077         unsigned int i;
1078         MDB_node *node;
1079         MDB_page *mp;
1080
1081         if (!mc->mc_snum && !(mc->mc_flags & C_INITIALIZED)) return;
1082         for (i=0; i<mc->mc_top; i++) {
1083                 mp = mc->mc_pg[i];
1084                 node = NODEPTR(mp, mc->mc_ki[i]);
1085                 if (NODEPGNO(node) != mc->mc_pg[i+1]->mp_pgno)
1086                         printf("oops!\n");
1087         }
1088         if (mc->mc_ki[i] >= NUMKEYS(mc->mc_pg[i]))
1089                 printf("ack!\n");
1090 }
1091 #endif
1092
1093 #if MDB_DEBUG > 2
1094 /** Count all the pages in each DB and in the freelist
1095  *  and make sure it matches the actual number of pages
1096  *  being used.
1097  */
1098 static void mdb_audit(MDB_txn *txn)
1099 {
1100         MDB_cursor mc;
1101         MDB_val key, data;
1102         MDB_ID freecount, count;
1103         MDB_dbi i;
1104         int rc;
1105
1106         freecount = 0;
1107         mdb_cursor_init(&mc, txn, FREE_DBI, NULL);
1108         while ((rc = mdb_cursor_get(&mc, &key, &data, MDB_NEXT)) == 0)
1109                 freecount += *(MDB_ID *)data.mv_data;
1110
1111         count = 0;
1112         for (i = 0; i<txn->mt_numdbs; i++) {
1113                 MDB_xcursor mx, *mxp;
1114                 mxp = (txn->mt_dbs[i].md_flags & MDB_DUPSORT) ? &mx : NULL;
1115                 mdb_cursor_init(&mc, txn, i, mxp);
1116                 if (txn->mt_dbs[i].md_root == P_INVALID)
1117                         continue;
1118                 count += txn->mt_dbs[i].md_branch_pages +
1119                         txn->mt_dbs[i].md_leaf_pages +
1120                         txn->mt_dbs[i].md_overflow_pages;
1121                 if (txn->mt_dbs[i].md_flags & MDB_DUPSORT) {
1122                         mdb_page_search(&mc, NULL, 0);
1123                         do {
1124                                 unsigned j;
1125                                 MDB_page *mp;
1126                                 mp = mc.mc_pg[mc.mc_top];
1127                                 for (j=0; j<NUMKEYS(mp); j++) {
1128                                         MDB_node *leaf = NODEPTR(mp, j);
1129                                         if (leaf->mn_flags & F_SUBDATA) {
1130                                                 MDB_db db;
1131                                                 memcpy(&db, NODEDATA(leaf), sizeof(db));
1132                                                 count += db.md_branch_pages + db.md_leaf_pages +
1133                                                         db.md_overflow_pages;
1134                                         }
1135                                 }
1136                         }
1137                         while (mdb_cursor_sibling(&mc, 1) == 0);
1138                 }
1139         }
1140         assert(freecount + count + 2 /* metapages */ == txn->mt_next_pgno);
1141 }
1142 #endif
1143
1144 int
1145 mdb_cmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b)
1146 {
1147         return txn->mt_dbxs[dbi].md_cmp(a, b);
1148 }
1149
1150 int
1151 mdb_dcmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b)
1152 {
1153         if (txn->mt_dbxs[dbi].md_dcmp)
1154                 return txn->mt_dbxs[dbi].md_dcmp(a, b);
1155         else
1156                 return EINVAL;  /* too bad you can't distinguish this from a valid result */
1157 }
1158
1159 /** Allocate a single page.
1160  * Re-use old malloc'd pages first, otherwise just malloc.
1161  */
1162 static MDB_page *
1163 mdb_page_malloc(MDB_cursor *mc) {
1164         MDB_page *ret;
1165         size_t sz = mc->mc_txn->mt_env->me_psize;
1166         if ((ret = mc->mc_txn->mt_env->me_dpages) != NULL) {
1167                 VGMEMP_ALLOC(mc->mc_txn->mt_env, ret, sz);
1168                 VGMEMP_DEFINED(ret, sizeof(ret->mp_next));
1169                 mc->mc_txn->mt_env->me_dpages = ret->mp_next;
1170         } else if ((ret = malloc(sz)) != NULL) {
1171                 VGMEMP_ALLOC(mc->mc_txn->mt_env, ret, sz);
1172         }
1173         return ret;
1174 }
1175
1176 /** Allocate pages for writing.
1177  * If there are free pages available from older transactions, they
1178  * will be re-used first. Otherwise a new page will be allocated.
1179  * @param[in] mc cursor A cursor handle identifying the transaction and
1180  *      database for which we are allocating.
1181  * @param[in] num the number of pages to allocate.
1182  * @return Address of the allocated page(s). Requests for multiple pages
1183  *  will always be satisfied by a single contiguous chunk of memory.
1184  */
1185 static MDB_page *
1186 mdb_page_alloc(MDB_cursor *mc, int num)
1187 {
1188         MDB_txn *txn = mc->mc_txn;
1189         MDB_page *np;
1190         pgno_t pgno = P_INVALID;
1191         MDB_ID2 mid;
1192
1193         /* The free list won't have any content at all until txn 2 has
1194          * committed. The pages freed by txn 2 will be unreferenced
1195          * after txn 3 commits, and so will be safe to re-use in txn 4.
1196          */
1197         if (txn->mt_txnid > 3) {
1198
1199                 if (!txn->mt_env->me_pghead &&
1200                         txn->mt_dbs[FREE_DBI].md_root != P_INVALID) {
1201                         /* See if there's anything in the free DB */
1202                         MDB_cursor m2;
1203                         MDB_node *leaf;
1204                         MDB_val data;
1205                         txnid_t *kptr, oldest, last;
1206
1207                         mdb_cursor_init(&m2, txn, FREE_DBI, NULL);
1208                         if (!txn->mt_env->me_pgfirst) {
1209                                 mdb_page_search(&m2, NULL, 0);
1210                                 leaf = NODEPTR(m2.mc_pg[m2.mc_top], 0);
1211                                 kptr = (txnid_t *)NODEKEY(leaf);
1212                                 last = *kptr;
1213                         } else {
1214                                 MDB_val key;
1215                                 int rc, exact;
1216 again:
1217                                 exact = 0;
1218                                 last = txn->mt_env->me_pglast + 1;
1219                                 leaf = NULL;
1220                                 key.mv_data = &last;
1221                                 key.mv_size = sizeof(last);
1222                                 rc = mdb_cursor_set(&m2, &key, &data, MDB_SET, &exact);
1223                                 if (rc)
1224                                         goto none;
1225                                 last = *(txnid_t *)key.mv_data;
1226                         }
1227
1228                         {
1229                                 unsigned int i;
1230                                 oldest = txn->mt_txnid - 1;
1231                                 for (i=0; i<txn->mt_env->me_txns->mti_numreaders; i++) {
1232                                         txnid_t mr = txn->mt_env->me_txns->mti_readers[i].mr_txnid;
1233                                         if (mr && mr < oldest)
1234                                                 oldest = mr;
1235                                 }
1236                         }
1237
1238                         if (oldest > last) {
1239                                 /* It's usable, grab it.
1240                                  */
1241                                 MDB_oldpages *mop;
1242                                 pgno_t *idl;
1243
1244                                 if (!txn->mt_env->me_pgfirst) {
1245                                         mdb_node_read(txn, leaf, &data);
1246                                 }
1247                                 txn->mt_env->me_pglast = last;
1248                                 if (!txn->mt_env->me_pgfirst)
1249                                         txn->mt_env->me_pgfirst = last;
1250                                 idl = (MDB_ID *) data.mv_data;
1251                                 /* We might have a zero-length IDL due to freelist growth
1252                                  * during a prior commit
1253                                  */
1254                                 if (!idl[0]) goto again;
1255                                 mop = malloc(sizeof(MDB_oldpages) + MDB_IDL_SIZEOF(idl) - sizeof(pgno_t));
1256                                 mop->mo_next = txn->mt_env->me_pghead;
1257                                 mop->mo_txnid = last;
1258                                 txn->mt_env->me_pghead = mop;
1259                                 memcpy(mop->mo_pages, idl, MDB_IDL_SIZEOF(idl));
1260
1261 #if MDB_DEBUG > 1
1262                                 {
1263                                         unsigned int i;
1264                                         DPRINTF("IDL read txn %zu root %zu num %zu",
1265                                                 mop->mo_txnid, txn->mt_dbs[FREE_DBI].md_root, idl[0]);
1266                                         for (i=0; i<idl[0]; i++) {
1267                                                 DPRINTF("IDL %zu", idl[i+1]);
1268                                         }
1269                                 }
1270 #endif
1271                         }
1272                 }
1273 none:
1274                 if (txn->mt_env->me_pghead) {
1275                         MDB_oldpages *mop = txn->mt_env->me_pghead;
1276                         if (num > 1) {
1277                                 /* FIXME: For now, always use fresh pages. We
1278                                  * really ought to search the free list for a
1279                                  * contiguous range.
1280                                  */
1281                                 ;
1282                         } else {
1283                                 /* peel pages off tail, so we only have to truncate the list */
1284                                 pgno = MDB_IDL_LAST(mop->mo_pages);
1285                                 if (MDB_IDL_IS_RANGE(mop->mo_pages)) {
1286                                         mop->mo_pages[2]++;
1287                                         if (mop->mo_pages[2] > mop->mo_pages[1])
1288                                                 mop->mo_pages[0] = 0;
1289                                 } else {
1290                                         mop->mo_pages[0]--;
1291                                 }
1292                                 if (MDB_IDL_IS_ZERO(mop->mo_pages)) {
1293                                         txn->mt_env->me_pghead = mop->mo_next;
1294                                         if (mc->mc_dbi == FREE_DBI) {
1295                                                 mop->mo_next = txn->mt_env->me_pgfree;
1296                                                 txn->mt_env->me_pgfree = mop;
1297                                         } else {
1298                                                 free(mop);
1299                                         }
1300                                 }
1301                         }
1302                 }
1303         }
1304
1305         if (pgno == P_INVALID) {
1306                 /* DB size is maxed out */
1307                 if (txn->mt_next_pgno + num >= txn->mt_env->me_maxpg) {
1308                         DPUTS("DB size maxed out");
1309                         return NULL;
1310                 }
1311         }
1312         if (txn->mt_env->me_dpages && num == 1) {
1313                 np = txn->mt_env->me_dpages;
1314                 VGMEMP_ALLOC(txn->mt_env, np, txn->mt_env->me_psize);
1315                 VGMEMP_DEFINED(np, sizeof(np->mp_next));
1316                 txn->mt_env->me_dpages = np->mp_next;
1317         } else {
1318                 size_t sz = txn->mt_env->me_psize * num;
1319                 if ((np = malloc(sz)) == NULL)
1320                         return NULL;
1321                 VGMEMP_ALLOC(txn->mt_env, np, sz);
1322         }
1323         if (pgno == P_INVALID) {
1324                 np->mp_pgno = txn->mt_next_pgno;
1325                 txn->mt_next_pgno += num;
1326         } else {
1327                 np->mp_pgno = pgno;
1328         }
1329         mid.mid = np->mp_pgno;
1330         mid.mptr = np;
1331         mdb_mid2l_insert(txn->mt_u.dirty_list, &mid);
1332
1333         return np;
1334 }
1335
1336 /** Copy a page: avoid copying unused portions of the page.
1337  * @param[in] dst page to copy into
1338  * @param[in] src page to copy from
1339  */
1340 static void
1341 mdb_page_copy(MDB_page *dst, MDB_page *src, unsigned int psize)
1342 {
1343         dst->mp_flags = src->mp_flags | P_DIRTY;
1344         dst->mp_pages = src->mp_pages;
1345
1346         if (IS_LEAF2(src)) {
1347                 memcpy(dst->mp_ptrs, src->mp_ptrs, psize - PAGEHDRSZ - SIZELEFT(src));
1348         } else {
1349                 unsigned int i, nkeys = NUMKEYS(src);
1350                 for (i=0; i<nkeys; i++)
1351                         dst->mp_ptrs[i] = src->mp_ptrs[i];
1352                 memcpy((char *)dst+src->mp_upper, (char *)src+src->mp_upper,
1353                         psize - src->mp_upper);
1354         }
1355 }
1356
1357 /** Touch a page: make it dirty and re-insert into tree with updated pgno.
1358  * @param[in] mc cursor pointing to the page to be touched
1359  * @return 0 on success, non-zero on failure.
1360  */
1361 static int
1362 mdb_page_touch(MDB_cursor *mc)
1363 {
1364         MDB_page *mp = mc->mc_pg[mc->mc_top];
1365         pgno_t  pgno;
1366
1367         if (!F_ISSET(mp->mp_flags, P_DIRTY)) {
1368                 MDB_page *np;
1369                 if ((np = mdb_page_alloc(mc, 1)) == NULL)
1370                         return ENOMEM;
1371                 DPRINTF("touched db %u page %zu -> %zu", mc->mc_dbi, mp->mp_pgno, np->mp_pgno);
1372                 assert(mp->mp_pgno != np->mp_pgno);
1373                 mdb_midl_append(&mc->mc_txn->mt_free_pgs, mp->mp_pgno);
1374                 if (SIZELEFT(mp)) {
1375                         /* If page isn't full, just copy the used portion */
1376                         mdb_page_copy(np, mp, mc->mc_txn->mt_env->me_psize);
1377                 } else {
1378                         pgno = np->mp_pgno;
1379                         memcpy(np, mp, mc->mc_txn->mt_env->me_psize);
1380                         np->mp_pgno = pgno;
1381                         np->mp_flags |= P_DIRTY;
1382                 }
1383                 mp = np;
1384
1385 finish:
1386                 /* Adjust other cursors pointing to mp */
1387                 if (mc->mc_flags & C_SUB) {
1388                         MDB_cursor *m2, *m3;
1389                         MDB_dbi dbi = mc->mc_dbi-1;
1390
1391                         for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
1392                                 if (m2 == mc) continue;
1393                                 m3 = &m2->mc_xcursor->mx_cursor;
1394                                 if (m3->mc_snum < mc->mc_snum) continue;
1395                                 if (m3->mc_pg[mc->mc_top] == mc->mc_pg[mc->mc_top]) {
1396                                         m3->mc_pg[mc->mc_top] = mp;
1397                                 }
1398                         }
1399                 } else {
1400                         MDB_cursor *m2;
1401
1402                         for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) {
1403                                 if (m2 == mc || m2->mc_snum < mc->mc_snum) continue;
1404                                 if (m2->mc_pg[mc->mc_top] == mc->mc_pg[mc->mc_top]) {
1405                                         m2->mc_pg[mc->mc_top] = mp;
1406                                 }
1407                         }
1408                 }
1409                 mc->mc_pg[mc->mc_top] = mp;
1410                 /** If this page has a parent, update the parent to point to
1411                  * this new page.
1412                  */
1413                 if (mc->mc_top)
1414                         SETPGNO(NODEPTR(mc->mc_pg[mc->mc_top-1], mc->mc_ki[mc->mc_top-1]), mp->mp_pgno);
1415                 else
1416                         mc->mc_db->md_root = mp->mp_pgno;
1417         } else if (mc->mc_txn->mt_parent) {
1418                 MDB_page *np;
1419                 MDB_ID2 mid;
1420                 /* If txn has a parent, make sure the page is in our
1421                  * dirty list.
1422                  */
1423                 if (mc->mc_txn->mt_u.dirty_list[0].mid) {
1424                         unsigned x = mdb_mid2l_search(mc->mc_txn->mt_u.dirty_list, mp->mp_pgno);
1425                         if (x <= mc->mc_txn->mt_u.dirty_list[0].mid &&
1426                                 mc->mc_txn->mt_u.dirty_list[x].mid == mp->mp_pgno) {
1427                                 if (mc->mc_txn->mt_u.dirty_list[x].mptr != mp) {
1428                                         mp = mc->mc_txn->mt_u.dirty_list[x].mptr;
1429                                         mc->mc_pg[mc->mc_top] = mp;
1430                                 }
1431                                 return 0;
1432                         }
1433                 }
1434                 /* No - copy it */
1435                 np = mdb_page_malloc(mc);
1436                 memcpy(np, mp, mc->mc_txn->mt_env->me_psize);
1437                 mid.mid = np->mp_pgno;
1438                 mid.mptr = np;
1439                 mdb_mid2l_insert(mc->mc_txn->mt_u.dirty_list, &mid);
1440                 mp = np;
1441                 goto finish;
1442         }
1443         return 0;
1444 }
1445
1446 int
1447 mdb_env_sync(MDB_env *env, int force)
1448 {
1449         int rc = 0;
1450         if (force || !F_ISSET(env->me_flags, MDB_NOSYNC)) {
1451                 if (MDB_FDATASYNC(env->me_fd))
1452                         rc = ErrCode();
1453         }
1454         return rc;
1455 }
1456
1457 /** Make shadow copies of all of parent txn's cursors */
1458 static int
1459 mdb_cursor_shadow(MDB_txn *src, MDB_txn *dst)
1460 {
1461         MDB_cursor *mc, *m2;
1462         unsigned int i, j, size;
1463
1464         for (i=0;i<src->mt_numdbs; i++) {
1465                 if (src->mt_cursors[i]) {
1466                         size = sizeof(MDB_cursor);
1467                         if (src->mt_cursors[i]->mc_xcursor)
1468                                 size += sizeof(MDB_xcursor);
1469                         for (m2 = src->mt_cursors[i]; m2; m2=m2->mc_next) {
1470                                 mc = malloc(size);
1471                                 if (!mc)
1472                                         return ENOMEM;
1473                                 mc->mc_orig = m2;
1474                                 mc->mc_txn = dst;
1475                                 mc->mc_dbi = i;
1476                                 mc->mc_db = &dst->mt_dbs[i];
1477                                 mc->mc_dbx = m2->mc_dbx;
1478                                 mc->mc_dbflag = &dst->mt_dbflags[i];
1479                                 mc->mc_snum = m2->mc_snum;
1480                                 mc->mc_top = m2->mc_top;
1481                                 mc->mc_flags = m2->mc_flags | C_SHADOW;
1482                                 for (j=0; j<mc->mc_snum; j++) {
1483                                         mc->mc_pg[j] = m2->mc_pg[j];
1484                                         mc->mc_ki[j] = m2->mc_ki[j];
1485                                 }
1486                                 if (m2->mc_xcursor) {
1487                                         MDB_xcursor *mx, *mx2;
1488                                         mx = (MDB_xcursor *)(mc+1);
1489                                         mc->mc_xcursor = mx;
1490                                         mx2 = m2->mc_xcursor;
1491                                         mx->mx_db = mx2->mx_db;
1492                                         mx->mx_dbx = mx2->mx_dbx;
1493                                         mx->mx_dbflag = mx2->mx_dbflag;
1494                                         mx->mx_cursor.mc_txn = dst;
1495                                         mx->mx_cursor.mc_dbi = mx2->mx_cursor.mc_dbi;
1496                                         mx->mx_cursor.mc_db = &mx->mx_db;
1497                                         mx->mx_cursor.mc_dbx = &mx->mx_dbx;
1498                                         mx->mx_cursor.mc_dbflag = &mx->mx_dbflag;
1499                                         mx->mx_cursor.mc_snum = mx2->mx_cursor.mc_snum;
1500                                         mx->mx_cursor.mc_top = mx2->mx_cursor.mc_top;
1501                                         mx->mx_cursor.mc_flags = mx2->mx_cursor.mc_flags | C_SHADOW;
1502                                         for (j=0; j<mx2->mx_cursor.mc_snum; j++) {
1503                                                 mx->mx_cursor.mc_pg[j] = mx2->mx_cursor.mc_pg[j];
1504                                                 mx->mx_cursor.mc_ki[j] = mx2->mx_cursor.mc_ki[j];
1505                                         }
1506                                 } else {
1507                                         mc->mc_xcursor = NULL;
1508                                 }
1509                                 mc->mc_next = dst->mt_cursors[i];
1510                                 dst->mt_cursors[i] = mc;
1511                         }
1512                 }
1513         }
1514         return MDB_SUCCESS;
1515 }
1516
1517 /** Merge shadow cursors back into parent's */
1518 static void
1519 mdb_cursor_merge(MDB_txn *txn)
1520 {
1521         MDB_dbi i;
1522         for (i=0; i<txn->mt_numdbs; i++) {
1523                 if (txn->mt_cursors[i]) {
1524                         MDB_cursor *mc;
1525                         while ((mc = txn->mt_cursors[i])) {
1526                                 txn->mt_cursors[i] = mc->mc_next;
1527                                 if (mc->mc_flags & C_SHADOW) {
1528                                         MDB_cursor *m2 = mc->mc_orig;
1529                                         unsigned int j;
1530                                         m2->mc_snum = mc->mc_snum;
1531                                         m2->mc_top = mc->mc_top;
1532                                         for (j=0; j<mc->mc_snum; j++) {
1533                                                 m2->mc_pg[j] = mc->mc_pg[j];
1534                                                 m2->mc_ki[j] = mc->mc_ki[j];
1535                                         }
1536                                 }
1537                                 if (mc->mc_flags & C_ALLOCD)
1538                                         free(mc);
1539                         }
1540                 }
1541         }
1542 }
1543
1544 static void
1545 mdb_txn_reset0(MDB_txn *txn);
1546
1547 /** Common code for #mdb_txn_begin() and #mdb_txn_renew().
1548  * @param[in] txn the transaction handle to initialize
1549  * @return 0 on success, non-zero on failure. This can only
1550  * fail for read-only transactions, and then only if the
1551  * reader table is full.
1552  */
1553 static int
1554 mdb_txn_renew0(MDB_txn *txn)
1555 {
1556         MDB_env *env = txn->mt_env;
1557         unsigned int i;
1558
1559         /* Setup db info */
1560         txn->mt_numdbs = env->me_numdbs;
1561         txn->mt_dbxs = env->me_dbxs;    /* mostly static anyway */
1562
1563         if (txn->mt_flags & MDB_TXN_RDONLY) {
1564                 MDB_reader *r = pthread_getspecific(env->me_txkey);
1565                 if (!r) {
1566                         pid_t pid = getpid();
1567                         pthread_t tid = pthread_self();
1568
1569                         LOCK_MUTEX_R(env);
1570                         for (i=0; i<env->me_txns->mti_numreaders; i++)
1571                                 if (env->me_txns->mti_readers[i].mr_pid == 0)
1572                                         break;
1573                         if (i == env->me_maxreaders) {
1574                                 UNLOCK_MUTEX_R(env);
1575                                 return ENOMEM;
1576                         }
1577                         env->me_txns->mti_readers[i].mr_pid = pid;
1578                         env->me_txns->mti_readers[i].mr_tid = tid;
1579                         if (i >= env->me_txns->mti_numreaders)
1580                                 env->me_txns->mti_numreaders = i+1;
1581                         UNLOCK_MUTEX_R(env);
1582                         r = &env->me_txns->mti_readers[i];
1583                         pthread_setspecific(env->me_txkey, r);
1584                 }
1585                 txn->mt_txnid = r->mr_txnid = env->me_txns->mti_txnid;
1586                 txn->mt_toggle = txn->mt_txnid & 1;
1587                 txn->mt_next_pgno = env->me_metas[txn->mt_toggle]->mm_last_pg+1;
1588                 txn->mt_u.reader = r;
1589         } else {
1590                 LOCK_MUTEX_W(env);
1591
1592                 txn->mt_txnid = env->me_txns->mti_txnid;
1593                 txn->mt_toggle = txn->mt_txnid & 1;
1594                 txn->mt_next_pgno = env->me_metas[txn->mt_toggle]->mm_last_pg+1;
1595                 txn->mt_txnid++;
1596 #if MDB_DEBUG
1597                 if (txn->mt_txnid == mdb_debug_start)
1598                         mdb_debug = 1;
1599 #endif
1600                 txn->mt_u.dirty_list = env->me_dirty_list;
1601                 txn->mt_u.dirty_list[0].mid = 0;
1602                 txn->mt_free_pgs = env->me_free_pgs;
1603                 txn->mt_free_pgs[0] = 0;
1604                 env->me_txn = txn;
1605         }
1606
1607         /* Copy the DB info and flags */
1608         memcpy(txn->mt_dbs, env->me_metas[txn->mt_toggle]->mm_dbs, 2 * sizeof(MDB_db));
1609         for (i=2; i<txn->mt_numdbs; i++)
1610                 txn->mt_dbs[i].md_flags = env->me_dbflags[i];
1611         txn->mt_dbflags[0] = txn->mt_dbflags[1] = 0;
1612         if (txn->mt_numdbs > 2)
1613                 memset(txn->mt_dbflags+2, DB_STALE, txn->mt_numdbs-2);
1614
1615         return MDB_SUCCESS;
1616 }
1617
1618 int
1619 mdb_txn_renew(MDB_txn *txn)
1620 {
1621         int rc;
1622
1623         if (!txn)
1624                 return EINVAL;
1625
1626         if (txn->mt_env->me_flags & MDB_FATAL_ERROR) {
1627                 DPUTS("environment had fatal error, must shutdown!");
1628                 return MDB_PANIC;
1629         }
1630
1631         rc = mdb_txn_renew0(txn);
1632         if (rc == MDB_SUCCESS) {
1633                 DPRINTF("renew txn %zu%c %p on mdbenv %p, root page %zu",
1634                         txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w',
1635                         (void *)txn, (void *)txn->mt_env, txn->mt_dbs[MAIN_DBI].md_root);
1636         }
1637         return rc;
1638 }
1639
1640 int
1641 mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned int flags, MDB_txn **ret)
1642 {
1643         MDB_txn *txn;
1644         int rc, size;
1645
1646         if (env->me_flags & MDB_FATAL_ERROR) {
1647                 DPUTS("environment had fatal error, must shutdown!");
1648                 return MDB_PANIC;
1649         }
1650         if (parent) {
1651                 /* parent already has an active child txn */
1652                 if (parent->mt_child) {
1653                         return EINVAL;
1654                 }
1655         }
1656         size = sizeof(MDB_txn) + env->me_maxdbs * (sizeof(MDB_db)+1);
1657         if (!(flags & MDB_RDONLY))
1658                 size += env->me_maxdbs * sizeof(MDB_cursor *);
1659
1660         if ((txn = calloc(1, size)) == NULL) {
1661                 DPRINTF("calloc: %s", strerror(ErrCode()));
1662                 return ENOMEM;
1663         }
1664         txn->mt_dbs = (MDB_db *)(txn+1);
1665         if (flags & MDB_RDONLY) {
1666                 txn->mt_flags |= MDB_TXN_RDONLY;
1667                 txn->mt_dbflags = (unsigned char *)(txn->mt_dbs + env->me_maxdbs);
1668         } else {
1669                 txn->mt_cursors = (MDB_cursor **)(txn->mt_dbs + env->me_maxdbs);
1670                 txn->mt_dbflags = (unsigned char *)(txn->mt_cursors + env->me_maxdbs);
1671         }
1672         txn->mt_env = env;
1673
1674         if (parent) {
1675                 txn->mt_free_pgs = mdb_midl_alloc();
1676                 if (!txn->mt_free_pgs) {
1677                         free(txn);
1678                         return ENOMEM;
1679                 }
1680                 txn->mt_u.dirty_list = malloc(sizeof(MDB_ID2)*MDB_IDL_UM_SIZE);
1681                 if (!txn->mt_u.dirty_list) {
1682                         free(txn->mt_free_pgs);
1683                         free(txn);
1684                         return ENOMEM;
1685                 }
1686                 txn->mt_txnid = parent->mt_txnid;
1687                 txn->mt_toggle = parent->mt_toggle;
1688                 txn->mt_u.dirty_list[0].mid = 0;
1689                 txn->mt_free_pgs[0] = 0;
1690                 txn->mt_next_pgno = parent->mt_next_pgno;
1691                 parent->mt_child = txn;
1692                 txn->mt_parent = parent;
1693                 txn->mt_numdbs = parent->mt_numdbs;
1694                 txn->mt_dbxs = parent->mt_dbxs;
1695                 memcpy(txn->mt_dbs, parent->mt_dbs, txn->mt_numdbs * sizeof(MDB_db));
1696                 memcpy(txn->mt_dbflags, parent->mt_dbflags, txn->mt_numdbs);
1697                 mdb_cursor_shadow(parent, txn);
1698                 rc = 0;
1699         } else {
1700                 rc = mdb_txn_renew0(txn);
1701         }
1702         if (rc)
1703                 free(txn);
1704         else {
1705                 *ret = txn;
1706                 DPRINTF("begin txn %zu%c %p on mdbenv %p, root page %zu",
1707                         txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w',
1708                         (void *) txn, (void *) env, txn->mt_dbs[MAIN_DBI].md_root);
1709         }
1710
1711         return rc;
1712 }
1713
1714 /** Common code for #mdb_txn_reset() and #mdb_txn_abort().
1715  * @param[in] txn the transaction handle to reset
1716  */
1717 static void
1718 mdb_txn_reset0(MDB_txn *txn)
1719 {
1720         MDB_env *env = txn->mt_env;
1721
1722         if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) {
1723                 txn->mt_u.reader->mr_txnid = 0;
1724         } else {
1725                 MDB_oldpages *mop;
1726                 MDB_page *dp;
1727                 unsigned int i;
1728
1729                 /* close(free) all cursors */
1730                 for (i=0; i<txn->mt_numdbs; i++) {
1731                         if (txn->mt_cursors[i]) {
1732                                 MDB_cursor *mc;
1733                                 while ((mc = txn->mt_cursors[i])) {
1734                                         txn->mt_cursors[i] = mc->mc_next;
1735                                         if (mc->mc_flags & C_ALLOCD)
1736                                                 free(mc);
1737                                 }
1738                         }
1739                 }
1740
1741                 /* return all dirty pages to dpage list */
1742                 for (i=1; i<=txn->mt_u.dirty_list[0].mid; i++) {
1743                         dp = txn->mt_u.dirty_list[i].mptr;
1744                         if (!IS_OVERFLOW(dp) || dp->mp_pages == 1) {
1745                                 dp->mp_next = txn->mt_env->me_dpages;
1746                                 VGMEMP_FREE(txn->mt_env, dp);
1747                                 txn->mt_env->me_dpages = dp;
1748                         } else {
1749                                 /* large pages just get freed directly */
1750                                 VGMEMP_FREE(txn->mt_env, dp);
1751                                 free(dp);
1752                         }
1753                 }
1754
1755                 if (txn->mt_parent) {
1756                         txn->mt_parent->mt_child = NULL;
1757                         free(txn->mt_free_pgs);
1758                         free(txn->mt_u.dirty_list);
1759                         return;
1760                 } else {
1761                         if (mdb_midl_shrink(&txn->mt_free_pgs))
1762                                 env->me_free_pgs = txn->mt_free_pgs;
1763                 }
1764
1765                 while ((mop = txn->mt_env->me_pghead)) {
1766                         txn->mt_env->me_pghead = mop->mo_next;
1767                         free(mop);
1768                 }
1769                 txn->mt_env->me_pgfirst = 0;
1770                 txn->mt_env->me_pglast = 0;
1771
1772                 env->me_txn = NULL;
1773                 /* The writer mutex was locked in mdb_txn_begin. */
1774                 UNLOCK_MUTEX_W(env);
1775         }
1776 }
1777
1778 void
1779 mdb_txn_reset(MDB_txn *txn)
1780 {
1781         if (txn == NULL)
1782                 return;
1783
1784         DPRINTF("reset txn %zu%c %p on mdbenv %p, root page %zu",
1785                 txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w',
1786                 (void *) txn, (void *)txn->mt_env, txn->mt_dbs[MAIN_DBI].md_root);
1787
1788         mdb_txn_reset0(txn);
1789 }
1790
1791 void
1792 mdb_txn_abort(MDB_txn *txn)
1793 {
1794         if (txn == NULL)
1795                 return;
1796
1797         DPRINTF("abort txn %zu%c %p on mdbenv %p, root page %zu",
1798                 txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w',
1799                 (void *)txn, (void *)txn->mt_env, txn->mt_dbs[MAIN_DBI].md_root);
1800
1801         if (txn->mt_child)
1802                 mdb_txn_abort(txn->mt_child);
1803
1804         mdb_txn_reset0(txn);
1805         free(txn);
1806 }
1807
1808 int
1809 mdb_txn_commit(MDB_txn *txn)
1810 {
1811         int              n, done;
1812         unsigned int i;
1813         ssize_t          rc;
1814         off_t            size;
1815         MDB_page        *dp;
1816         MDB_env *env;
1817         pgno_t  next, freecnt;
1818         MDB_cursor mc;
1819
1820         assert(txn != NULL);
1821         assert(txn->mt_env != NULL);
1822
1823         if (txn->mt_child) {
1824                 mdb_txn_commit(txn->mt_child);
1825                 txn->mt_child = NULL;
1826         }
1827
1828         env = txn->mt_env;
1829
1830         if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) {
1831                 if (txn->mt_numdbs > env->me_numdbs) {
1832                         /* update the DB flags */
1833                         MDB_dbi i;
1834                         for (i = env->me_numdbs; i<txn->mt_numdbs; i++)
1835                                 env->me_dbflags[i] = txn->mt_dbs[i].md_flags;
1836                         env->me_numdbs = i;
1837                 }
1838                 mdb_txn_abort(txn);
1839                 return MDB_SUCCESS;
1840         }
1841
1842         if (F_ISSET(txn->mt_flags, MDB_TXN_ERROR)) {
1843                 DPUTS("error flag is set, can't commit");
1844                 if (txn->mt_parent)
1845                         txn->mt_parent->mt_flags |= MDB_TXN_ERROR;
1846                 mdb_txn_abort(txn);
1847                 return EINVAL;
1848         }
1849
1850         /* Merge (and close) our cursors with parent's */
1851         mdb_cursor_merge(txn);
1852
1853         if (txn->mt_parent) {
1854                 MDB_db *ip, *jp;
1855                 MDB_dbi i;
1856                 unsigned x, y;
1857                 MDB_ID2L dst, src;
1858
1859                 /* Update parent's DB table */
1860                 ip = &txn->mt_parent->mt_dbs[2];
1861                 jp = &txn->mt_dbs[2];
1862                 for (i = 2; i < txn->mt_numdbs; i++) {
1863                         if (ip->md_root != jp->md_root)
1864                                 *ip = *jp;
1865                         ip++; jp++;
1866                 }
1867                 txn->mt_parent->mt_numdbs = txn->mt_numdbs;
1868
1869                 /* Append our free list to parent's */
1870                 mdb_midl_append_list(&txn->mt_parent->mt_free_pgs,
1871                         txn->mt_free_pgs);
1872                 mdb_midl_free(txn->mt_free_pgs);
1873
1874                 /* Merge our dirty list with parent's */
1875                 dst = txn->mt_parent->mt_u.dirty_list;
1876                 src = txn->mt_u.dirty_list;
1877                 x = mdb_mid2l_search(dst, src[1].mid);
1878                 for (y=1; y<=src[0].mid; y++) {
1879                         while (x <= dst[0].mid && dst[x].mid != src[y].mid) x++;
1880                         if (x > dst[0].mid)
1881                                 break;
1882                         free(dst[x].mptr);
1883                         dst[x].mptr = src[y].mptr;
1884                 }
1885                 x = dst[0].mid;
1886                 for (; y<=src[0].mid; y++) {
1887                         if (++x >= MDB_IDL_UM_MAX) {
1888                                 mdb_txn_abort(txn);
1889                                 return ENOMEM;
1890                         }
1891                         dst[x] = src[y];
1892                 }
1893                 dst[0].mid = x;
1894                 free(txn->mt_u.dirty_list);
1895                 txn->mt_parent->mt_child = NULL;
1896                 free(txn);
1897                 return MDB_SUCCESS;
1898         }
1899
1900         if (txn != env->me_txn) {
1901                 DPUTS("attempt to commit unknown transaction");
1902                 mdb_txn_abort(txn);
1903                 return EINVAL;
1904         }
1905
1906         if (!txn->mt_u.dirty_list[0].mid)
1907                 goto done;
1908
1909         DPRINTF("committing txn %zu %p on mdbenv %p, root page %zu",
1910             txn->mt_txnid, (void *)txn, (void *)env, txn->mt_dbs[MAIN_DBI].md_root);
1911
1912         /* Update DB root pointers. Their pages have already been
1913          * touched so this is all in-place and cannot fail.
1914          */
1915         if (txn->mt_numdbs > 2) {
1916                 MDB_dbi i;
1917                 MDB_val data;
1918                 data.mv_size = sizeof(MDB_db);
1919
1920                 mdb_cursor_init(&mc, txn, MAIN_DBI, NULL);
1921                 for (i = 2; i < txn->mt_numdbs; i++) {
1922                         if (txn->mt_dbflags[i] & DB_DIRTY) {
1923                                 data.mv_data = &txn->mt_dbs[i];
1924                                 mdb_cursor_put(&mc, &txn->mt_dbxs[i].md_name, &data, 0);
1925                         }
1926                 }
1927         }
1928
1929         mdb_cursor_init(&mc, txn, FREE_DBI, NULL);
1930
1931         /* should only be one record now */
1932         if (env->me_pghead) {
1933                 /* make sure first page of freeDB is touched and on freelist */
1934                 mdb_page_search(&mc, NULL, MDB_PS_MODIFY);
1935         }
1936
1937         /* Delete IDLs we used from the free list */
1938         if (env->me_pgfirst) {
1939                 txnid_t cur;
1940                 MDB_val key;
1941                 int exact = 0;
1942
1943                 key.mv_size = sizeof(cur);
1944                 for (cur = env->me_pgfirst; cur <= env->me_pglast; cur++) {
1945                         key.mv_data = &cur;
1946
1947                         mdb_cursor_set(&mc, &key, NULL, MDB_SET, &exact);
1948                         rc = mdb_cursor_del(&mc, 0);
1949                         if (rc) {
1950                                 mdb_txn_abort(txn);
1951                                 return rc;
1952                         }
1953                 }
1954                 env->me_pgfirst = 0;
1955                 env->me_pglast = 0;
1956         }
1957
1958         /* save to free list */
1959 free2:
1960         freecnt = txn->mt_free_pgs[0];
1961         if (!MDB_IDL_IS_ZERO(txn->mt_free_pgs)) {
1962                 MDB_val key, data;
1963
1964                 /* make sure last page of freeDB is touched and on freelist */
1965                 key.mv_size = MAXKEYSIZE+1;
1966                 key.mv_data = NULL;
1967                 mdb_page_search(&mc, &key, MDB_PS_MODIFY);
1968
1969                 mdb_midl_sort(txn->mt_free_pgs);
1970 #if MDB_DEBUG > 1
1971                 {
1972                         unsigned int i;
1973                         MDB_IDL idl = txn->mt_free_pgs;
1974                         DPRINTF("IDL write txn %zu root %zu num %zu",
1975                                 txn->mt_txnid, txn->mt_dbs[FREE_DBI].md_root, idl[0]);
1976                         for (i=0; i<idl[0]; i++) {
1977                                 DPRINTF("IDL %zu", idl[i+1]);
1978                         }
1979                 }
1980 #endif
1981                 /* write to last page of freeDB */
1982                 key.mv_size = sizeof(pgno_t);
1983                 key.mv_data = &txn->mt_txnid;
1984                 data.mv_data = txn->mt_free_pgs;
1985                 /* The free list can still grow during this call,
1986                  * despite the pre-emptive touches above. So check
1987                  * and make sure the entire thing got written.
1988                  */
1989                 do {
1990                         freecnt = txn->mt_free_pgs[0];
1991                         data.mv_size = MDB_IDL_SIZEOF(txn->mt_free_pgs);
1992                         rc = mdb_cursor_put(&mc, &key, &data, 0);
1993                         if (rc) {
1994                                 mdb_txn_abort(txn);
1995                                 return rc;
1996                         }
1997                 } while (freecnt != txn->mt_free_pgs[0]);
1998         }
1999         /* should only be one record now */
2000 again:
2001         if (env->me_pghead) {
2002                 MDB_val key, data;
2003                 MDB_oldpages *mop;
2004                 pgno_t orig;
2005                 txnid_t id;
2006
2007                 mop = env->me_pghead;
2008                 id = mop->mo_txnid;
2009                 key.mv_size = sizeof(id);
2010                 key.mv_data = &id;
2011                 data.mv_size = MDB_IDL_SIZEOF(mop->mo_pages);
2012                 data.mv_data = mop->mo_pages;
2013                 orig = mop->mo_pages[0];
2014                 /* These steps may grow the freelist again
2015                  * due to freed overflow pages...
2016                  */
2017                 mdb_cursor_put(&mc, &key, &data, 0);
2018                 if (mop == env->me_pghead && env->me_pghead->mo_txnid == id) {
2019                         /* could have been used again here */
2020                         if (mop->mo_pages[0] != orig) {
2021                                 data.mv_size = MDB_IDL_SIZEOF(mop->mo_pages);
2022                                 data.mv_data = mop->mo_pages;
2023                                 id = mop->mo_txnid;
2024                                 mdb_cursor_put(&mc, &key, &data, 0);
2025                         }
2026                         env->me_pghead = NULL;
2027                         free(mop);
2028                 } else {
2029                         /* was completely used up */
2030                         mdb_cursor_del(&mc, 0);
2031                         if (env->me_pghead)
2032                                 goto again;
2033                 }
2034                 env->me_pgfirst = 0;
2035                 env->me_pglast = 0;
2036         }
2037
2038         while (env->me_pgfree) {
2039                 MDB_oldpages *mop = env->me_pgfree;
2040                 env->me_pgfree = mop->mo_next;
2041                 free(mop);;
2042         }
2043
2044         /* Check for growth of freelist again */
2045         if (freecnt != txn->mt_free_pgs[0])
2046                 goto free2;
2047
2048         if (!MDB_IDL_IS_ZERO(txn->mt_free_pgs)) {
2049                 if (mdb_midl_shrink(&txn->mt_free_pgs))
2050                         env->me_free_pgs = txn->mt_free_pgs;
2051         }
2052
2053 #if MDB_DEBUG > 2
2054         mdb_audit(txn);
2055 #endif
2056
2057         /* Commit up to MDB_COMMIT_PAGES dirty pages to disk until done.
2058          */
2059         next = 0;
2060         i = 1;
2061         do {
2062 #ifdef _WIN32
2063                 /* Windows actually supports scatter/gather I/O, but only on
2064                  * unbuffered file handles. Since we're relying on the OS page
2065                  * cache for all our data, that's self-defeating. So we just
2066                  * write pages one at a time. We use the ov structure to set
2067                  * the write offset, to at least save the overhead of a Seek
2068                  * system call.
2069                  */
2070                 OVERLAPPED ov;
2071                 memset(&ov, 0, sizeof(ov));
2072                 for (; i<=txn->mt_u.dirty_list[0].mid; i++) {
2073                         size_t wsize;
2074                         dp = txn->mt_u.dirty_list[i].mptr;
2075                         DPRINTF("committing page %zu", dp->mp_pgno);
2076                         size = dp->mp_pgno * env->me_psize;
2077                         ov.Offset = size & 0xffffffff;
2078                         ov.OffsetHigh = size >> 16;
2079                         ov.OffsetHigh >>= 16;
2080                         /* clear dirty flag */
2081                         dp->mp_flags &= ~P_DIRTY;
2082                         wsize = env->me_psize;
2083                         if (IS_OVERFLOW(dp)) wsize *= dp->mp_pages;
2084                         rc = WriteFile(env->me_fd, dp, wsize, NULL, &ov);
2085                         if (!rc) {
2086                                 n = ErrCode();
2087                                 DPRINTF("WriteFile: %d", n);
2088                                 mdb_txn_abort(txn);
2089                                 return n;
2090                         }
2091                 }
2092                 done = 1;
2093 #else
2094                 struct iovec     iov[MDB_COMMIT_PAGES];
2095                 n = 0;
2096                 done = 1;
2097                 size = 0;
2098                 for (; i<=txn->mt_u.dirty_list[0].mid; i++) {
2099                         dp = txn->mt_u.dirty_list[i].mptr;
2100                         if (dp->mp_pgno != next) {
2101                                 if (n) {
2102                                         rc = writev(env->me_fd, iov, n);
2103                                         if (rc != size) {
2104                                                 n = ErrCode();
2105                                                 if (rc > 0)
2106                                                         DPUTS("short write, filesystem full?");
2107                                                 else
2108                                                         DPRINTF("writev: %s", strerror(n));
2109                                                 mdb_txn_abort(txn);
2110                                                 return n;
2111                                         }
2112                                         n = 0;
2113                                         size = 0;
2114                                 }
2115                                 lseek(env->me_fd, dp->mp_pgno * env->me_psize, SEEK_SET);
2116                                 next = dp->mp_pgno;
2117                         }
2118                         DPRINTF("committing page %zu", dp->mp_pgno);
2119                         iov[n].iov_len = env->me_psize;
2120                         if (IS_OVERFLOW(dp)) iov[n].iov_len *= dp->mp_pages;
2121                         iov[n].iov_base = (char *)dp;
2122                         size += iov[n].iov_len;
2123                         next = dp->mp_pgno + (IS_OVERFLOW(dp) ? dp->mp_pages : 1);
2124                         /* clear dirty flag */
2125                         dp->mp_flags &= ~P_DIRTY;
2126                         if (++n >= MDB_COMMIT_PAGES) {
2127                                 done = 0;
2128                                 i++;
2129                                 break;
2130                         }
2131                 }
2132
2133                 if (n == 0)
2134                         break;
2135
2136                 rc = writev(env->me_fd, iov, n);
2137                 if (rc != size) {
2138                         n = ErrCode();
2139                         if (rc > 0)
2140                                 DPUTS("short write, filesystem full?");
2141                         else
2142                                 DPRINTF("writev: %s", strerror(n));
2143                         mdb_txn_abort(txn);
2144                         return n;
2145                 }
2146 #endif
2147         } while (!done);
2148
2149         /* Drop the dirty pages.
2150          */
2151         for (i=1; i<=txn->mt_u.dirty_list[0].mid; i++) {
2152                 dp = txn->mt_u.dirty_list[i].mptr;
2153                 if (!IS_OVERFLOW(dp) || dp->mp_pages == 1) {
2154                         dp->mp_next = txn->mt_env->me_dpages;
2155                         VGMEMP_FREE(txn->mt_env, dp);
2156                         txn->mt_env->me_dpages = dp;
2157                 } else {
2158                         VGMEMP_FREE(txn->mt_env, dp);
2159                         free(dp);
2160                 }
2161                 txn->mt_u.dirty_list[i].mid = 0;
2162         }
2163         txn->mt_u.dirty_list[0].mid = 0;
2164
2165         if ((n = mdb_env_sync(env, 0)) != 0 ||
2166             (n = mdb_env_write_meta(txn)) != MDB_SUCCESS) {
2167                 mdb_txn_abort(txn);
2168                 return n;
2169         }
2170
2171 done:
2172         env->me_txn = NULL;
2173         if (txn->mt_numdbs > env->me_numdbs) {
2174                 /* update the DB flags */
2175                 MDB_dbi i;
2176                 for (i = env->me_numdbs; i<txn->mt_numdbs; i++)
2177                         env->me_dbflags[i] = txn->mt_dbs[i].md_flags;
2178                 env->me_numdbs = i;
2179         }
2180
2181         UNLOCK_MUTEX_W(env);
2182         free(txn);
2183
2184         return MDB_SUCCESS;
2185 }
2186
2187 /** Read the environment parameters of a DB environment before
2188  * mapping it into memory.
2189  * @param[in] env the environment handle
2190  * @param[out] meta address of where to store the meta information
2191  * @return 0 on success, non-zero on failure.
2192  */
2193 static int
2194 mdb_env_read_header(MDB_env *env, MDB_meta *meta)
2195 {
2196         MDB_pagebuf     pbuf;
2197         MDB_page        *p;
2198         MDB_meta        *m;
2199         int              rc, err;
2200
2201         /* We don't know the page size yet, so use a minimum value.
2202          */
2203
2204 #ifdef _WIN32
2205         if (!ReadFile(env->me_fd, &pbuf, MDB_PAGESIZE, (DWORD *)&rc, NULL) || rc == 0)
2206 #else
2207         if ((rc = read(env->me_fd, &pbuf, MDB_PAGESIZE)) == 0)
2208 #endif
2209         {
2210                 return ENOENT;
2211         }
2212         else if (rc != MDB_PAGESIZE) {
2213                 err = ErrCode();
2214                 if (rc > 0)
2215                         err = EINVAL;
2216                 DPRINTF("read: %s", strerror(err));
2217                 return err;
2218         }
2219
2220         p = (MDB_page *)&pbuf;
2221
2222         if (!F_ISSET(p->mp_flags, P_META)) {
2223                 DPRINTF("page %zu not a meta page", p->mp_pgno);
2224                 return EINVAL;
2225         }
2226
2227         m = METADATA(p);
2228         if (m->mm_magic != MDB_MAGIC) {
2229                 DPUTS("meta has invalid magic");
2230                 return EINVAL;
2231         }
2232
2233         if (m->mm_version != MDB_VERSION) {
2234                 DPRINTF("database is version %u, expected version %u",
2235                     m->mm_version, MDB_VERSION);
2236                 return MDB_VERSION_MISMATCH;
2237         }
2238
2239         memcpy(meta, m, sizeof(*m));
2240         return 0;
2241 }
2242
2243 /** Write the environment parameters of a freshly created DB environment.
2244  * @param[in] env the environment handle
2245  * @param[out] meta address of where to store the meta information
2246  * @return 0 on success, non-zero on failure.
2247  */
2248 static int
2249 mdb_env_init_meta(MDB_env *env, MDB_meta *meta)
2250 {
2251         MDB_page *p, *q;
2252         MDB_meta *m;
2253         int rc;
2254         unsigned int     psize;
2255
2256         DPUTS("writing new meta page");
2257
2258         GET_PAGESIZE(psize);
2259
2260         meta->mm_magic = MDB_MAGIC;
2261         meta->mm_version = MDB_VERSION;
2262         meta->mm_psize = psize;
2263         meta->mm_last_pg = 1;
2264         meta->mm_flags = env->me_flags & 0xffff;
2265         meta->mm_flags |= MDB_INTEGERKEY;
2266         meta->mm_dbs[0].md_root = P_INVALID;
2267         meta->mm_dbs[1].md_root = P_INVALID;
2268
2269         p = calloc(2, psize);
2270         p->mp_pgno = 0;
2271         p->mp_flags = P_META;
2272
2273         m = METADATA(p);
2274         memcpy(m, meta, sizeof(*meta));
2275
2276         q = (MDB_page *)((char *)p + psize);
2277
2278         q->mp_pgno = 1;
2279         q->mp_flags = P_META;
2280
2281         m = METADATA(q);
2282         memcpy(m, meta, sizeof(*meta));
2283
2284 #ifdef _WIN32
2285         {
2286                 DWORD len;
2287                 rc = WriteFile(env->me_fd, p, psize * 2, &len, NULL);
2288                 rc = (len == psize * 2) ? MDB_SUCCESS : ErrCode();
2289         }
2290 #else
2291         rc = write(env->me_fd, p, psize * 2);
2292         rc = (rc == (int)psize * 2) ? MDB_SUCCESS : ErrCode();
2293 #endif
2294         free(p);
2295         return rc;
2296 }
2297
2298 /** Update the environment info to commit a transaction.
2299  * @param[in] txn the transaction that's being committed
2300  * @return 0 on success, non-zero on failure.
2301  */
2302 static int
2303 mdb_env_write_meta(MDB_txn *txn)
2304 {
2305         MDB_env *env;
2306         MDB_meta        meta, metab;
2307         off_t off;
2308         int rc, len, toggle;
2309         char *ptr;
2310 #ifdef _WIN32
2311         OVERLAPPED ov;
2312 #endif
2313
2314         assert(txn != NULL);
2315         assert(txn->mt_env != NULL);
2316
2317         toggle = !txn->mt_toggle;
2318         DPRINTF("writing meta page %d for root page %zu",
2319                 toggle, txn->mt_dbs[MAIN_DBI].md_root);
2320
2321         env = txn->mt_env;
2322
2323         metab.mm_txnid = env->me_metas[toggle]->mm_txnid;
2324         metab.mm_last_pg = env->me_metas[toggle]->mm_last_pg;
2325
2326         ptr = (char *)&meta;
2327         off = offsetof(MDB_meta, mm_dbs[0].md_depth);
2328         len = sizeof(MDB_meta) - off;
2329
2330         ptr += off;
2331         meta.mm_dbs[0] = txn->mt_dbs[0];
2332         meta.mm_dbs[1] = txn->mt_dbs[1];
2333         meta.mm_last_pg = txn->mt_next_pgno - 1;
2334         meta.mm_txnid = txn->mt_txnid;
2335
2336         if (toggle)
2337                 off += env->me_psize;
2338         off += PAGEHDRSZ;
2339
2340         /* Write to the SYNC fd */
2341 #ifdef _WIN32
2342         {
2343                 memset(&ov, 0, sizeof(ov));
2344                 ov.Offset = off;
2345                 WriteFile(env->me_mfd, ptr, len, (DWORD *)&rc, &ov);
2346         }
2347 #else
2348         rc = pwrite(env->me_mfd, ptr, len, off);
2349 #endif
2350         if (rc != len) {
2351                 int r2;
2352                 rc = ErrCode();
2353                 DPUTS("write failed, disk error?");
2354                 /* On a failure, the pagecache still contains the new data.
2355                  * Write some old data back, to prevent it from being used.
2356                  * Use the non-SYNC fd; we know it will fail anyway.
2357                  */
2358                 meta.mm_last_pg = metab.mm_last_pg;
2359                 meta.mm_txnid = metab.mm_txnid;
2360 #ifdef _WIN32
2361                 WriteFile(env->me_fd, ptr, len, NULL, &ov);
2362 #else
2363                 r2 = pwrite(env->me_fd, ptr, len, off);
2364 #endif
2365                 env->me_flags |= MDB_FATAL_ERROR;
2366                 return rc;
2367         }
2368         /* Memory ordering issues are irrelevant; since the entire writer
2369          * is wrapped by wmutex, all of these changes will become visible
2370          * after the wmutex is unlocked. Since the DB is multi-version,
2371          * readers will get consistent data regardless of how fresh or
2372          * how stale their view of these values is.
2373          */
2374         txn->mt_env->me_txns->mti_txnid = txn->mt_txnid;
2375
2376         return MDB_SUCCESS;
2377 }
2378
2379 /** Check both meta pages to see which one is newer.
2380  * @param[in] env the environment handle
2381  * @return meta toggle (0 or 1).
2382  */
2383 static int
2384 mdb_env_pick_meta(const MDB_env *env)
2385 {
2386         return (env->me_metas[0]->mm_txnid < env->me_metas[1]->mm_txnid);
2387 }
2388
2389 int
2390 mdb_env_create(MDB_env **env)
2391 {
2392         MDB_env *e;
2393
2394         e = calloc(1, sizeof(MDB_env));
2395         if (!e)
2396                 return ENOMEM;
2397
2398         e->me_free_pgs = mdb_midl_alloc();
2399         if (!e->me_free_pgs) {
2400                 free(e);
2401                 return ENOMEM;
2402         }
2403         e->me_maxreaders = DEFAULT_READERS;
2404         e->me_maxdbs = 2;
2405         e->me_fd = INVALID_HANDLE_VALUE;
2406         e->me_lfd = INVALID_HANDLE_VALUE;
2407         e->me_mfd = INVALID_HANDLE_VALUE;
2408         VGMEMP_CREATE(e,0,0);
2409         *env = e;
2410         return MDB_SUCCESS;
2411 }
2412
2413 int
2414 mdb_env_set_mapsize(MDB_env *env, size_t size)
2415 {
2416         if (env->me_map)
2417                 return EINVAL;
2418         env->me_mapsize = size;
2419         if (env->me_psize)
2420                 env->me_maxpg = env->me_mapsize / env->me_psize;
2421         return MDB_SUCCESS;
2422 }
2423
2424 int
2425 mdb_env_set_maxdbs(MDB_env *env, MDB_dbi dbs)
2426 {
2427         if (env->me_map)
2428                 return EINVAL;
2429         env->me_maxdbs = dbs;
2430         return MDB_SUCCESS;
2431 }
2432
2433 int
2434 mdb_env_set_maxreaders(MDB_env *env, unsigned int readers)
2435 {
2436         if (env->me_map || readers < 1)
2437                 return EINVAL;
2438         env->me_maxreaders = readers;
2439         return MDB_SUCCESS;
2440 }
2441
2442 int
2443 mdb_env_get_maxreaders(MDB_env *env, unsigned int *readers)
2444 {
2445         if (!env || !readers)
2446                 return EINVAL;
2447         *readers = env->me_maxreaders;
2448         return MDB_SUCCESS;
2449 }
2450
2451 /** Further setup required for opening an MDB environment
2452  */
2453 static int
2454 mdb_env_open2(MDB_env *env, unsigned int flags)
2455 {
2456         int i, newenv = 0;
2457         MDB_meta meta;
2458         MDB_page *p;
2459
2460         env->me_flags = flags;
2461
2462         memset(&meta, 0, sizeof(meta));
2463
2464         if ((i = mdb_env_read_header(env, &meta)) != 0) {
2465                 if (i != ENOENT)
2466                         return i;
2467                 DPUTS("new mdbenv");
2468                 newenv = 1;
2469         }
2470
2471         if (!env->me_mapsize) {
2472                 env->me_mapsize = newenv ? DEFAULT_MAPSIZE : meta.mm_mapsize;
2473         }
2474
2475 #ifdef _WIN32
2476         {
2477                 HANDLE mh;
2478                 LONG sizelo, sizehi;
2479                 sizelo = env->me_mapsize & 0xffffffff;
2480                 sizehi = env->me_mapsize >> 16;         /* pointless on WIN32, only needed on W64 */
2481                 sizehi >>= 16;
2482                 /* Windows won't create mappings for zero length files.
2483                  * Just allocate the maxsize right now.
2484                  */
2485                 if (newenv) {
2486                         SetFilePointer(env->me_fd, sizelo, sizehi ? &sizehi : NULL, 0);
2487                         if (!SetEndOfFile(env->me_fd))
2488                                 return ErrCode();
2489                         SetFilePointer(env->me_fd, 0, NULL, 0);
2490                 }
2491                 mh = CreateFileMapping(env->me_fd, NULL, PAGE_READONLY,
2492                         sizehi, sizelo, NULL);
2493                 if (!mh)
2494                         return ErrCode();
2495                 env->me_map = MapViewOfFileEx(mh, FILE_MAP_READ, 0, 0, env->me_mapsize,
2496                         meta.mm_address);
2497                 CloseHandle(mh);
2498                 if (!env->me_map)
2499                         return ErrCode();
2500         }
2501 #else
2502         i = MAP_SHARED;
2503         if (meta.mm_address && (flags & MDB_FIXEDMAP))
2504                 i |= MAP_FIXED;
2505         env->me_map = mmap(meta.mm_address, env->me_mapsize, PROT_READ, i,
2506                 env->me_fd, 0);
2507         if (env->me_map == MAP_FAILED) {
2508                 env->me_map = NULL;
2509                 return ErrCode();
2510         }
2511 #endif
2512
2513         if (newenv) {
2514                 meta.mm_mapsize = env->me_mapsize;
2515                 if (flags & MDB_FIXEDMAP)
2516                         meta.mm_address = env->me_map;
2517                 i = mdb_env_init_meta(env, &meta);
2518                 if (i != MDB_SUCCESS) {
2519                         munmap(env->me_map, env->me_mapsize);
2520                         return i;
2521                 }
2522         }
2523         env->me_psize = meta.mm_psize;
2524
2525         env->me_maxpg = env->me_mapsize / env->me_psize;
2526
2527         p = (MDB_page *)env->me_map;
2528         env->me_metas[0] = METADATA(p);
2529         env->me_metas[1] = (MDB_meta *)((char *)env->me_metas[0] + meta.mm_psize);
2530
2531 #if MDB_DEBUG
2532         {
2533                 int toggle = mdb_env_pick_meta(env);
2534                 MDB_db *db = &env->me_metas[toggle]->mm_dbs[MAIN_DBI];
2535
2536                 DPRINTF("opened database version %u, pagesize %u",
2537                         env->me_metas[0]->mm_version, env->me_psize);
2538                 DPRINTF("using meta page %d",  toggle);
2539                 DPRINTF("depth: %u",           db->md_depth);
2540                 DPRINTF("entries: %zu",        db->md_entries);
2541                 DPRINTF("branch pages: %zu",   db->md_branch_pages);
2542                 DPRINTF("leaf pages: %zu",     db->md_leaf_pages);
2543                 DPRINTF("overflow pages: %zu", db->md_overflow_pages);
2544                 DPRINTF("root: %zu",           db->md_root);
2545         }
2546 #endif
2547
2548         return MDB_SUCCESS;
2549 }
2550
2551
2552 /** Release a reader thread's slot in the reader lock table.
2553  *      This function is called automatically when a thread exits.
2554  * @param[in] ptr This points to the slot in the reader lock table.
2555  */
2556 static void
2557 mdb_env_reader_dest(void *ptr)
2558 {
2559         MDB_reader *reader = ptr;
2560
2561         reader->mr_txnid = 0;
2562         reader->mr_pid = 0;
2563         reader->mr_tid = 0;
2564 }
2565
2566 #ifdef _WIN32
2567 /** Junk for arranging thread-specific callbacks on Windows. This is
2568  *      necessarily platform and compiler-specific. Windows supports up
2569  *      to 1088 keys. Let's assume nobody opens more than 64 environments
2570  *      in a single process, for now. They can override this if needed.
2571  */
2572 #ifndef MAX_TLS_KEYS
2573 #define MAX_TLS_KEYS    64
2574 #endif
2575 static pthread_key_t mdb_tls_keys[MAX_TLS_KEYS];
2576 static int mdb_tls_nkeys;
2577
2578 static void NTAPI mdb_tls_callback(PVOID module, DWORD reason, PVOID ptr)
2579 {
2580         int i;
2581         switch(reason) {
2582         case DLL_PROCESS_ATTACH: break;
2583         case DLL_THREAD_ATTACH: break;
2584         case DLL_THREAD_DETACH:
2585                 for (i=0; i<mdb_tls_nkeys; i++) {
2586                         MDB_reader *r = pthread_getspecific(mdb_tls_keys[i]);
2587                         mdb_env_reader_dest(r);
2588                 }
2589                 break;
2590         case DLL_PROCESS_DETACH: break;
2591         }
2592 }
2593 #ifdef __GNUC__
2594 #ifdef _WIN64
2595 const PIMAGE_TLS_CALLBACK mdb_tls_cbp __attribute__((section (".CRT$XLB"))) = mdb_tls_callback;
2596 #else
2597 PIMAGE_TLS_CALLBACK mdb_tls_cbp __attribute__((section (".CRT$XLB"))) = mdb_tls_callback;
2598 #endif
2599 #else
2600 #ifdef _WIN64
2601 /* Force some symbol references.
2602  *      _tls_used forces the linker to create the TLS directory if not already done
2603  *      mdb_tls_cbp prevents whole-program-optimizer from dropping the symbol.
2604  */
2605 #pragma comment(linker, "/INCLUDE:_tls_used")
2606 #pragma comment(linker, "/INCLUDE:mdb_tls_cbp")
2607 #pragma const_seg(".CRT$XLB")
2608 extern const PIMAGE_TLS_CALLBACK mdb_tls_callback;
2609 const PIMAGE_TLS_CALLBACK mdb_tls_cbp = mdb_tls_callback;
2610 #pragma const_seg()
2611 #else   /* WIN32 */
2612 #pragma comment(linker, "/INCLUDE:__tls_used")
2613 #pragma comment(linker, "/INCLUDE:_mdb_tls_cbp")
2614 #pragma data_seg(".CRT$XLB")
2615 PIMAGE_TLS_CALLBACK mdb_tls_cbp = mdb_tls_callback;
2616 #pragma data_seg()
2617 #endif  /* WIN 32/64 */
2618 #endif  /* !__GNUC__ */
2619 #endif
2620
2621 /** Downgrade the exclusive lock on the region back to shared */
2622 static void
2623 mdb_env_share_locks(MDB_env *env)
2624 {
2625         int toggle = mdb_env_pick_meta(env);
2626
2627         env->me_txns->mti_txnid = env->me_metas[toggle]->mm_txnid;
2628
2629 #ifdef _WIN32
2630         {
2631                 OVERLAPPED ov;
2632                 /* First acquire a shared lock. The Unlock will
2633                  * then release the existing exclusive lock.
2634                  */
2635                 memset(&ov, 0, sizeof(ov));
2636                 LockFileEx(env->me_lfd, 0, 0, 1, 0, &ov);
2637                 UnlockFile(env->me_lfd, 0, 0, 1, 0);
2638         }
2639 #else
2640         {
2641                 struct flock lock_info;
2642                 /* The shared lock replaces the existing lock */
2643                 memset((void *)&lock_info, 0, sizeof(lock_info));
2644                 lock_info.l_type = F_RDLCK;
2645                 lock_info.l_whence = SEEK_SET;
2646                 lock_info.l_start = 0;
2647                 lock_info.l_len = 1;
2648                 fcntl(env->me_lfd, F_SETLK, &lock_info);
2649         }
2650 #endif
2651 }
2652
2653 static int
2654 mdb_env_excl_lock(MDB_env *env, int *excl)
2655 {
2656 #ifdef _WIN32
2657         if (LockFile(env->me_lfd, 0, 0, 1, 0)) {
2658                 *excl = 1;
2659         } else {
2660                 OVERLAPPED ov;
2661                 memset(&ov, 0, sizeof(ov));
2662                 if (!LockFileEx(env->me_lfd, 0, 0, 1, 0, &ov)) {
2663                         return ErrCode();
2664                 }
2665         }
2666 #else
2667         struct flock lock_info;
2668         memset((void *)&lock_info, 0, sizeof(lock_info));
2669         lock_info.l_type = F_WRLCK;
2670         lock_info.l_whence = SEEK_SET;
2671         lock_info.l_start = 0;
2672         lock_info.l_len = 1;
2673         if (!fcntl(env->me_lfd, F_SETLK, &lock_info)) {
2674                 *excl = 1;
2675         } else {
2676                 lock_info.l_type = F_RDLCK;
2677                 if (fcntl(env->me_lfd, F_SETLKW, &lock_info)) {
2678                         return ErrCode();
2679                 }
2680         }
2681 #endif
2682         return 0;
2683 }
2684
2685 #if defined(_WIN32) || defined(USE_POSIX_SEM)
2686 /*
2687  * hash_64 - 64 bit Fowler/Noll/Vo-0 FNV-1a hash code
2688  *
2689  * @(#) $Revision: 5.1 $
2690  * @(#) $Id: hash_64a.c,v 5.1 2009/06/30 09:01:38 chongo Exp $
2691  * @(#) $Source: /usr/local/src/cmd/fnv/RCS/hash_64a.c,v $
2692  *
2693  *        http://www.isthe.com/chongo/tech/comp/fnv/index.html
2694  *
2695  ***
2696  *
2697  * Please do not copyright this code.  This code is in the public domain.
2698  *
2699  * LANDON CURT NOLL DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
2700  * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO
2701  * EVENT SHALL LANDON CURT NOLL BE LIABLE FOR ANY SPECIAL, INDIRECT OR
2702  * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF
2703  * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
2704  * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
2705  * PERFORMANCE OF THIS SOFTWARE.
2706  *
2707  * By:
2708  *      chongo <Landon Curt Noll> /\oo/\
2709  *        http://www.isthe.com/chongo/
2710  *
2711  * Share and Enjoy!     :-)
2712  */
2713
2714 typedef unsigned long long      mdb_hash_t;
2715 #define MDB_HASH_INIT ((mdb_hash_t)0xcbf29ce484222325ULL)
2716
2717 /** perform a 64 bit Fowler/Noll/Vo FNV-1a hash on a buffer
2718  * @param[in] str string to hash
2719  * @param[in] hval      initial value for hash
2720  * @return 64 bit hash
2721  *
2722  * NOTE: To use the recommended 64 bit FNV-1a hash, use MDB_HASH_INIT as the
2723  *       hval arg on the first call.
2724  */
2725 static mdb_hash_t
2726 mdb_hash_val(MDB_val *val, mdb_hash_t hval)
2727 {
2728         unsigned char *s = (unsigned char *)val->mv_data;       /* unsigned string */
2729         unsigned char *end = s + val->mv_size;
2730         /*
2731          * FNV-1a hash each octet of the string
2732          */
2733         while (s < end) {
2734                 /* xor the bottom with the current octet */
2735                 hval ^= (mdb_hash_t)*s++;
2736
2737                 /* multiply by the 64 bit FNV magic prime mod 2^64 */
2738                 hval += (hval << 1) + (hval << 4) + (hval << 5) +
2739                         (hval << 7) + (hval << 8) + (hval << 40);
2740         }
2741         /* return our new hash value */
2742         return hval;
2743 }
2744
2745 /** Hash the string and output the hash in hex.
2746  * @param[in] str string to hash
2747  * @param[out] hexbuf an array of 17 chars to hold the hash
2748  */
2749 static void
2750 mdb_hash_hex(MDB_val *val, char *hexbuf)
2751 {
2752         int i;
2753         mdb_hash_t h = mdb_hash_val(val, MDB_HASH_INIT);
2754         for (i=0; i<8; i++) {
2755                 hexbuf += sprintf(hexbuf, "%02x", (unsigned int)h & 0xff);
2756                 h >>= 8;
2757         }
2758 }
2759 #endif
2760
2761 /** Open and/or initialize the lock region for the environment.
2762  * @param[in] env The MDB environment.
2763  * @param[in] lpath The pathname of the file used for the lock region.
2764  * @param[in] mode The Unix permissions for the file, if we create it.
2765  * @param[out] excl Set to true if we got an exclusive lock on the region.
2766  * @return 0 on success, non-zero on failure.
2767  */
2768 static int
2769 mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl)
2770 {
2771         int rc;
2772         off_t size, rsize;
2773
2774         *excl = 0;
2775
2776 #ifdef _WIN32
2777         if ((env->me_lfd = CreateFile(lpath, GENERIC_READ|GENERIC_WRITE,
2778                 FILE_SHARE_READ|FILE_SHARE_WRITE, NULL, OPEN_ALWAYS,
2779                 FILE_ATTRIBUTE_NORMAL, NULL)) == INVALID_HANDLE_VALUE) {
2780                 rc = ErrCode();
2781                 return rc;
2782         }
2783         /* Try to get exclusive lock. If we succeed, then
2784          * nobody is using the lock region and we should initialize it.
2785          */
2786         if ((rc = mdb_env_excl_lock(env, excl))) goto fail;
2787         size = GetFileSize(env->me_lfd, NULL);
2788
2789 #else
2790 #if !(O_CLOEXEC)
2791         {
2792                 int fdflags;
2793                 if ((env->me_lfd = open(lpath, O_RDWR|O_CREAT, mode)) == -1)
2794                         return ErrCode();
2795                 /* Lose record locks when exec*() */
2796                 if ((fdflags = fcntl(env->me_lfd, F_GETFD) | FD_CLOEXEC) >= 0)
2797                         fcntl(env->me_lfd, F_SETFD, fdflags);
2798         }
2799 #else /* O_CLOEXEC on Linux: Open file and set FD_CLOEXEC atomically */
2800         if ((env->me_lfd = open(lpath, O_RDWR|O_CREAT|O_CLOEXEC, mode)) == -1)
2801                 return ErrCode();
2802 #endif
2803
2804         /* Try to get exclusive lock. If we succeed, then
2805          * nobody is using the lock region and we should initialize it.
2806          */
2807         if ((rc = mdb_env_excl_lock(env, excl))) goto fail;
2808
2809         size = lseek(env->me_lfd, 0, SEEK_END);
2810 #endif
2811         rsize = (env->me_maxreaders-1) * sizeof(MDB_reader) + sizeof(MDB_txninfo);
2812         if (size < rsize && *excl) {
2813 #ifdef _WIN32
2814                 SetFilePointer(env->me_lfd, rsize, NULL, 0);
2815                 if (!SetEndOfFile(env->me_lfd)) {
2816                         rc = ErrCode();
2817                         goto fail;
2818                 }
2819 #else
2820                 if (ftruncate(env->me_lfd, rsize) != 0) {
2821                         rc = ErrCode();
2822                         goto fail;
2823                 }
2824 #endif
2825         } else {
2826                 rsize = size;
2827                 size = rsize - sizeof(MDB_txninfo);
2828                 env->me_maxreaders = size/sizeof(MDB_reader) + 1;
2829         }
2830         {
2831 #ifdef _WIN32
2832                 HANDLE mh;
2833                 mh = CreateFileMapping(env->me_lfd, NULL, PAGE_READWRITE,
2834                         0, 0, NULL);
2835                 if (!mh) {
2836                         rc = ErrCode();
2837                         goto fail;
2838                 }
2839                 env->me_txns = MapViewOfFileEx(mh, FILE_MAP_WRITE, 0, 0, rsize, NULL);
2840                 CloseHandle(mh);
2841                 if (!env->me_txns) {
2842                         rc = ErrCode();
2843                         goto fail;
2844                 }
2845 #else
2846                 void *m = mmap(NULL, rsize, PROT_READ|PROT_WRITE, MAP_SHARED,
2847                         env->me_lfd, 0);
2848                 if (m == MAP_FAILED) {
2849                         env->me_txns = NULL;
2850                         rc = ErrCode();
2851                         goto fail;
2852                 }
2853                 env->me_txns = m;
2854 #endif
2855         }
2856         if (*excl) {
2857 #ifdef _WIN32
2858                 BY_HANDLE_FILE_INFORMATION stbuf;
2859                 struct {
2860                         DWORD volume;
2861                         DWORD nhigh;
2862                         DWORD nlow;
2863                 } idbuf;
2864                 MDB_val val;
2865                 char hexbuf[17];
2866
2867                 if (!mdb_sec_inited) {
2868                         InitializeSecurityDescriptor(&mdb_null_sd,
2869                                 SECURITY_DESCRIPTOR_REVISION);
2870                         SetSecurityDescriptorDacl(&mdb_null_sd, TRUE, 0, FALSE);
2871                         mdb_all_sa.nLength = sizeof(SECURITY_ATTRIBUTES);
2872                         mdb_all_sa.bInheritHandle = FALSE;
2873                         mdb_all_sa.lpSecurityDescriptor = &mdb_null_sd;
2874                         mdb_sec_inited = 1;
2875                 }
2876                 GetFileInformationByHandle(env->me_lfd, &stbuf);
2877                 idbuf.volume = stbuf.dwVolumeSerialNumber;
2878                 idbuf.nhigh  = stbuf.nFileIndexHigh;
2879                 idbuf.nlow   = stbuf.nFileIndexLow;
2880                 val.mv_data = &idbuf;
2881                 val.mv_size = sizeof(idbuf);
2882                 mdb_hash_hex(&val, hexbuf);
2883                 sprintf(env->me_txns->mti_rmname, "Global\\MDBr%s", hexbuf);
2884                 env->me_rmutex = CreateMutex(&mdb_all_sa, FALSE, env->me_txns->mti_rmname);
2885                 if (!env->me_rmutex) {
2886                         rc = ErrCode();
2887                         goto fail;
2888                 }
2889                 sprintf(env->me_txns->mti_wmname, "Global\\MDBw%s", hexbuf);
2890                 env->me_wmutex = CreateMutex(&mdb_all_sa, FALSE, env->me_txns->mti_wmname);
2891                 if (!env->me_wmutex) {
2892                         rc = ErrCode();
2893                         goto fail;
2894                 }
2895 #else   /* _WIN32 */
2896 #ifdef USE_POSIX_SEM
2897                 struct stat stbuf;
2898                 struct {
2899                         dev_t dev;
2900                         ino_t ino;
2901                 } idbuf;
2902                 MDB_val val;
2903                 char hexbuf[17];
2904
2905                 fstat(env->me_lfd, &stbuf);
2906                 idbuf.dev = stbuf.st_dev;
2907                 idbuf.ino = stbuf.st_ino;
2908                 val.mv_data = &idbuf;
2909                 val.mv_size = sizeof(idbuf);
2910                 mdb_hash_hex(&val, hexbuf);
2911                 sprintf(env->me_txns->mti_rmname, "/MDBr%s", hexbuf);
2912                 if (sem_unlink(env->me_txns->mti_rmname)) {
2913                         rc = ErrCode();
2914                         if (rc != ENOENT && rc != EINVAL)
2915                                 goto fail;
2916                 }
2917                 env->me_rmutex = sem_open(env->me_txns->mti_rmname, O_CREAT, mode, 1);
2918                 if (!env->me_rmutex) {
2919                         rc = ErrCode();
2920                         goto fail;
2921                 }
2922                 sprintf(env->me_txns->mti_wmname, "/MDBw%s", hexbuf);
2923                 if (sem_unlink(env->me_txns->mti_wmname)) {
2924                         rc = ErrCode();
2925                         if (rc != ENOENT && rc != EINVAL)
2926                                 goto fail;
2927                 }
2928                 env->me_wmutex = sem_open(env->me_txns->mti_wmname, O_CREAT, mode, 1);
2929                 if (!env->me_wmutex) {
2930                         rc = ErrCode();
2931                         goto fail;
2932                 }
2933 #else   /* USE_POSIX_SEM */
2934                 pthread_mutexattr_t mattr;
2935
2936                 pthread_mutexattr_init(&mattr);
2937                 rc = pthread_mutexattr_setpshared(&mattr, PTHREAD_PROCESS_SHARED);
2938                 if (rc) {
2939                         goto fail;
2940                 }
2941                 pthread_mutex_init(&env->me_txns->mti_mutex, &mattr);
2942                 pthread_mutex_init(&env->me_txns->mti_wmutex, &mattr);
2943 #endif  /* USE_POSIX_SEM */
2944 #endif  /* _WIN32 */
2945                 env->me_txns->mti_version = MDB_VERSION;
2946                 env->me_txns->mti_magic = MDB_MAGIC;
2947                 env->me_txns->mti_txnid = 0;
2948                 env->me_txns->mti_numreaders = 0;
2949
2950         } else {
2951                 if (env->me_txns->mti_magic != MDB_MAGIC) {
2952                         DPUTS("lock region has invalid magic");
2953                         rc = EINVAL;
2954                         goto fail;
2955                 }
2956                 if (env->me_txns->mti_version != MDB_VERSION) {
2957                         DPRINTF("lock region is version %u, expected version %u",
2958                                 env->me_txns->mti_version, MDB_VERSION);
2959                         rc = MDB_VERSION_MISMATCH;
2960                         goto fail;
2961                 }
2962                 rc = ErrCode();
2963                 if (rc != EACCES && rc != EAGAIN) {
2964                         goto fail;
2965                 }
2966 #ifdef _WIN32
2967                 env->me_rmutex = OpenMutex(SYNCHRONIZE, FALSE, env->me_txns->mti_rmname);
2968                 if (!env->me_rmutex) {
2969                         rc = ErrCode();
2970                         goto fail;
2971                 }
2972                 env->me_wmutex = OpenMutex(SYNCHRONIZE, FALSE, env->me_txns->mti_wmname);
2973                 if (!env->me_wmutex) {
2974                         rc = ErrCode();
2975                         goto fail;
2976                 }
2977 #endif
2978 #ifdef USE_POSIX_SEM
2979                 env->me_rmutex = sem_open(env->me_txns->mti_rmname, 0);
2980                 if (!env->me_rmutex) {
2981                         rc = ErrCode();
2982                         goto fail;
2983                 }
2984                 env->me_wmutex = sem_open(env->me_txns->mti_wmname, 0);
2985                 if (!env->me_wmutex) {
2986                         rc = ErrCode();
2987                         goto fail;
2988                 }
2989 #endif
2990         }
2991         return MDB_SUCCESS;
2992
2993 fail:
2994         close(env->me_lfd);
2995         env->me_lfd = INVALID_HANDLE_VALUE;
2996         return rc;
2997
2998 }
2999
3000         /** The name of the lock file in the DB environment */
3001 #define LOCKNAME        "/lock.mdb"
3002         /** The name of the data file in the DB environment */
3003 #define DATANAME        "/data.mdb"
3004         /** The suffix of the lock file when no subdir is used */
3005 #define LOCKSUFF        "-lock"
3006
3007 int
3008 mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mode_t mode)
3009 {
3010         int             oflags, rc, len, excl;
3011         char *lpath, *dpath;
3012
3013         len = strlen(path);
3014         if (flags & MDB_NOSUBDIR) {
3015                 rc = len + sizeof(LOCKSUFF) + len + 1;
3016         } else {
3017                 rc = len + sizeof(LOCKNAME) + len + sizeof(DATANAME);
3018         }
3019         lpath = malloc(rc);
3020         if (!lpath)
3021                 return ENOMEM;
3022         if (flags & MDB_NOSUBDIR) {
3023                 dpath = lpath + len + sizeof(LOCKSUFF);
3024                 sprintf(lpath, "%s" LOCKSUFF, path);
3025                 strcpy(dpath, path);
3026         } else {
3027                 dpath = lpath + len + sizeof(LOCKNAME);
3028                 sprintf(lpath, "%s" LOCKNAME, path);
3029                 sprintf(dpath, "%s" DATANAME, path);
3030         }
3031
3032         rc = mdb_env_setup_locks(env, lpath, mode, &excl);
3033         if (rc)
3034                 goto leave;
3035
3036 #ifdef _WIN32
3037         if (F_ISSET(flags, MDB_RDONLY)) {
3038                 oflags = GENERIC_READ;
3039                 len = OPEN_EXISTING;
3040         } else {
3041                 oflags = GENERIC_READ|GENERIC_WRITE;
3042                 len = OPEN_ALWAYS;
3043         }
3044         mode = FILE_ATTRIBUTE_NORMAL;
3045         env->me_fd = CreateFile(dpath, oflags, FILE_SHARE_READ|FILE_SHARE_WRITE,
3046                 NULL, len, mode, NULL);
3047 #else
3048         if (F_ISSET(flags, MDB_RDONLY))
3049                 oflags = O_RDONLY;
3050         else
3051                 oflags = O_RDWR | O_CREAT;
3052
3053         env->me_fd = open(dpath, oflags, mode);
3054 #endif
3055         if (env->me_fd == INVALID_HANDLE_VALUE) {
3056                 rc = ErrCode();
3057                 goto leave;
3058         }
3059
3060         if ((rc = mdb_env_open2(env, flags)) == MDB_SUCCESS) {
3061                 if (flags & (MDB_RDONLY|MDB_NOSYNC|MDB_NOMETASYNC)) {
3062                         env->me_mfd = env->me_fd;
3063                 } else {
3064                         /* synchronous fd for meta writes */
3065 #ifdef _WIN32
3066                         env->me_mfd = CreateFile(dpath, oflags,
3067                                 FILE_SHARE_READ|FILE_SHARE_WRITE, NULL, len,
3068                                 mode | FILE_FLAG_WRITE_THROUGH, NULL);
3069 #else
3070                         env->me_mfd = open(dpath, oflags | MDB_DSYNC, mode);
3071 #endif
3072                         if (env->me_mfd == INVALID_HANDLE_VALUE) {
3073                                 rc = ErrCode();
3074                                 goto leave;
3075                         }
3076                 }
3077                 env->me_path = strdup(path);
3078                 DPRINTF("opened dbenv %p", (void *) env);
3079                 pthread_key_create(&env->me_txkey, mdb_env_reader_dest);
3080 #ifdef _WIN32
3081                 /* Windows TLS callbacks need help finding their TLS info. */
3082                 if (mdb_tls_nkeys < MAX_TLS_KEYS)
3083                         mdb_tls_keys[mdb_tls_nkeys++] = env->me_txkey;
3084                 else {
3085                         rc = ENOMEM;
3086                         goto leave;
3087                 }
3088 #endif
3089                 if (excl)
3090                         mdb_env_share_locks(env);
3091                 env->me_numdbs = 2;
3092                 env->me_dbxs = calloc(env->me_maxdbs, sizeof(MDB_dbx));
3093                 env->me_dbflags = calloc(env->me_maxdbs, sizeof(uint16_t));
3094                 if (!env->me_dbxs || !env->me_dbflags)
3095                         rc = ENOMEM;
3096         }
3097
3098 leave:
3099         if (rc) {
3100                 if (env->me_fd != INVALID_HANDLE_VALUE) {
3101                         close(env->me_fd);
3102                         env->me_fd = INVALID_HANDLE_VALUE;
3103                 }
3104                 if (env->me_lfd != INVALID_HANDLE_VALUE) {
3105                         close(env->me_lfd);
3106                         env->me_lfd = INVALID_HANDLE_VALUE;
3107                 }
3108         }
3109         free(lpath);
3110         return rc;
3111 }
3112
3113 void
3114 mdb_env_close(MDB_env *env)
3115 {
3116         MDB_page *dp;
3117
3118         if (env == NULL)
3119                 return;
3120
3121         VGMEMP_DESTROY(env);
3122         while (env->me_dpages) {
3123                 dp = env->me_dpages;
3124                 VGMEMP_DEFINED(&dp->mp_next, sizeof(dp->mp_next));
3125                 env->me_dpages = dp->mp_next;
3126                 free(dp);
3127         }
3128
3129         free(env->me_dbflags);
3130         free(env->me_dbxs);
3131         free(env->me_path);
3132
3133         pthread_key_delete(env->me_txkey);
3134 #ifdef _WIN32
3135         /* Delete our key from the global list */
3136         { int i;
3137                 for (i=0; i<mdb_tls_nkeys; i++)
3138                         if (mdb_tls_keys[i] == env->me_txkey) {
3139                                 mdb_tls_keys[i] = mdb_tls_keys[mdb_tls_nkeys-1];
3140                                 mdb_tls_nkeys--;
3141                                 break;
3142                         }
3143         }
3144 #endif
3145
3146         if (env->me_map) {
3147                 munmap(env->me_map, env->me_mapsize);
3148         }
3149         if (env->me_mfd != env->me_fd)
3150                 close(env->me_mfd);
3151         close(env->me_fd);
3152         if (env->me_txns) {
3153                 pid_t pid = getpid();
3154                 unsigned int i;
3155                 for (i=0; i<env->me_txns->mti_numreaders; i++)
3156                         if (env->me_txns->mti_readers[i].mr_pid == pid)
3157                                 env->me_txns->mti_readers[i].mr_pid = 0;
3158 #ifdef _WIN32
3159                 CloseHandle(env->me_rmutex);
3160                 CloseHandle(env->me_wmutex);
3161                 /* Windows automatically destroys the mutexes when
3162                  * the last handle closes.
3163                  */
3164 #else
3165 #ifdef USE_POSIX_SEM
3166                 sem_close(env->me_rmutex);
3167                 sem_close(env->me_wmutex);
3168                 { int excl = 0;
3169                         if (!mdb_env_excl_lock(env, &excl) && excl) {
3170                                 /* we are the only remaining user of the environment.
3171                                    clean up semaphores. */
3172                                 sem_unlink(env->me_txns->mti_rmname);
3173                                 sem_unlink(env->me_txns->mti_wmname);
3174                         }
3175                 }
3176 #endif
3177 #endif
3178                 munmap((void *)env->me_txns, (env->me_maxreaders-1)*sizeof(MDB_reader)+sizeof(MDB_txninfo));
3179         }
3180         close(env->me_lfd);
3181         mdb_midl_free(env->me_free_pgs);
3182         free(env);
3183 }
3184
3185 /** Compare two items pointing at aligned size_t's */
3186 static int
3187 mdb_cmp_long(const MDB_val *a, const MDB_val *b)
3188 {
3189         return (*(size_t *)a->mv_data < *(size_t *)b->mv_data) ? -1 :
3190                 *(size_t *)a->mv_data > *(size_t *)b->mv_data;
3191 }
3192
3193 /** Compare two items pointing at aligned int's */
3194 static int
3195 mdb_cmp_int(const MDB_val *a, const MDB_val *b)
3196 {
3197         return (*(unsigned int *)a->mv_data < *(unsigned int *)b->mv_data) ? -1 :
3198                 *(unsigned int *)a->mv_data > *(unsigned int *)b->mv_data;
3199 }
3200
3201 /** Compare two items pointing at ints of unknown alignment.
3202  *      Nodes and keys are guaranteed to be 2-byte aligned.
3203  */
3204 static int
3205 mdb_cmp_cint(const MDB_val *a, const MDB_val *b)
3206 {
3207 #if BYTE_ORDER == LITTLE_ENDIAN
3208         unsigned short *u, *c;
3209         int x;
3210
3211         u = (unsigned short *) ((char *) a->mv_data + a->mv_size);
3212         c = (unsigned short *) ((char *) b->mv_data + a->mv_size);
3213         do {
3214                 x = *--u - *--c;
3215         } while(!x && u > (unsigned short *)a->mv_data);
3216         return x;
3217 #else
3218         return memcmp(a->mv_data, b->mv_data, a->mv_size);
3219 #endif
3220 }
3221
3222 /** Compare two items lexically */
3223 static int
3224 mdb_cmp_memn(const MDB_val *a, const MDB_val *b)
3225 {
3226         int diff;
3227         ssize_t len_diff;
3228         unsigned int len;
3229
3230         len = a->mv_size;
3231         len_diff = (ssize_t) a->mv_size - (ssize_t) b->mv_size;
3232         if (len_diff > 0) {
3233                 len = b->mv_size;
3234                 len_diff = 1;
3235         }
3236
3237         diff = memcmp(a->mv_data, b->mv_data, len);
3238         return diff ? diff : len_diff<0 ? -1 : len_diff;
3239 }
3240
3241 /** Compare two items in reverse byte order */
3242 static int
3243 mdb_cmp_memnr(const MDB_val *a, const MDB_val *b)
3244 {
3245         const unsigned char     *p1, *p2, *p1_lim;
3246         ssize_t len_diff;
3247         int diff;
3248
3249         p1_lim = (const unsigned char *)a->mv_data;
3250         p1 = (const unsigned char *)a->mv_data + a->mv_size;
3251         p2 = (const unsigned char *)b->mv_data + b->mv_size;
3252
3253         len_diff = (ssize_t) a->mv_size - (ssize_t) b->mv_size;
3254         if (len_diff > 0) {
3255                 p1_lim += len_diff;
3256                 len_diff = 1;
3257         }
3258
3259         while (p1 > p1_lim) {
3260                 diff = *--p1 - *--p2;
3261                 if (diff)
3262                         return diff;
3263         }
3264         return len_diff<0 ? -1 : len_diff;
3265 }
3266
3267 /** Search for key within a page, using binary search.
3268  * Returns the smallest entry larger or equal to the key.
3269  * If exactp is non-null, stores whether the found entry was an exact match
3270  * in *exactp (1 or 0).
3271  * Updates the cursor index with the index of the found entry.
3272  * If no entry larger or equal to the key is found, returns NULL.
3273  */
3274 static MDB_node *
3275 mdb_node_search(MDB_cursor *mc, MDB_val *key, int *exactp)
3276 {
3277         unsigned int     i = 0, nkeys;
3278         int              low, high;
3279         int              rc = 0;
3280         MDB_page *mp = mc->mc_pg[mc->mc_top];
3281         MDB_node        *node = NULL;
3282         MDB_val  nodekey;
3283         MDB_cmp_func *cmp;
3284         DKBUF;
3285
3286         nkeys = NUMKEYS(mp);
3287
3288 #if MDB_DEBUG
3289         {
3290         pgno_t pgno;
3291         COPY_PGNO(pgno, mp->mp_pgno);
3292         DPRINTF("searching %u keys in %s %spage %zu",
3293             nkeys, IS_LEAF(mp) ? "leaf" : "branch", IS_SUBP(mp) ? "sub-" : "",
3294             pgno);
3295         }
3296 #endif
3297
3298         assert(nkeys > 0);
3299
3300         low = IS_LEAF(mp) ? 0 : 1;
3301         high = nkeys - 1;
3302         cmp = mc->mc_dbx->md_cmp;
3303
3304         /* Branch pages have no data, so if using integer keys,
3305          * alignment is guaranteed. Use faster mdb_cmp_int.
3306          */
3307         if (cmp == mdb_cmp_cint && IS_BRANCH(mp)) {
3308                 if (NODEPTR(mp, 1)->mn_ksize == sizeof(size_t))
3309                         cmp = mdb_cmp_long;
3310                 else
3311                         cmp = mdb_cmp_int;
3312         }
3313
3314         if (IS_LEAF2(mp)) {
3315                 nodekey.mv_size = mc->mc_db->md_pad;
3316                 node = NODEPTR(mp, 0);  /* fake */
3317                 while (low <= high) {
3318                         i = (low + high) >> 1;
3319                         nodekey.mv_data = LEAF2KEY(mp, i, nodekey.mv_size);
3320                         rc = cmp(key, &nodekey);
3321                         DPRINTF("found leaf index %u [%s], rc = %i",
3322                             i, DKEY(&nodekey), rc);
3323                         if (rc == 0)
3324                                 break;
3325                         if (rc > 0)
3326                                 low = i + 1;
3327                         else
3328                                 high = i - 1;
3329                 }
3330         } else {
3331                 while (low <= high) {
3332                         i = (low + high) >> 1;
3333
3334                         node = NODEPTR(mp, i);
3335                         nodekey.mv_size = NODEKSZ(node);
3336                         nodekey.mv_data = NODEKEY(node);
3337
3338                         rc = cmp(key, &nodekey);
3339 #if MDB_DEBUG
3340                         if (IS_LEAF(mp))
3341                                 DPRINTF("found leaf index %u [%s], rc = %i",
3342                                     i, DKEY(&nodekey), rc);
3343                         else
3344                                 DPRINTF("found branch index %u [%s -> %zu], rc = %i",
3345                                     i, DKEY(&nodekey), NODEPGNO(node), rc);
3346 #endif
3347                         if (rc == 0)
3348                                 break;
3349                         if (rc > 0)
3350                                 low = i + 1;
3351                         else
3352                                 high = i - 1;
3353                 }
3354         }
3355
3356         if (rc > 0) {   /* Found entry is less than the key. */
3357                 i++;    /* Skip to get the smallest entry larger than key. */
3358                 if (!IS_LEAF2(mp))
3359                         node = NODEPTR(mp, i);
3360         }
3361         if (exactp)
3362                 *exactp = (rc == 0);
3363         /* store the key index */
3364         mc->mc_ki[mc->mc_top] = i;
3365         if (i >= nkeys)
3366                 /* There is no entry larger or equal to the key. */
3367                 return NULL;
3368
3369         /* nodeptr is fake for LEAF2 */
3370         return node;
3371 }
3372
3373 #if 0
3374 static void
3375 mdb_cursor_adjust(MDB_cursor *mc, func)
3376 {
3377         MDB_cursor *m2;
3378
3379         for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) {
3380                 if (m2->mc_pg[m2->mc_top] == mc->mc_pg[mc->mc_top]) {
3381                         func(mc, m2);
3382                 }
3383         }
3384 }
3385 #endif
3386
3387 /** Pop a page off the top of the cursor's stack. */
3388 static void
3389 mdb_cursor_pop(MDB_cursor *mc)
3390 {
3391         if (mc->mc_snum) {
3392 #if MDB_DEBUG
3393                 MDB_page        *top = mc->mc_pg[mc->mc_top];
3394 #endif
3395                 mc->mc_snum--;
3396                 if (mc->mc_snum)
3397                         mc->mc_top--;
3398
3399                 DPRINTF("popped page %zu off db %u cursor %p", top->mp_pgno,
3400                         mc->mc_dbi, (void *) mc);
3401         }
3402 }
3403
3404 /** Push a page onto the top of the cursor's stack. */
3405 static int
3406 mdb_cursor_push(MDB_cursor *mc, MDB_page *mp)
3407 {
3408         DPRINTF("pushing page %zu on db %u cursor %p", mp->mp_pgno,
3409                 mc->mc_dbi, (void *) mc);
3410
3411         if (mc->mc_snum >= CURSOR_STACK) {
3412                 assert(mc->mc_snum < CURSOR_STACK);
3413                 return ENOMEM;
3414         }
3415
3416         mc->mc_top = mc->mc_snum++;
3417         mc->mc_pg[mc->mc_top] = mp;
3418         mc->mc_ki[mc->mc_top] = 0;
3419
3420         return MDB_SUCCESS;
3421 }
3422
3423 /** Find the address of the page corresponding to a given page number.
3424  * @param[in] txn the transaction for this access.
3425  * @param[in] pgno the page number for the page to retrieve.
3426  * @param[out] ret address of a pointer where the page's address will be stored.
3427  * @return 0 on success, non-zero on failure.
3428  */
3429 static int
3430 mdb_page_get(MDB_txn *txn, pgno_t pgno, MDB_page **ret)
3431 {
3432         MDB_page *p = NULL;
3433
3434         if (!F_ISSET(txn->mt_flags, MDB_TXN_RDONLY) && txn->mt_u.dirty_list[0].mid) {
3435                 unsigned x;
3436                 x = mdb_mid2l_search(txn->mt_u.dirty_list, pgno);
3437                 if (x <= txn->mt_u.dirty_list[0].mid && txn->mt_u.dirty_list[x].mid == pgno) {
3438                         p = txn->mt_u.dirty_list[x].mptr;
3439                 }
3440         }
3441         if (!p) {
3442                 if (pgno < txn->mt_next_pgno)
3443                         p = (MDB_page *)(txn->mt_env->me_map + txn->mt_env->me_psize * pgno);
3444         }
3445         *ret = p;
3446         if (!p) {
3447                 DPRINTF("page %zu not found", pgno);
3448                 assert(p != NULL);
3449         }
3450         return (p != NULL) ? MDB_SUCCESS : MDB_PAGE_NOTFOUND;
3451 }
3452
3453 /** Search for the page a given key should be in.
3454  * Pushes parent pages on the cursor stack. This function continues a
3455  * search on a cursor that has already been initialized. (Usually by
3456  * #mdb_page_search() but also by #mdb_node_move().)
3457  * @param[in,out] mc the cursor for this operation.
3458  * @param[in] key the key to search for. If NULL, search for the lowest
3459  * page. (This is used by #mdb_cursor_first().)
3460  * @param[in] flags If MDB_PS_MODIFY set, visited pages are updated with new page numbers.
3461  *   If MDB_PS_ROOTONLY set, just fetch root node, no further lookups.
3462  * @return 0 on success, non-zero on failure.
3463  */
3464 static int
3465 mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int modify)
3466 {
3467         MDB_page        *mp = mc->mc_pg[mc->mc_top];
3468         DKBUF;
3469         int rc;
3470
3471
3472         while (IS_BRANCH(mp)) {
3473                 MDB_node        *node;
3474                 indx_t          i;
3475
3476                 DPRINTF("branch page %zu has %u keys", mp->mp_pgno, NUMKEYS(mp));
3477                 assert(NUMKEYS(mp) > 1);
3478                 DPRINTF("found index 0 to page %zu", NODEPGNO(NODEPTR(mp, 0)));
3479
3480                 if (key == NULL)        /* Initialize cursor to first page. */
3481                         i = 0;
3482                 else if (key->mv_size > MAXKEYSIZE && key->mv_data == NULL) {
3483                                                         /* cursor to last page */
3484                         i = NUMKEYS(mp)-1;
3485                 } else {
3486                         int      exact;
3487                         node = mdb_node_search(mc, key, &exact);
3488                         if (node == NULL)
3489                                 i = NUMKEYS(mp) - 1;
3490                         else {
3491                                 i = mc->mc_ki[mc->mc_top];
3492                                 if (!exact) {
3493                                         assert(i > 0);
3494                                         i--;
3495                                 }
3496                         }
3497                 }
3498
3499                 if (key)
3500                         DPRINTF("following index %u for key [%s]",
3501                             i, DKEY(key));
3502                 assert(i < NUMKEYS(mp));
3503                 node = NODEPTR(mp, i);
3504
3505                 if ((rc = mdb_page_get(mc->mc_txn, NODEPGNO(node), &mp)))
3506                         return rc;
3507
3508                 mc->mc_ki[mc->mc_top] = i;
3509                 if ((rc = mdb_cursor_push(mc, mp)))
3510                         return rc;
3511
3512                 if (modify) {
3513                         if ((rc = mdb_page_touch(mc)) != 0)
3514                                 return rc;
3515                         mp = mc->mc_pg[mc->mc_top];
3516                 }
3517         }
3518
3519         if (!IS_LEAF(mp)) {
3520                 DPRINTF("internal error, index points to a %02X page!?",
3521                     mp->mp_flags);
3522                 return MDB_CORRUPTED;
3523         }
3524
3525         DPRINTF("found leaf page %zu for key [%s]", mp->mp_pgno,
3526             key ? DKEY(key) : NULL);
3527
3528         return MDB_SUCCESS;
3529 }
3530
3531 /** Search for the page a given key should be in.
3532  * Pushes parent pages on the cursor stack. This function just sets up
3533  * the search; it finds the root page for \b mc's database and sets this
3534  * as the root of the cursor's stack. Then #mdb_page_search_root() is
3535  * called to complete the search.
3536  * @param[in,out] mc the cursor for this operation.
3537  * @param[in] key the key to search for. If NULL, search for the lowest
3538  * page. (This is used by #mdb_cursor_first().)
3539  * @param[in] modify If true, visited pages are updated with new page numbers.
3540  * @return 0 on success, non-zero on failure.
3541  */
3542 static int
3543 mdb_page_search(MDB_cursor *mc, MDB_val *key, int flags)
3544 {
3545         int              rc;
3546         pgno_t           root;
3547
3548         /* Make sure the txn is still viable, then find the root from
3549          * the txn's db table.
3550          */
3551         if (F_ISSET(mc->mc_txn->mt_flags, MDB_TXN_ERROR)) {
3552                 DPUTS("transaction has failed, must abort");
3553                 return EINVAL;
3554         } else {
3555                 /* Make sure we're using an up-to-date root */
3556                 if (mc->mc_dbi > MAIN_DBI) {
3557                         if ((*mc->mc_dbflag & DB_STALE) ||
3558                         ((flags & MDB_PS_MODIFY) && !(*mc->mc_dbflag & DB_DIRTY))) {
3559                                 MDB_cursor mc2;
3560                                 unsigned char dbflag = 0;
3561                                 mdb_cursor_init(&mc2, mc->mc_txn, MAIN_DBI, NULL);
3562                                 rc = mdb_page_search(&mc2, &mc->mc_dbx->md_name, flags & MDB_PS_MODIFY);
3563                                 if (rc)
3564                                         return rc;
3565                                 if (*mc->mc_dbflag & DB_STALE) {
3566                                         MDB_val data;
3567                                         int exact = 0;
3568                                         MDB_node *leaf = mdb_node_search(&mc2,
3569                                                 &mc->mc_dbx->md_name, &exact);
3570                                         if (!exact)
3571                                                 return MDB_NOTFOUND;
3572                                         mdb_node_read(mc->mc_txn, leaf, &data);
3573                                         memcpy(mc->mc_db, data.mv_data, sizeof(MDB_db));
3574                                 }
3575                                 if (flags & MDB_PS_MODIFY)
3576                                         dbflag = DB_DIRTY;
3577                                 *mc->mc_dbflag = dbflag;
3578                         }
3579                 }
3580                 root = mc->mc_db->md_root;
3581
3582                 if (root == P_INVALID) {                /* Tree is empty. */
3583                         DPUTS("tree is empty");
3584                         return MDB_NOTFOUND;
3585                 }
3586         }
3587
3588         assert(root > 1);
3589         if (!mc->mc_pg[0] || mc->mc_pg[0]->mp_pgno != root)
3590                 if ((rc = mdb_page_get(mc->mc_txn, root, &mc->mc_pg[0])))
3591                         return rc;
3592
3593         mc->mc_snum = 1;
3594         mc->mc_top = 0;
3595
3596         DPRINTF("db %u root page %zu has flags 0x%X",
3597                 mc->mc_dbi, root, mc->mc_pg[0]->mp_flags);
3598
3599         if (flags & MDB_PS_MODIFY) {
3600                 if ((rc = mdb_page_touch(mc)))
3601                         return rc;
3602         }
3603
3604         if (flags & MDB_PS_ROOTONLY)
3605                 return MDB_SUCCESS;
3606
3607         return mdb_page_search_root(mc, key, flags);
3608 }
3609
3610 /** Return the data associated with a given node.
3611  * @param[in] txn The transaction for this operation.
3612  * @param[in] leaf The node being read.
3613  * @param[out] data Updated to point to the node's data.
3614  * @return 0 on success, non-zero on failure.
3615  */
3616 static int
3617 mdb_node_read(MDB_txn *txn, MDB_node *leaf, MDB_val *data)
3618 {
3619         MDB_page        *omp;           /* overflow page */
3620         pgno_t           pgno;
3621         int rc;
3622
3623         if (!F_ISSET(leaf->mn_flags, F_BIGDATA)) {
3624                 data->mv_size = NODEDSZ(leaf);
3625                 data->mv_data = NODEDATA(leaf);
3626                 return MDB_SUCCESS;
3627         }
3628
3629         /* Read overflow data.
3630          */
3631         data->mv_size = NODEDSZ(leaf);
3632         memcpy(&pgno, NODEDATA(leaf), sizeof(pgno));
3633         if ((rc = mdb_page_get(txn, pgno, &omp))) {
3634                 DPRINTF("read overflow page %zu failed", pgno);
3635                 return rc;
3636         }
3637         data->mv_data = METADATA(omp);
3638
3639         return MDB_SUCCESS;
3640 }
3641
3642 int
3643 mdb_get(MDB_txn *txn, MDB_dbi dbi,
3644     MDB_val *key, MDB_val *data)
3645 {
3646         MDB_cursor      mc;
3647         MDB_xcursor     mx;
3648         int exact = 0;
3649         DKBUF;
3650
3651         assert(key);
3652         assert(data);
3653         DPRINTF("===> get db %u key [%s]", dbi, DKEY(key));
3654
3655         if (txn == NULL || !dbi || dbi >= txn->mt_numdbs)
3656                 return EINVAL;
3657
3658         if (key->mv_size == 0 || key->mv_size > MAXKEYSIZE) {
3659                 return EINVAL;
3660         }
3661
3662         mdb_cursor_init(&mc, txn, dbi, &mx);
3663         return mdb_cursor_set(&mc, key, data, MDB_SET, &exact);
3664 }
3665
3666 /** Find a sibling for a page.
3667  * Replaces the page at the top of the cursor's stack with the
3668  * specified sibling, if one exists.
3669  * @param[in] mc The cursor for this operation.
3670  * @param[in] move_right Non-zero if the right sibling is requested,
3671  * otherwise the left sibling.
3672  * @return 0 on success, non-zero on failure.
3673  */
3674 static int
3675 mdb_cursor_sibling(MDB_cursor *mc, int move_right)
3676 {
3677         int              rc;
3678         MDB_node        *indx;
3679         MDB_page        *mp;
3680
3681         if (mc->mc_snum < 2) {
3682                 return MDB_NOTFOUND;            /* root has no siblings */
3683         }
3684
3685         mdb_cursor_pop(mc);
3686         DPRINTF("parent page is page %zu, index %u",
3687                 mc->mc_pg[mc->mc_top]->mp_pgno, mc->mc_ki[mc->mc_top]);
3688
3689         if (move_right ? (mc->mc_ki[mc->mc_top] + 1u >= NUMKEYS(mc->mc_pg[mc->mc_top]))
3690                        : (mc->mc_ki[mc->mc_top] == 0)) {
3691                 DPRINTF("no more keys left, moving to %s sibling",
3692                     move_right ? "right" : "left");
3693                 if ((rc = mdb_cursor_sibling(mc, move_right)) != MDB_SUCCESS)
3694                         return rc;
3695         } else {
3696                 if (move_right)
3697                         mc->mc_ki[mc->mc_top]++;
3698                 else
3699                         mc->mc_ki[mc->mc_top]--;
3700                 DPRINTF("just moving to %s index key %u",
3701                     move_right ? "right" : "left", mc->mc_ki[mc->mc_top]);
3702         }
3703         assert(IS_BRANCH(mc->mc_pg[mc->mc_top]));
3704
3705         indx = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
3706         if ((rc = mdb_page_get(mc->mc_txn, NODEPGNO(indx), &mp)))
3707                 return rc;;
3708
3709         mdb_cursor_push(mc, mp);
3710
3711         return MDB_SUCCESS;
3712 }
3713
3714 /** Move the cursor to the next data item. */
3715 static int
3716 mdb_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op)
3717 {
3718         MDB_page        *mp;
3719         MDB_node        *leaf;
3720         int rc;
3721
3722         if (mc->mc_flags & C_EOF) {
3723                 return MDB_NOTFOUND;
3724         }
3725
3726         assert(mc->mc_flags & C_INITIALIZED);
3727
3728         mp = mc->mc_pg[mc->mc_top];
3729
3730         if (mc->mc_db->md_flags & MDB_DUPSORT) {
3731                 leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
3732                 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
3733                         if (op == MDB_NEXT || op == MDB_NEXT_DUP) {
3734                                 rc = mdb_cursor_next(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_NEXT);
3735                                 if (op != MDB_NEXT || rc == MDB_SUCCESS)
3736                                         return rc;
3737                         }
3738                 } else {
3739                         mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED;
3740                         if (op == MDB_NEXT_DUP)
3741                                 return MDB_NOTFOUND;
3742                 }
3743         }
3744
3745         DPRINTF("cursor_next: top page is %zu in cursor %p", mp->mp_pgno, (void *) mc);
3746
3747         if (mc->mc_ki[mc->mc_top] + 1u >= NUMKEYS(mp)) {
3748                 DPUTS("=====> move to next sibling page");
3749                 if (mdb_cursor_sibling(mc, 1) != MDB_SUCCESS) {
3750                         mc->mc_flags |= C_EOF;
3751                         mc->mc_flags &= ~C_INITIALIZED;
3752                         return MDB_NOTFOUND;
3753                 }
3754                 mp = mc->mc_pg[mc->mc_top];
3755                 DPRINTF("next page is %zu, key index %u", mp->mp_pgno, mc->mc_ki[mc->mc_top]);
3756         } else
3757                 mc->mc_ki[mc->mc_top]++;
3758
3759         DPRINTF("==> cursor points to page %zu with %u keys, key index %u",
3760             mp->mp_pgno, NUMKEYS(mp), mc->mc_ki[mc->mc_top]);
3761
3762         if (IS_LEAF2(mp)) {
3763                 key->mv_size = mc->mc_db->md_pad;
3764                 key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size);
3765                 return MDB_SUCCESS;
3766         }
3767
3768         assert(IS_LEAF(mp));
3769         leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
3770
3771         if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
3772                 mdb_xcursor_init1(mc, leaf);
3773         }
3774         if (data) {
3775                 if ((rc = mdb_node_read(mc->mc_txn, leaf, data) != MDB_SUCCESS))
3776                         return rc;
3777
3778                 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
3779                         rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL);
3780                         if (rc != MDB_SUCCESS)
3781                                 return rc;
3782                 }
3783         }
3784
3785         MDB_SET_KEY(leaf, key);
3786         return MDB_SUCCESS;
3787 }
3788
3789 /** Move the cursor to the previous data item. */
3790 static int
3791 mdb_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op)
3792 {
3793         MDB_page        *mp;
3794         MDB_node        *leaf;
3795         int rc;
3796
3797         assert(mc->mc_flags & C_INITIALIZED);
3798
3799         mp = mc->mc_pg[mc->mc_top];
3800
3801         if (mc->mc_db->md_flags & MDB_DUPSORT) {
3802                 leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
3803                 if (op == MDB_PREV || op == MDB_PREV_DUP) {
3804                         if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
3805                                 rc = mdb_cursor_prev(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_PREV);
3806                                 if (op != MDB_PREV || rc == MDB_SUCCESS)
3807                                         return rc;
3808                         } else {
3809                                 mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED;
3810                                 if (op == MDB_PREV_DUP)
3811                                         return MDB_NOTFOUND;
3812                         }
3813                 }
3814         }
3815
3816         DPRINTF("cursor_prev: top page is %zu in cursor %p", mp->mp_pgno, (void *) mc);
3817
3818         if (mc->mc_ki[mc->mc_top] == 0)  {
3819                 DPUTS("=====> move to prev sibling page");
3820                 if (mdb_cursor_sibling(mc, 0) != MDB_SUCCESS) {
3821                         mc->mc_flags &= ~C_INITIALIZED;
3822                         return MDB_NOTFOUND;
3823                 }
3824                 mp = mc->mc_pg[mc->mc_top];
3825                 mc->mc_ki[mc->mc_top] = NUMKEYS(mp) - 1;
3826                 DPRINTF("prev page is %zu, key index %u", mp->mp_pgno, mc->mc_ki[mc->mc_top]);
3827         } else
3828                 mc->mc_ki[mc->mc_top]--;
3829
3830         mc->mc_flags &= ~C_EOF;
3831
3832         DPRINTF("==> cursor points to page %zu with %u keys, key index %u",
3833             mp->mp_pgno, NUMKEYS(mp), mc->mc_ki[mc->mc_top]);
3834
3835         if (IS_LEAF2(mp)) {
3836                 key->mv_size = mc->mc_db->md_pad;
3837                 key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size);
3838                 return MDB_SUCCESS;
3839         }
3840
3841         assert(IS_LEAF(mp));
3842         leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
3843
3844         if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
3845                 mdb_xcursor_init1(mc, leaf);
3846         }
3847         if (data) {
3848                 if ((rc = mdb_node_read(mc->mc_txn, leaf, data) != MDB_SUCCESS))
3849                         return rc;
3850
3851                 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
3852                         rc = mdb_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL);
3853                         if (rc != MDB_SUCCESS)
3854                                 return rc;
3855                 }
3856         }
3857
3858         MDB_SET_KEY(leaf, key);
3859         return MDB_SUCCESS;
3860 }
3861
3862 /** Set the cursor on a specific data item. */
3863 static int
3864 mdb_cursor_set(MDB_cursor *mc, MDB_val *key, MDB_val *data,
3865     MDB_cursor_op op, int *exactp)
3866 {
3867         int              rc;
3868         MDB_page        *mp;
3869         MDB_node        *leaf = NULL;
3870         DKBUF;
3871
3872         assert(mc);
3873         assert(key);
3874         assert(key->mv_size > 0);
3875
3876         /* See if we're already on the right page */
3877         if (mc->mc_flags & C_INITIALIZED) {
3878                 MDB_val nodekey;
3879
3880                 mp = mc->mc_pg[mc->mc_top];
3881                 if (!NUMKEYS(mp)) {
3882                         mc->mc_ki[mc->mc_top] = 0;
3883                         return MDB_NOTFOUND;
3884                 }
3885                 if (mp->mp_flags & P_LEAF2) {
3886                         nodekey.mv_size = mc->mc_db->md_pad;
3887                         nodekey.mv_data = LEAF2KEY(mp, 0, nodekey.mv_size);
3888                 } else {
3889                         leaf = NODEPTR(mp, 0);
3890                         MDB_SET_KEY(leaf, &nodekey);
3891                 }
3892                 rc = mc->mc_dbx->md_cmp(key, &nodekey);
3893                 if (rc == 0) {
3894                         /* Probably happens rarely, but first node on the page
3895                          * was the one we wanted.
3896                          */
3897                         mc->mc_ki[mc->mc_top] = 0;
3898                         if (exactp)
3899                                 *exactp = 1;
3900                         goto set1;
3901                 }
3902                 if (rc > 0) {
3903                         unsigned int i;
3904                         unsigned int nkeys = NUMKEYS(mp);
3905                         if (nkeys > 1) {
3906                                 if (mp->mp_flags & P_LEAF2) {
3907                                         nodekey.mv_data = LEAF2KEY(mp,
3908                                                  nkeys-1, nodekey.mv_size);
3909                                 } else {
3910                                         leaf = NODEPTR(mp, nkeys-1);
3911                                         MDB_SET_KEY(leaf, &nodekey);
3912                                 }
3913                                 rc = mc->mc_dbx->md_cmp(key, &nodekey);
3914                                 if (rc == 0) {
3915                                         /* last node was the one we wanted */
3916                                         mc->mc_ki[mc->mc_top] = nkeys-1;
3917                                         if (exactp)
3918                                                 *exactp = 1;
3919                                         goto set1;
3920                                 }
3921                                 if (rc < 0) {
3922                                         if (mc->mc_ki[mc->mc_top] < NUMKEYS(mp)) {
3923                                                 /* This is definitely the right page, skip search_page */
3924                                                 if (mp->mp_flags & P_LEAF2) {
3925                                                         nodekey.mv_data = LEAF2KEY(mp,
3926                                                                  mc->mc_ki[mc->mc_top], nodekey.mv_size);
3927                                                 } else {
3928                                                         leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
3929                                                         MDB_SET_KEY(leaf, &nodekey);
3930                                                 }
3931                                                 rc = mc->mc_dbx->md_cmp(key, &nodekey);
3932                                                 if (rc == 0) {
3933                                                         /* current node was the one we wanted */
3934                                                         if (exactp)
3935                                                                 *exactp = 1;
3936                                                         goto set1;
3937                                                 }
3938                                         }
3939                                         rc = 0;
3940                                         goto set2;
3941                                 }
3942                         }
3943                         /* If any parents have right-sibs, search.
3944                          * Otherwise, there's nothing further.
3945                          */
3946                         for (i=0; i<mc->mc_top; i++)
3947                                 if (mc->mc_ki[i] <
3948                                         NUMKEYS(mc->mc_pg[i])-1)
3949                                         break;
3950                         if (i == mc->mc_top) {
3951                                 /* There are no other pages */
3952                                 mc->mc_ki[mc->mc_top] = nkeys;
3953                                 return MDB_NOTFOUND;
3954                         }
3955                 }
3956                 if (!mc->mc_top) {
3957                         /* There are no other pages */
3958                         mc->mc_ki[mc->mc_top] = 0;
3959                         return MDB_NOTFOUND;
3960                 }
3961         }
3962
3963         rc = mdb_page_search(mc, key, 0);
3964         if (rc != MDB_SUCCESS)
3965                 return rc;
3966
3967         mp = mc->mc_pg[mc->mc_top];
3968         assert(IS_LEAF(mp));
3969
3970 set2:
3971         leaf = mdb_node_search(mc, key, exactp);
3972         if (exactp != NULL && !*exactp) {
3973                 /* MDB_SET specified and not an exact match. */
3974                 return MDB_NOTFOUND;
3975         }
3976
3977         if (leaf == NULL) {
3978                 DPUTS("===> inexact leaf not found, goto sibling");
3979                 if ((rc = mdb_cursor_sibling(mc, 1)) != MDB_SUCCESS)
3980                         return rc;              /* no entries matched */
3981                 mp = mc->mc_pg[mc->mc_top];
3982                 assert(IS_LEAF(mp));
3983                 leaf = NODEPTR(mp, 0);
3984         }
3985
3986 set1:
3987         mc->mc_flags |= C_INITIALIZED;
3988         mc->mc_flags &= ~C_EOF;
3989
3990         if (IS_LEAF2(mp)) {
3991                 key->mv_size = mc->mc_db->md_pad;
3992                 key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size);
3993                 return MDB_SUCCESS;
3994         }
3995
3996         if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
3997                 mdb_xcursor_init1(mc, leaf);
3998         }
3999         if (data) {
4000                 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
4001                         if (op == MDB_SET || op == MDB_SET_RANGE) {
4002                                 rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL);
4003                         } else {
4004                                 int ex2, *ex2p;
4005                                 if (op == MDB_GET_BOTH) {
4006                                         ex2p = &ex2;
4007                                         ex2 = 0;
4008                                 } else {
4009                                         ex2p = NULL;
4010                                 }
4011                                 rc = mdb_cursor_set(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_SET_RANGE, ex2p);
4012                                 if (rc != MDB_SUCCESS)
4013                                         return rc;
4014                         }
4015                 } else if (op == MDB_GET_BOTH || op == MDB_GET_BOTH_RANGE) {
4016                         MDB_val d2;
4017                         if ((rc = mdb_node_read(mc->mc_txn, leaf, &d2)) != MDB_SUCCESS)
4018                                 return rc;
4019                         rc = mc->mc_dbx->md_dcmp(data, &d2);
4020                         if (rc) {
4021                                 if (op == MDB_GET_BOTH || rc > 0)
4022                                         return MDB_NOTFOUND;
4023                         }
4024
4025                 } else {
4026                         if (mc->mc_xcursor)
4027                                 mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED;
4028                         if ((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS)
4029                                 return rc;
4030                 }
4031         }
4032
4033         /* The key already matches in all other cases */
4034         if (op == MDB_SET_RANGE)
4035                 MDB_SET_KEY(leaf, key);
4036         DPRINTF("==> cursor placed on key [%s]", DKEY(key));
4037
4038         return rc;
4039 }
4040
4041 /** Move the cursor to the first item in the database. */
4042 static int
4043 mdb_cursor_first(MDB_cursor *mc, MDB_val *key, MDB_val *data)
4044 {
4045         int              rc;
4046         MDB_node        *leaf;
4047
4048         if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) {
4049                 rc = mdb_page_search(mc, NULL, 0);
4050                 if (rc != MDB_SUCCESS)
4051                         return rc;
4052         }
4053         assert(IS_LEAF(mc->mc_pg[mc->mc_top]));
4054
4055         leaf = NODEPTR(mc->mc_pg[mc->mc_top], 0);
4056         mc->mc_flags |= C_INITIALIZED;
4057         mc->mc_flags &= ~C_EOF;
4058
4059         mc->mc_ki[mc->mc_top] = 0;
4060
4061         if (IS_LEAF2(mc->mc_pg[mc->mc_top])) {
4062                 key->mv_size = mc->mc_db->md_pad;
4063                 key->mv_data = LEAF2KEY(mc->mc_pg[mc->mc_top], 0, key->mv_size);
4064                 return MDB_SUCCESS;
4065         }
4066
4067         if (data) {
4068                 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
4069                         mdb_xcursor_init1(mc, leaf);
4070                         rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL);
4071                         if (rc)
4072                                 return rc;
4073                 } else {
4074                         if (mc->mc_xcursor)
4075                                 mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED;
4076                         if ((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS)
4077                                 return rc;
4078                 }
4079         }
4080         MDB_SET_KEY(leaf, key);
4081         return MDB_SUCCESS;
4082 }
4083
4084 /** Move the cursor to the last item in the database. */
4085 static int
4086 mdb_cursor_last(MDB_cursor *mc, MDB_val *key, MDB_val *data)
4087 {
4088         int              rc;
4089         MDB_node        *leaf;
4090
4091         if (!(mc->mc_flags & C_EOF)) {
4092
4093         if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) {
4094                 MDB_val lkey;
4095
4096                 lkey.mv_size = MAXKEYSIZE+1;
4097                 lkey.mv_data = NULL;
4098                 rc = mdb_page_search(mc, &lkey, 0);
4099                 if (rc != MDB_SUCCESS)
4100                         return rc;
4101         }
4102         assert(IS_LEAF(mc->mc_pg[mc->mc_top]));
4103
4104         mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]) - 1;
4105         mc->mc_flags |= C_INITIALIZED|C_EOF;
4106         }
4107         leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
4108
4109         if (IS_LEAF2(mc->mc_pg[mc->mc_top])) {
4110                 key->mv_size = mc->mc_db->md_pad;
4111                 key->mv_data = LEAF2KEY(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], key->mv_size);
4112                 return MDB_SUCCESS;
4113         }
4114
4115         if (data) {
4116                 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
4117                         mdb_xcursor_init1(mc, leaf);
4118                         rc = mdb_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL);
4119                         if (rc)
4120                                 return rc;
4121                 } else {
4122                         if (mc->mc_xcursor)
4123                                 mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED;
4124                         if ((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS)
4125                                 return rc;
4126                 }
4127         }
4128
4129         MDB_SET_KEY(leaf, key);
4130         return MDB_SUCCESS;
4131 }
4132
4133 int
4134 mdb_cursor_get(MDB_cursor *mc, MDB_val *key, MDB_val *data,
4135     MDB_cursor_op op)
4136 {
4137         int              rc;
4138         int              exact = 0;
4139
4140         assert(mc);
4141
4142         switch (op) {
4143         case MDB_GET_BOTH:
4144         case MDB_GET_BOTH_RANGE:
4145                 if (data == NULL || mc->mc_xcursor == NULL) {
4146                         rc = EINVAL;
4147                         break;
4148                 }
4149                 /* FALLTHRU */
4150         case MDB_SET:
4151         case MDB_SET_RANGE:
4152                 if (key == NULL || key->mv_size == 0 || key->mv_size > MAXKEYSIZE) {
4153                         rc = EINVAL;
4154                 } else if (op == MDB_SET_RANGE)
4155                         rc = mdb_cursor_set(mc, key, data, op, NULL);
4156                 else
4157                         rc = mdb_cursor_set(mc, key, data, op, &exact);
4158                 break;
4159         case MDB_GET_MULTIPLE:
4160                 if (data == NULL ||
4161                         !(mc->mc_db->md_flags & MDB_DUPFIXED) ||
4162                         !(mc->mc_flags & C_INITIALIZED)) {
4163                         rc = EINVAL;
4164                         break;
4165                 }
4166                 rc = MDB_SUCCESS;
4167                 if (!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) ||
4168                         (mc->mc_xcursor->mx_cursor.mc_flags & C_EOF))
4169                         break;
4170                 goto fetchm;
4171         case MDB_NEXT_MULTIPLE:
4172                 if (data == NULL ||
4173                         !(mc->mc_db->md_flags & MDB_DUPFIXED)) {
4174                         rc = EINVAL;
4175                         break;
4176                 }
4177                 if (!(mc->mc_flags & C_INITIALIZED))
4178                         rc = mdb_cursor_first(mc, key, data);
4179                 else
4180                         rc = mdb_cursor_next(mc, key, data, MDB_NEXT_DUP);
4181                 if (rc == MDB_SUCCESS) {
4182                         if (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) {
4183                                 MDB_cursor *mx;
4184 fetchm:
4185                                 mx = &mc->mc_xcursor->mx_cursor;
4186                                 data->mv_size = NUMKEYS(mx->mc_pg[mx->mc_top]) *
4187                                         mx->mc_db->md_pad;
4188                                 data->mv_data = METADATA(mx->mc_pg[mx->mc_top]);
4189                                 mx->mc_ki[mx->mc_top] = NUMKEYS(mx->mc_pg[mx->mc_top])-1;
4190                         } else {
4191                                 rc = MDB_NOTFOUND;
4192                         }
4193                 }
4194                 break;
4195         case MDB_NEXT:
4196         case MDB_NEXT_DUP:
4197         case MDB_NEXT_NODUP:
4198                 if (!(mc->mc_flags & C_INITIALIZED))
4199                         rc = mdb_cursor_first(mc, key, data);
4200                 else
4201                         rc = mdb_cursor_next(mc, key, data, op);
4202                 break;
4203         case MDB_PREV:
4204         case MDB_PREV_DUP:
4205         case MDB_PREV_NODUP:
4206                 if (!(mc->mc_flags & C_INITIALIZED) || (mc->mc_flags & C_EOF)) {
4207                         rc = mdb_cursor_last(mc, key, data);
4208                         mc->mc_flags &= ~C_EOF;
4209                 } else
4210                         rc = mdb_cursor_prev(mc, key, data, op);
4211                 break;
4212         case MDB_FIRST:
4213                 rc = mdb_cursor_first(mc, key, data);
4214                 break;
4215         case MDB_FIRST_DUP:
4216                 if (data == NULL ||
4217                         !(mc->mc_db->md_flags & MDB_DUPSORT) ||
4218                         !(mc->mc_flags & C_INITIALIZED) ||
4219                         !(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) {
4220                         rc = EINVAL;
4221                         break;
4222                 }
4223                 rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL);
4224                 break;
4225         case MDB_LAST:
4226                 rc = mdb_cursor_last(mc, key, data);
4227                 break;
4228         case MDB_LAST_DUP:
4229                 if (data == NULL ||
4230                         !(mc->mc_db->md_flags & MDB_DUPSORT) ||
4231                         !(mc->mc_flags & C_INITIALIZED) ||
4232                         !(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) {
4233                         rc = EINVAL;
4234                         break;
4235                 }
4236                 rc = mdb_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL);
4237                 break;
4238         default:
4239                 DPRINTF("unhandled/unimplemented cursor operation %u", op);
4240                 rc = EINVAL;
4241                 break;
4242         }
4243
4244         return rc;
4245 }
4246
4247 /** Touch all the pages in the cursor stack.
4248  *      Makes sure all the pages are writable, before attempting a write operation.
4249  * @param[in] mc The cursor to operate on.
4250  */
4251 static int
4252 mdb_cursor_touch(MDB_cursor *mc)
4253 {
4254         int rc;
4255
4256         if (mc->mc_dbi > MAIN_DBI && !(*mc->mc_dbflag & DB_DIRTY)) {
4257                 MDB_cursor mc2;
4258                 mdb_cursor_init(&mc2, mc->mc_txn, MAIN_DBI, NULL);
4259                 rc = mdb_page_search(&mc2, &mc->mc_dbx->md_name, MDB_PS_MODIFY);
4260                 if (rc)
4261                          return rc;
4262                 *mc->mc_dbflag = DB_DIRTY;
4263         }
4264         for (mc->mc_top = 0; mc->mc_top < mc->mc_snum; mc->mc_top++) {
4265                 rc = mdb_page_touch(mc);
4266                 if (rc)
4267                         return rc;
4268         }
4269         mc->mc_top = mc->mc_snum-1;
4270         return MDB_SUCCESS;
4271 }
4272
4273 int
4274 mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data,
4275     unsigned int flags)
4276 {
4277         MDB_node        *leaf = NULL;
4278         MDB_val xdata, *rdata, dkey;
4279         MDB_page        *fp;
4280         MDB_db dummy;
4281         int do_sub = 0, insert = 0;
4282         unsigned int mcount = 0;
4283         size_t nsize;
4284         int rc, rc2;
4285         MDB_pagebuf pbuf;
4286         char dbuf[MAXKEYSIZE+1];
4287         unsigned int nflags;
4288         DKBUF;
4289
4290         if (F_ISSET(mc->mc_txn->mt_flags, MDB_TXN_RDONLY))
4291                 return EACCES;
4292
4293         DPRINTF("==> put db %u key [%s], size %zu, data size %zu",
4294                 mc->mc_dbi, DKEY(key), key ? key->mv_size:0, data->mv_size);
4295
4296         dkey.mv_size = 0;
4297
4298         if (flags == MDB_CURRENT) {
4299                 if (!(mc->mc_flags & C_INITIALIZED))
4300                         return EINVAL;
4301                 rc = MDB_SUCCESS;
4302         } else if (mc->mc_db->md_root == P_INVALID) {
4303                 MDB_page *np;
4304                 /* new database, write a root leaf page */
4305                 DPUTS("allocating new root leaf page");
4306                 if ((np = mdb_page_new(mc, P_LEAF, 1)) == NULL) {
4307                         return ENOMEM;
4308                 }
4309                 mc->mc_snum = 0;
4310                 mdb_cursor_push(mc, np);
4311                 mc->mc_db->md_root = np->mp_pgno;
4312                 mc->mc_db->md_depth++;
4313                 *mc->mc_dbflag = DB_DIRTY;
4314                 if ((mc->mc_db->md_flags & (MDB_DUPSORT|MDB_DUPFIXED))
4315                         == MDB_DUPFIXED)
4316                         np->mp_flags |= P_LEAF2;
4317                 mc->mc_flags |= C_INITIALIZED;
4318                 rc = MDB_NOTFOUND;
4319                 goto top;
4320         } else {
4321                 int exact = 0;
4322                 MDB_val d2;
4323                 if (flags & MDB_APPEND) {
4324                         MDB_val k2;
4325                         rc = mdb_cursor_last(mc, &k2, &d2);
4326                         if (rc == 0) {
4327                                 rc = mc->mc_dbx->md_cmp(key, &k2);
4328                                 if (rc > 0) {
4329                                         rc = MDB_NOTFOUND;
4330                                         mc->mc_ki[mc->mc_top]++;
4331                                 } else {
4332                                         rc = 0;
4333                                 }
4334                         }
4335                 } else {
4336                 rc = mdb_cursor_set(mc, key, &d2, MDB_SET, &exact);
4337                 }
4338                 if ((flags & MDB_NOOVERWRITE) && rc == 0) {
4339                         DPRINTF("duplicate key [%s]", DKEY(key));
4340                         *data = d2;
4341                         return MDB_KEYEXIST;
4342                 }
4343                 if (rc && rc != MDB_NOTFOUND)
4344                         return rc;
4345         }
4346
4347         /* Cursor is positioned, now make sure all pages are writable */
4348         rc2 = mdb_cursor_touch(mc);
4349         if (rc2)
4350                 return rc2;
4351
4352 top:
4353         /* The key already exists */
4354         if (rc == MDB_SUCCESS) {
4355                 /* there's only a key anyway, so this is a no-op */
4356                 if (IS_LEAF2(mc->mc_pg[mc->mc_top])) {
4357                         unsigned int ksize = mc->mc_db->md_pad;
4358                         if (key->mv_size != ksize)
4359                                 return EINVAL;
4360                         if (flags == MDB_CURRENT) {
4361                                 char *ptr = LEAF2KEY(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], ksize);
4362                                 memcpy(ptr, key->mv_data, ksize);
4363                         }
4364                         return MDB_SUCCESS;
4365                 }
4366
4367                 leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
4368
4369                 /* DB has dups? */
4370                 if (F_ISSET(mc->mc_db->md_flags, MDB_DUPSORT)) {
4371                         /* Was a single item before, must convert now */
4372 more:
4373                         if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) {
4374                                 /* Just overwrite the current item */
4375                                 if (flags == MDB_CURRENT)
4376                                         goto current;
4377
4378                                 dkey.mv_size = NODEDSZ(leaf);
4379                                 dkey.mv_data = NODEDATA(leaf);
4380 #if UINT_MAX < SIZE_MAX
4381                                 if (mc->mc_dbx->md_dcmp == mdb_cmp_int && dkey.mv_size == sizeof(size_t))
4382 #ifdef MISALIGNED_OK
4383                                         mc->mc_dbx->md_dcmp = mdb_cmp_long;
4384 #else
4385                                         mc->mc_dbx->md_dcmp = mdb_cmp_cint;
4386 #endif
4387 #endif
4388                                 /* if data matches, ignore it */
4389                                 if (!mc->mc_dbx->md_dcmp(data, &dkey))
4390                                         return (flags == MDB_NODUPDATA) ? MDB_KEYEXIST : MDB_SUCCESS;
4391
4392                                 /* create a fake page for the dup items */
4393                                 memcpy(dbuf, dkey.mv_data, dkey.mv_size);
4394                                 dkey.mv_data = dbuf;
4395                                 fp = (MDB_page *)&pbuf;
4396                                 fp->mp_pgno = mc->mc_pg[mc->mc_top]->mp_pgno;
4397                                 fp->mp_flags = P_LEAF|P_DIRTY|P_SUBP;
4398                                 fp->mp_lower = PAGEHDRSZ;
4399                                 fp->mp_upper = PAGEHDRSZ + dkey.mv_size + data->mv_size;
4400                                 if (mc->mc_db->md_flags & MDB_DUPFIXED) {
4401                                         fp->mp_flags |= P_LEAF2;
4402                                         fp->mp_pad = data->mv_size;
4403                                         fp->mp_upper += 2 * data->mv_size;      /* leave space for 2 more */
4404                                 } else {
4405                                         fp->mp_upper += 2 * sizeof(indx_t) + 2 * NODESIZE +
4406                                                 (dkey.mv_size & 1) + (data->mv_size & 1);
4407                                 }
4408                                 mdb_node_del(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], 0);
4409                                 do_sub = 1;
4410                                 rdata = &xdata;
4411                                 xdata.mv_size = fp->mp_upper;
4412                                 xdata.mv_data = fp;
4413                                 flags |= F_DUPDATA;
4414                                 goto new_sub;
4415                         }
4416                         if (!F_ISSET(leaf->mn_flags, F_SUBDATA)) {
4417                                 /* See if we need to convert from fake page to subDB */
4418                                 MDB_page *mp;
4419                                 unsigned int offset;
4420                                 unsigned int i;
4421
4422                                 fp = NODEDATA(leaf);
4423                                 if (flags == MDB_CURRENT) {
4424 reuse:
4425                                         fp->mp_flags |= P_DIRTY;
4426                                         COPY_PGNO(fp->mp_pgno, mc->mc_pg[mc->mc_top]->mp_pgno);
4427                                         mc->mc_xcursor->mx_cursor.mc_pg[0] = fp;
4428                                         flags |= F_DUPDATA;
4429                                         goto put_sub;
4430                                 }
4431                                 if (mc->mc_db->md_flags & MDB_DUPFIXED) {
4432                                         offset = fp->mp_pad;
4433                                         if (SIZELEFT(fp) >= offset)
4434                                                 goto reuse;
4435                                         offset *= 4;    /* space for 4 more */
4436                                 } else {
4437                                         offset = NODESIZE + sizeof(indx_t) + data->mv_size;
4438                                 }
4439                                 offset += offset & 1;
4440                                 if (NODESIZE + sizeof(indx_t) + NODEKSZ(leaf) + NODEDSZ(leaf) +
4441                                         offset >= (mc->mc_txn->mt_env->me_psize - PAGEHDRSZ) /
4442                                                 MDB_MINKEYS) {
4443                                         /* yes, convert it */
4444                                         dummy.md_flags = 0;
4445                                         if (mc->mc_db->md_flags & MDB_DUPFIXED) {
4446                                                 dummy.md_pad = fp->mp_pad;
4447                                                 dummy.md_flags = MDB_DUPFIXED;
4448                                                 if (mc->mc_db->md_flags & MDB_INTEGERDUP)
4449                                                         dummy.md_flags |= MDB_INTEGERKEY;
4450                                         }
4451                                         dummy.md_depth = 1;
4452                                         dummy.md_branch_pages = 0;
4453                                         dummy.md_leaf_pages = 1;
4454                                         dummy.md_overflow_pages = 0;
4455                                         dummy.md_entries = NUMKEYS(fp);
4456                                         rdata = &xdata;
4457                                         xdata.mv_size = sizeof(MDB_db);
4458                                         xdata.mv_data = &dummy;
4459                                         mp = mdb_page_alloc(mc, 1);
4460                                         if (!mp)
4461                                                 return ENOMEM;
4462                                         offset = mc->mc_txn->mt_env->me_psize - NODEDSZ(leaf);
4463                                         flags |= F_DUPDATA|F_SUBDATA;
4464                                         dummy.md_root = mp->mp_pgno;
4465                                 } else {
4466                                         /* no, just grow it */
4467                                         rdata = &xdata;
4468                                         xdata.mv_size = NODEDSZ(leaf) + offset;
4469                                         xdata.mv_data = &pbuf;
4470                                         mp = (MDB_page *)&pbuf;
4471                                         mp->mp_pgno = mc->mc_pg[mc->mc_top]->mp_pgno;
4472                                         flags |= F_DUPDATA;
4473                                 }
4474                                 mp->mp_flags = fp->mp_flags | P_DIRTY;
4475                                 mp->mp_pad   = fp->mp_pad;
4476                                 mp->mp_lower = fp->mp_lower;
4477                                 mp->mp_upper = fp->mp_upper + offset;
4478                                 if (IS_LEAF2(fp)) {
4479                                         memcpy(METADATA(mp), METADATA(fp), NUMKEYS(fp) * fp->mp_pad);
4480                                 } else {
4481                                         nsize = NODEDSZ(leaf) - fp->mp_upper;
4482                                         memcpy((char *)mp + mp->mp_upper, (char *)fp + fp->mp_upper, nsize);
4483                                         for (i=0; i<NUMKEYS(fp); i++)
4484                                                 mp->mp_ptrs[i] = fp->mp_ptrs[i] + offset;
4485                                 }
4486                                 mdb_node_del(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], 0);
4487                                 do_sub = 1;
4488                                 goto new_sub;
4489                         }
4490                         /* data is on sub-DB, just store it */
4491                         flags |= F_DUPDATA|F_SUBDATA;
4492                         goto put_sub;
4493                 }
4494 current:
4495                 /* overflow page overwrites need special handling */
4496                 if (F_ISSET(leaf->mn_flags, F_BIGDATA)) {
4497                         MDB_page *omp;
4498                         pgno_t pg;
4499                         int ovpages, dpages;
4500
4501                         ovpages = OVPAGES(NODEDSZ(leaf), mc->mc_txn->mt_env->me_psize);
4502                         dpages = OVPAGES(data->mv_size, mc->mc_txn->mt_env->me_psize);
4503                         memcpy(&pg, NODEDATA(leaf), sizeof(pg));
4504                         mdb_page_get(mc->mc_txn, pg, &omp);
4505                         /* Is the ov page writable and large enough? */
4506                         if ((omp->mp_flags & P_DIRTY) && ovpages >= dpages) {
4507                                 /* yes, overwrite it. Note in this case we don't
4508                                  * bother to try shrinking the node if the new data
4509                                  * is smaller than the overflow threshold.
4510                                  */
4511                                 if (F_ISSET(flags, MDB_RESERVE))
4512                                         data->mv_data = METADATA(omp);
4513                                 else
4514                                         memcpy(METADATA(omp), data->mv_data, data->mv_size);
4515                                 goto done;
4516                         } else {
4517                                 /* no, free ovpages */
4518                                 int i;
4519                                 mc->mc_db->md_overflow_pages -= ovpages;
4520                                 for (i=0; i<ovpages; i++) {
4521                                         DPRINTF("freed ov page %zu", pg);
4522                                         mdb_midl_append(&mc->mc_txn->mt_free_pgs, pg);
4523                                         pg++;
4524                                 }
4525                         }
4526                 } else if (NODEDSZ(leaf) == data->mv_size) {
4527                         /* same size, just replace it. Note that we could
4528                          * also reuse this node if the new data is smaller,
4529                          * but instead we opt to shrink the node in that case.
4530                          */
4531                         if (F_ISSET(flags, MDB_RESERVE))
4532                                 data->mv_data = NODEDATA(leaf);
4533                         else
4534                                 memcpy(NODEDATA(leaf), data->mv_data, data->mv_size);
4535                         goto done;
4536                 }
4537                 mdb_node_del(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], 0);
4538                 mc->mc_db->md_entries--;
4539         } else {
4540                 DPRINTF("inserting key at index %i", mc->mc_ki[mc->mc_top]);
4541                 insert = 1;
4542         }
4543
4544         rdata = data;
4545
4546 new_sub:
4547         nflags = flags & NODE_ADD_FLAGS;
4548         nsize = IS_LEAF2(mc->mc_pg[mc->mc_top]) ? key->mv_size : mdb_leaf_size(mc->mc_txn->mt_env, key, rdata);
4549         if (SIZELEFT(mc->mc_pg[mc->mc_top]) < nsize) {
4550                 if (( flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA )
4551                         nflags &= ~MDB_APPEND;
4552                 if (!insert)
4553                         nflags |= MDB_SPLIT_REPLACE;
4554                 rc = mdb_page_split(mc, key, rdata, P_INVALID, nflags);
4555         } else {
4556                 /* There is room already in this leaf page. */
4557                 rc = mdb_node_add(mc, mc->mc_ki[mc->mc_top], key, rdata, 0, nflags);
4558                 if (rc == 0 && !do_sub && insert) {
4559                         /* Adjust other cursors pointing to mp */
4560                         MDB_cursor *m2, *m3;
4561                         MDB_dbi dbi = mc->mc_dbi;
4562                         unsigned i = mc->mc_top;
4563                         MDB_page *mp = mc->mc_pg[i];
4564
4565                         if (mc->mc_flags & C_SUB)
4566                                 dbi--;
4567
4568                         for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
4569                                 if (mc->mc_flags & C_SUB)
4570                                         m3 = &m2->mc_xcursor->mx_cursor;
4571                                 else
4572                                         m3 = m2;
4573                                 if (m3 == mc || m3->mc_snum < mc->mc_snum) continue;
4574                                 if (m3->mc_pg[i] == mp && m3->mc_ki[i] >= mc->mc_ki[i]) {
4575                                         m3->mc_ki[i]++;
4576                                 }
4577                         }
4578                 }
4579         }
4580
4581         if (rc != MDB_SUCCESS)
4582                 mc->mc_txn->mt_flags |= MDB_TXN_ERROR;
4583         else {
4584                 /* Now store the actual data in the child DB. Note that we're
4585                  * storing the user data in the keys field, so there are strict
4586                  * size limits on dupdata. The actual data fields of the child
4587                  * DB are all zero size.
4588                  */
4589                 if (do_sub) {
4590                         int xflags;
4591 put_sub:
4592                         xdata.mv_size = 0;
4593                         xdata.mv_data = "";
4594                         leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
4595                         if (flags & MDB_CURRENT) {
4596                                 xflags = MDB_CURRENT;
4597                         } else {
4598                                 mdb_xcursor_init1(mc, leaf);
4599                                 xflags = (flags & MDB_NODUPDATA) ? MDB_NOOVERWRITE : 0;
4600                         }
4601                         /* converted, write the original data first */
4602                         if (dkey.mv_size) {
4603                                 rc = mdb_cursor_put(&mc->mc_xcursor->mx_cursor, &dkey, &xdata, xflags);
4604                                 if (rc)
4605                                         return rc;
4606                                 {
4607                                         /* Adjust other cursors pointing to mp */
4608                                         MDB_cursor *m2;
4609                                         unsigned i = mc->mc_top;
4610                                         MDB_page *mp = mc->mc_pg[i];
4611
4612                                         for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) {
4613                                                 if (m2 == mc || m2->mc_snum < mc->mc_snum) continue;
4614                                                 if (m2->mc_pg[i] == mp && m2->mc_ki[i] == mc->mc_ki[i]) {
4615                                                         mdb_xcursor_init1(m2, leaf);
4616                                                 }
4617                                         }
4618                                 }
4619                         }
4620                         if (flags & MDB_APPENDDUP)
4621                                 xflags |= MDB_APPEND;
4622                         rc = mdb_cursor_put(&mc->mc_xcursor->mx_cursor, data, &xdata, xflags);
4623                         if (flags & F_SUBDATA) {
4624                                 void *db = NODEDATA(leaf);
4625                                 memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDB_db));
4626                         }
4627                 }
4628                 /* sub-writes might have failed so check rc again.
4629                  * Don't increment count if we just replaced an existing item.
4630                  */
4631                 if (!rc && !(flags & MDB_CURRENT))
4632                         mc->mc_db->md_entries++;
4633                 if (flags & MDB_MULTIPLE) {
4634                         mcount++;
4635                         if (mcount < data[1].mv_size) {
4636                                 data[0].mv_data = (char *)data[0].mv_data + data[0].mv_size;
4637                                 leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
4638                                 goto more;
4639                         }
4640                 }
4641         }
4642 done:
4643         return rc;
4644 }
4645
4646 int
4647 mdb_cursor_del(MDB_cursor *mc, unsigned int flags)
4648 {
4649         MDB_node        *leaf;
4650         int rc;
4651
4652         if (F_ISSET(mc->mc_txn->mt_flags, MDB_TXN_RDONLY))
4653                 return EACCES;
4654
4655         if (!mc->mc_flags & C_INITIALIZED)
4656                 return EINVAL;
4657
4658         rc = mdb_cursor_touch(mc);
4659         if (rc)
4660                 return rc;
4661
4662         leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
4663
4664         if (!IS_LEAF2(mc->mc_pg[mc->mc_top]) && F_ISSET(leaf->mn_flags, F_DUPDATA)) {
4665                 if (flags != MDB_NODUPDATA) {
4666                         if (!F_ISSET(leaf->mn_flags, F_SUBDATA)) {
4667                                 mc->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf);
4668                         }
4669                         rc = mdb_cursor_del(&mc->mc_xcursor->mx_cursor, 0);
4670                         /* If sub-DB still has entries, we're done */
4671                         if (mc->mc_xcursor->mx_db.md_entries) {
4672                                 if (leaf->mn_flags & F_SUBDATA) {
4673                                         /* update subDB info */
4674                                         void *db = NODEDATA(leaf);
4675                                         memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDB_db));
4676                                 } else {
4677                                         /* shrink fake page */
4678                                         mdb_node_shrink(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
4679                                 }
4680                                 mc->mc_db->md_entries--;
4681                                 return rc;
4682                         }
4683                         /* otherwise fall thru and delete the sub-DB */
4684                 }
4685
4686                 if (leaf->mn_flags & F_SUBDATA) {
4687                         /* add all the child DB's pages to the free list */
4688                         rc = mdb_drop0(&mc->mc_xcursor->mx_cursor, 0);
4689                         if (rc == MDB_SUCCESS) {
4690                                 mc->mc_db->md_entries -=
4691                                         mc->mc_xcursor->mx_db.md_entries;
4692                         }
4693                 }
4694         }
4695
4696         return mdb_cursor_del0(mc, leaf);
4697 }
4698
4699 /** Allocate and initialize new pages for a database.
4700  * @param[in] mc a cursor on the database being added to.
4701  * @param[in] flags flags defining what type of page is being allocated.
4702  * @param[in] num the number of pages to allocate. This is usually 1,
4703  * unless allocating overflow pages for a large record.
4704  * @return Address of a page, or NULL on failure.
4705  */
4706 static MDB_page *
4707 mdb_page_new(MDB_cursor *mc, uint32_t flags, int num)
4708 {
4709         MDB_page        *np;
4710
4711         if ((np = mdb_page_alloc(mc, num)) == NULL)
4712                 return NULL;
4713         DPRINTF("allocated new mpage %zu, page size %u",
4714             np->mp_pgno, mc->mc_txn->mt_env->me_psize);
4715         np->mp_flags = flags | P_DIRTY;
4716         np->mp_lower = PAGEHDRSZ;
4717         np->mp_upper = mc->mc_txn->mt_env->me_psize;
4718
4719         if (IS_BRANCH(np))
4720                 mc->mc_db->md_branch_pages++;
4721         else if (IS_LEAF(np))
4722                 mc->mc_db->md_leaf_pages++;
4723         else if (IS_OVERFLOW(np)) {
4724                 mc->mc_db->md_overflow_pages += num;
4725                 np->mp_pages = num;
4726         }
4727
4728         return np;
4729 }
4730
4731 /** Calculate the size of a leaf node.
4732  * The size depends on the environment's page size; if a data item
4733  * is too large it will be put onto an overflow page and the node
4734  * size will only include the key and not the data. Sizes are always
4735  * rounded up to an even number of bytes, to guarantee 2-byte alignment
4736  * of the #MDB_node headers.
4737  * @param[in] env The environment handle.
4738  * @param[in] key The key for the node.
4739  * @param[in] data The data for the node.
4740  * @return The number of bytes needed to store the node.
4741  */
4742 static size_t
4743 mdb_leaf_size(MDB_env *env, MDB_val *key, MDB_val *data)
4744 {
4745         size_t           sz;
4746
4747         sz = LEAFSIZE(key, data);
4748         if (sz >= env->me_psize / MDB_MINKEYS) {
4749                 /* put on overflow page */
4750                 sz -= data->mv_size - sizeof(pgno_t);
4751         }
4752         sz += sz & 1;
4753
4754         return sz + sizeof(indx_t);
4755 }
4756
4757 /** Calculate the size of a branch node.
4758  * The size should depend on the environment's page size but since
4759  * we currently don't support spilling large keys onto overflow
4760  * pages, it's simply the size of the #MDB_node header plus the
4761  * size of the key. Sizes are always rounded up to an even number
4762  * of bytes, to guarantee 2-byte alignment of the #MDB_node headers.
4763  * @param[in] env The environment handle.
4764  * @param[in] key The key for the node.
4765  * @return The number of bytes needed to store the node.
4766  */
4767 static size_t
4768 mdb_branch_size(MDB_env *env, MDB_val *key)
4769 {
4770         size_t           sz;
4771
4772         sz = INDXSIZE(key);
4773         if (sz >= env->me_psize / MDB_MINKEYS) {
4774                 /* put on overflow page */
4775                 /* not implemented */
4776                 /* sz -= key->size - sizeof(pgno_t); */
4777         }
4778
4779         return sz + sizeof(indx_t);
4780 }
4781
4782 /** Add a node to the page pointed to by the cursor.
4783  * @param[in] mc The cursor for this operation.
4784  * @param[in] indx The index on the page where the new node should be added.
4785  * @param[in] key The key for the new node.
4786  * @param[in] data The data for the new node, if any.
4787  * @param[in] pgno The page number, if adding a branch node.
4788  * @param[in] flags Flags for the node.
4789  * @return 0 on success, non-zero on failure. Possible errors are:
4790  * <ul>
4791  *      <li>ENOMEM - failed to allocate overflow pages for the node.
4792  *      <li>ENOSPC - there is insufficient room in the page. This error
4793  *      should never happen since all callers already calculate the
4794  *      page's free space before calling this function.
4795  * </ul>
4796  */
4797 static int
4798 mdb_node_add(MDB_cursor *mc, indx_t indx,
4799     MDB_val *key, MDB_val *data, pgno_t pgno, unsigned int flags)
4800 {
4801         unsigned int     i;
4802         size_t           node_size = NODESIZE;
4803         indx_t           ofs;
4804         MDB_node        *node;
4805         MDB_page        *mp = mc->mc_pg[mc->mc_top];
4806         MDB_page        *ofp = NULL;            /* overflow page */
4807         DKBUF;
4808
4809         assert(mp->mp_upper >= mp->mp_lower);
4810
4811         DPRINTF("add to %s %spage %zu index %i, data size %zu key size %zu [%s]",
4812             IS_LEAF(mp) ? "leaf" : "branch",
4813                 IS_SUBP(mp) ? "sub-" : "",
4814             mp->mp_pgno, indx, data ? data->mv_size : 0,
4815                 key ? key->mv_size : 0, key ? DKEY(key) : NULL);
4816
4817         if (IS_LEAF2(mp)) {
4818                 /* Move higher keys up one slot. */
4819                 int ksize = mc->mc_db->md_pad, dif;
4820                 char *ptr = LEAF2KEY(mp, indx, ksize);
4821                 dif = NUMKEYS(mp) - indx;
4822                 if (dif > 0)
4823                         memmove(ptr+ksize, ptr, dif*ksize);
4824                 /* insert new key */
4825                 memcpy(ptr, key->mv_data, ksize);
4826
4827                 /* Just using these for counting */
4828                 mp->mp_lower += sizeof(indx_t);
4829                 mp->mp_upper -= ksize - sizeof(indx_t);
4830                 return MDB_SUCCESS;
4831         }
4832
4833         if (key != NULL)
4834                 node_size += key->mv_size;
4835
4836         if (IS_LEAF(mp)) {
4837                 assert(data);
4838                 if (F_ISSET(flags, F_BIGDATA)) {
4839                         /* Data already on overflow page. */
4840                         node_size += sizeof(pgno_t);
4841                 } else if (node_size + data->mv_size >= mc->mc_txn->mt_env->me_psize / MDB_MINKEYS) {
4842                         int ovpages = OVPAGES(data->mv_size, mc->mc_txn->mt_env->me_psize);
4843                         /* Put data on overflow page. */
4844                         DPRINTF("data size is %zu, node would be %zu, put data on overflow page",
4845                             data->mv_size, node_size+data->mv_size);
4846                         node_size += sizeof(pgno_t);
4847                         if ((ofp = mdb_page_new(mc, P_OVERFLOW, ovpages)) == NULL)
4848                                 return ENOMEM;
4849                         DPRINTF("allocated overflow page %zu", ofp->mp_pgno);
4850                         flags |= F_BIGDATA;
4851                 } else {
4852                         node_size += data->mv_size;
4853                 }
4854         }
4855         node_size += node_size & 1;
4856
4857         if (node_size + sizeof(indx_t) > SIZELEFT(mp)) {
4858                 DPRINTF("not enough room in page %zu, got %u ptrs",
4859                     mp->mp_pgno, NUMKEYS(mp));
4860                 DPRINTF("upper - lower = %u - %u = %u", mp->mp_upper, mp->mp_lower,
4861                     mp->mp_upper - mp->mp_lower);
4862                 DPRINTF("node size = %zu", node_size);
4863                 return ENOSPC;
4864         }
4865
4866         /* Move higher pointers up one slot. */
4867         for (i = NUMKEYS(mp); i > indx; i--)
4868                 mp->mp_ptrs[i] = mp->mp_ptrs[i - 1];
4869
4870         /* Adjust free space offsets. */
4871         ofs = mp->mp_upper - node_size;
4872         assert(ofs >= mp->mp_lower + sizeof(indx_t));
4873         mp->mp_ptrs[indx] = ofs;
4874         mp->mp_upper = ofs;
4875         mp->mp_lower += sizeof(indx_t);
4876
4877         /* Write the node data. */
4878         node = NODEPTR(mp, indx);
4879         node->mn_ksize = (key == NULL) ? 0 : key->mv_size;
4880         node->mn_flags = flags;
4881         if (IS_LEAF(mp))
4882                 SETDSZ(node,data->mv_size);
4883         else
4884                 SETPGNO(node,pgno);
4885
4886         if (key)
4887                 memcpy(NODEKEY(node), key->mv_data, key->mv_size);
4888
4889         if (IS_LEAF(mp)) {
4890                 assert(key);
4891                 if (ofp == NULL) {
4892                         if (F_ISSET(flags, F_BIGDATA))
4893                                 memcpy(node->mn_data + key->mv_size, data->mv_data,
4894                                     sizeof(pgno_t));
4895                         else if (F_ISSET(flags, MDB_RESERVE))
4896                                 data->mv_data = node->mn_data + key->mv_size;
4897                         else
4898                                 memcpy(node->mn_data + key->mv_size, data->mv_data,
4899                                     data->mv_size);
4900                 } else {
4901                         memcpy(node->mn_data + key->mv_size, &ofp->mp_pgno,
4902                             sizeof(pgno_t));
4903                         if (F_ISSET(flags, MDB_RESERVE))
4904                                 data->mv_data = METADATA(ofp);
4905                         else
4906                                 memcpy(METADATA(ofp), data->mv_data, data->mv_size);
4907                 }
4908         }
4909
4910         return MDB_SUCCESS;
4911 }
4912
4913 /** Delete the specified node from a page.
4914  * @param[in] mp The page to operate on.
4915  * @param[in] indx The index of the node to delete.
4916  * @param[in] ksize The size of a node. Only used if the page is
4917  * part of a #MDB_DUPFIXED database.
4918  */
4919 static void
4920 mdb_node_del(MDB_page *mp, indx_t indx, int ksize)
4921 {
4922         unsigned int     sz;
4923         indx_t           i, j, numkeys, ptr;
4924         MDB_node        *node;
4925         char            *base;
4926
4927 #if MDB_DEBUG
4928         {
4929         pgno_t pgno;
4930         COPY_PGNO(pgno, mp->mp_pgno);
4931         DPRINTF("delete node %u on %s page %zu", indx,
4932             IS_LEAF(mp) ? "leaf" : "branch", pgno);
4933         }
4934 #endif
4935         assert(indx < NUMKEYS(mp));
4936
4937         if (IS_LEAF2(mp)) {
4938                 int x = NUMKEYS(mp) - 1 - indx;
4939                 base = LEAF2KEY(mp, indx, ksize);
4940                 if (x)
4941                         memmove(base, base + ksize, x * ksize);
4942                 mp->mp_lower -= sizeof(indx_t);
4943                 mp->mp_upper += ksize - sizeof(indx_t);
4944                 return;
4945         }
4946
4947         node = NODEPTR(mp, indx);
4948         sz = NODESIZE + node->mn_ksize;
4949         if (IS_LEAF(mp)) {
4950                 if (F_ISSET(node->mn_flags, F_BIGDATA))
4951                         sz += sizeof(pgno_t);
4952                 else
4953                         sz += NODEDSZ(node);
4954         }
4955         sz += sz & 1;
4956
4957         ptr = mp->mp_ptrs[indx];
4958         numkeys = NUMKEYS(mp);
4959         for (i = j = 0; i < numkeys; i++) {
4960                 if (i != indx) {
4961                         mp->mp_ptrs[j] = mp->mp_ptrs[i];
4962                         if (mp->mp_ptrs[i] < ptr)
4963                                 mp->mp_ptrs[j] += sz;
4964                         j++;
4965                 }
4966         }
4967
4968         base = (char *)mp + mp->mp_upper;
4969         memmove(base + sz, base, ptr - mp->mp_upper);
4970
4971         mp->mp_lower -= sizeof(indx_t);
4972         mp->mp_upper += sz;
4973 }
4974
4975 /** Compact the main page after deleting a node on a subpage.
4976  * @param[in] mp The main page to operate on.
4977  * @param[in] indx The index of the subpage on the main page.
4978  */
4979 static void
4980 mdb_node_shrink(MDB_page *mp, indx_t indx)
4981 {
4982         MDB_node *node;
4983         MDB_page *sp, *xp;
4984         char *base;
4985         int osize, nsize;
4986         int delta;
4987         indx_t           i, numkeys, ptr;
4988
4989         node = NODEPTR(mp, indx);
4990         sp = (MDB_page *)NODEDATA(node);
4991         osize = NODEDSZ(node);
4992
4993         delta = sp->mp_upper - sp->mp_lower;
4994         SETDSZ(node, osize - delta);
4995         xp = (MDB_page *)((char *)sp + delta);
4996
4997         /* shift subpage upward */
4998         if (IS_LEAF2(sp)) {
4999                 nsize = NUMKEYS(sp) * sp->mp_pad;
5000                 memmove(METADATA(xp), METADATA(sp), nsize);
5001         } else {
5002                 int i;
5003                 nsize = osize - sp->mp_upper;
5004                 numkeys = NUMKEYS(sp);
5005                 for (i=numkeys-1; i>=0; i--)
5006                         xp->mp_ptrs[i] = sp->mp_ptrs[i] - delta;
5007         }
5008         xp->mp_upper = sp->mp_lower;
5009         xp->mp_lower = sp->mp_lower;
5010         xp->mp_flags = sp->mp_flags;
5011         xp->mp_pad = sp->mp_pad;
5012         COPY_PGNO(xp->mp_pgno, mp->mp_pgno);
5013
5014         /* shift lower nodes upward */
5015         ptr = mp->mp_ptrs[indx];
5016         numkeys = NUMKEYS(mp);
5017         for (i = 0; i < numkeys; i++) {
5018                 if (mp->mp_ptrs[i] <= ptr)
5019                         mp->mp_ptrs[i] += delta;
5020         }
5021
5022         base = (char *)mp + mp->mp_upper;
5023         memmove(base + delta, base, ptr - mp->mp_upper + NODESIZE + NODEKSZ(node));
5024         mp->mp_upper += delta;
5025 }
5026
5027 /** Initial setup of a sorted-dups cursor.
5028  * Sorted duplicates are implemented as a sub-database for the given key.
5029  * The duplicate data items are actually keys of the sub-database.
5030  * Operations on the duplicate data items are performed using a sub-cursor
5031  * initialized when the sub-database is first accessed. This function does
5032  * the preliminary setup of the sub-cursor, filling in the fields that
5033  * depend only on the parent DB.
5034  * @param[in] mc The main cursor whose sorted-dups cursor is to be initialized.
5035  */
5036 static void
5037 mdb_xcursor_init0(MDB_cursor *mc)
5038 {
5039         MDB_xcursor *mx = mc->mc_xcursor;
5040
5041         mx->mx_cursor.mc_xcursor = NULL;
5042         mx->mx_cursor.mc_txn = mc->mc_txn;
5043         mx->mx_cursor.mc_db = &mx->mx_db;
5044         mx->mx_cursor.mc_dbx = &mx->mx_dbx;
5045         mx->mx_cursor.mc_dbi = mc->mc_dbi+1;
5046         mx->mx_cursor.mc_dbflag = &mx->mx_dbflag;
5047         mx->mx_cursor.mc_snum = 0;
5048         mx->mx_cursor.mc_top = 0;
5049         mx->mx_cursor.mc_flags = C_SUB;
5050         mx->mx_dbx.md_cmp = mc->mc_dbx->md_dcmp;
5051         mx->mx_dbx.md_dcmp = NULL;
5052         mx->mx_dbx.md_rel = mc->mc_dbx->md_rel;
5053 }
5054
5055 /** Final setup of a sorted-dups cursor.
5056  *      Sets up the fields that depend on the data from the main cursor.
5057  * @param[in] mc The main cursor whose sorted-dups cursor is to be initialized.
5058  * @param[in] node The data containing the #MDB_db record for the
5059  * sorted-dup database.
5060  */
5061 static void
5062 mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node)
5063 {
5064         MDB_xcursor *mx = mc->mc_xcursor;
5065
5066         if (node->mn_flags & F_SUBDATA) {
5067                 memcpy(&mx->mx_db, NODEDATA(node), sizeof(MDB_db));
5068                 mx->mx_cursor.mc_pg[0] = 0;
5069                 mx->mx_cursor.mc_snum = 0;
5070                 mx->mx_cursor.mc_flags = C_SUB;
5071         } else {
5072                 MDB_page *fp = NODEDATA(node);
5073                 mx->mx_db.md_pad = mc->mc_pg[mc->mc_top]->mp_pad;
5074                 mx->mx_db.md_flags = 0;
5075                 mx->mx_db.md_depth = 1;
5076                 mx->mx_db.md_branch_pages = 0;
5077                 mx->mx_db.md_leaf_pages = 1;
5078                 mx->mx_db.md_overflow_pages = 0;
5079                 mx->mx_db.md_entries = NUMKEYS(fp);
5080                 COPY_PGNO(mx->mx_db.md_root, fp->mp_pgno);
5081                 mx->mx_cursor.mc_snum = 1;
5082                 mx->mx_cursor.mc_flags = C_INITIALIZED|C_SUB;
5083                 mx->mx_cursor.mc_top = 0;
5084                 mx->mx_cursor.mc_pg[0] = fp;
5085                 mx->mx_cursor.mc_ki[0] = 0;
5086                 if (mc->mc_db->md_flags & MDB_DUPFIXED) {
5087                         mx->mx_db.md_flags = MDB_DUPFIXED;
5088                         mx->mx_db.md_pad = fp->mp_pad;
5089                         if (mc->mc_db->md_flags & MDB_INTEGERDUP)
5090                                 mx->mx_db.md_flags |= MDB_INTEGERKEY;
5091                 }
5092         }
5093         DPRINTF("Sub-db %u for db %u root page %zu", mx->mx_cursor.mc_dbi, mc->mc_dbi,
5094                 mx->mx_db.md_root);
5095         mx->mx_dbflag = (F_ISSET(mc->mc_pg[mc->mc_top]->mp_flags, P_DIRTY)) ?
5096                 DB_DIRTY : 0;
5097         mx->mx_dbx.md_name.mv_data = NODEKEY(node);
5098         mx->mx_dbx.md_name.mv_size = node->mn_ksize;
5099 #if UINT_MAX < SIZE_MAX
5100         if (mx->mx_dbx.md_cmp == mdb_cmp_int && mx->mx_db.md_pad == sizeof(size_t))
5101 #ifdef MISALIGNED_OK
5102                 mx->mx_dbx.md_cmp = mdb_cmp_long;
5103 #else
5104                 mx->mx_dbx.md_cmp = mdb_cmp_cint;
5105 #endif
5106 #endif
5107 }
5108
5109 /** Initialize a cursor for a given transaction and database. */
5110 static void
5111 mdb_cursor_init(MDB_cursor *mc, MDB_txn *txn, MDB_dbi dbi, MDB_xcursor *mx)
5112 {
5113         mc->mc_orig = NULL;
5114         mc->mc_dbi = dbi;
5115         mc->mc_txn = txn;
5116         mc->mc_db = &txn->mt_dbs[dbi];
5117         mc->mc_dbx = &txn->mt_dbxs[dbi];
5118         mc->mc_dbflag = &txn->mt_dbflags[dbi];
5119         mc->mc_snum = 0;
5120         mc->mc_top = 0;
5121         mc->mc_pg[0] = 0;
5122         mc->mc_flags = 0;
5123         if (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT) {
5124                 assert(mx != NULL);
5125                 mc->mc_xcursor = mx;
5126                 mdb_xcursor_init0(mc);
5127         } else {
5128                 mc->mc_xcursor = NULL;
5129         }
5130         if (*mc->mc_dbflag & DB_STALE) {
5131                 mdb_page_search(mc, NULL, MDB_PS_ROOTONLY);
5132         }
5133 }
5134
5135 int
5136 mdb_cursor_open(MDB_txn *txn, MDB_dbi dbi, MDB_cursor **ret)
5137 {
5138         MDB_cursor      *mc;
5139         MDB_xcursor     *mx = NULL;
5140         size_t size = sizeof(MDB_cursor);
5141
5142         if (txn == NULL || ret == NULL || dbi >= txn->mt_numdbs)
5143                 return EINVAL;
5144
5145         /* Allow read access to the freelist */
5146         if (!dbi && !F_ISSET(txn->mt_flags, MDB_TXN_RDONLY))
5147                 return EINVAL;
5148
5149         if (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT)
5150                 size += sizeof(MDB_xcursor);
5151
5152         if ((mc = malloc(size)) != NULL) {
5153                 if (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT) {
5154                         mx = (MDB_xcursor *)(mc + 1);
5155                 }
5156                 mdb_cursor_init(mc, txn, dbi, mx);
5157                 if (txn->mt_cursors) {
5158                         mc->mc_next = txn->mt_cursors[dbi];
5159                         txn->mt_cursors[dbi] = mc;
5160                 }
5161                 mc->mc_flags |= C_ALLOCD;
5162         } else {
5163                 return ENOMEM;
5164         }
5165
5166         *ret = mc;
5167
5168         return MDB_SUCCESS;
5169 }
5170
5171 /* Return the count of duplicate data items for the current key */
5172 int
5173 mdb_cursor_count(MDB_cursor *mc, size_t *countp)
5174 {
5175         MDB_node        *leaf;
5176
5177         if (mc == NULL || countp == NULL)
5178                 return EINVAL;
5179
5180         if (!(mc->mc_db->md_flags & MDB_DUPSORT))
5181                 return EINVAL;
5182
5183         leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
5184         if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) {
5185                 *countp = 1;
5186         } else {
5187                 if (!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED))
5188                         return EINVAL;
5189
5190                 *countp = mc->mc_xcursor->mx_db.md_entries;
5191         }
5192         return MDB_SUCCESS;
5193 }
5194
5195 void
5196 mdb_cursor_close(MDB_cursor *mc)
5197 {
5198         if (mc != NULL) {
5199                 /* remove from txn, if tracked */
5200                 if (mc->mc_txn->mt_cursors) {
5201                         MDB_cursor **prev = &mc->mc_txn->mt_cursors[mc->mc_dbi];
5202                         while (*prev && *prev != mc) prev = &(*prev)->mc_next;
5203                         if (*prev == mc)
5204                                 *prev = mc->mc_next;
5205                 }
5206                 if (mc->mc_flags & C_ALLOCD)
5207                         free(mc);
5208         }
5209 }
5210
5211 MDB_txn *
5212 mdb_cursor_txn(MDB_cursor *mc)
5213 {
5214         if (!mc) return NULL;
5215         return mc->mc_txn;
5216 }
5217
5218 MDB_dbi
5219 mdb_cursor_dbi(MDB_cursor *mc)
5220 {
5221         if (!mc) return 0;
5222         return mc->mc_dbi;
5223 }
5224
5225 /** Replace the key for a node with a new key.
5226  * @param[in] mp The page containing the node to operate on.
5227  * @param[in] indx The index of the node to operate on.
5228  * @param[in] key The new key to use.
5229  * @return 0 on success, non-zero on failure.
5230  */
5231 static int
5232 mdb_update_key(MDB_page *mp, indx_t indx, MDB_val *key)
5233 {
5234         MDB_node                *node;
5235         char                    *base;
5236         size_t                   len;
5237         int                      delta, delta0;
5238         indx_t                   ptr, i, numkeys;
5239         DKBUF;
5240
5241         node = NODEPTR(mp, indx);
5242         ptr = mp->mp_ptrs[indx];
5243 #if MDB_DEBUG
5244         {
5245                 MDB_val k2;
5246                 char kbuf2[(MAXKEYSIZE*2+1)];
5247                 k2.mv_data = NODEKEY(node);
5248                 k2.mv_size = node->mn_ksize;
5249                 DPRINTF("update key %u (ofs %u) [%s] to [%s] on page %zu",
5250                         indx, ptr,
5251                         mdb_dkey(&k2, kbuf2),
5252                         DKEY(key),
5253                         mp->mp_pgno);
5254         }
5255 #endif
5256
5257         delta0 = delta = key->mv_size - node->mn_ksize;
5258
5259         /* Must be 2-byte aligned. If new key is
5260          * shorter by 1, the shift will be skipped.
5261          */
5262         delta += (delta & 1);
5263         if (delta) {
5264                 if (delta > 0 && SIZELEFT(mp) < delta) {
5265                         DPRINTF("OUCH! Not enough room, delta = %d", delta);
5266                         return ENOSPC;
5267                 }
5268
5269                 numkeys = NUMKEYS(mp);
5270                 for (i = 0; i < numkeys; i++) {
5271                         if (mp->mp_ptrs[i] <= ptr)
5272                                 mp->mp_ptrs[i] -= delta;
5273                 }
5274
5275                 base = (char *)mp + mp->mp_upper;
5276                 len = ptr - mp->mp_upper + NODESIZE;
5277                 memmove(base - delta, base, len);
5278                 mp->mp_upper -= delta;
5279
5280                 node = NODEPTR(mp, indx);
5281         }
5282
5283         /* But even if no shift was needed, update ksize */
5284         if (delta0)
5285                 node->mn_ksize = key->mv_size;
5286
5287         if (key->mv_size)
5288                 memcpy(NODEKEY(node), key->mv_data, key->mv_size);
5289
5290         return MDB_SUCCESS;
5291 }
5292
5293 /** Move a node from csrc to cdst.
5294  */
5295 static int
5296 mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst)
5297 {
5298         int                      rc;
5299         MDB_node                *srcnode;
5300         MDB_val          key, data;
5301         pgno_t  srcpg;
5302         unsigned short flags;
5303
5304         DKBUF;
5305
5306         /* Mark src and dst as dirty. */
5307         if ((rc = mdb_page_touch(csrc)) ||
5308             (rc = mdb_page_touch(cdst)))
5309                 return rc;
5310
5311         if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) {
5312                 srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], 0);        /* fake */
5313                 key.mv_size = csrc->mc_db->md_pad;
5314                 key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], csrc->mc_ki[csrc->mc_top], key.mv_size);
5315                 data.mv_size = 0;
5316                 data.mv_data = NULL;
5317                 srcpg = 0;
5318                 flags = 0;
5319         } else {
5320                 srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], csrc->mc_ki[csrc->mc_top]);
5321                 assert(!((long)srcnode&1));
5322                 srcpg = NODEPGNO(srcnode);
5323                 flags = srcnode->mn_flags;
5324                 if (csrc->mc_ki[csrc->mc_top] == 0 && IS_BRANCH(csrc->mc_pg[csrc->mc_top])) {
5325                         unsigned int snum = csrc->mc_snum;
5326                         MDB_node *s2;
5327                         /* must find the lowest key below src */
5328                         mdb_page_search_root(csrc, NULL, 0);
5329                         if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) {
5330                                 key.mv_size = csrc->mc_db->md_pad;
5331                                 key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], 0, key.mv_size);
5332                         } else {
5333                                 s2 = NODEPTR(csrc->mc_pg[csrc->mc_top], 0);
5334                                 key.mv_size = NODEKSZ(s2);
5335                                 key.mv_data = NODEKEY(s2);
5336                         }
5337                         csrc->mc_snum = snum--;
5338                         csrc->mc_top = snum;
5339                 } else {
5340                         key.mv_size = NODEKSZ(srcnode);
5341                         key.mv_data = NODEKEY(srcnode);
5342                 }
5343                 data.mv_size = NODEDSZ(srcnode);
5344                 data.mv_data = NODEDATA(srcnode);
5345         }
5346         if (IS_BRANCH(cdst->mc_pg[cdst->mc_top]) && cdst->mc_ki[cdst->mc_top] == 0) {
5347                 unsigned int snum = cdst->mc_snum;
5348                 MDB_node *s2;
5349                 MDB_val bkey;
5350                 /* must find the lowest key below dst */
5351                 mdb_page_search_root(cdst, NULL, 0);
5352                 if (IS_LEAF2(cdst->mc_pg[cdst->mc_top])) {
5353                         bkey.mv_size = cdst->mc_db->md_pad;
5354                         bkey.mv_data = LEAF2KEY(cdst->mc_pg[cdst->mc_top], 0, bkey.mv_size);
5355                 } else {
5356                         s2 = NODEPTR(cdst->mc_pg[cdst->mc_top], 0);
5357                         bkey.mv_size = NODEKSZ(s2);
5358                         bkey.mv_data = NODEKEY(s2);
5359                 }
5360                 cdst->mc_snum = snum--;
5361                 cdst->mc_top = snum;
5362                 rc = mdb_update_key(cdst->mc_pg[cdst->mc_top], 0, &bkey);
5363         }
5364
5365         DPRINTF("moving %s node %u [%s] on page %zu to node %u on page %zu",
5366             IS_LEAF(csrc->mc_pg[csrc->mc_top]) ? "leaf" : "branch",
5367             csrc->mc_ki[csrc->mc_top],
5368                 DKEY(&key),
5369             csrc->mc_pg[csrc->mc_top]->mp_pgno,
5370             cdst->mc_ki[cdst->mc_top], cdst->mc_pg[cdst->mc_top]->mp_pgno);
5371
5372         /* Add the node to the destination page.
5373          */
5374         rc = mdb_node_add(cdst, cdst->mc_ki[cdst->mc_top], &key, &data, srcpg, flags);
5375         if (rc != MDB_SUCCESS)
5376                 return rc;
5377
5378         /* Delete the node from the source page.
5379          */
5380         mdb_node_del(csrc->mc_pg[csrc->mc_top], csrc->mc_ki[csrc->mc_top], key.mv_size);
5381
5382         {
5383                 /* Adjust other cursors pointing to mp */
5384                 MDB_cursor *m2, *m3;
5385                 MDB_dbi dbi = csrc->mc_dbi;
5386                 MDB_page *mp = csrc->mc_pg[csrc->mc_top];
5387
5388                 if (csrc->mc_flags & C_SUB)
5389                         dbi--;
5390
5391                 for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
5392                         if (m2 == csrc) continue;
5393                         if (csrc->mc_flags & C_SUB)
5394                                 m3 = &m2->mc_xcursor->mx_cursor;
5395                         else
5396                                 m3 = m2;
5397                         if (m3->mc_pg[csrc->mc_top] == mp && m3->mc_ki[csrc->mc_top] ==
5398                                 csrc->mc_ki[csrc->mc_top]) {
5399                                 m3->mc_pg[csrc->mc_top] = cdst->mc_pg[cdst->mc_top];
5400                                 m3->mc_ki[csrc->mc_top] = cdst->mc_ki[cdst->mc_top];
5401                         }
5402                 }
5403         }
5404
5405         /* Update the parent separators.
5406          */
5407         if (csrc->mc_ki[csrc->mc_top] == 0) {
5408                 if (csrc->mc_ki[csrc->mc_top-1] != 0) {
5409                         if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) {
5410                                 key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], 0, key.mv_size);
5411                         } else {
5412                                 srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], 0);
5413                                 key.mv_size = NODEKSZ(srcnode);
5414                                 key.mv_data = NODEKEY(srcnode);
5415                         }
5416                         DPRINTF("update separator for source page %zu to [%s]",
5417                                 csrc->mc_pg[csrc->mc_top]->mp_pgno, DKEY(&key));
5418                         if ((rc = mdb_update_key(csrc->mc_pg[csrc->mc_top-1], csrc->mc_ki[csrc->mc_top-1],
5419                                 &key)) != MDB_SUCCESS)
5420                                 return rc;
5421                 }
5422                 if (IS_BRANCH(csrc->mc_pg[csrc->mc_top])) {
5423                         MDB_val  nullkey;
5424                         nullkey.mv_size = 0;
5425                         rc = mdb_update_key(csrc->mc_pg[csrc->mc_top], 0, &nullkey);
5426                         assert(rc == MDB_SUCCESS);
5427                 }
5428         }
5429
5430         if (cdst->mc_ki[cdst->mc_top] == 0) {
5431                 if (cdst->mc_ki[cdst->mc_top-1] != 0) {
5432                         if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) {
5433                                 key.mv_data = LEAF2KEY(cdst->mc_pg[cdst->mc_top], 0, key.mv_size);
5434                         } else {
5435                                 srcnode = NODEPTR(cdst->mc_pg[cdst->mc_top], 0);
5436                                 key.mv_size = NODEKSZ(srcnode);
5437                                 key.mv_data = NODEKEY(srcnode);
5438                         }
5439                         DPRINTF("update separator for destination page %zu to [%s]",
5440                                 cdst->mc_pg[cdst->mc_top]->mp_pgno, DKEY(&key));
5441                         if ((rc = mdb_update_key(cdst->mc_pg[cdst->mc_top-1], cdst->mc_ki[cdst->mc_top-1],
5442                                 &key)) != MDB_SUCCESS)
5443                                 return rc;
5444                 }
5445                 if (IS_BRANCH(cdst->mc_pg[cdst->mc_top])) {
5446                         MDB_val  nullkey;
5447                         nullkey.mv_size = 0;
5448                         rc = mdb_update_key(cdst->mc_pg[cdst->mc_top], 0, &nullkey);
5449                         assert(rc == MDB_SUCCESS);
5450                 }
5451         }
5452
5453         return MDB_SUCCESS;
5454 }
5455
5456 /** Merge one page into another.
5457  *  The nodes from the page pointed to by \b csrc will
5458  *      be copied to the page pointed to by \b cdst and then
5459  *      the \b csrc page will be freed.
5460  * @param[in] csrc Cursor pointing to the source page.
5461  * @param[in] cdst Cursor pointing to the destination page.
5462  */
5463 static int
5464 mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst)
5465 {
5466         int                      rc;
5467         indx_t                   i, j;
5468         MDB_node                *srcnode;
5469         MDB_val          key, data;
5470         unsigned        nkeys;
5471
5472         DPRINTF("merging page %zu into %zu", csrc->mc_pg[csrc->mc_top]->mp_pgno,
5473                 cdst->mc_pg[cdst->mc_top]->mp_pgno);
5474
5475         assert(csrc->mc_snum > 1);      /* can't merge root page */
5476         assert(cdst->mc_snum > 1);
5477
5478         /* Mark dst as dirty. */
5479         if ((rc = mdb_page_touch(cdst)))
5480                 return rc;
5481
5482         /* Move all nodes from src to dst.
5483          */
5484         j = nkeys = NUMKEYS(cdst->mc_pg[cdst->mc_top]);
5485         if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) {
5486                 key.mv_size = csrc->mc_db->md_pad;
5487                 key.mv_data = METADATA(csrc->mc_pg[csrc->mc_top]);
5488                 for (i = 0; i < NUMKEYS(csrc->mc_pg[csrc->mc_top]); i++, j++) {
5489                         rc = mdb_node_add(cdst, j, &key, NULL, 0, 0);
5490                         if (rc != MDB_SUCCESS)
5491                                 return rc;
5492                         key.mv_data = (char *)key.mv_data + key.mv_size;
5493                 }
5494         } else {
5495                 for (i = 0; i < NUMKEYS(csrc->mc_pg[csrc->mc_top]); i++, j++) {
5496                         srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], i);
5497                         if (i == 0 && IS_BRANCH(csrc->mc_pg[csrc->mc_top])) {
5498                                 unsigned int snum = csrc->mc_snum;
5499                                 MDB_node *s2;
5500                                 /* must find the lowest key below src */
5501                                 mdb_page_search_root(csrc, NULL, 0);
5502                                 if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) {
5503                                         key.mv_size = csrc->mc_db->md_pad;
5504                                         key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], 0, key.mv_size);
5505                                 } else {
5506                                         s2 = NODEPTR(csrc->mc_pg[csrc->mc_top], 0);
5507                                         key.mv_size = NODEKSZ(s2);
5508                                         key.mv_data = NODEKEY(s2);
5509                                 }
5510                                 csrc->mc_snum = snum--;
5511                                 csrc->mc_top = snum;
5512                         } else {
5513                                 key.mv_size = srcnode->mn_ksize;
5514                                 key.mv_data = NODEKEY(srcnode);
5515                         }
5516
5517                         data.mv_size = NODEDSZ(srcnode);
5518                         data.mv_data = NODEDATA(srcnode);
5519                         rc = mdb_node_add(cdst, j, &key, &data, NODEPGNO(srcnode), srcnode->mn_flags);
5520                         if (rc != MDB_SUCCESS)
5521                                 return rc;
5522                 }
5523         }
5524
5525         DPRINTF("dst page %zu now has %u keys (%.1f%% filled)",
5526             cdst->mc_pg[cdst->mc_top]->mp_pgno, NUMKEYS(cdst->mc_pg[cdst->mc_top]), (float)PAGEFILL(cdst->mc_txn->mt_env, cdst->mc_pg[cdst->mc_top]) / 10);
5527
5528         /* Unlink the src page from parent and add to free list.
5529          */
5530         mdb_node_del(csrc->mc_pg[csrc->mc_top-1], csrc->mc_ki[csrc->mc_top-1], 0);
5531         if (csrc->mc_ki[csrc->mc_top-1] == 0) {
5532                 key.mv_size = 0;
5533                 if ((rc = mdb_update_key(csrc->mc_pg[csrc->mc_top-1], 0, &key)) != MDB_SUCCESS)
5534                         return rc;
5535         }
5536
5537         mdb_midl_append(&csrc->mc_txn->mt_free_pgs, csrc->mc_pg[csrc->mc_top]->mp_pgno);
5538         if (IS_LEAF(csrc->mc_pg[csrc->mc_top]))
5539                 csrc->mc_db->md_leaf_pages--;
5540         else
5541                 csrc->mc_db->md_branch_pages--;
5542         {
5543                 /* Adjust other cursors pointing to mp */
5544                 MDB_cursor *m2, *m3;
5545                 MDB_dbi dbi = csrc->mc_dbi;
5546                 MDB_page *mp = cdst->mc_pg[cdst->mc_top];
5547
5548                 if (csrc->mc_flags & C_SUB)
5549                         dbi--;
5550
5551                 for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
5552                         if (csrc->mc_flags & C_SUB)
5553                                 m3 = &m2->mc_xcursor->mx_cursor;
5554                         else
5555                                 m3 = m2;
5556                         if (m3 == csrc) continue;
5557                         if (m3->mc_snum < csrc->mc_snum) continue;
5558                         if (m3->mc_pg[csrc->mc_top] == csrc->mc_pg[csrc->mc_top]) {
5559                                 m3->mc_pg[csrc->mc_top] = mp;
5560                                 m3->mc_ki[csrc->mc_top] += nkeys;
5561                         }
5562                 }
5563         }
5564         mdb_cursor_pop(csrc);
5565
5566         return mdb_rebalance(csrc);
5567 }
5568
5569 /** Copy the contents of a cursor.
5570  * @param[in] csrc The cursor to copy from.
5571  * @param[out] cdst The cursor to copy to.
5572  */
5573 static void
5574 mdb_cursor_copy(const MDB_cursor *csrc, MDB_cursor *cdst)
5575 {
5576         unsigned int i;
5577
5578         cdst->mc_txn = csrc->mc_txn;
5579         cdst->mc_dbi = csrc->mc_dbi;
5580         cdst->mc_db  = csrc->mc_db;
5581         cdst->mc_dbx = csrc->mc_dbx;
5582         cdst->mc_snum = csrc->mc_snum;
5583         cdst->mc_top = csrc->mc_top;
5584         cdst->mc_flags = csrc->mc_flags;
5585
5586         for (i=0; i<csrc->mc_snum; i++) {
5587                 cdst->mc_pg[i] = csrc->mc_pg[i];
5588                 cdst->mc_ki[i] = csrc->mc_ki[i];
5589         }
5590 }
5591
5592 /** Rebalance the tree after a delete operation.
5593  * @param[in] mc Cursor pointing to the page where rebalancing
5594  * should begin.
5595  * @return 0 on success, non-zero on failure.
5596  */
5597 static int
5598 mdb_rebalance(MDB_cursor *mc)
5599 {
5600         MDB_node        *node;
5601         int rc;
5602         unsigned int ptop;
5603         MDB_cursor      mn;
5604
5605 #if MDB_DEBUG
5606         {
5607         pgno_t pgno;
5608         COPY_PGNO(pgno, mc->mc_pg[mc->mc_top]->mp_pgno);
5609         DPRINTF("rebalancing %s page %zu (has %u keys, %.1f%% full)",
5610             IS_LEAF(mc->mc_pg[mc->mc_top]) ? "leaf" : "branch",
5611             pgno, NUMKEYS(mc->mc_pg[mc->mc_top]), (float)PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) / 10);
5612         }
5613 #endif
5614
5615         if (PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) >= FILL_THRESHOLD) {
5616 #if MDB_DEBUG
5617                 pgno_t pgno;
5618                 COPY_PGNO(pgno, mc->mc_pg[mc->mc_top]->mp_pgno);
5619                 DPRINTF("no need to rebalance page %zu, above fill threshold",
5620                     pgno);
5621 #endif
5622                 return MDB_SUCCESS;
5623         }
5624
5625         if (mc->mc_snum < 2) {
5626                 MDB_page *mp = mc->mc_pg[0];
5627                 if (NUMKEYS(mp) == 0) {
5628                         DPUTS("tree is completely empty");
5629                         mc->mc_db->md_root = P_INVALID;
5630                         mc->mc_db->md_depth = 0;
5631                         mc->mc_db->md_leaf_pages = 0;
5632                         mdb_midl_append(&mc->mc_txn->mt_free_pgs, mp->mp_pgno);
5633                         mc->mc_snum = 0;
5634                         mc->mc_top = 0;
5635                         {
5636                                 /* Adjust other cursors pointing to mp */
5637                                 MDB_cursor *m2, *m3;
5638                                 MDB_dbi dbi = mc->mc_dbi;
5639
5640                                 if (mc->mc_flags & C_SUB)
5641                                         dbi--;
5642
5643                                 for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
5644                                         if (m2 == mc) continue;
5645                                         if (mc->mc_flags & C_SUB)
5646                                                 m3 = &m2->mc_xcursor->mx_cursor;
5647                                         else
5648                                                 m3 = m2;
5649                                         if (m3->mc_snum < mc->mc_snum) continue;
5650                                         if (m3->mc_pg[0] == mp) {
5651                                                 m3->mc_snum = 0;
5652                                                 m3->mc_top = 0;
5653                                         }
5654                                 }
5655                         }
5656                 } else if (IS_BRANCH(mp) && NUMKEYS(mp) == 1) {
5657                         DPUTS("collapsing root page!");
5658                         mdb_midl_append(&mc->mc_txn->mt_free_pgs, mp->mp_pgno);
5659                         mc->mc_db->md_root = NODEPGNO(NODEPTR(mp, 0));
5660                         if ((rc = mdb_page_get(mc->mc_txn, mc->mc_db->md_root,
5661                                 &mc->mc_pg[0])))
5662                                 return rc;
5663                         mc->mc_db->md_depth--;
5664                         mc->mc_db->md_branch_pages--;
5665                         {
5666                                 /* Adjust other cursors pointing to mp */
5667                                 MDB_cursor *m2, *m3;
5668                                 MDB_dbi dbi = mc->mc_dbi;
5669
5670                                 if (mc->mc_flags & C_SUB)
5671                                         dbi--;
5672
5673                                 for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
5674                                         if (m2 == mc) continue;
5675                                         if (mc->mc_flags & C_SUB)
5676                                                 m3 = &m2->mc_xcursor->mx_cursor;
5677                                         else
5678                                                 m3 = m2;
5679                                         if (m3->mc_snum < mc->mc_snum) continue;
5680                                         if (m3->mc_pg[0] == mp) {
5681                                                 m3->mc_pg[0] = mc->mc_pg[0];
5682                                         }
5683                                 }
5684                         }
5685                 } else
5686                         DPUTS("root page doesn't need rebalancing");
5687                 return MDB_SUCCESS;
5688         }
5689
5690         /* The parent (branch page) must have at least 2 pointers,
5691          * otherwise the tree is invalid.
5692          */
5693         ptop = mc->mc_top-1;
5694         assert(NUMKEYS(mc->mc_pg[ptop]) > 1);
5695
5696         /* Leaf page fill factor is below the threshold.
5697          * Try to move keys from left or right neighbor, or
5698          * merge with a neighbor page.
5699          */
5700
5701         /* Find neighbors.
5702          */
5703         mdb_cursor_copy(mc, &mn);
5704         mn.mc_xcursor = NULL;
5705
5706         if (mc->mc_ki[ptop] == 0) {
5707                 /* We're the leftmost leaf in our parent.
5708                  */
5709                 DPUTS("reading right neighbor");
5710                 mn.mc_ki[ptop]++;
5711                 node = NODEPTR(mc->mc_pg[ptop], mn.mc_ki[ptop]);
5712                 if ((rc = mdb_page_get(mc->mc_txn, NODEPGNO(node), &mn.mc_pg[mn.mc_top])))
5713                         return rc;
5714                 mn.mc_ki[mn.mc_top] = 0;
5715                 mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]);
5716         } else {
5717                 /* There is at least one neighbor to the left.
5718                  */
5719                 DPUTS("reading left neighbor");
5720                 mn.mc_ki[ptop]--;
5721                 node = NODEPTR(mc->mc_pg[ptop], mn.mc_ki[ptop]);
5722                 if ((rc = mdb_page_get(mc->mc_txn, NODEPGNO(node), &mn.mc_pg[mn.mc_top])))
5723                         return rc;
5724                 mn.mc_ki[mn.mc_top] = NUMKEYS(mn.mc_pg[mn.mc_top]) - 1;
5725                 mc->mc_ki[mc->mc_top] = 0;
5726         }
5727
5728         DPRINTF("found neighbor page %zu (%u keys, %.1f%% full)",
5729             mn.mc_pg[mn.mc_top]->mp_pgno, NUMKEYS(mn.mc_pg[mn.mc_top]), (float)PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) / 10);
5730
5731         /* If the neighbor page is above threshold and has at least two
5732          * keys, move one key from it.
5733          *
5734          * Otherwise we should try to merge them.
5735          */
5736         if (PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) >= FILL_THRESHOLD && NUMKEYS(mn.mc_pg[mn.mc_top]) >= 2)
5737                 return mdb_node_move(&mn, mc);
5738         else { /* FIXME: if (has_enough_room()) */
5739                 mc->mc_flags &= ~C_INITIALIZED;
5740                 if (mc->mc_ki[ptop] == 0)
5741                         return mdb_page_merge(&mn, mc);
5742                 else
5743                         return mdb_page_merge(mc, &mn);
5744         }
5745 }
5746
5747 /** Complete a delete operation started by #mdb_cursor_del(). */
5748 static int
5749 mdb_cursor_del0(MDB_cursor *mc, MDB_node *leaf)
5750 {
5751         int rc;
5752
5753         /* add overflow pages to free list */
5754         if (!IS_LEAF2(mc->mc_pg[mc->mc_top]) && F_ISSET(leaf->mn_flags, F_BIGDATA)) {
5755                 int i, ovpages;
5756                 pgno_t pg;
5757
5758                 memcpy(&pg, NODEDATA(leaf), sizeof(pg));
5759                 ovpages = OVPAGES(NODEDSZ(leaf), mc->mc_txn->mt_env->me_psize);
5760                 mc->mc_db->md_overflow_pages -= ovpages;
5761                 for (i=0; i<ovpages; i++) {
5762                         DPRINTF("freed ov page %zu", pg);
5763                         mdb_midl_append(&mc->mc_txn->mt_free_pgs, pg);
5764                         pg++;
5765                 }
5766         }
5767         mdb_node_del(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], mc->mc_db->md_pad);
5768         mc->mc_db->md_entries--;
5769         rc = mdb_rebalance(mc);
5770         if (rc != MDB_SUCCESS)
5771                 mc->mc_txn->mt_flags |= MDB_TXN_ERROR;
5772
5773         return rc;
5774 }
5775
5776 int
5777 mdb_del(MDB_txn *txn, MDB_dbi dbi,
5778     MDB_val *key, MDB_val *data)
5779 {
5780         MDB_cursor mc;
5781         MDB_xcursor mx;
5782         MDB_cursor_op op;
5783         MDB_val rdata, *xdata;
5784         int              rc, exact;
5785         DKBUF;
5786
5787         assert(key != NULL);
5788
5789         DPRINTF("====> delete db %u key [%s]", dbi, DKEY(key));
5790
5791         if (txn == NULL || !dbi || dbi >= txn->mt_numdbs)
5792                 return EINVAL;
5793
5794         if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) {
5795                 return EACCES;
5796         }
5797
5798         if (key->mv_size == 0 || key->mv_size > MAXKEYSIZE) {
5799                 return EINVAL;
5800         }
5801
5802         mdb_cursor_init(&mc, txn, dbi, &mx);
5803
5804         exact = 0;
5805         if (data) {
5806                 op = MDB_GET_BOTH;
5807                 rdata = *data;
5808                 xdata = &rdata;
5809         } else {
5810                 op = MDB_SET;
5811                 xdata = NULL;
5812         }
5813         rc = mdb_cursor_set(&mc, key, xdata, op, &exact);
5814         if (rc == 0)
5815                 rc = mdb_cursor_del(&mc, data ? 0 : MDB_NODUPDATA);
5816         return rc;
5817 }
5818
5819 /** Split a page and insert a new node.
5820  * @param[in,out] mc Cursor pointing to the page and desired insertion index.
5821  * The cursor will be updated to point to the actual page and index where
5822  * the node got inserted after the split.
5823  * @param[in] newkey The key for the newly inserted node.
5824  * @param[in] newdata The data for the newly inserted node.
5825  * @param[in] newpgno The page number, if the new node is a branch node.
5826  * @param[in] nflags The #NODE_ADD_FLAGS for the new node.
5827  * @return 0 on success, non-zero on failure.
5828  */
5829 static int
5830 mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno,
5831         unsigned int nflags)
5832 {
5833         unsigned int flags;
5834         int              rc = MDB_SUCCESS, ins_new = 0, new_root = 0, newpos = 1, did_split = 0;
5835         indx_t           newindx;
5836         pgno_t           pgno = 0;
5837         unsigned int     i, j, split_indx, nkeys, pmax;
5838         MDB_node        *node;
5839         MDB_val  sepkey, rkey, xdata, *rdata = &xdata;
5840         MDB_page        *copy;
5841         MDB_page        *mp, *rp, *pp;
5842         unsigned int ptop;
5843         MDB_cursor      mn;
5844         DKBUF;
5845
5846         mp = mc->mc_pg[mc->mc_top];
5847         newindx = mc->mc_ki[mc->mc_top];
5848
5849         DPRINTF("-----> splitting %s page %zu and adding [%s] at index %i",
5850             IS_LEAF(mp) ? "leaf" : "branch", mp->mp_pgno,
5851             DKEY(newkey), mc->mc_ki[mc->mc_top]);
5852
5853         /* Create a right sibling. */
5854         if ((rp = mdb_page_new(mc, mp->mp_flags, 1)) == NULL)
5855                 return ENOMEM;
5856         DPRINTF("new right sibling: page %zu", rp->mp_pgno);
5857
5858         if (mc->mc_snum < 2) {
5859                 if ((pp = mdb_page_new(mc, P_BRANCH, 1)) == NULL)
5860                         return ENOMEM;
5861                 /* shift current top to make room for new parent */
5862                 mc->mc_pg[1] = mc->mc_pg[0];
5863                 mc->mc_ki[1] = mc->mc_ki[0];
5864                 mc->mc_pg[0] = pp;
5865                 mc->mc_ki[0] = 0;
5866                 mc->mc_db->md_root = pp->mp_pgno;
5867                 DPRINTF("root split! new root = %zu", pp->mp_pgno);
5868                 mc->mc_db->md_depth++;
5869                 new_root = 1;
5870
5871                 /* Add left (implicit) pointer. */
5872                 if ((rc = mdb_node_add(mc, 0, NULL, NULL, mp->mp_pgno, 0)) != MDB_SUCCESS) {
5873                         /* undo the pre-push */
5874                         mc->mc_pg[0] = mc->mc_pg[1];
5875                         mc->mc_ki[0] = mc->mc_ki[1];
5876                         mc->mc_db->md_root = mp->mp_pgno;
5877                         mc->mc_db->md_depth--;
5878                         return rc;
5879                 }
5880                 mc->mc_snum = 2;
5881                 mc->mc_top = 1;
5882                 ptop = 0;
5883         } else {
5884                 ptop = mc->mc_top-1;
5885                 DPRINTF("parent branch page is %zu", mc->mc_pg[ptop]->mp_pgno);
5886         }
5887
5888         mc->mc_flags |= C_SPLITTING;
5889         mdb_cursor_copy(mc, &mn);
5890         mn.mc_pg[mn.mc_top] = rp;
5891         mn.mc_ki[ptop] = mc->mc_ki[ptop]+1;
5892
5893         if (nflags & MDB_APPEND) {
5894                 mn.mc_ki[mn.mc_top] = 0;
5895                 sepkey = *newkey;
5896                 split_indx = newindx;
5897                 nkeys = 0;
5898                 goto newsep;
5899         }
5900
5901         nkeys = NUMKEYS(mp);
5902         split_indx = (nkeys + 1) / 2;
5903         if (newindx < split_indx)
5904                 newpos = 0;
5905
5906         if (IS_LEAF2(rp)) {
5907                 char *split, *ins;
5908                 int x;
5909                 unsigned int lsize, rsize, ksize;
5910                 /* Move half of the keys to the right sibling */
5911                 copy = NULL;
5912                 x = mc->mc_ki[mc->mc_top] - split_indx;
5913                 ksize = mc->mc_db->md_pad;
5914                 split = LEAF2KEY(mp, split_indx, ksize);
5915                 rsize = (nkeys - split_indx) * ksize;
5916                 lsize = (nkeys - split_indx) * sizeof(indx_t);
5917                 mp->mp_lower -= lsize;
5918                 rp->mp_lower += lsize;
5919                 mp->mp_upper += rsize - lsize;
5920                 rp->mp_upper -= rsize - lsize;
5921                 sepkey.mv_size = ksize;
5922                 if (newindx == split_indx) {
5923                         sepkey.mv_data = newkey->mv_data;
5924                 } else {
5925                         sepkey.mv_data = split;
5926                 }
5927                 if (x<0) {
5928                         ins = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], ksize);
5929                         memcpy(rp->mp_ptrs, split, rsize);
5930                         sepkey.mv_data = rp->mp_ptrs;
5931                         memmove(ins+ksize, ins, (split_indx - mc->mc_ki[mc->mc_top]) * ksize);
5932                         memcpy(ins, newkey->mv_data, ksize);
5933                         mp->mp_lower += sizeof(indx_t);
5934                         mp->mp_upper -= ksize - sizeof(indx_t);
5935                 } else {
5936                         if (x)
5937                                 memcpy(rp->mp_ptrs, split, x * ksize);
5938                         ins = LEAF2KEY(rp, x, ksize);
5939                         memcpy(ins, newkey->mv_data, ksize);
5940                         memcpy(ins+ksize, split + x * ksize, rsize - x * ksize);
5941                         rp->mp_lower += sizeof(indx_t);
5942                         rp->mp_upper -= ksize - sizeof(indx_t);
5943                         mc->mc_ki[mc->mc_top] = x;
5944                         mc->mc_pg[mc->mc_top] = rp;
5945                 }
5946                 goto newsep;
5947         }
5948
5949         /* For leaf pages, check the split point based on what
5950          * fits where, since otherwise mdb_node_add can fail.
5951          *
5952          * This check is only needed when the data items are
5953          * relatively large, such that being off by one will
5954          * make the difference between success or failure.
5955          * When the size of the data items is much smaller than
5956          * one-half of a page, this check is irrelevant.
5957          */
5958         if (IS_LEAF(mp)) {
5959                 unsigned int psize, nsize;
5960                 /* Maximum free space in an empty page */
5961                 pmax = mc->mc_txn->mt_env->me_psize - PAGEHDRSZ;
5962                 nsize = mdb_leaf_size(mc->mc_txn->mt_env, newkey, newdata);
5963                 if ((nkeys < 20) || (nsize > pmax/4)) {
5964                         if (newindx <= split_indx) {
5965                                 psize = nsize;
5966                                 newpos = 0;
5967                                 for (i=0; i<split_indx; i++) {
5968                                         node = NODEPTR(mp, i);
5969                                         psize += NODESIZE + NODEKSZ(node) + sizeof(indx_t);
5970                                         if (F_ISSET(node->mn_flags, F_BIGDATA))
5971                                                 psize += sizeof(pgno_t);
5972                                         else
5973                                                 psize += NODEDSZ(node);
5974                                         psize += psize & 1;
5975                                         if (psize > pmax) {
5976                                                 if (i == split_indx - 1 && newindx == split_indx)
5977                                                         newpos = 1;
5978                                                 else
5979                                                         split_indx = i;
5980                                                 break;
5981                                         }
5982                                 }
5983                         } else {
5984                                 psize = nsize;
5985                                 for (i=nkeys-1; i>=split_indx; i--) {
5986                                         node = NODEPTR(mp, i);
5987                                         psize += NODESIZE + NODEKSZ(node) + sizeof(indx_t);
5988                                         if (F_ISSET(node->mn_flags, F_BIGDATA))
5989                                                 psize += sizeof(pgno_t);
5990                                         else
5991                                                 psize += NODEDSZ(node);
5992                                         psize += psize & 1;
5993                                         if (psize > pmax) {
5994                                                 split_indx = i+1;
5995                                                 break;
5996                                         }
5997                                 }
5998                         }
5999                 }
6000         }
6001
6002         /* First find the separating key between the split pages.
6003          * The case where newindx == split_indx is ambiguous; the
6004          * new item could go to the new page or stay on the original
6005          * page. If newpos == 1 it goes to the new page.
6006          */
6007         if (newindx == split_indx && newpos) {
6008                 sepkey.mv_size = newkey->mv_size;
6009                 sepkey.mv_data = newkey->mv_data;
6010         } else {
6011                 node = NODEPTR(mp, split_indx);
6012                 sepkey.mv_size = node->mn_ksize;
6013                 sepkey.mv_data = NODEKEY(node);
6014         }
6015
6016 newsep:
6017         DPRINTF("separator is [%s]", DKEY(&sepkey));
6018
6019         /* Copy separator key to the parent.
6020          */
6021         if (SIZELEFT(mn.mc_pg[ptop]) < mdb_branch_size(mc->mc_txn->mt_env, &sepkey)) {
6022                 mn.mc_snum--;
6023                 mn.mc_top--;
6024                 did_split = 1;
6025                 rc = mdb_page_split(&mn, &sepkey, NULL, rp->mp_pgno, 0);
6026
6027                 /* root split? */
6028                 if (mn.mc_snum == mc->mc_snum) {
6029                         mc->mc_pg[mc->mc_snum] = mc->mc_pg[mc->mc_top];
6030                         mc->mc_ki[mc->mc_snum] = mc->mc_ki[mc->mc_top];
6031                         mc->mc_pg[mc->mc_top] = mc->mc_pg[ptop];
6032                         mc->mc_ki[mc->mc_top] = mc->mc_ki[ptop];
6033                         mc->mc_snum++;
6034                         mc->mc_top++;
6035                         ptop++;
6036                 }
6037                 /* Right page might now have changed parent.
6038                  * Check if left page also changed parent.
6039                  */
6040                 if (mn.mc_pg[ptop] != mc->mc_pg[ptop] &&
6041                     mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) {
6042                         for (i=0; i<ptop; i++) {
6043                                 mc->mc_pg[i] = mn.mc_pg[i];
6044                                 mc->mc_ki[i] = mn.mc_ki[i];
6045                         }
6046                         mc->mc_pg[ptop] = mn.mc_pg[ptop];
6047                         mc->mc_ki[ptop] = mn.mc_ki[ptop] - 1;
6048                 }
6049         } else {
6050                 mn.mc_top--;
6051                 rc = mdb_node_add(&mn, mn.mc_ki[ptop], &sepkey, NULL, rp->mp_pgno, 0);
6052                 mn.mc_top++;
6053         }
6054         mc->mc_flags ^= C_SPLITTING;
6055         if (rc != MDB_SUCCESS) {
6056                 return rc;
6057         }
6058         if (nflags & MDB_APPEND) {
6059                 mc->mc_pg[mc->mc_top] = rp;
6060                 mc->mc_ki[mc->mc_top] = 0;
6061                 rc = mdb_node_add(mc, 0, newkey, newdata, newpgno, nflags);
6062                 if (rc)
6063                         return rc;
6064                 for (i=0; i<mc->mc_top; i++)
6065                         mc->mc_ki[i] = mn.mc_ki[i];
6066                 goto done;
6067         }
6068         if (IS_LEAF2(rp)) {
6069                 goto done;
6070         }
6071
6072         /* Move half of the keys to the right sibling. */
6073
6074         /* grab a page to hold a temporary copy */
6075         copy = mdb_page_malloc(mc);
6076         if (copy == NULL)
6077                 return ENOMEM;
6078
6079         copy->mp_pgno  = mp->mp_pgno;
6080         copy->mp_flags = mp->mp_flags;
6081         copy->mp_lower = PAGEHDRSZ;
6082         copy->mp_upper = mc->mc_txn->mt_env->me_psize;
6083         mc->mc_pg[mc->mc_top] = copy;
6084         for (i = j = 0; i <= nkeys; j++) {
6085                 if (i == split_indx) {
6086                 /* Insert in right sibling. */
6087                 /* Reset insert index for right sibling. */
6088                         if (i != newindx || (newpos ^ ins_new)) {
6089                                 j = 0;
6090                                 mc->mc_pg[mc->mc_top] = rp;
6091                         }
6092                 }
6093
6094                 if (i == newindx && !ins_new) {
6095                         /* Insert the original entry that caused the split. */
6096                         rkey.mv_data = newkey->mv_data;
6097                         rkey.mv_size = newkey->mv_size;
6098                         if (IS_LEAF(mp)) {
6099                                 rdata = newdata;
6100                         } else
6101                                 pgno = newpgno;
6102                         flags = nflags;
6103
6104                         ins_new = 1;
6105
6106                         /* Update index for the new key. */
6107                         mc->mc_ki[mc->mc_top] = j;
6108                 } else if (i == nkeys) {
6109                         break;
6110                 } else {
6111                         node = NODEPTR(mp, i);
6112                         rkey.mv_data = NODEKEY(node);
6113                         rkey.mv_size = node->mn_ksize;
6114                         if (IS_LEAF(mp)) {
6115                                 xdata.mv_data = NODEDATA(node);
6116                                 xdata.mv_size = NODEDSZ(node);
6117                                 rdata = &xdata;
6118                         } else
6119                                 pgno = NODEPGNO(node);
6120                         flags = node->mn_flags;
6121
6122                         i++;
6123                 }
6124
6125                 if (!IS_LEAF(mp) && j == 0) {
6126                         /* First branch index doesn't need key data. */
6127                         rkey.mv_size = 0;
6128                 }
6129
6130                 rc = mdb_node_add(mc, j, &rkey, rdata, pgno, flags);
6131                 if (rc) break;
6132         }
6133
6134         nkeys = NUMKEYS(copy);
6135         for (i=0; i<nkeys; i++)
6136                 mp->mp_ptrs[i] = copy->mp_ptrs[i];
6137         mp->mp_lower = copy->mp_lower;
6138         mp->mp_upper = copy->mp_upper;
6139         memcpy(NODEPTR(mp, nkeys-1), NODEPTR(copy, nkeys-1),
6140                 mc->mc_txn->mt_env->me_psize - copy->mp_upper);
6141
6142         /* reset back to original page */
6143         if (newindx < split_indx || (!newpos && newindx == split_indx)) {
6144                 mc->mc_pg[mc->mc_top] = mp;
6145                 if (nflags & MDB_RESERVE) {
6146                         node = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
6147                         if (!(node->mn_flags & F_BIGDATA))
6148                                 newdata->mv_data = NODEDATA(node);
6149                 }
6150         } else {
6151                 mc->mc_ki[ptop]++;
6152         }
6153
6154         /* return tmp page to freelist */
6155         copy->mp_next = mc->mc_txn->mt_env->me_dpages;
6156         VGMEMP_FREE(mc->mc_txn->mt_env, copy);
6157         mc->mc_txn->mt_env->me_dpages = copy;
6158 done:
6159         {
6160                 /* Adjust other cursors pointing to mp */
6161                 MDB_cursor *m2, *m3;
6162                 MDB_dbi dbi = mc->mc_dbi;
6163                 int fixup = NUMKEYS(mp);
6164
6165                 if (mc->mc_flags & C_SUB)
6166                         dbi--;
6167
6168                 for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
6169                         if (m2 == mc) continue;
6170                         if (mc->mc_flags & C_SUB)
6171                                 m3 = &m2->mc_xcursor->mx_cursor;
6172                         else
6173                                 m3 = m2;
6174                         if (!(m3->mc_flags & C_INITIALIZED))
6175                                 continue;
6176                         if (m3->mc_flags & C_SPLITTING)
6177                                 continue;
6178                         if (new_root) {
6179                                 int k;
6180                                 /* root split */
6181                                 for (k=m3->mc_top; k>=0; k--) {
6182                                         m3->mc_ki[k+1] = m3->mc_ki[k];
6183                                         m3->mc_pg[k+1] = m3->mc_pg[k];
6184                                 }
6185                                 if (m3->mc_ki[0] >= split_indx) {
6186                                         m3->mc_ki[0] = 1;
6187                                 } else {
6188                                         m3->mc_ki[0] = 0;
6189                                 }
6190                                 m3->mc_pg[0] = mc->mc_pg[0];
6191                                 m3->mc_snum++;
6192                                 m3->mc_top++;
6193                         }
6194                         if (m3->mc_pg[mc->mc_top] == mp) {
6195                                 if (m3->mc_ki[mc->mc_top] >= newindx && !(nflags & MDB_SPLIT_REPLACE))
6196                                         m3->mc_ki[mc->mc_top]++;
6197                                 if (m3->mc_ki[mc->mc_top] >= fixup) {
6198                                         m3->mc_pg[mc->mc_top] = rp;
6199                                         m3->mc_ki[mc->mc_top] -= fixup;
6200                                         m3->mc_ki[ptop] = mn.mc_ki[ptop];
6201                                 }
6202                         } else if (!did_split && m3->mc_pg[ptop] == mc->mc_pg[ptop] &&
6203                                 m3->mc_ki[ptop] >= mc->mc_ki[ptop]) {
6204                                 m3->mc_ki[ptop]++;
6205                         }
6206                 }
6207         }
6208         return rc;
6209 }
6210
6211 int
6212 mdb_put(MDB_txn *txn, MDB_dbi dbi,
6213     MDB_val *key, MDB_val *data, unsigned int flags)
6214 {
6215         MDB_cursor mc;
6216         MDB_xcursor mx;
6217
6218         assert(key != NULL);
6219         assert(data != NULL);
6220
6221         if (txn == NULL || !dbi || dbi >= txn->mt_numdbs)
6222                 return EINVAL;
6223
6224         if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) {
6225                 return EACCES;
6226         }
6227
6228         if (key->mv_size == 0 || key->mv_size > MAXKEYSIZE) {
6229                 return EINVAL;
6230         }
6231
6232         if ((flags & (MDB_NOOVERWRITE|MDB_NODUPDATA|MDB_RESERVE|MDB_APPEND)) != flags)
6233                 return EINVAL;
6234
6235         mdb_cursor_init(&mc, txn, dbi, &mx);
6236         return mdb_cursor_put(&mc, key, data, flags);
6237 }
6238
6239 /** Only a subset of the @ref mdb_env flags can be changed
6240  *      at runtime. Changing other flags requires closing the environment
6241  *      and re-opening it with the new flags.
6242  */
6243 #define CHANGEABLE      (MDB_NOSYNC|MDB_NOMETASYNC)
6244 int
6245 mdb_env_set_flags(MDB_env *env, unsigned int flag, int onoff)
6246 {
6247         if ((flag & CHANGEABLE) != flag)
6248                 return EINVAL;
6249         if (onoff)
6250                 env->me_flags |= flag;
6251         else
6252                 env->me_flags &= ~flag;
6253         return MDB_SUCCESS;
6254 }
6255
6256 int
6257 mdb_env_get_flags(MDB_env *env, unsigned int *arg)
6258 {
6259         if (!env || !arg)
6260                 return EINVAL;
6261
6262         *arg = env->me_flags;
6263         return MDB_SUCCESS;
6264 }
6265
6266 int
6267 mdb_env_get_path(MDB_env *env, const char **arg)
6268 {
6269         if (!env || !arg)
6270                 return EINVAL;
6271
6272         *arg = env->me_path;
6273         return MDB_SUCCESS;
6274 }
6275
6276 /** Common code for #mdb_stat() and #mdb_env_stat().
6277  * @param[in] env the environment to operate in.
6278  * @param[in] db the #MDB_db record containing the stats to return.
6279  * @param[out] arg the address of an #MDB_stat structure to receive the stats.
6280  * @return 0, this function always succeeds.
6281  */
6282 static int
6283 mdb_stat0(MDB_env *env, MDB_db *db, MDB_stat *arg)
6284 {
6285         arg->ms_psize = env->me_psize;
6286         arg->ms_depth = db->md_depth;
6287         arg->ms_branch_pages = db->md_branch_pages;
6288         arg->ms_leaf_pages = db->md_leaf_pages;
6289         arg->ms_overflow_pages = db->md_overflow_pages;
6290         arg->ms_entries = db->md_entries;
6291
6292         return MDB_SUCCESS;
6293 }
6294 int
6295 mdb_env_stat(MDB_env *env, MDB_stat *arg)
6296 {
6297         int toggle;
6298
6299         if (env == NULL || arg == NULL)
6300                 return EINVAL;
6301
6302         toggle = mdb_env_pick_meta(env);
6303
6304         return mdb_stat0(env, &env->me_metas[toggle]->mm_dbs[MAIN_DBI], arg);
6305 }
6306
6307 /** Set the default comparison functions for a database.
6308  * Called immediately after a database is opened to set the defaults.
6309  * The user can then override them with #mdb_set_compare() or
6310  * #mdb_set_dupsort().
6311  * @param[in] txn A transaction handle returned by #mdb_txn_begin()
6312  * @param[in] dbi A database handle returned by #mdb_open()
6313  */
6314 static void
6315 mdb_default_cmp(MDB_txn *txn, MDB_dbi dbi)
6316 {
6317         uint16_t f = txn->mt_dbs[dbi].md_flags;
6318
6319         txn->mt_dbxs[dbi].md_cmp =
6320                 (f & MDB_REVERSEKEY) ? mdb_cmp_memnr :
6321                 (f & MDB_INTEGERKEY) ? mdb_cmp_cint  : mdb_cmp_memn;
6322
6323         txn->mt_dbxs[dbi].md_dcmp =
6324                 !(f & MDB_DUPSORT) ? 0 :
6325                 ((f & MDB_INTEGERDUP)
6326                  ? ((f & MDB_DUPFIXED)   ? mdb_cmp_int   : mdb_cmp_cint)
6327                  : ((f & MDB_REVERSEDUP) ? mdb_cmp_memnr : mdb_cmp_memn));
6328 }
6329
6330 int mdb_open(MDB_txn *txn, const char *name, unsigned int flags, MDB_dbi *dbi)
6331 {
6332         MDB_val key, data;
6333         MDB_dbi i;
6334         MDB_cursor mc;
6335         int rc, dbflag, exact;
6336         unsigned int unused = 0;
6337         size_t len;
6338
6339         if (txn->mt_dbxs[FREE_DBI].md_cmp == NULL) {
6340                 mdb_default_cmp(txn, FREE_DBI);
6341         }
6342
6343         /* main DB? */
6344         if (!name) {
6345                 *dbi = MAIN_DBI;
6346                 if (flags & (MDB_DUPSORT|MDB_REVERSEKEY|MDB_INTEGERKEY))
6347                         txn->mt_dbs[MAIN_DBI].md_flags |= (flags & (MDB_DUPSORT|MDB_REVERSEKEY|MDB_INTEGERKEY));
6348                 mdb_default_cmp(txn, MAIN_DBI);
6349                 return MDB_SUCCESS;
6350         }
6351
6352         if (txn->mt_dbxs[MAIN_DBI].md_cmp == NULL) {
6353                 mdb_default_cmp(txn, MAIN_DBI);
6354         }
6355
6356         /* Is the DB already open? */
6357         len = strlen(name);
6358         for (i=2; i<txn->mt_numdbs; i++) {
6359                 if (!txn->mt_dbxs[i].md_name.mv_size) {
6360                         /* Remember this free slot */
6361                         if (!unused) unused = i;
6362                         continue;
6363                 }
6364                 if (len == txn->mt_dbxs[i].md_name.mv_size &&
6365                         !strncmp(name, txn->mt_dbxs[i].md_name.mv_data, len)) {
6366                         *dbi = i;
6367                         return MDB_SUCCESS;
6368                 }
6369         }
6370
6371         /* If no free slot and max hit, fail */
6372         if (!unused && txn->mt_numdbs >= txn->mt_env->me_maxdbs - 1)
6373                 return ENFILE;
6374
6375         /* Find the DB info */
6376         dbflag = 0;
6377         exact = 0;
6378         key.mv_size = len;
6379         key.mv_data = (void *)name;
6380         mdb_cursor_init(&mc, txn, MAIN_DBI, NULL);
6381         rc = mdb_cursor_set(&mc, &key, &data, MDB_SET, &exact);
6382         if (rc == MDB_SUCCESS) {
6383                 /* make sure this is actually a DB */
6384                 MDB_node *node = NODEPTR(mc.mc_pg[mc.mc_top], mc.mc_ki[mc.mc_top]);
6385                 if (!(node->mn_flags & F_SUBDATA))
6386                         return EINVAL;
6387         } else if (rc == MDB_NOTFOUND && (flags & MDB_CREATE)) {
6388                 /* Create if requested */
6389                 MDB_db dummy;
6390                 data.mv_size = sizeof(MDB_db);
6391                 data.mv_data = &dummy;
6392                 memset(&dummy, 0, sizeof(dummy));
6393                 dummy.md_root = P_INVALID;
6394                 dummy.md_flags = flags & 0xffff;
6395                 rc = mdb_cursor_put(&mc, &key, &data, F_SUBDATA);
6396                 dbflag = DB_DIRTY;
6397         }
6398
6399         /* OK, got info, add to table */
6400         if (rc == MDB_SUCCESS) {
6401                 unsigned int slot = unused ? unused : txn->mt_numdbs;
6402                 txn->mt_dbxs[slot].md_name.mv_data = strdup(name);
6403                 txn->mt_dbxs[slot].md_name.mv_size = len;
6404                 txn->mt_dbxs[slot].md_rel = NULL;
6405                 txn->mt_dbflags[slot] = dbflag;
6406                 memcpy(&txn->mt_dbs[slot], data.mv_data, sizeof(MDB_db));
6407                 *dbi = slot;
6408                 txn->mt_env->me_dbflags[slot] = txn->mt_dbs[slot].md_flags;
6409                 mdb_default_cmp(txn, slot);
6410                 if (!unused) {
6411                         txn->mt_numdbs++;
6412                         txn->mt_env->me_numdbs++;
6413                 }
6414         }
6415
6416         return rc;
6417 }
6418
6419 int mdb_stat(MDB_txn *txn, MDB_dbi dbi, MDB_stat *arg)
6420 {
6421         if (txn == NULL || arg == NULL || dbi >= txn->mt_numdbs)
6422                 return EINVAL;
6423
6424         return mdb_stat0(txn->mt_env, &txn->mt_dbs[dbi], arg);
6425 }
6426
6427 void mdb_close(MDB_env *env, MDB_dbi dbi)
6428 {
6429         char *ptr;
6430         if (dbi <= MAIN_DBI || dbi >= env->me_numdbs)
6431                 return;
6432         ptr = env->me_dbxs[dbi].md_name.mv_data;
6433         env->me_dbxs[dbi].md_name.mv_data = NULL;
6434         env->me_dbxs[dbi].md_name.mv_size = 0;
6435         free(ptr);
6436 }
6437
6438 /** Add all the DB's pages to the free list.
6439  * @param[in] mc Cursor on the DB to free.
6440  * @param[in] subs non-Zero to check for sub-DBs in this DB.
6441  * @return 0 on success, non-zero on failure.
6442  */
6443 static int
6444 mdb_drop0(MDB_cursor *mc, int subs)
6445 {
6446         int rc;
6447
6448         rc = mdb_page_search(mc, NULL, 0);
6449         if (rc == MDB_SUCCESS) {
6450                 MDB_node *ni;
6451                 MDB_cursor mx;
6452                 unsigned int i;
6453
6454                 /* LEAF2 pages have no nodes, cannot have sub-DBs */
6455                 if (!subs || IS_LEAF2(mc->mc_pg[mc->mc_top]))
6456                         mdb_cursor_pop(mc);
6457
6458                 mdb_cursor_copy(mc, &mx);
6459                 while (mc->mc_snum > 0) {
6460                         if (IS_LEAF(mc->mc_pg[mc->mc_top])) {
6461                                 for (i=0; i<NUMKEYS(mc->mc_pg[mc->mc_top]); i++) {
6462                                         ni = NODEPTR(mc->mc_pg[mc->mc_top], i);
6463                                         if (ni->mn_flags & F_SUBDATA) {
6464                                                 mdb_xcursor_init1(mc, ni);
6465                                                 rc = mdb_drop0(&mc->mc_xcursor->mx_cursor, 0);
6466                                                 if (rc)
6467                                                         return rc;
6468                                         }
6469                                 }
6470                         } else {
6471                                 for (i=0; i<NUMKEYS(mc->mc_pg[mc->mc_top]); i++) {
6472                                         pgno_t pg;
6473                                         ni = NODEPTR(mc->mc_pg[mc->mc_top], i);
6474                                         pg = NODEPGNO(ni);
6475                                         /* free it */
6476                                         mdb_midl_append(&mc->mc_txn->mt_free_pgs, pg);
6477                                 }
6478                         }
6479                         if (!mc->mc_top)
6480                                 break;
6481                         rc = mdb_cursor_sibling(mc, 1);
6482                         if (rc) {
6483                                 /* no more siblings, go back to beginning
6484                                  * of previous level. (stack was already popped
6485                                  * by mdb_cursor_sibling)
6486                                  */
6487                                 for (i=1; i<mc->mc_top; i++)
6488                                         mc->mc_pg[i] = mx.mc_pg[i];
6489                         }
6490                 }
6491                 /* free it */
6492                 mdb_midl_append(&mc->mc_txn->mt_free_pgs,
6493                         mc->mc_db->md_root);
6494         }
6495         return 0;
6496 }
6497
6498 int mdb_drop(MDB_txn *txn, MDB_dbi dbi, int del)
6499 {
6500         MDB_cursor *mc;
6501         int rc;
6502
6503         if (!txn || !dbi || dbi >= txn->mt_numdbs)
6504                 return EINVAL;
6505
6506         if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY))
6507                 return EACCES;
6508
6509         rc = mdb_cursor_open(txn, dbi, &mc);
6510         if (rc)
6511                 return rc;
6512
6513         rc = mdb_drop0(mc, mc->mc_db->md_flags & MDB_DUPSORT);
6514         if (rc)
6515                 goto leave;
6516
6517         /* Can't delete the main DB */
6518         if (del && dbi > MAIN_DBI) {
6519                 rc = mdb_del(txn, MAIN_DBI, &mc->mc_dbx->md_name, NULL);
6520                 if (!rc)
6521                         mdb_close(txn->mt_env, dbi);
6522         } else {
6523                 txn->mt_dbflags[dbi] |= DB_DIRTY;
6524                 txn->mt_dbs[dbi].md_depth = 0;
6525                 txn->mt_dbs[dbi].md_branch_pages = 0;
6526                 txn->mt_dbs[dbi].md_leaf_pages = 0;
6527                 txn->mt_dbs[dbi].md_overflow_pages = 0;
6528                 txn->mt_dbs[dbi].md_entries = 0;
6529                 txn->mt_dbs[dbi].md_root = P_INVALID;
6530         }
6531 leave:
6532         mdb_cursor_close(mc);
6533         return rc;
6534 }
6535
6536 int mdb_set_compare(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp)
6537 {
6538         if (txn == NULL || !dbi || dbi >= txn->mt_numdbs)
6539                 return EINVAL;
6540
6541         txn->mt_dbxs[dbi].md_cmp = cmp;
6542         return MDB_SUCCESS;
6543 }
6544
6545 int mdb_set_dupsort(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp)
6546 {
6547         if (txn == NULL || !dbi || dbi >= txn->mt_numdbs)
6548                 return EINVAL;
6549
6550         txn->mt_dbxs[dbi].md_dcmp = cmp;
6551         return MDB_SUCCESS;
6552 }
6553
6554 int mdb_set_relfunc(MDB_txn *txn, MDB_dbi dbi, MDB_rel_func *rel)
6555 {
6556         if (txn == NULL || !dbi || dbi >= txn->mt_numdbs)
6557                 return EINVAL;
6558
6559         txn->mt_dbxs[dbi].md_rel = rel;
6560         return MDB_SUCCESS;
6561 }
6562
6563 int mdb_set_relctx(MDB_txn *txn, MDB_dbi dbi, void *ctx)
6564 {
6565         if (txn == NULL || !dbi || dbi >= txn->mt_numdbs)
6566                 return EINVAL;
6567
6568         txn->mt_dbxs[dbi].md_relctx = ctx;
6569         return MDB_SUCCESS;
6570 }
6571
6572 /** @} */