2 * @brief Lightning memory-mapped database library
4 * A Btree-based database management library modeled loosely on the
5 * BerkeleyDB API, but much simplified.
8 * Copyright 2011-2014 Howard Chu, Symas Corp.
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted only as authorized by the OpenLDAP
15 * A copy of this license is available in the file LICENSE in the
16 * top-level directory of the distribution or, alternatively, at
17 * <http://www.OpenLDAP.org/license.html>.
19 * This code is derived from btree.c written by Martin Hedenfalk.
21 * Copyright (c) 2009, 2010 Martin Hedenfalk <martin@bzero.se>
23 * Permission to use, copy, modify, and distribute this software for any
24 * purpose with or without fee is hereby granted, provided that the above
25 * copyright notice and this permission notice appear in all copies.
27 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
28 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
29 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
30 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
31 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
32 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
33 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
41 /** getpid() returns int; MinGW defines pid_t but MinGW64 typedefs it
42 * as int64 which is wrong. MSVC doesn't define it at all, so just
46 #define MDB_THR_T DWORD
47 #include <sys/types.h>
50 # include <sys/param.h>
52 # define LITTLE_ENDIAN 1234
53 # define BIG_ENDIAN 4321
54 # define BYTE_ORDER LITTLE_ENDIAN
56 # define SSIZE_MAX INT_MAX
60 #include <sys/types.h>
62 #define MDB_PID_T pid_t
63 #define MDB_THR_T pthread_t
64 #include <sys/param.h>
67 #ifdef HAVE_SYS_FILE_H
73 #if defined(__mips) && defined(__linux)
74 /* MIPS has cache coherency issues, requires explicit cache control */
75 #include <asm/cachectl.h>
76 extern int cacheflush(char *addr, int nbytes, int cache);
77 #define CACHEFLUSH(addr, bytes, cache) cacheflush(addr, bytes, cache)
79 #define CACHEFLUSH(addr, bytes, cache)
93 #if defined(__sun) || defined(ANDROID)
94 /* Most platforms have posix_memalign, older may only have memalign */
95 #define HAVE_MEMALIGN 1
99 #if !(defined(BYTE_ORDER) || defined(__BYTE_ORDER))
100 #include <netinet/in.h>
101 #include <resolv.h> /* defines BYTE_ORDER on HPUX and Solaris */
104 #if defined(__APPLE__) || defined (BSD)
105 # define MDB_USE_POSIX_SEM 1
106 # define MDB_FDATASYNC fsync
107 #elif defined(ANDROID)
108 # define MDB_FDATASYNC fsync
113 #ifdef MDB_USE_POSIX_SEM
114 # define MDB_USE_HASH 1
115 #include <semaphore.h>
120 #include <valgrind/memcheck.h>
121 #define VGMEMP_CREATE(h,r,z) VALGRIND_CREATE_MEMPOOL(h,r,z)
122 #define VGMEMP_ALLOC(h,a,s) VALGRIND_MEMPOOL_ALLOC(h,a,s)
123 #define VGMEMP_FREE(h,a) VALGRIND_MEMPOOL_FREE(h,a)
124 #define VGMEMP_DESTROY(h) VALGRIND_DESTROY_MEMPOOL(h)
125 #define VGMEMP_DEFINED(a,s) VALGRIND_MAKE_MEM_DEFINED(a,s)
127 #define VGMEMP_CREATE(h,r,z)
128 #define VGMEMP_ALLOC(h,a,s)
129 #define VGMEMP_FREE(h,a)
130 #define VGMEMP_DESTROY(h)
131 #define VGMEMP_DEFINED(a,s)
135 # if (defined(_LITTLE_ENDIAN) || defined(_BIG_ENDIAN)) && !(defined(_LITTLE_ENDIAN) && defined(_BIG_ENDIAN))
136 /* Solaris just defines one or the other */
137 # define LITTLE_ENDIAN 1234
138 # define BIG_ENDIAN 4321
139 # ifdef _LITTLE_ENDIAN
140 # define BYTE_ORDER LITTLE_ENDIAN
142 # define BYTE_ORDER BIG_ENDIAN
145 # define BYTE_ORDER __BYTE_ORDER
149 #ifndef LITTLE_ENDIAN
150 #define LITTLE_ENDIAN __LITTLE_ENDIAN
153 #define BIG_ENDIAN __BIG_ENDIAN
156 #if defined(__i386) || defined(__x86_64) || defined(_M_IX86)
157 #define MISALIGNED_OK 1
163 #if (BYTE_ORDER == LITTLE_ENDIAN) == (BYTE_ORDER == BIG_ENDIAN)
164 # error "Unknown or unsupported endianness (BYTE_ORDER)"
165 #elif (-6 & 5) || CHAR_BIT != 8 || UINT_MAX < 0xffffffff || ULONG_MAX % 0xFFFF
166 # error "Two's complement, reasonably sized integer types, please"
170 /** Put infrequently used env functions in separate section */
172 # define ESECT __attribute__ ((section("__TEXT,text_env")))
174 # define ESECT __attribute__ ((section("text_env")))
180 /** @defgroup internal LMDB Internals
183 /** @defgroup compat Compatibility Macros
184 * A bunch of macros to minimize the amount of platform-specific ifdefs
185 * needed throughout the rest of the code. When the features this library
186 * needs are similar enough to POSIX to be hidden in a one-or-two line
187 * replacement, this macro approach is used.
191 /** Features under development */
196 /** Wrapper around __func__, which is a C99 feature */
197 #if __STDC_VERSION__ >= 199901L
198 # define mdb_func_ __func__
199 #elif __GNUC__ >= 2 || _MSC_VER >= 1300
200 # define mdb_func_ __FUNCTION__
202 /* If a debug message says <mdb_unknown>(), update the #if statements above */
203 # define mdb_func_ "<mdb_unknown>"
207 #define MDB_USE_HASH 1
208 #define MDB_PIDLOCK 0
209 #define THREAD_RET DWORD
210 #define pthread_t HANDLE
211 #define pthread_mutex_t HANDLE
212 #define pthread_cond_t HANDLE
213 #define pthread_key_t DWORD
214 #define pthread_self() GetCurrentThreadId()
215 #define pthread_key_create(x,y) \
216 ((*(x) = TlsAlloc()) == TLS_OUT_OF_INDEXES ? ErrCode() : 0)
217 #define pthread_key_delete(x) TlsFree(x)
218 #define pthread_getspecific(x) TlsGetValue(x)
219 #define pthread_setspecific(x,y) (TlsSetValue(x,y) ? 0 : ErrCode())
220 #define pthread_mutex_unlock(x) ReleaseMutex(*x)
221 #define pthread_mutex_lock(x) WaitForSingleObject(*x, INFINITE)
222 #define pthread_cond_signal(x) SetEvent(*x)
223 #define pthread_cond_wait(cond,mutex) do{SignalObjectAndWait(*mutex, *cond, INFINITE, FALSE); WaitForSingleObject(*mutex, INFINITE);}while(0)
224 #define THREAD_CREATE(thr,start,arg) thr=CreateThread(NULL,0,start,arg,0,NULL)
225 #define THREAD_FINISH(thr) WaitForSingleObject(thr, INFINITE)
226 #define LOCK_MUTEX_R(env) pthread_mutex_lock(&(env)->me_rmutex)
227 #define UNLOCK_MUTEX_R(env) pthread_mutex_unlock(&(env)->me_rmutex)
228 #define LOCK_MUTEX_W(env) pthread_mutex_lock(&(env)->me_wmutex)
229 #define UNLOCK_MUTEX_W(env) pthread_mutex_unlock(&(env)->me_wmutex)
230 #define getpid() GetCurrentProcessId()
231 #define MDB_FDATASYNC(fd) (!FlushFileBuffers(fd))
232 #define MDB_MSYNC(addr,len,flags) (!FlushViewOfFile(addr,len))
233 #define ErrCode() GetLastError()
234 #define GET_PAGESIZE(x) {SYSTEM_INFO si; GetSystemInfo(&si); (x) = si.dwPageSize;}
235 #define close(fd) (CloseHandle(fd) ? 0 : -1)
236 #define munmap(ptr,len) UnmapViewOfFile(ptr)
237 #ifdef PROCESS_QUERY_LIMITED_INFORMATION
238 #define MDB_PROCESS_QUERY_LIMITED_INFORMATION PROCESS_QUERY_LIMITED_INFORMATION
240 #define MDB_PROCESS_QUERY_LIMITED_INFORMATION 0x1000
244 #define THREAD_RET void *
245 #define THREAD_CREATE(thr,start,arg) pthread_create(&thr,NULL,start,arg)
246 #define THREAD_FINISH(thr) pthread_join(thr,NULL)
247 #define Z "z" /**< printf format modifier for size_t */
249 /** For MDB_LOCK_FORMAT: True if readers take a pid lock in the lockfile */
250 #define MDB_PIDLOCK 1
252 #ifdef MDB_USE_POSIX_SEM
254 #define LOCK_MUTEX_R(env) mdb_sem_wait((env)->me_rmutex)
255 #define UNLOCK_MUTEX_R(env) sem_post((env)->me_rmutex)
256 #define LOCK_MUTEX_W(env) mdb_sem_wait((env)->me_wmutex)
257 #define UNLOCK_MUTEX_W(env) sem_post((env)->me_wmutex)
260 mdb_sem_wait(sem_t *sem)
263 while ((rc = sem_wait(sem)) && (rc = errno) == EINTR) ;
268 /** Lock the reader mutex.
270 #define LOCK_MUTEX_R(env) pthread_mutex_lock(&(env)->me_txns->mti_mutex)
271 /** Unlock the reader mutex.
273 #define UNLOCK_MUTEX_R(env) pthread_mutex_unlock(&(env)->me_txns->mti_mutex)
275 /** Lock the writer mutex.
276 * Only a single write transaction is allowed at a time. Other writers
277 * will block waiting for this mutex.
279 #define LOCK_MUTEX_W(env) pthread_mutex_lock(&(env)->me_txns->mti_wmutex)
280 /** Unlock the writer mutex.
282 #define UNLOCK_MUTEX_W(env) pthread_mutex_unlock(&(env)->me_txns->mti_wmutex)
283 #endif /* MDB_USE_POSIX_SEM */
285 /** Get the error code for the last failed system function.
287 #define ErrCode() errno
289 /** An abstraction for a file handle.
290 * On POSIX systems file handles are small integers. On Windows
291 * they're opaque pointers.
295 /** A value for an invalid file handle.
296 * Mainly used to initialize file variables and signify that they are
299 #define INVALID_HANDLE_VALUE (-1)
301 /** Get the size of a memory page for the system.
302 * This is the basic size that the platform's memory manager uses, and is
303 * fundamental to the use of memory-mapped files.
305 #define GET_PAGESIZE(x) ((x) = sysconf(_SC_PAGE_SIZE))
308 #if defined(_WIN32) || defined(MDB_USE_POSIX_SEM)
311 #define MNAME_LEN (sizeof(pthread_mutex_t))
317 /** A flag for opening a file and requesting synchronous data writes.
318 * This is only used when writing a meta page. It's not strictly needed;
319 * we could just do a normal write and then immediately perform a flush.
320 * But if this flag is available it saves us an extra system call.
322 * @note If O_DSYNC is undefined but exists in /usr/include,
323 * preferably set some compiler flag to get the definition.
324 * Otherwise compile with the less efficient -DMDB_DSYNC=O_SYNC.
327 # define MDB_DSYNC O_DSYNC
331 /** Function for flushing the data of a file. Define this to fsync
332 * if fdatasync() is not supported.
334 #ifndef MDB_FDATASYNC
335 # define MDB_FDATASYNC fdatasync
336 # define HAVE_FDATASYNC 1
340 # define MDB_MSYNC(addr,len,flags) msync(addr,len,flags)
351 /** A page number in the database.
352 * Note that 64 bit page numbers are overkill, since pages themselves
353 * already represent 12-13 bits of addressable memory, and the OS will
354 * always limit applications to a maximum of 63 bits of address space.
356 * @note In the #MDB_node structure, we only store 48 bits of this value,
357 * which thus limits us to only 60 bits of addressable data.
359 typedef MDB_ID pgno_t;
361 /** A transaction ID.
362 * See struct MDB_txn.mt_txnid for details.
364 typedef MDB_ID txnid_t;
366 /** @defgroup debug Debug Macros
370 /** Enable debug output. Needs variable argument macros (a C99 feature).
371 * Set this to 1 for copious tracing. Set to 2 to add dumps of all IDLs
372 * read from and written to the database (used for free space management).
378 static int mdb_debug;
379 static txnid_t mdb_debug_start;
381 /** Print a debug message with printf formatting.
382 * Requires double parenthesis around 2 or more args.
384 # define DPRINTF(args) ((void) ((mdb_debug) && DPRINTF0 args))
385 # define DPRINTF0(fmt, ...) \
386 fprintf(stderr, "%s:%d " fmt "\n", mdb_func_, __LINE__, __VA_ARGS__)
388 # define DPRINTF(args) ((void) 0)
390 /** Print a debug string.
391 * The string is printed literally, with no format processing.
393 #define DPUTS(arg) DPRINTF(("%s", arg))
394 /** Debuging output value of a cursor DBI: Negative in a sub-cursor. */
396 (((mc)->mc_flags & C_SUB) ? -(int)(mc)->mc_dbi : (int)(mc)->mc_dbi)
399 /** @brief The maximum size of a database page.
401 * It is 32k or 64k, since value-PAGEBASE must fit in
402 * #MDB_page.%mp_upper.
404 * LMDB will use database pages < OS pages if needed.
405 * That causes more I/O in write transactions: The OS must
406 * know (read) the whole page before writing a partial page.
408 * Note that we don't currently support Huge pages. On Linux,
409 * regular data files cannot use Huge pages, and in general
410 * Huge pages aren't actually pageable. We rely on the OS
411 * demand-pager to read our data and page it out when memory
412 * pressure from other processes is high. So until OSs have
413 * actual paging support for Huge pages, they're not viable.
415 #define MAX_PAGESIZE (PAGEBASE ? 0x10000 : 0x8000)
417 /** The minimum number of keys required in a database page.
418 * Setting this to a larger value will place a smaller bound on the
419 * maximum size of a data item. Data items larger than this size will
420 * be pushed into overflow pages instead of being stored directly in
421 * the B-tree node. This value used to default to 4. With a page size
422 * of 4096 bytes that meant that any item larger than 1024 bytes would
423 * go into an overflow page. That also meant that on average 2-3KB of
424 * each overflow page was wasted space. The value cannot be lower than
425 * 2 because then there would no longer be a tree structure. With this
426 * value, items larger than 2KB will go into overflow pages, and on
427 * average only 1KB will be wasted.
429 #define MDB_MINKEYS 2
431 /** A stamp that identifies a file as an LMDB file.
432 * There's nothing special about this value other than that it is easily
433 * recognizable, and it will reflect any byte order mismatches.
435 #define MDB_MAGIC 0xBEEFC0DE
437 /** The version number for a database's datafile format. */
438 #define MDB_DATA_VERSION ((MDB_DEVEL) ? 999 : 1)
439 /** The version number for a database's lockfile format. */
440 #define MDB_LOCK_VERSION 1
442 /** @brief The max size of a key we can write, or 0 for dynamic max.
444 * Define this as 0 to compute the max from the page size. 511
445 * is default for backwards compat: liblmdb <= 0.9.10 can break
446 * when modifying a DB with keys/dupsort data bigger than its max.
447 * #MDB_DEVEL sets the default to 0.
449 * Data items in an #MDB_DUPSORT database are also limited to
450 * this size, since they're actually keys of a sub-DB. Keys and
451 * #MDB_DUPSORT data items must fit on a node in a regular page.
453 #ifndef MDB_MAXKEYSIZE
454 #define MDB_MAXKEYSIZE ((MDB_DEVEL) ? 0 : 511)
457 /** The maximum size of a key we can write to the environment. */
459 #define ENV_MAXKEY(env) (MDB_MAXKEYSIZE)
461 #define ENV_MAXKEY(env) ((env)->me_maxkey)
464 /** @brief The maximum size of a data item.
466 * We only store a 32 bit value for node sizes.
468 #define MAXDATASIZE 0xffffffffUL
471 /** Key size which fits in a #DKBUF.
474 #define DKBUF_MAXKEYSIZE ((MDB_MAXKEYSIZE) > 0 ? (MDB_MAXKEYSIZE) : 511)
477 * This is used for printing a hex dump of a key's contents.
479 #define DKBUF char kbuf[DKBUF_MAXKEYSIZE*2+1]
480 /** Display a key in hex.
482 * Invoke a function to display a key in hex.
484 #define DKEY(x) mdb_dkey(x, kbuf)
490 /** An invalid page number.
491 * Mainly used to denote an empty tree.
493 #define P_INVALID (~(pgno_t)0)
495 /** Test if the flags \b f are set in a flag word \b w. */
496 #define F_ISSET(w, f) (((w) & (f)) == (f))
498 /** Round \b n up to an even number. */
499 #define EVEN(n) (((n) + 1U) & -2) /* sign-extending -2 to match n+1U */
501 /** Used for offsets within a single page.
502 * Since memory pages are typically 4 or 8KB in size, 12-13 bits,
505 typedef uint16_t indx_t;
507 /** Default size of memory map.
508 * This is certainly too small for any actual applications. Apps should always set
509 * the size explicitly using #mdb_env_set_mapsize().
511 #define DEFAULT_MAPSIZE 1048576
513 /** @defgroup readers Reader Lock Table
514 * Readers don't acquire any locks for their data access. Instead, they
515 * simply record their transaction ID in the reader table. The reader
516 * mutex is needed just to find an empty slot in the reader table. The
517 * slot's address is saved in thread-specific data so that subsequent read
518 * transactions started by the same thread need no further locking to proceed.
520 * If #MDB_NOTLS is set, the slot address is not saved in thread-specific data.
522 * No reader table is used if the database is on a read-only filesystem, or
523 * if #MDB_NOLOCK is set.
525 * Since the database uses multi-version concurrency control, readers don't
526 * actually need any locking. This table is used to keep track of which
527 * readers are using data from which old transactions, so that we'll know
528 * when a particular old transaction is no longer in use. Old transactions
529 * that have discarded any data pages can then have those pages reclaimed
530 * for use by a later write transaction.
532 * The lock table is constructed such that reader slots are aligned with the
533 * processor's cache line size. Any slot is only ever used by one thread.
534 * This alignment guarantees that there will be no contention or cache
535 * thrashing as threads update their own slot info, and also eliminates
536 * any need for locking when accessing a slot.
538 * A writer thread will scan every slot in the table to determine the oldest
539 * outstanding reader transaction. Any freed pages older than this will be
540 * reclaimed by the writer. The writer doesn't use any locks when scanning
541 * this table. This means that there's no guarantee that the writer will
542 * see the most up-to-date reader info, but that's not required for correct
543 * operation - all we need is to know the upper bound on the oldest reader,
544 * we don't care at all about the newest reader. So the only consequence of
545 * reading stale information here is that old pages might hang around a
546 * while longer before being reclaimed. That's actually good anyway, because
547 * the longer we delay reclaiming old pages, the more likely it is that a
548 * string of contiguous pages can be found after coalescing old pages from
549 * many old transactions together.
552 /** Number of slots in the reader table.
553 * This value was chosen somewhat arbitrarily. 126 readers plus a
554 * couple mutexes fit exactly into 8KB on my development machine.
555 * Applications should set the table size using #mdb_env_set_maxreaders().
557 #define DEFAULT_READERS 126
559 /** The size of a CPU cache line in bytes. We want our lock structures
560 * aligned to this size to avoid false cache line sharing in the
562 * This value works for most CPUs. For Itanium this should be 128.
568 /** The information we store in a single slot of the reader table.
569 * In addition to a transaction ID, we also record the process and
570 * thread ID that owns a slot, so that we can detect stale information,
571 * e.g. threads or processes that went away without cleaning up.
572 * @note We currently don't check for stale records. We simply re-init
573 * the table when we know that we're the only process opening the
576 typedef struct MDB_rxbody {
577 /** Current Transaction ID when this transaction began, or (txnid_t)-1.
578 * Multiple readers that start at the same time will probably have the
579 * same ID here. Again, it's not important to exclude them from
580 * anything; all we need to know is which version of the DB they
581 * started from so we can avoid overwriting any data used in that
582 * particular version.
585 /** The process ID of the process owning this reader txn. */
587 /** The thread ID of the thread owning this txn. */
591 /** The actual reader record, with cacheline padding. */
592 typedef struct MDB_reader {
595 /** shorthand for mrb_txnid */
596 #define mr_txnid mru.mrx.mrb_txnid
597 #define mr_pid mru.mrx.mrb_pid
598 #define mr_tid mru.mrx.mrb_tid
599 /** cache line alignment */
600 char pad[(sizeof(MDB_rxbody)+CACHELINE-1) & ~(CACHELINE-1)];
604 /** The header for the reader table.
605 * The table resides in a memory-mapped file. (This is a different file
606 * than is used for the main database.)
608 * For POSIX the actual mutexes reside in the shared memory of this
609 * mapped file. On Windows, mutexes are named objects allocated by the
610 * kernel; we store the mutex names in this mapped file so that other
611 * processes can grab them. This same approach is also used on
612 * MacOSX/Darwin (using named semaphores) since MacOSX doesn't support
613 * process-shared POSIX mutexes. For these cases where a named object
614 * is used, the object name is derived from a 64 bit FNV hash of the
615 * environment pathname. As such, naming collisions are extremely
616 * unlikely. If a collision occurs, the results are unpredictable.
618 typedef struct MDB_txbody {
619 /** Stamp identifying this as an LMDB file. It must be set
622 /** Format of this lock file. Must be set to #MDB_LOCK_FORMAT. */
624 #if defined(_WIN32) || defined(MDB_USE_POSIX_SEM)
625 char mtb_rmname[MNAME_LEN];
627 /** Mutex protecting access to this table.
628 * This is the reader lock that #LOCK_MUTEX_R acquires.
630 pthread_mutex_t mtb_mutex;
632 /** The ID of the last transaction committed to the database.
633 * This is recorded here only for convenience; the value can always
634 * be determined by reading the main database meta pages.
637 /** The number of slots that have been used in the reader table.
638 * This always records the maximum count, it is not decremented
639 * when readers release their slots.
641 unsigned mtb_numreaders;
644 /** The actual reader table definition. */
645 typedef struct MDB_txninfo {
648 #define mti_magic mt1.mtb.mtb_magic
649 #define mti_format mt1.mtb.mtb_format
650 #define mti_mutex mt1.mtb.mtb_mutex
651 #define mti_rmname mt1.mtb.mtb_rmname
652 #define mti_txnid mt1.mtb.mtb_txnid
653 #define mti_numreaders mt1.mtb.mtb_numreaders
654 char pad[(sizeof(MDB_txbody)+CACHELINE-1) & ~(CACHELINE-1)];
657 #if defined(_WIN32) || defined(MDB_USE_POSIX_SEM)
658 char mt2_wmname[MNAME_LEN];
659 #define mti_wmname mt2.mt2_wmname
661 pthread_mutex_t mt2_wmutex;
662 #define mti_wmutex mt2.mt2_wmutex
664 char pad[(MNAME_LEN+CACHELINE-1) & ~(CACHELINE-1)];
666 MDB_reader mti_readers[1];
669 /** Lockfile format signature: version, features and field layout */
670 #define MDB_LOCK_FORMAT \
672 ((MDB_LOCK_VERSION) \
673 /* Flags which describe functionality */ \
674 + (((MDB_PIDLOCK) != 0) << 16)))
677 /** Common header for all page types.
678 * Overflow records occupy a number of contiguous pages with no
679 * headers on any page after the first.
681 typedef struct MDB_page {
682 #define mp_pgno mp_p.p_pgno
683 #define mp_next mp_p.p_next
685 pgno_t p_pgno; /**< page number */
686 struct MDB_page *p_next; /**< for in-memory list of freed pages */
689 /** @defgroup mdb_page Page Flags
691 * Flags for the page headers.
694 #define P_BRANCH 0x01 /**< branch page */
695 #define P_LEAF 0x02 /**< leaf page */
696 #define P_OVERFLOW 0x04 /**< overflow page */
697 #define P_META 0x08 /**< meta page */
698 #define P_DIRTY 0x10 /**< dirty page, also set for #P_SUBP pages */
699 #define P_LEAF2 0x20 /**< for #MDB_DUPFIXED records */
700 #define P_SUBP 0x40 /**< for #MDB_DUPSORT sub-pages */
701 #define P_LOOSE 0x4000 /**< page was dirtied then freed, can be reused */
702 #define P_KEEP 0x8000 /**< leave this page alone during spill */
704 uint16_t mp_flags; /**< @ref mdb_page */
705 #define mp_lower mp_pb.pb.pb_lower
706 #define mp_upper mp_pb.pb.pb_upper
707 #define mp_pages mp_pb.pb_pages
710 indx_t pb_lower; /**< lower bound of free space */
711 indx_t pb_upper; /**< upper bound of free space */
713 uint32_t pb_pages; /**< number of overflow pages */
715 indx_t mp_ptrs[1]; /**< dynamic size */
718 /** Size of the page header, excluding dynamic data at the end */
719 #define PAGEHDRSZ ((unsigned) offsetof(MDB_page, mp_ptrs))
721 /** Address of first usable data byte in a page, after the header */
722 #define METADATA(p) ((void *)((char *)(p) + PAGEHDRSZ))
724 /** ITS#7713, change PAGEBASE to handle 65536 byte pages */
725 #define PAGEBASE ((MDB_DEVEL) ? PAGEHDRSZ : 0)
727 /** Number of nodes on a page */
728 #define NUMKEYS(p) (((p)->mp_lower - (PAGEHDRSZ-PAGEBASE)) >> 1)
730 /** The amount of space remaining in the page */
731 #define SIZELEFT(p) (indx_t)((p)->mp_upper - (p)->mp_lower)
733 /** The percentage of space used in the page, in tenths of a percent. */
734 #define PAGEFILL(env, p) (1000L * ((env)->me_psize - PAGEHDRSZ - SIZELEFT(p)) / \
735 ((env)->me_psize - PAGEHDRSZ))
736 /** The minimum page fill factor, in tenths of a percent.
737 * Pages emptier than this are candidates for merging.
739 #define FILL_THRESHOLD 250
741 /** Test if a page is a leaf page */
742 #define IS_LEAF(p) F_ISSET((p)->mp_flags, P_LEAF)
743 /** Test if a page is a LEAF2 page */
744 #define IS_LEAF2(p) F_ISSET((p)->mp_flags, P_LEAF2)
745 /** Test if a page is a branch page */
746 #define IS_BRANCH(p) F_ISSET((p)->mp_flags, P_BRANCH)
747 /** Test if a page is an overflow page */
748 #define IS_OVERFLOW(p) F_ISSET((p)->mp_flags, P_OVERFLOW)
749 /** Test if a page is a sub page */
750 #define IS_SUBP(p) F_ISSET((p)->mp_flags, P_SUBP)
752 /** The number of overflow pages needed to store the given size. */
753 #define OVPAGES(size, psize) ((PAGEHDRSZ-1 + (size)) / (psize) + 1)
755 /** Link in #MDB_txn.%mt_loose_pgs list */
756 #define NEXT_LOOSE_PAGE(p) (*(MDB_page **)((p) + 2))
758 /** Header for a single key/data pair within a page.
759 * Used in pages of type #P_BRANCH and #P_LEAF without #P_LEAF2.
760 * We guarantee 2-byte alignment for 'MDB_node's.
762 typedef struct MDB_node {
763 /** lo and hi are used for data size on leaf nodes and for
764 * child pgno on branch nodes. On 64 bit platforms, flags
765 * is also used for pgno. (Branch nodes have no flags).
766 * They are in host byte order in case that lets some
767 * accesses be optimized into a 32-bit word access.
769 #if BYTE_ORDER == LITTLE_ENDIAN
770 unsigned short mn_lo, mn_hi; /**< part of data size or pgno */
772 unsigned short mn_hi, mn_lo;
774 /** @defgroup mdb_node Node Flags
776 * Flags for node headers.
779 #define F_BIGDATA 0x01 /**< data put on overflow page */
780 #define F_SUBDATA 0x02 /**< data is a sub-database */
781 #define F_DUPDATA 0x04 /**< data has duplicates */
783 /** valid flags for #mdb_node_add() */
784 #define NODE_ADD_FLAGS (F_DUPDATA|F_SUBDATA|MDB_RESERVE|MDB_APPEND)
787 unsigned short mn_flags; /**< @ref mdb_node */
788 unsigned short mn_ksize; /**< key size */
789 char mn_data[1]; /**< key and data are appended here */
792 /** Size of the node header, excluding dynamic data at the end */
793 #define NODESIZE offsetof(MDB_node, mn_data)
795 /** Bit position of top word in page number, for shifting mn_flags */
796 #define PGNO_TOPWORD ((pgno_t)-1 > 0xffffffffu ? 32 : 0)
798 /** Size of a node in a branch page with a given key.
799 * This is just the node header plus the key, there is no data.
801 #define INDXSIZE(k) (NODESIZE + ((k) == NULL ? 0 : (k)->mv_size))
803 /** Size of a node in a leaf page with a given key and data.
804 * This is node header plus key plus data size.
806 #define LEAFSIZE(k, d) (NODESIZE + (k)->mv_size + (d)->mv_size)
808 /** Address of node \b i in page \b p */
809 #define NODEPTR(p, i) ((MDB_node *)((char *)(p) + (p)->mp_ptrs[i] + PAGEBASE))
811 /** Address of the key for the node */
812 #define NODEKEY(node) (void *)((node)->mn_data)
814 /** Address of the data for a node */
815 #define NODEDATA(node) (void *)((char *)(node)->mn_data + (node)->mn_ksize)
817 /** Get the page number pointed to by a branch node */
818 #define NODEPGNO(node) \
819 ((node)->mn_lo | ((pgno_t) (node)->mn_hi << 16) | \
820 (PGNO_TOPWORD ? ((pgno_t) (node)->mn_flags << PGNO_TOPWORD) : 0))
821 /** Set the page number in a branch node */
822 #define SETPGNO(node,pgno) do { \
823 (node)->mn_lo = (pgno) & 0xffff; (node)->mn_hi = (pgno) >> 16; \
824 if (PGNO_TOPWORD) (node)->mn_flags = (pgno) >> PGNO_TOPWORD; } while(0)
826 /** Get the size of the data in a leaf node */
827 #define NODEDSZ(node) ((node)->mn_lo | ((unsigned)(node)->mn_hi << 16))
828 /** Set the size of the data for a leaf node */
829 #define SETDSZ(node,size) do { \
830 (node)->mn_lo = (size) & 0xffff; (node)->mn_hi = (size) >> 16;} while(0)
831 /** The size of a key in a node */
832 #define NODEKSZ(node) ((node)->mn_ksize)
834 /** Copy a page number from src to dst */
836 #define COPY_PGNO(dst,src) dst = src
838 #if SIZE_MAX > 4294967295UL
839 #define COPY_PGNO(dst,src) do { \
840 unsigned short *s, *d; \
841 s = (unsigned short *)&(src); \
842 d = (unsigned short *)&(dst); \
849 #define COPY_PGNO(dst,src) do { \
850 unsigned short *s, *d; \
851 s = (unsigned short *)&(src); \
852 d = (unsigned short *)&(dst); \
858 /** The address of a key in a LEAF2 page.
859 * LEAF2 pages are used for #MDB_DUPFIXED sorted-duplicate sub-DBs.
860 * There are no node headers, keys are stored contiguously.
862 #define LEAF2KEY(p, i, ks) ((char *)(p) + PAGEHDRSZ + ((i)*(ks)))
864 /** Set the \b node's key into \b keyptr, if requested. */
865 #define MDB_GET_KEY(node, keyptr) { if ((keyptr) != NULL) { \
866 (keyptr)->mv_size = NODEKSZ(node); (keyptr)->mv_data = NODEKEY(node); } }
868 /** Set the \b node's key into \b key. */
869 #define MDB_GET_KEY2(node, key) { key.mv_size = NODEKSZ(node); key.mv_data = NODEKEY(node); }
871 /** Information about a single database in the environment. */
872 typedef struct MDB_db {
873 uint32_t md_pad; /**< also ksize for LEAF2 pages */
874 uint16_t md_flags; /**< @ref mdb_dbi_open */
875 uint16_t md_depth; /**< depth of this tree */
876 pgno_t md_branch_pages; /**< number of internal pages */
877 pgno_t md_leaf_pages; /**< number of leaf pages */
878 pgno_t md_overflow_pages; /**< number of overflow pages */
879 size_t md_entries; /**< number of data items */
880 pgno_t md_root; /**< the root page of this tree */
883 /** mdb_dbi_open flags */
884 #define MDB_VALID 0x8000 /**< DB handle is valid, for me_dbflags */
885 #define PERSISTENT_FLAGS (0xffff & ~(MDB_VALID))
886 #define VALID_FLAGS (MDB_REVERSEKEY|MDB_DUPSORT|MDB_INTEGERKEY|MDB_DUPFIXED|\
887 MDB_INTEGERDUP|MDB_REVERSEDUP|MDB_CREATE)
889 /** Handle for the DB used to track free pages. */
891 /** Handle for the default DB. */
894 /** Meta page content.
895 * A meta page is the start point for accessing a database snapshot.
896 * Pages 0-1 are meta pages. Transaction N writes meta page #(N % 2).
898 typedef struct MDB_meta {
899 /** Stamp identifying this as an LMDB file. It must be set
902 /** Version number of this file. Must be set to #MDB_DATA_VERSION. */
904 void *mm_address; /**< address for fixed mapping */
905 size_t mm_mapsize; /**< size of mmap region */
906 MDB_db mm_dbs[2]; /**< first is free space, 2nd is main db */
907 /** The size of pages used in this DB */
908 #define mm_psize mm_dbs[0].md_pad
909 /** Any persistent environment flags. @ref mdb_env */
910 #define mm_flags mm_dbs[0].md_flags
911 pgno_t mm_last_pg; /**< last used page in file */
912 txnid_t mm_txnid; /**< txnid that committed this page */
915 /** Buffer for a stack-allocated meta page.
916 * The members define size and alignment, and silence type
917 * aliasing warnings. They are not used directly; that could
918 * mean incorrectly using several union members in parallel.
920 typedef union MDB_metabuf {
923 char mm_pad[PAGEHDRSZ];
928 /** Auxiliary DB info.
929 * The information here is mostly static/read-only. There is
930 * only a single copy of this record in the environment.
932 typedef struct MDB_dbx {
933 MDB_val md_name; /**< name of the database */
934 MDB_cmp_func *md_cmp; /**< function for comparing keys */
935 MDB_cmp_func *md_dcmp; /**< function for comparing data items */
936 MDB_rel_func *md_rel; /**< user relocate function */
937 void *md_relctx; /**< user-provided context for md_rel */
940 /** A database transaction.
941 * Every operation requires a transaction handle.
944 MDB_txn *mt_parent; /**< parent of a nested txn */
945 MDB_txn *mt_child; /**< nested txn under this txn */
946 pgno_t mt_next_pgno; /**< next unallocated page */
947 /** The ID of this transaction. IDs are integers incrementing from 1.
948 * Only committed write transactions increment the ID. If a transaction
949 * aborts, the ID may be re-used by the next writer.
952 MDB_env *mt_env; /**< the DB environment */
953 /** The list of pages that became unused during this transaction.
956 /** The list of loose pages that became unused and may be reused
957 * in this transaction, linked through #NEXT_LOOSE_PAGE(page).
959 MDB_page *mt_loose_pgs;
960 /* #Number of loose pages (#mt_loose_pgs) */
962 /** The sorted list of dirty pages we temporarily wrote to disk
963 * because the dirty list was full. page numbers in here are
964 * shifted left by 1, deleted slots have the LSB set.
966 MDB_IDL mt_spill_pgs;
968 /** For write txns: Modified pages. Sorted when not MDB_WRITEMAP. */
970 /** For read txns: This thread/txn's reader table slot, or NULL. */
973 /** Array of records for each DB known in the environment. */
975 /** Array of MDB_db records for each known DB */
977 /** Array of sequence numbers for each DB handle */
978 unsigned int *mt_dbiseqs;
979 /** @defgroup mt_dbflag Transaction DB Flags
983 #define DB_DIRTY 0x01 /**< DB was modified or is DUPSORT data */
984 #define DB_STALE 0x02 /**< Named-DB record is older than txnID */
985 #define DB_NEW 0x04 /**< Named-DB handle opened in this txn */
986 #define DB_VALID 0x08 /**< DB handle is valid, see also #MDB_VALID */
988 /** In write txns, array of cursors for each DB */
989 MDB_cursor **mt_cursors;
990 /** Array of flags for each DB */
991 unsigned char *mt_dbflags;
992 /** Number of DB records in use. This number only ever increments;
993 * we don't decrement it when individual DB handles are closed.
997 /** @defgroup mdb_txn Transaction Flags
1001 #define MDB_TXN_RDONLY 0x01 /**< read-only transaction */
1002 #define MDB_TXN_ERROR 0x02 /**< txn is unusable after an error */
1003 #define MDB_TXN_DIRTY 0x04 /**< must write, even if dirty list is empty */
1004 #define MDB_TXN_SPILLS 0x08 /**< txn or a parent has spilled pages */
1006 unsigned int mt_flags; /**< @ref mdb_txn */
1007 /** #dirty_list room: Array size - \#dirty pages visible to this txn.
1008 * Includes ancestor txns' dirty pages not hidden by other txns'
1009 * dirty/spilled pages. Thus commit(nested txn) has room to merge
1010 * dirty_list into mt_parent after freeing hidden mt_parent pages.
1012 unsigned int mt_dirty_room;
1015 /** Enough space for 2^32 nodes with minimum of 2 keys per node. I.e., plenty.
1016 * At 4 keys per node, enough for 2^64 nodes, so there's probably no need to
1017 * raise this on a 64 bit machine.
1019 #define CURSOR_STACK 32
1023 /** Cursors are used for all DB operations.
1024 * A cursor holds a path of (page pointer, key index) from the DB
1025 * root to a position in the DB, plus other state. #MDB_DUPSORT
1026 * cursors include an xcursor to the current data item. Write txns
1027 * track their cursors and keep them up to date when data moves.
1028 * Exception: An xcursor's pointer to a #P_SUBP page can be stale.
1029 * (A node with #F_DUPDATA but no #F_SUBDATA contains a subpage).
1032 /** Next cursor on this DB in this txn */
1033 MDB_cursor *mc_next;
1034 /** Backup of the original cursor if this cursor is a shadow */
1035 MDB_cursor *mc_backup;
1036 /** Context used for databases with #MDB_DUPSORT, otherwise NULL */
1037 struct MDB_xcursor *mc_xcursor;
1038 /** The transaction that owns this cursor */
1040 /** The database handle this cursor operates on */
1042 /** The database record for this cursor */
1044 /** The database auxiliary record for this cursor */
1046 /** The @ref mt_dbflag for this database */
1047 unsigned char *mc_dbflag;
1048 unsigned short mc_snum; /**< number of pushed pages */
1049 unsigned short mc_top; /**< index of top page, normally mc_snum-1 */
1050 /** @defgroup mdb_cursor Cursor Flags
1052 * Cursor state flags.
1055 #define C_INITIALIZED 0x01 /**< cursor has been initialized and is valid */
1056 #define C_EOF 0x02 /**< No more data */
1057 #define C_SUB 0x04 /**< Cursor is a sub-cursor */
1058 #define C_DEL 0x08 /**< last op was a cursor_del */
1059 #define C_SPLITTING 0x20 /**< Cursor is in page_split */
1060 #define C_UNTRACK 0x40 /**< Un-track cursor when closing */
1062 unsigned int mc_flags; /**< @ref mdb_cursor */
1063 MDB_page *mc_pg[CURSOR_STACK]; /**< stack of pushed pages */
1064 indx_t mc_ki[CURSOR_STACK]; /**< stack of page indices */
1067 /** Context for sorted-dup records.
1068 * We could have gone to a fully recursive design, with arbitrarily
1069 * deep nesting of sub-databases. But for now we only handle these
1070 * levels - main DB, optional sub-DB, sorted-duplicate DB.
1072 typedef struct MDB_xcursor {
1073 /** A sub-cursor for traversing the Dup DB */
1074 MDB_cursor mx_cursor;
1075 /** The database record for this Dup DB */
1077 /** The auxiliary DB record for this Dup DB */
1079 /** The @ref mt_dbflag for this Dup DB */
1080 unsigned char mx_dbflag;
1083 /** State of FreeDB old pages, stored in the MDB_env */
1084 typedef struct MDB_pgstate {
1085 pgno_t *mf_pghead; /**< Reclaimed freeDB pages, or NULL before use */
1086 txnid_t mf_pglast; /**< ID of last used record, or 0 if !mf_pghead */
1089 /** The database environment. */
1091 HANDLE me_fd; /**< The main data file */
1092 HANDLE me_lfd; /**< The lock file */
1093 HANDLE me_mfd; /**< just for writing the meta pages */
1094 /** Failed to update the meta page. Probably an I/O error. */
1095 #define MDB_FATAL_ERROR 0x80000000U
1096 /** Some fields are initialized. */
1097 #define MDB_ENV_ACTIVE 0x20000000U
1098 /** me_txkey is set */
1099 #define MDB_ENV_TXKEY 0x10000000U
1100 uint32_t me_flags; /**< @ref mdb_env */
1101 unsigned int me_psize; /**< DB page size, inited from me_os_psize */
1102 unsigned int me_os_psize; /**< OS page size, from #GET_PAGESIZE */
1103 unsigned int me_maxreaders; /**< size of the reader table */
1104 unsigned int me_numreaders; /**< max numreaders set by this env */
1105 MDB_dbi me_numdbs; /**< number of DBs opened */
1106 MDB_dbi me_maxdbs; /**< size of the DB table */
1107 MDB_PID_T me_pid; /**< process ID of this env */
1108 char *me_path; /**< path to the DB files */
1109 char *me_map; /**< the memory map of the data file */
1110 MDB_txninfo *me_txns; /**< the memory map of the lock file or NULL */
1111 MDB_meta *me_metas[2]; /**< pointers to the two meta pages */
1112 void *me_pbuf; /**< scratch area for DUPSORT put() */
1113 MDB_txn *me_txn; /**< current write transaction */
1114 MDB_txn *me_txn0; /**< prealloc'd write transaction */
1115 size_t me_mapsize; /**< size of the data memory map */
1116 size_t me_size; /**< current file size */
1117 pgno_t me_maxpg; /**< me_mapsize / me_psize */
1118 MDB_dbx *me_dbxs; /**< array of static DB info */
1119 uint16_t *me_dbflags; /**< array of flags from MDB_db.md_flags */
1120 unsigned int *me_dbiseqs; /**< array of dbi sequence numbers */
1121 pthread_key_t me_txkey; /**< thread-key for readers */
1122 txnid_t me_pgoldest; /**< ID of oldest reader last time we looked */
1123 MDB_pgstate me_pgstate; /**< state of old pages from freeDB */
1124 # define me_pglast me_pgstate.mf_pglast
1125 # define me_pghead me_pgstate.mf_pghead
1126 MDB_page *me_dpages; /**< list of malloc'd blocks for re-use */
1127 /** IDL of pages that became unused in a write txn */
1128 MDB_IDL me_free_pgs;
1129 /** ID2L of pages written during a write txn. Length MDB_IDL_UM_SIZE. */
1130 MDB_ID2L me_dirty_list;
1131 /** Max number of freelist items that can fit in a single overflow page */
1133 /** Max size of a node on a page */
1134 unsigned int me_nodemax;
1135 #if !(MDB_MAXKEYSIZE)
1136 unsigned int me_maxkey; /**< max size of a key */
1138 int me_live_reader; /**< have liveness lock in reader table */
1140 int me_pidquery; /**< Used in OpenProcess */
1141 HANDLE me_rmutex; /* Windows mutexes don't reside in shared mem */
1143 #elif defined(MDB_USE_POSIX_SEM)
1144 sem_t *me_rmutex; /* Shared mutexes are not supported */
1147 void *me_userctx; /**< User-settable context */
1148 MDB_assert_func *me_assert_func; /**< Callback for assertion failures */
1151 /** Nested transaction */
1152 typedef struct MDB_ntxn {
1153 MDB_txn mnt_txn; /**< the transaction */
1154 MDB_pgstate mnt_pgstate; /**< parent transaction's saved freestate */
1157 /** max number of pages to commit in one writev() call */
1158 #define MDB_COMMIT_PAGES 64
1159 #if defined(IOV_MAX) && IOV_MAX < MDB_COMMIT_PAGES
1160 #undef MDB_COMMIT_PAGES
1161 #define MDB_COMMIT_PAGES IOV_MAX
1164 /** max bytes to write in one call */
1165 #define MAX_WRITE (0x80000000U >> (sizeof(ssize_t) == 4))
1167 /** Check \b txn and \b dbi arguments to a function */
1168 #define TXN_DBI_EXIST(txn, dbi) \
1169 ((txn) && (dbi) < (txn)->mt_numdbs && ((txn)->mt_dbflags[dbi] & DB_VALID))
1171 /** Check for misused \b dbi handles */
1172 #define TXN_DBI_CHANGED(txn, dbi) \
1173 ((txn)->mt_dbiseqs[dbi] != (txn)->mt_env->me_dbiseqs[dbi])
1175 static int mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp);
1176 static int mdb_page_new(MDB_cursor *mc, uint32_t flags, int num, MDB_page **mp);
1177 static int mdb_page_touch(MDB_cursor *mc);
1179 static int mdb_page_get(MDB_txn *txn, pgno_t pgno, MDB_page **mp, int *lvl);
1180 static int mdb_page_search_root(MDB_cursor *mc,
1181 MDB_val *key, int modify);
1182 #define MDB_PS_MODIFY 1
1183 #define MDB_PS_ROOTONLY 2
1184 #define MDB_PS_FIRST 4
1185 #define MDB_PS_LAST 8
1186 static int mdb_page_search(MDB_cursor *mc,
1187 MDB_val *key, int flags);
1188 static int mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst);
1190 #define MDB_SPLIT_REPLACE MDB_APPENDDUP /**< newkey is not new */
1191 static int mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata,
1192 pgno_t newpgno, unsigned int nflags);
1194 static int mdb_env_read_header(MDB_env *env, MDB_meta *meta);
1195 static int mdb_env_pick_meta(const MDB_env *env);
1196 static int mdb_env_write_meta(MDB_txn *txn);
1197 #if !(defined(_WIN32) || defined(MDB_USE_POSIX_SEM)) /* Drop unused excl arg */
1198 # define mdb_env_close0(env, excl) mdb_env_close1(env)
1200 static void mdb_env_close0(MDB_env *env, int excl);
1202 static MDB_node *mdb_node_search(MDB_cursor *mc, MDB_val *key, int *exactp);
1203 static int mdb_node_add(MDB_cursor *mc, indx_t indx,
1204 MDB_val *key, MDB_val *data, pgno_t pgno, unsigned int flags);
1205 static void mdb_node_del(MDB_cursor *mc, int ksize);
1206 static void mdb_node_shrink(MDB_page *mp, indx_t indx);
1207 static int mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst);
1208 static int mdb_node_read(MDB_txn *txn, MDB_node *leaf, MDB_val *data);
1209 static size_t mdb_leaf_size(MDB_env *env, MDB_val *key, MDB_val *data);
1210 static size_t mdb_branch_size(MDB_env *env, MDB_val *key);
1212 static int mdb_rebalance(MDB_cursor *mc);
1213 static int mdb_update_key(MDB_cursor *mc, MDB_val *key);
1215 static void mdb_cursor_pop(MDB_cursor *mc);
1216 static int mdb_cursor_push(MDB_cursor *mc, MDB_page *mp);
1218 static int mdb_cursor_del0(MDB_cursor *mc);
1219 static int mdb_del0(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, unsigned flags);
1220 static int mdb_cursor_sibling(MDB_cursor *mc, int move_right);
1221 static int mdb_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op);
1222 static int mdb_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op);
1223 static int mdb_cursor_set(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op,
1225 static int mdb_cursor_first(MDB_cursor *mc, MDB_val *key, MDB_val *data);
1226 static int mdb_cursor_last(MDB_cursor *mc, MDB_val *key, MDB_val *data);
1228 static void mdb_cursor_init(MDB_cursor *mc, MDB_txn *txn, MDB_dbi dbi, MDB_xcursor *mx);
1229 static void mdb_xcursor_init0(MDB_cursor *mc);
1230 static void mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node);
1232 static int mdb_drop0(MDB_cursor *mc, int subs);
1233 static void mdb_default_cmp(MDB_txn *txn, MDB_dbi dbi);
1236 static MDB_cmp_func mdb_cmp_memn, mdb_cmp_memnr, mdb_cmp_int, mdb_cmp_cint, mdb_cmp_long;
1240 static SECURITY_DESCRIPTOR mdb_null_sd;
1241 static SECURITY_ATTRIBUTES mdb_all_sa;
1242 static int mdb_sec_inited;
1245 /** Return the library version info. */
1247 mdb_version(int *major, int *minor, int *patch)
1249 if (major) *major = MDB_VERSION_MAJOR;
1250 if (minor) *minor = MDB_VERSION_MINOR;
1251 if (patch) *patch = MDB_VERSION_PATCH;
1252 return MDB_VERSION_STRING;
1255 /** Table of descriptions for LMDB @ref errors */
1256 static char *const mdb_errstr[] = {
1257 "MDB_KEYEXIST: Key/data pair already exists",
1258 "MDB_NOTFOUND: No matching key/data pair found",
1259 "MDB_PAGE_NOTFOUND: Requested page not found",
1260 "MDB_CORRUPTED: Located page was wrong type",
1261 "MDB_PANIC: Update of meta page failed",
1262 "MDB_VERSION_MISMATCH: Database environment version mismatch",
1263 "MDB_INVALID: File is not an LMDB file",
1264 "MDB_MAP_FULL: Environment mapsize limit reached",
1265 "MDB_DBS_FULL: Environment maxdbs limit reached",
1266 "MDB_READERS_FULL: Environment maxreaders limit reached",
1267 "MDB_TLS_FULL: Thread-local storage keys full - too many environments open",
1268 "MDB_TXN_FULL: Transaction has too many dirty pages - transaction too big",
1269 "MDB_CURSOR_FULL: Internal error - cursor stack limit reached",
1270 "MDB_PAGE_FULL: Internal error - page has no more space",
1271 "MDB_MAP_RESIZED: Database contents grew beyond environment mapsize",
1272 "MDB_INCOMPATIBLE: Operation and DB incompatible, or DB flags changed",
1273 "MDB_BAD_RSLOT: Invalid reuse of reader locktable slot",
1274 "MDB_BAD_TXN: Transaction cannot recover - it must be aborted",
1275 "MDB_BAD_VALSIZE: Unsupported size of key/DB name/data, or wrong DUPFIXED size",
1276 "MDB_BAD_DBI: The specified DBI handle was closed/changed unexpectedly",
1280 mdb_strerror(int err)
1283 /** HACK: pad 4KB on stack over the buf. Return system msgs in buf.
1284 * This works as long as no function between the call to mdb_strerror
1285 * and the actual use of the message uses more than 4K of stack.
1288 char buf[1024], *ptr = buf;
1292 return ("Successful return: 0");
1294 if (err >= MDB_KEYEXIST && err <= MDB_LAST_ERRCODE) {
1295 i = err - MDB_KEYEXIST;
1296 return mdb_errstr[i];
1300 /* These are the C-runtime error codes we use. The comment indicates
1301 * their numeric value, and the Win32 error they would correspond to
1302 * if the error actually came from a Win32 API. A major mess, we should
1303 * have used LMDB-specific error codes for everything.
1306 case ENOENT: /* 2, FILE_NOT_FOUND */
1307 case EIO: /* 5, ACCESS_DENIED */
1308 case ENOMEM: /* 12, INVALID_ACCESS */
1309 case EACCES: /* 13, INVALID_DATA */
1310 case EBUSY: /* 16, CURRENT_DIRECTORY */
1311 case EINVAL: /* 22, BAD_COMMAND */
1312 case ENOSPC: /* 28, OUT_OF_PAPER */
1313 return strerror(err);
1318 FormatMessage(FORMAT_MESSAGE_FROM_SYSTEM |
1319 FORMAT_MESSAGE_IGNORE_INSERTS,
1320 NULL, err, 0, ptr, sizeof(buf), (va_list *)pad);
1323 return strerror(err);
1327 /** assert(3) variant in cursor context */
1328 #define mdb_cassert(mc, expr) mdb_assert0((mc)->mc_txn->mt_env, expr, #expr)
1329 /** assert(3) variant in transaction context */
1330 #define mdb_tassert(mc, expr) mdb_assert0((txn)->mt_env, expr, #expr)
1331 /** assert(3) variant in environment context */
1332 #define mdb_eassert(env, expr) mdb_assert0(env, expr, #expr)
1335 # define mdb_assert0(env, expr, expr_txt) ((expr) ? (void)0 : \
1336 mdb_assert_fail(env, expr_txt, mdb_func_, __FILE__, __LINE__))
1339 mdb_assert_fail(MDB_env *env, const char *expr_txt,
1340 const char *func, const char *file, int line)
1343 sprintf(buf, "%.100s:%d: Assertion '%.200s' failed in %.40s()",
1344 file, line, expr_txt, func);
1345 if (env->me_assert_func)
1346 env->me_assert_func(env, buf);
1347 fprintf(stderr, "%s\n", buf);
1351 # define mdb_assert0(env, expr, expr_txt) ((void) 0)
1355 /** Return the page number of \b mp which may be sub-page, for debug output */
1357 mdb_dbg_pgno(MDB_page *mp)
1360 COPY_PGNO(ret, mp->mp_pgno);
1364 /** Display a key in hexadecimal and return the address of the result.
1365 * @param[in] key the key to display
1366 * @param[in] buf the buffer to write into. Should always be #DKBUF.
1367 * @return The key in hexadecimal form.
1370 mdb_dkey(MDB_val *key, char *buf)
1373 unsigned char *c = key->mv_data;
1379 if (key->mv_size > DKBUF_MAXKEYSIZE)
1380 return "MDB_MAXKEYSIZE";
1381 /* may want to make this a dynamic check: if the key is mostly
1382 * printable characters, print it as-is instead of converting to hex.
1386 for (i=0; i<key->mv_size; i++)
1387 ptr += sprintf(ptr, "%02x", *c++);
1389 sprintf(buf, "%.*s", key->mv_size, key->mv_data);
1395 mdb_leafnode_type(MDB_node *n)
1397 static char *const tp[2][2] = {{"", ": DB"}, {": sub-page", ": sub-DB"}};
1398 return F_ISSET(n->mn_flags, F_BIGDATA) ? ": overflow page" :
1399 tp[F_ISSET(n->mn_flags, F_DUPDATA)][F_ISSET(n->mn_flags, F_SUBDATA)];
1402 /** Display all the keys in the page. */
1404 mdb_page_list(MDB_page *mp)
1406 pgno_t pgno = mdb_dbg_pgno(mp);
1407 const char *type, *state = (mp->mp_flags & P_DIRTY) ? ", dirty" : "";
1409 unsigned int i, nkeys, nsize, total = 0;
1413 switch (mp->mp_flags & (P_BRANCH|P_LEAF|P_LEAF2|P_META|P_OVERFLOW|P_SUBP)) {
1414 case P_BRANCH: type = "Branch page"; break;
1415 case P_LEAF: type = "Leaf page"; break;
1416 case P_LEAF|P_SUBP: type = "Sub-page"; break;
1417 case P_LEAF|P_LEAF2: type = "LEAF2 page"; break;
1418 case P_LEAF|P_LEAF2|P_SUBP: type = "LEAF2 sub-page"; break;
1420 fprintf(stderr, "Overflow page %"Z"u pages %u%s\n",
1421 pgno, mp->mp_pages, state);
1424 fprintf(stderr, "Meta-page %"Z"u txnid %"Z"u\n",
1425 pgno, ((MDB_meta *)METADATA(mp))->mm_txnid);
1428 fprintf(stderr, "Bad page %"Z"u flags 0x%u\n", pgno, mp->mp_flags);
1432 nkeys = NUMKEYS(mp);
1433 fprintf(stderr, "%s %"Z"u numkeys %d%s\n", type, pgno, nkeys, state);
1435 for (i=0; i<nkeys; i++) {
1436 if (IS_LEAF2(mp)) { /* LEAF2 pages have no mp_ptrs[] or node headers */
1437 key.mv_size = nsize = mp->mp_pad;
1438 key.mv_data = LEAF2KEY(mp, i, nsize);
1440 fprintf(stderr, "key %d: nsize %d, %s\n", i, nsize, DKEY(&key));
1443 node = NODEPTR(mp, i);
1444 key.mv_size = node->mn_ksize;
1445 key.mv_data = node->mn_data;
1446 nsize = NODESIZE + key.mv_size;
1447 if (IS_BRANCH(mp)) {
1448 fprintf(stderr, "key %d: page %"Z"u, %s\n", i, NODEPGNO(node),
1452 if (F_ISSET(node->mn_flags, F_BIGDATA))
1453 nsize += sizeof(pgno_t);
1455 nsize += NODEDSZ(node);
1457 nsize += sizeof(indx_t);
1458 fprintf(stderr, "key %d: nsize %d, %s%s\n",
1459 i, nsize, DKEY(&key), mdb_leafnode_type(node));
1461 total = EVEN(total);
1463 fprintf(stderr, "Total: header %d + contents %d + unused %d\n",
1464 IS_LEAF2(mp) ? PAGEHDRSZ : PAGEBASE + mp->mp_lower, total, SIZELEFT(mp));
1468 mdb_cursor_chk(MDB_cursor *mc)
1474 if (!mc->mc_snum && !(mc->mc_flags & C_INITIALIZED)) return;
1475 for (i=0; i<mc->mc_top; i++) {
1477 node = NODEPTR(mp, mc->mc_ki[i]);
1478 if (NODEPGNO(node) != mc->mc_pg[i+1]->mp_pgno)
1481 if (mc->mc_ki[i] >= NUMKEYS(mc->mc_pg[i]))
1487 /** Count all the pages in each DB and in the freelist
1488 * and make sure it matches the actual number of pages
1490 * All named DBs must be open for a correct count.
1492 static void mdb_audit(MDB_txn *txn)
1496 MDB_ID freecount, count;
1501 mdb_cursor_init(&mc, txn, FREE_DBI, NULL);
1502 while ((rc = mdb_cursor_get(&mc, &key, &data, MDB_NEXT)) == 0)
1503 freecount += *(MDB_ID *)data.mv_data;
1504 mdb_tassert(txn, rc == MDB_NOTFOUND);
1507 for (i = 0; i<txn->mt_numdbs; i++) {
1509 if (!(txn->mt_dbflags[i] & DB_VALID))
1511 mdb_cursor_init(&mc, txn, i, &mx);
1512 if (txn->mt_dbs[i].md_root == P_INVALID)
1514 count += txn->mt_dbs[i].md_branch_pages +
1515 txn->mt_dbs[i].md_leaf_pages +
1516 txn->mt_dbs[i].md_overflow_pages;
1517 if (txn->mt_dbs[i].md_flags & MDB_DUPSORT) {
1518 rc = mdb_page_search(&mc, NULL, MDB_PS_FIRST);
1519 for (; rc == MDB_SUCCESS; rc = mdb_cursor_sibling(&mc, 1)) {
1522 mp = mc.mc_pg[mc.mc_top];
1523 for (j=0; j<NUMKEYS(mp); j++) {
1524 MDB_node *leaf = NODEPTR(mp, j);
1525 if (leaf->mn_flags & F_SUBDATA) {
1527 memcpy(&db, NODEDATA(leaf), sizeof(db));
1528 count += db.md_branch_pages + db.md_leaf_pages +
1529 db.md_overflow_pages;
1533 mdb_tassert(txn, rc == MDB_NOTFOUND);
1536 if (freecount + count + 2 /* metapages */ != txn->mt_next_pgno) {
1537 fprintf(stderr, "audit: %lu freecount: %lu count: %lu total: %lu next_pgno: %lu\n",
1538 txn->mt_txnid, freecount, count+2, freecount+count+2, txn->mt_next_pgno);
1544 mdb_cmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b)
1546 return txn->mt_dbxs[dbi].md_cmp(a, b);
1550 mdb_dcmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b)
1552 return txn->mt_dbxs[dbi].md_dcmp(a, b);
1555 /** Allocate memory for a page.
1556 * Re-use old malloc'd pages first for singletons, otherwise just malloc.
1559 mdb_page_malloc(MDB_txn *txn, unsigned num)
1561 MDB_env *env = txn->mt_env;
1562 MDB_page *ret = env->me_dpages;
1563 size_t psize = env->me_psize, sz = psize, off;
1564 /* For ! #MDB_NOMEMINIT, psize counts how much to init.
1565 * For a single page alloc, we init everything after the page header.
1566 * For multi-page, we init the final page; if the caller needed that
1567 * many pages they will be filling in at least up to the last page.
1571 VGMEMP_ALLOC(env, ret, sz);
1572 VGMEMP_DEFINED(ret, sizeof(ret->mp_next));
1573 env->me_dpages = ret->mp_next;
1576 psize -= off = PAGEHDRSZ;
1581 if ((ret = malloc(sz)) != NULL) {
1582 VGMEMP_ALLOC(env, ret, sz);
1583 if (!(env->me_flags & MDB_NOMEMINIT)) {
1584 memset((char *)ret + off, 0, psize);
1588 txn->mt_flags |= MDB_TXN_ERROR;
1592 /** Free a single page.
1593 * Saves single pages to a list, for future reuse.
1594 * (This is not used for multi-page overflow pages.)
1597 mdb_page_free(MDB_env *env, MDB_page *mp)
1599 mp->mp_next = env->me_dpages;
1600 VGMEMP_FREE(env, mp);
1601 env->me_dpages = mp;
1604 /** Free a dirty page */
1606 mdb_dpage_free(MDB_env *env, MDB_page *dp)
1608 if (!IS_OVERFLOW(dp) || dp->mp_pages == 1) {
1609 mdb_page_free(env, dp);
1611 /* large pages just get freed directly */
1612 VGMEMP_FREE(env, dp);
1617 /** Return all dirty pages to dpage list */
1619 mdb_dlist_free(MDB_txn *txn)
1621 MDB_env *env = txn->mt_env;
1622 MDB_ID2L dl = txn->mt_u.dirty_list;
1623 unsigned i, n = dl[0].mid;
1625 for (i = 1; i <= n; i++) {
1626 mdb_dpage_free(env, dl[i].mptr);
1631 /** Loosen or free a single page.
1632 * Saves single pages to a list for future reuse
1633 * in this same txn. It has been pulled from the freeDB
1634 * and already resides on the dirty list, but has been
1635 * deleted. Use these pages first before pulling again
1638 * If the page wasn't dirtied in this txn, just add it
1639 * to this txn's free list.
1642 mdb_page_loose(MDB_cursor *mc, MDB_page *mp)
1645 pgno_t pgno = mp->mp_pgno;
1646 MDB_txn *txn = mc->mc_txn;
1648 if ((mp->mp_flags & P_DIRTY) && mc->mc_dbi != FREE_DBI) {
1649 if (txn->mt_parent) {
1650 MDB_ID2 *dl = txn->mt_u.dirty_list;
1651 /* If txn has a parent, make sure the page is in our
1655 unsigned x = mdb_mid2l_search(dl, pgno);
1656 if (x <= dl[0].mid && dl[x].mid == pgno) {
1657 if (mp != dl[x].mptr) { /* bad cursor? */
1658 mc->mc_flags &= ~(C_INITIALIZED|C_EOF);
1659 txn->mt_flags |= MDB_TXN_ERROR;
1660 return MDB_CORRUPTED;
1667 /* no parent txn, so it's just ours */
1672 DPRINTF(("loosen db %d page %"Z"u", DDBI(mc),
1674 NEXT_LOOSE_PAGE(mp) = txn->mt_loose_pgs;
1675 txn->mt_loose_pgs = mp;
1676 txn->mt_loose_count++;
1677 mp->mp_flags |= P_LOOSE;
1679 int rc = mdb_midl_append(&txn->mt_free_pgs, pgno);
1687 /** Set or clear P_KEEP in dirty, non-overflow, non-sub pages watched by txn.
1688 * @param[in] mc A cursor handle for the current operation.
1689 * @param[in] pflags Flags of the pages to update:
1690 * P_DIRTY to set P_KEEP, P_DIRTY|P_KEEP to clear it.
1691 * @param[in] all No shortcuts. Needed except after a full #mdb_page_flush().
1692 * @return 0 on success, non-zero on failure.
1695 mdb_pages_xkeep(MDB_cursor *mc, unsigned pflags, int all)
1697 enum { Mask = P_SUBP|P_DIRTY|P_LOOSE|P_KEEP };
1698 MDB_txn *txn = mc->mc_txn;
1704 int rc = MDB_SUCCESS, level;
1706 /* Mark pages seen by cursors */
1707 if (mc->mc_flags & C_UNTRACK)
1708 mc = NULL; /* will find mc in mt_cursors */
1709 for (i = txn->mt_numdbs;; mc = txn->mt_cursors[--i]) {
1710 for (; mc; mc=mc->mc_next) {
1711 if (!(mc->mc_flags & C_INITIALIZED))
1713 for (m3 = mc;; m3 = &mx->mx_cursor) {
1715 for (j=0; j<m3->mc_snum; j++) {
1717 if ((mp->mp_flags & Mask) == pflags)
1718 mp->mp_flags ^= P_KEEP;
1720 mx = m3->mc_xcursor;
1721 /* Proceed to mx if it is at a sub-database */
1722 if (! (mx && (mx->mx_cursor.mc_flags & C_INITIALIZED)))
1724 if (! (mp && (mp->mp_flags & P_LEAF)))
1726 leaf = NODEPTR(mp, m3->mc_ki[j-1]);
1727 if (!(leaf->mn_flags & F_SUBDATA))
1736 /* Mark dirty root pages */
1737 for (i=0; i<txn->mt_numdbs; i++) {
1738 if (txn->mt_dbflags[i] & DB_DIRTY) {
1739 pgno_t pgno = txn->mt_dbs[i].md_root;
1740 if (pgno == P_INVALID)
1742 if ((rc = mdb_page_get(txn, pgno, &dp, &level)) != MDB_SUCCESS)
1744 if ((dp->mp_flags & Mask) == pflags && level <= 1)
1745 dp->mp_flags ^= P_KEEP;
1753 static int mdb_page_flush(MDB_txn *txn, int keep);
1755 /** Spill pages from the dirty list back to disk.
1756 * This is intended to prevent running into #MDB_TXN_FULL situations,
1757 * but note that they may still occur in a few cases:
1758 * 1) our estimate of the txn size could be too small. Currently this
1759 * seems unlikely, except with a large number of #MDB_MULTIPLE items.
1760 * 2) child txns may run out of space if their parents dirtied a
1761 * lot of pages and never spilled them. TODO: we probably should do
1762 * a preemptive spill during #mdb_txn_begin() of a child txn, if
1763 * the parent's dirty_room is below a given threshold.
1765 * Otherwise, if not using nested txns, it is expected that apps will
1766 * not run into #MDB_TXN_FULL any more. The pages are flushed to disk
1767 * the same way as for a txn commit, e.g. their P_DIRTY flag is cleared.
1768 * If the txn never references them again, they can be left alone.
1769 * If the txn only reads them, they can be used without any fuss.
1770 * If the txn writes them again, they can be dirtied immediately without
1771 * going thru all of the work of #mdb_page_touch(). Such references are
1772 * handled by #mdb_page_unspill().
1774 * Also note, we never spill DB root pages, nor pages of active cursors,
1775 * because we'll need these back again soon anyway. And in nested txns,
1776 * we can't spill a page in a child txn if it was already spilled in a
1777 * parent txn. That would alter the parent txns' data even though
1778 * the child hasn't committed yet, and we'd have no way to undo it if
1779 * the child aborted.
1781 * @param[in] m0 cursor A cursor handle identifying the transaction and
1782 * database for which we are checking space.
1783 * @param[in] key For a put operation, the key being stored.
1784 * @param[in] data For a put operation, the data being stored.
1785 * @return 0 on success, non-zero on failure.
1788 mdb_page_spill(MDB_cursor *m0, MDB_val *key, MDB_val *data)
1790 MDB_txn *txn = m0->mc_txn;
1792 MDB_ID2L dl = txn->mt_u.dirty_list;
1793 unsigned int i, j, need;
1796 if (m0->mc_flags & C_SUB)
1799 /* Estimate how much space this op will take */
1800 i = m0->mc_db->md_depth;
1801 /* Named DBs also dirty the main DB */
1802 if (m0->mc_dbi > MAIN_DBI)
1803 i += txn->mt_dbs[MAIN_DBI].md_depth;
1804 /* For puts, roughly factor in the key+data size */
1806 i += (LEAFSIZE(key, data) + txn->mt_env->me_psize) / txn->mt_env->me_psize;
1807 i += i; /* double it for good measure */
1810 if (txn->mt_dirty_room > i)
1813 if (!txn->mt_spill_pgs) {
1814 txn->mt_spill_pgs = mdb_midl_alloc(MDB_IDL_UM_MAX);
1815 if (!txn->mt_spill_pgs)
1818 /* purge deleted slots */
1819 MDB_IDL sl = txn->mt_spill_pgs;
1820 unsigned int num = sl[0];
1822 for (i=1; i<=num; i++) {
1829 /* Preserve pages which may soon be dirtied again */
1830 if ((rc = mdb_pages_xkeep(m0, P_DIRTY, 1)) != MDB_SUCCESS)
1833 /* Less aggressive spill - we originally spilled the entire dirty list,
1834 * with a few exceptions for cursor pages and DB root pages. But this
1835 * turns out to be a lot of wasted effort because in a large txn many
1836 * of those pages will need to be used again. So now we spill only 1/8th
1837 * of the dirty pages. Testing revealed this to be a good tradeoff,
1838 * better than 1/2, 1/4, or 1/10.
1840 if (need < MDB_IDL_UM_MAX / 8)
1841 need = MDB_IDL_UM_MAX / 8;
1843 /* Save the page IDs of all the pages we're flushing */
1844 /* flush from the tail forward, this saves a lot of shifting later on. */
1845 for (i=dl[0].mid; i && need; i--) {
1846 MDB_ID pn = dl[i].mid << 1;
1848 if (dp->mp_flags & (P_LOOSE|P_KEEP))
1850 /* Can't spill twice, make sure it's not already in a parent's
1853 if (txn->mt_parent) {
1855 for (tx2 = txn->mt_parent; tx2; tx2 = tx2->mt_parent) {
1856 if (tx2->mt_spill_pgs) {
1857 j = mdb_midl_search(tx2->mt_spill_pgs, pn);
1858 if (j <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[j] == pn) {
1859 dp->mp_flags |= P_KEEP;
1867 if ((rc = mdb_midl_append(&txn->mt_spill_pgs, pn)))
1871 mdb_midl_sort(txn->mt_spill_pgs);
1873 /* Flush the spilled part of dirty list */
1874 if ((rc = mdb_page_flush(txn, i)) != MDB_SUCCESS)
1877 /* Reset any dirty pages we kept that page_flush didn't see */
1878 rc = mdb_pages_xkeep(m0, P_DIRTY|P_KEEP, i);
1881 txn->mt_flags |= rc ? MDB_TXN_ERROR : MDB_TXN_SPILLS;
1885 /** Find oldest txnid still referenced. Expects txn->mt_txnid > 0. */
1887 mdb_find_oldest(MDB_txn *txn)
1890 txnid_t mr, oldest = txn->mt_txnid - 1;
1891 if (txn->mt_env->me_txns) {
1892 MDB_reader *r = txn->mt_env->me_txns->mti_readers;
1893 for (i = txn->mt_env->me_txns->mti_numreaders; --i >= 0; ) {
1904 /** Add a page to the txn's dirty list */
1906 mdb_page_dirty(MDB_txn *txn, MDB_page *mp)
1909 int rc, (*insert)(MDB_ID2L, MDB_ID2 *);
1911 if (txn->mt_env->me_flags & MDB_WRITEMAP) {
1912 insert = mdb_mid2l_append;
1914 insert = mdb_mid2l_insert;
1916 mid.mid = mp->mp_pgno;
1918 rc = insert(txn->mt_u.dirty_list, &mid);
1919 mdb_tassert(txn, rc == 0);
1920 txn->mt_dirty_room--;
1923 /** Allocate page numbers and memory for writing. Maintain me_pglast,
1924 * me_pghead and mt_next_pgno.
1926 * If there are free pages available from older transactions, they
1927 * are re-used first. Otherwise allocate a new page at mt_next_pgno.
1928 * Do not modify the freedB, just merge freeDB records into me_pghead[]
1929 * and move me_pglast to say which records were consumed. Only this
1930 * function can create me_pghead and move me_pglast/mt_next_pgno.
1931 * @param[in] mc cursor A cursor handle identifying the transaction and
1932 * database for which we are allocating.
1933 * @param[in] num the number of pages to allocate.
1934 * @param[out] mp Address of the allocated page(s). Requests for multiple pages
1935 * will always be satisfied by a single contiguous chunk of memory.
1936 * @return 0 on success, non-zero on failure.
1939 mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp)
1941 #ifdef MDB_PARANOID /* Seems like we can ignore this now */
1942 /* Get at most <Max_retries> more freeDB records once me_pghead
1943 * has enough pages. If not enough, use new pages from the map.
1944 * If <Paranoid> and mc is updating the freeDB, only get new
1945 * records if me_pghead is empty. Then the freelist cannot play
1946 * catch-up with itself by growing while trying to save it.
1948 enum { Paranoid = 1, Max_retries = 500 };
1950 enum { Paranoid = 0, Max_retries = INT_MAX /*infinite*/ };
1952 int rc, retry = num * 60;
1953 MDB_txn *txn = mc->mc_txn;
1954 MDB_env *env = txn->mt_env;
1955 pgno_t pgno, *mop = env->me_pghead;
1956 unsigned i, j, mop_len = mop ? mop[0] : 0, n2 = num-1;
1958 txnid_t oldest = 0, last;
1963 /* If there are any loose pages, just use them */
1964 if (num == 1 && txn->mt_loose_pgs) {
1965 np = txn->mt_loose_pgs;
1966 txn->mt_loose_pgs = NEXT_LOOSE_PAGE(np);
1967 txn->mt_loose_count--;
1968 DPRINTF(("db %d use loose page %"Z"u", DDBI(mc),
1976 /* If our dirty list is already full, we can't do anything */
1977 if (txn->mt_dirty_room == 0) {
1982 for (op = MDB_FIRST;; op = MDB_NEXT) {
1987 /* Seek a big enough contiguous page range. Prefer
1988 * pages at the tail, just truncating the list.
1994 if (mop[i-n2] == pgno+n2)
2001 if (op == MDB_FIRST) { /* 1st iteration */
2002 /* Prepare to fetch more and coalesce */
2003 last = env->me_pglast;
2004 oldest = env->me_pgoldest;
2005 mdb_cursor_init(&m2, txn, FREE_DBI, NULL);
2008 key.mv_data = &last; /* will look up last+1 */
2009 key.mv_size = sizeof(last);
2011 if (Paranoid && mc->mc_dbi == FREE_DBI)
2014 if (Paranoid && retry < 0 && mop_len)
2018 /* Do not fetch more if the record will be too recent */
2019 if (oldest <= last) {
2021 oldest = mdb_find_oldest(txn);
2022 env->me_pgoldest = oldest;
2028 rc = mdb_cursor_get(&m2, &key, NULL, op);
2030 if (rc == MDB_NOTFOUND)
2034 last = *(txnid_t*)key.mv_data;
2035 if (oldest <= last) {
2037 oldest = mdb_find_oldest(txn);
2038 env->me_pgoldest = oldest;
2044 np = m2.mc_pg[m2.mc_top];
2045 leaf = NODEPTR(np, m2.mc_ki[m2.mc_top]);
2046 if ((rc = mdb_node_read(txn, leaf, &data)) != MDB_SUCCESS)
2049 idl = (MDB_ID *) data.mv_data;
2052 if (!(env->me_pghead = mop = mdb_midl_alloc(i))) {
2057 if ((rc = mdb_midl_need(&env->me_pghead, i)) != 0)
2059 mop = env->me_pghead;
2061 env->me_pglast = last;
2063 DPRINTF(("IDL read txn %"Z"u root %"Z"u num %u",
2064 last, txn->mt_dbs[FREE_DBI].md_root, i));
2066 DPRINTF(("IDL %"Z"u", idl[j]));
2068 /* Merge in descending sorted order */
2069 mdb_midl_xmerge(mop, idl);
2073 /* Use new pages from the map when nothing suitable in the freeDB */
2075 pgno = txn->mt_next_pgno;
2076 if (pgno + num >= env->me_maxpg) {
2077 DPUTS("DB size maxed out");
2083 if (env->me_flags & MDB_WRITEMAP) {
2084 np = (MDB_page *)(env->me_map + env->me_psize * pgno);
2086 if (!(np = mdb_page_malloc(txn, num))) {
2092 mop[0] = mop_len -= num;
2093 /* Move any stragglers down */
2094 for (j = i-num; j < mop_len; )
2095 mop[++j] = mop[++i];
2097 txn->mt_next_pgno = pgno + num;
2100 mdb_page_dirty(txn, np);
2106 txn->mt_flags |= MDB_TXN_ERROR;
2110 /** Copy the used portions of a non-overflow page.
2111 * @param[in] dst page to copy into
2112 * @param[in] src page to copy from
2113 * @param[in] psize size of a page
2116 mdb_page_copy(MDB_page *dst, MDB_page *src, unsigned int psize)
2118 enum { Align = sizeof(pgno_t) };
2119 indx_t upper = src->mp_upper, lower = src->mp_lower, unused = upper-lower;
2121 /* If page isn't full, just copy the used portion. Adjust
2122 * alignment so memcpy may copy words instead of bytes.
2124 if ((unused &= -Align) && !IS_LEAF2(src)) {
2125 upper = (upper + PAGEBASE) & -Align;
2126 memcpy(dst, src, (lower + PAGEBASE + (Align-1)) & -Align);
2127 memcpy((pgno_t *)((char *)dst+upper), (pgno_t *)((char *)src+upper),
2130 memcpy(dst, src, psize - unused);
2134 /** Pull a page off the txn's spill list, if present.
2135 * If a page being referenced was spilled to disk in this txn, bring
2136 * it back and make it dirty/writable again.
2137 * @param[in] txn the transaction handle.
2138 * @param[in] mp the page being referenced. It must not be dirty.
2139 * @param[out] ret the writable page, if any. ret is unchanged if
2140 * mp wasn't spilled.
2143 mdb_page_unspill(MDB_txn *txn, MDB_page *mp, MDB_page **ret)
2145 MDB_env *env = txn->mt_env;
2148 pgno_t pgno = mp->mp_pgno, pn = pgno << 1;
2150 for (tx2 = txn; tx2; tx2=tx2->mt_parent) {
2151 if (!tx2->mt_spill_pgs)
2153 x = mdb_midl_search(tx2->mt_spill_pgs, pn);
2154 if (x <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[x] == pn) {
2157 if (txn->mt_dirty_room == 0)
2158 return MDB_TXN_FULL;
2159 if (IS_OVERFLOW(mp))
2163 if (env->me_flags & MDB_WRITEMAP) {
2166 np = mdb_page_malloc(txn, num);
2170 memcpy(np, mp, num * env->me_psize);
2172 mdb_page_copy(np, mp, env->me_psize);
2175 /* If in current txn, this page is no longer spilled.
2176 * If it happens to be the last page, truncate the spill list.
2177 * Otherwise mark it as deleted by setting the LSB.
2179 if (x == txn->mt_spill_pgs[0])
2180 txn->mt_spill_pgs[0]--;
2182 txn->mt_spill_pgs[x] |= 1;
2183 } /* otherwise, if belonging to a parent txn, the
2184 * page remains spilled until child commits
2187 mdb_page_dirty(txn, np);
2188 np->mp_flags |= P_DIRTY;
2196 /** Touch a page: make it dirty and re-insert into tree with updated pgno.
2197 * @param[in] mc cursor pointing to the page to be touched
2198 * @return 0 on success, non-zero on failure.
2201 mdb_page_touch(MDB_cursor *mc)
2203 MDB_page *mp = mc->mc_pg[mc->mc_top], *np;
2204 MDB_txn *txn = mc->mc_txn;
2205 MDB_cursor *m2, *m3;
2209 if (!F_ISSET(mp->mp_flags, P_DIRTY)) {
2210 if (txn->mt_flags & MDB_TXN_SPILLS) {
2212 rc = mdb_page_unspill(txn, mp, &np);
2218 if ((rc = mdb_midl_need(&txn->mt_free_pgs, 1)) ||
2219 (rc = mdb_page_alloc(mc, 1, &np)))
2222 DPRINTF(("touched db %d page %"Z"u -> %"Z"u", DDBI(mc),
2223 mp->mp_pgno, pgno));
2224 mdb_cassert(mc, mp->mp_pgno != pgno);
2225 mdb_midl_xappend(txn->mt_free_pgs, mp->mp_pgno);
2226 /* Update the parent page, if any, to point to the new page */
2228 MDB_page *parent = mc->mc_pg[mc->mc_top-1];
2229 MDB_node *node = NODEPTR(parent, mc->mc_ki[mc->mc_top-1]);
2230 SETPGNO(node, pgno);
2232 mc->mc_db->md_root = pgno;
2234 } else if (txn->mt_parent && !IS_SUBP(mp)) {
2235 MDB_ID2 mid, *dl = txn->mt_u.dirty_list;
2237 /* If txn has a parent, make sure the page is in our
2241 unsigned x = mdb_mid2l_search(dl, pgno);
2242 if (x <= dl[0].mid && dl[x].mid == pgno) {
2243 if (mp != dl[x].mptr) { /* bad cursor? */
2244 mc->mc_flags &= ~(C_INITIALIZED|C_EOF);
2245 txn->mt_flags |= MDB_TXN_ERROR;
2246 return MDB_CORRUPTED;
2251 mdb_cassert(mc, dl[0].mid < MDB_IDL_UM_MAX);
2253 np = mdb_page_malloc(txn, 1);
2258 rc = mdb_mid2l_insert(dl, &mid);
2259 mdb_cassert(mc, rc == 0);
2264 mdb_page_copy(np, mp, txn->mt_env->me_psize);
2266 np->mp_flags |= P_DIRTY;
2269 /* Adjust cursors pointing to mp */
2270 mc->mc_pg[mc->mc_top] = np;
2271 m2 = txn->mt_cursors[mc->mc_dbi];
2272 if (mc->mc_flags & C_SUB) {
2273 for (; m2; m2=m2->mc_next) {
2274 m3 = &m2->mc_xcursor->mx_cursor;
2275 if (m3->mc_snum < mc->mc_snum) continue;
2276 if (m3->mc_pg[mc->mc_top] == mp)
2277 m3->mc_pg[mc->mc_top] = np;
2280 for (; m2; m2=m2->mc_next) {
2281 if (m2->mc_snum < mc->mc_snum) continue;
2282 if (m2->mc_pg[mc->mc_top] == mp) {
2283 m2->mc_pg[mc->mc_top] = np;
2284 if ((mc->mc_db->md_flags & MDB_DUPSORT) &&
2286 m2->mc_ki[mc->mc_top] == mc->mc_ki[mc->mc_top])
2288 MDB_node *leaf = NODEPTR(np, mc->mc_ki[mc->mc_top]);
2289 if (!(leaf->mn_flags & F_SUBDATA))
2290 m2->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf);
2298 txn->mt_flags |= MDB_TXN_ERROR;
2302 /* internal env_sync flags: */
2303 #define FORCE 1 /* as before, force a flush */
2304 #define FGREW 0x8000 /* file has grown, do a full fsync instead of just
2305 fdatasync. We shouldn't have to do this, according to the POSIX spec.
2306 But common Linux FSs violate the spec and won't sync required metadata
2307 correctly when the file grows. This only makes a difference if the
2308 platform actually distinguishes fdatasync from fsync.
2309 http://www.openldap.org/lists/openldap-devel/201411/msg00000.html */
2312 mdb_env_sync0(MDB_env *env, int flag)
2314 int rc = 0, force = flag & FORCE;
2315 if (force || !F_ISSET(env->me_flags, MDB_NOSYNC)) {
2316 if (env->me_flags & MDB_WRITEMAP) {
2317 int flags = ((env->me_flags & MDB_MAPASYNC) && !force)
2318 ? MS_ASYNC : MS_SYNC;
2319 if (MDB_MSYNC(env->me_map, env->me_mapsize, flags))
2322 else if (flags == MS_SYNC && MDB_FDATASYNC(env->me_fd))
2326 #ifdef HAVE_FDATASYNC
2328 if (fsync(env->me_fd)) /* Avoid ext-fs bugs, do full sync */
2332 if (MDB_FDATASYNC(env->me_fd))
2340 mdb_env_sync(MDB_env *env, int force)
2342 return mdb_env_sync0(env, force != 0);
2345 /** Back up parent txn's cursors, then grab the originals for tracking */
2347 mdb_cursor_shadow(MDB_txn *src, MDB_txn *dst)
2349 MDB_cursor *mc, *bk;
2354 for (i = src->mt_numdbs; --i >= 0; ) {
2355 if ((mc = src->mt_cursors[i]) != NULL) {
2356 size = sizeof(MDB_cursor);
2358 size += sizeof(MDB_xcursor);
2359 for (; mc; mc = bk->mc_next) {
2365 mc->mc_db = &dst->mt_dbs[i];
2366 /* Kill pointers into src - and dst to reduce abuse: The
2367 * user may not use mc until dst ends. Otherwise we'd...
2369 mc->mc_txn = NULL; /* ...set this to dst */
2370 mc->mc_dbflag = NULL; /* ...and &dst->mt_dbflags[i] */
2371 if ((mx = mc->mc_xcursor) != NULL) {
2372 *(MDB_xcursor *)(bk+1) = *mx;
2373 mx->mx_cursor.mc_txn = NULL; /* ...and dst. */
2375 mc->mc_next = dst->mt_cursors[i];
2376 dst->mt_cursors[i] = mc;
2383 /** Close this write txn's cursors, give parent txn's cursors back to parent.
2384 * @param[in] txn the transaction handle.
2385 * @param[in] merge true to keep changes to parent cursors, false to revert.
2386 * @return 0 on success, non-zero on failure.
2389 mdb_cursors_close(MDB_txn *txn, unsigned merge)
2391 MDB_cursor **cursors = txn->mt_cursors, *mc, *next, *bk;
2395 for (i = txn->mt_numdbs; --i >= 0; ) {
2396 for (mc = cursors[i]; mc; mc = next) {
2398 if ((bk = mc->mc_backup) != NULL) {
2400 /* Commit changes to parent txn */
2401 mc->mc_next = bk->mc_next;
2402 mc->mc_backup = bk->mc_backup;
2403 mc->mc_txn = bk->mc_txn;
2404 mc->mc_db = bk->mc_db;
2405 mc->mc_dbflag = bk->mc_dbflag;
2406 if ((mx = mc->mc_xcursor) != NULL)
2407 mx->mx_cursor.mc_txn = bk->mc_txn;
2409 /* Abort nested txn */
2411 if ((mx = mc->mc_xcursor) != NULL)
2412 *mx = *(MDB_xcursor *)(bk+1);
2416 /* Only malloced cursors are permanently tracked. */
2424 #define mdb_txn_reset0(txn, act) mdb_txn_reset0(txn)
2427 mdb_txn_reset0(MDB_txn *txn, const char *act);
2429 #if !(MDB_PIDLOCK) /* Currently the same as defined(_WIN32) */
2435 Pidset = F_SETLK, Pidcheck = F_GETLK
2439 /** Set or check a pid lock. Set returns 0 on success.
2440 * Check returns 0 if the process is certainly dead, nonzero if it may
2441 * be alive (the lock exists or an error happened so we do not know).
2443 * On Windows Pidset is a no-op, we merely check for the existence
2444 * of the process with the given pid. On POSIX we use a single byte
2445 * lock on the lockfile, set at an offset equal to the pid.
2448 mdb_reader_pid(MDB_env *env, enum Pidlock_op op, MDB_PID_T pid)
2450 #if !(MDB_PIDLOCK) /* Currently the same as defined(_WIN32) */
2453 if (op == Pidcheck) {
2454 h = OpenProcess(env->me_pidquery, FALSE, pid);
2455 /* No documented "no such process" code, but other program use this: */
2457 return ErrCode() != ERROR_INVALID_PARAMETER;
2458 /* A process exists until all handles to it close. Has it exited? */
2459 ret = WaitForSingleObject(h, 0) != 0;
2466 struct flock lock_info;
2467 memset(&lock_info, 0, sizeof(lock_info));
2468 lock_info.l_type = F_WRLCK;
2469 lock_info.l_whence = SEEK_SET;
2470 lock_info.l_start = pid;
2471 lock_info.l_len = 1;
2472 if ((rc = fcntl(env->me_lfd, op, &lock_info)) == 0) {
2473 if (op == F_GETLK && lock_info.l_type != F_UNLCK)
2475 } else if ((rc = ErrCode()) == EINTR) {
2483 /** Common code for #mdb_txn_begin() and #mdb_txn_renew().
2484 * @param[in] txn the transaction handle to initialize
2485 * @return 0 on success, non-zero on failure.
2488 mdb_txn_renew0(MDB_txn *txn)
2490 MDB_env *env = txn->mt_env;
2491 MDB_txninfo *ti = env->me_txns;
2495 int rc, new_notls = 0;
2497 if (txn->mt_flags & MDB_TXN_RDONLY) {
2499 txn->mt_numdbs = env->me_numdbs;
2500 txn->mt_dbxs = env->me_dbxs; /* mostly static anyway */
2502 meta = env->me_metas[ mdb_env_pick_meta(env) ];
2503 txn->mt_txnid = meta->mm_txnid;
2504 txn->mt_u.reader = NULL;
2506 MDB_reader *r = (env->me_flags & MDB_NOTLS) ? txn->mt_u.reader :
2507 pthread_getspecific(env->me_txkey);
2509 if (r->mr_pid != env->me_pid || r->mr_txnid != (txnid_t)-1)
2510 return MDB_BAD_RSLOT;
2512 MDB_PID_T pid = env->me_pid;
2513 MDB_THR_T tid = pthread_self();
2515 if (!env->me_live_reader) {
2516 rc = mdb_reader_pid(env, Pidset, pid);
2519 env->me_live_reader = 1;
2523 nr = ti->mti_numreaders;
2524 for (i=0; i<nr; i++)
2525 if (ti->mti_readers[i].mr_pid == 0)
2527 if (i == env->me_maxreaders) {
2528 UNLOCK_MUTEX_R(env);
2529 return MDB_READERS_FULL;
2531 ti->mti_readers[i].mr_pid = pid;
2532 ti->mti_readers[i].mr_tid = tid;
2534 ti->mti_numreaders = ++nr;
2535 /* Save numreaders for un-mutexed mdb_env_close() */
2536 env->me_numreaders = nr;
2537 UNLOCK_MUTEX_R(env);
2539 r = &ti->mti_readers[i];
2540 new_notls = (env->me_flags & MDB_NOTLS);
2541 if (!new_notls && (rc=pthread_setspecific(env->me_txkey, r))) {
2546 txn->mt_txnid = r->mr_txnid = ti->mti_txnid;
2547 txn->mt_u.reader = r;
2548 meta = env->me_metas[txn->mt_txnid & 1];
2554 txn->mt_txnid = ti->mti_txnid;
2555 meta = env->me_metas[txn->mt_txnid & 1];
2557 meta = env->me_metas[ mdb_env_pick_meta(env) ];
2558 txn->mt_txnid = meta->mm_txnid;
2561 txn->mt_numdbs = env->me_numdbs;
2564 if (txn->mt_txnid == mdb_debug_start)
2568 txn->mt_child = NULL;
2569 txn->mt_loose_pgs = NULL;
2570 txn->mt_loose_count = 0;
2571 txn->mt_dirty_room = MDB_IDL_UM_MAX;
2572 txn->mt_u.dirty_list = env->me_dirty_list;
2573 txn->mt_u.dirty_list[0].mid = 0;
2574 txn->mt_free_pgs = env->me_free_pgs;
2575 txn->mt_free_pgs[0] = 0;
2576 txn->mt_spill_pgs = NULL;
2578 memcpy(txn->mt_dbiseqs, env->me_dbiseqs, env->me_maxdbs * sizeof(unsigned int));
2581 /* Copy the DB info and flags */
2582 memcpy(txn->mt_dbs, meta->mm_dbs, 2 * sizeof(MDB_db));
2584 /* Moved to here to avoid a data race in read TXNs */
2585 txn->mt_next_pgno = meta->mm_last_pg+1;
2587 for (i=2; i<txn->mt_numdbs; i++) {
2588 x = env->me_dbflags[i];
2589 txn->mt_dbs[i].md_flags = x & PERSISTENT_FLAGS;
2590 txn->mt_dbflags[i] = (x & MDB_VALID) ? DB_VALID|DB_STALE : 0;
2592 txn->mt_dbflags[0] = txn->mt_dbflags[1] = DB_VALID;
2594 if (env->me_maxpg < txn->mt_next_pgno) {
2595 mdb_txn_reset0(txn, "renew0-mapfail");
2597 txn->mt_u.reader->mr_pid = 0;
2598 txn->mt_u.reader = NULL;
2600 return MDB_MAP_RESIZED;
2607 mdb_txn_renew(MDB_txn *txn)
2611 if (!txn || txn->mt_dbxs) /* A reset txn has mt_dbxs==NULL */
2614 if (txn->mt_env->me_flags & MDB_FATAL_ERROR) {
2615 DPUTS("environment had fatal error, must shutdown!");
2619 rc = mdb_txn_renew0(txn);
2620 if (rc == MDB_SUCCESS) {
2621 DPRINTF(("renew txn %"Z"u%c %p on mdbenv %p, root page %"Z"u",
2622 txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w',
2623 (void *)txn, (void *)txn->mt_env, txn->mt_dbs[MAIN_DBI].md_root));
2629 mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned int flags, MDB_txn **ret)
2633 int rc, size, tsize = sizeof(MDB_txn);
2635 if (env->me_flags & MDB_FATAL_ERROR) {
2636 DPUTS("environment had fatal error, must shutdown!");
2639 if ((env->me_flags & MDB_RDONLY) && !(flags & MDB_RDONLY))
2642 /* Nested transactions: Max 1 child, write txns only, no writemap */
2643 if (parent->mt_child ||
2644 (flags & MDB_RDONLY) ||
2645 (parent->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_ERROR)) ||
2646 (env->me_flags & MDB_WRITEMAP))
2648 return (parent->mt_flags & MDB_TXN_RDONLY) ? EINVAL : MDB_BAD_TXN;
2650 tsize = sizeof(MDB_ntxn);
2653 if (!(flags & MDB_RDONLY)) {
2655 txn = env->me_txn0; /* just reuse preallocated write txn */
2658 /* child txns use own copy of cursors */
2659 size += env->me_maxdbs * sizeof(MDB_cursor *);
2661 size += env->me_maxdbs * (sizeof(MDB_db)+1);
2663 if ((txn = calloc(1, size)) == NULL) {
2664 DPRINTF(("calloc: %s", strerror(errno)));
2667 txn->mt_dbs = (MDB_db *) ((char *)txn + tsize);
2668 if (flags & MDB_RDONLY) {
2669 txn->mt_flags |= MDB_TXN_RDONLY;
2670 txn->mt_dbflags = (unsigned char *)(txn->mt_dbs + env->me_maxdbs);
2671 txn->mt_dbiseqs = env->me_dbiseqs;
2673 txn->mt_cursors = (MDB_cursor **)(txn->mt_dbs + env->me_maxdbs);
2675 txn->mt_dbiseqs = parent->mt_dbiseqs;
2676 txn->mt_dbflags = (unsigned char *)(txn->mt_cursors + env->me_maxdbs);
2678 txn->mt_dbiseqs = (unsigned int *)(txn->mt_cursors + env->me_maxdbs);
2679 txn->mt_dbflags = (unsigned char *)(txn->mt_dbiseqs + env->me_maxdbs);
2687 txn->mt_u.dirty_list = malloc(sizeof(MDB_ID2)*MDB_IDL_UM_SIZE);
2688 if (!txn->mt_u.dirty_list ||
2689 !(txn->mt_free_pgs = mdb_midl_alloc(MDB_IDL_UM_MAX)))
2691 free(txn->mt_u.dirty_list);
2695 txn->mt_txnid = parent->mt_txnid;
2696 txn->mt_dirty_room = parent->mt_dirty_room;
2697 txn->mt_u.dirty_list[0].mid = 0;
2698 txn->mt_spill_pgs = NULL;
2699 txn->mt_next_pgno = parent->mt_next_pgno;
2700 parent->mt_child = txn;
2701 txn->mt_parent = parent;
2702 txn->mt_numdbs = parent->mt_numdbs;
2703 txn->mt_flags = parent->mt_flags;
2704 txn->mt_dbxs = parent->mt_dbxs;
2705 memcpy(txn->mt_dbs, parent->mt_dbs, txn->mt_numdbs * sizeof(MDB_db));
2706 /* Copy parent's mt_dbflags, but clear DB_NEW */
2707 for (i=0; i<txn->mt_numdbs; i++)
2708 txn->mt_dbflags[i] = parent->mt_dbflags[i] & ~DB_NEW;
2710 ntxn = (MDB_ntxn *)txn;
2711 ntxn->mnt_pgstate = env->me_pgstate; /* save parent me_pghead & co */
2712 if (env->me_pghead) {
2713 size = MDB_IDL_SIZEOF(env->me_pghead);
2714 env->me_pghead = mdb_midl_alloc(env->me_pghead[0]);
2716 memcpy(env->me_pghead, ntxn->mnt_pgstate.mf_pghead, size);
2721 rc = mdb_cursor_shadow(parent, txn);
2723 mdb_txn_reset0(txn, "beginchild-fail");
2725 rc = mdb_txn_renew0(txn);
2728 if (txn != env->me_txn0)
2732 DPRINTF(("begin txn %"Z"u%c %p on mdbenv %p, root page %"Z"u",
2733 txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w',
2734 (void *) txn, (void *) env, txn->mt_dbs[MAIN_DBI].md_root));
2741 mdb_txn_env(MDB_txn *txn)
2743 if(!txn) return NULL;
2747 /** Export or close DBI handles opened in this txn. */
2749 mdb_dbis_update(MDB_txn *txn, int keep)
2752 MDB_dbi n = txn->mt_numdbs;
2753 MDB_env *env = txn->mt_env;
2754 unsigned char *tdbflags = txn->mt_dbflags;
2756 for (i = n; --i >= 2;) {
2757 if (tdbflags[i] & DB_NEW) {
2759 env->me_dbflags[i] = txn->mt_dbs[i].md_flags | MDB_VALID;
2761 char *ptr = env->me_dbxs[i].md_name.mv_data;
2763 env->me_dbxs[i].md_name.mv_data = NULL;
2764 env->me_dbxs[i].md_name.mv_size = 0;
2765 env->me_dbflags[i] = 0;
2766 env->me_dbiseqs[i]++;
2772 if (keep && env->me_numdbs < n)
2776 /** Common code for #mdb_txn_reset() and #mdb_txn_abort().
2777 * May be called twice for readonly txns: First reset it, then abort.
2778 * @param[in] txn the transaction handle to reset
2779 * @param[in] act why the transaction is being reset
2782 mdb_txn_reset0(MDB_txn *txn, const char *act)
2784 MDB_env *env = txn->mt_env;
2786 /* Close any DBI handles opened in this txn */
2787 mdb_dbis_update(txn, 0);
2789 DPRINTF(("%s txn %"Z"u%c %p on mdbenv %p, root page %"Z"u",
2790 act, txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w',
2791 (void *) txn, (void *)env, txn->mt_dbs[MAIN_DBI].md_root));
2793 if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) {
2794 if (txn->mt_u.reader) {
2795 txn->mt_u.reader->mr_txnid = (txnid_t)-1;
2796 if (!(env->me_flags & MDB_NOTLS))
2797 txn->mt_u.reader = NULL; /* txn does not own reader */
2799 txn->mt_numdbs = 0; /* close nothing if called again */
2800 txn->mt_dbxs = NULL; /* mark txn as reset */
2802 pgno_t *pghead = env->me_pghead;
2804 mdb_cursors_close(txn, 0);
2805 if (!(env->me_flags & MDB_WRITEMAP)) {
2806 mdb_dlist_free(txn);
2809 if (!txn->mt_parent) {
2810 if (mdb_midl_shrink(&txn->mt_free_pgs))
2811 env->me_free_pgs = txn->mt_free_pgs;
2813 env->me_pghead = NULL;
2817 /* The writer mutex was locked in mdb_txn_begin. */
2819 UNLOCK_MUTEX_W(env);
2821 txn->mt_parent->mt_child = NULL;
2822 env->me_pgstate = ((MDB_ntxn *)txn)->mnt_pgstate;
2823 mdb_midl_free(txn->mt_free_pgs);
2824 mdb_midl_free(txn->mt_spill_pgs);
2825 free(txn->mt_u.dirty_list);
2828 mdb_midl_free(pghead);
2833 mdb_txn_reset(MDB_txn *txn)
2838 /* This call is only valid for read-only txns */
2839 if (!(txn->mt_flags & MDB_TXN_RDONLY))
2842 mdb_txn_reset0(txn, "reset");
2846 mdb_txn_abort(MDB_txn *txn)
2852 mdb_txn_abort(txn->mt_child);
2854 mdb_txn_reset0(txn, "abort");
2855 /* Free reader slot tied to this txn (if MDB_NOTLS && writable FS) */
2856 if ((txn->mt_flags & MDB_TXN_RDONLY) && txn->mt_u.reader)
2857 txn->mt_u.reader->mr_pid = 0;
2859 if (txn != txn->mt_env->me_txn0)
2863 /** Save the freelist as of this transaction to the freeDB.
2864 * This changes the freelist. Keep trying until it stabilizes.
2867 mdb_freelist_save(MDB_txn *txn)
2869 /* env->me_pghead[] can grow and shrink during this call.
2870 * env->me_pglast and txn->mt_free_pgs[] can only grow.
2871 * Page numbers cannot disappear from txn->mt_free_pgs[].
2874 MDB_env *env = txn->mt_env;
2875 int rc, maxfree_1pg = env->me_maxfree_1pg, more = 1;
2876 txnid_t pglast = 0, head_id = 0;
2877 pgno_t freecnt = 0, *free_pgs, *mop;
2878 ssize_t head_room = 0, total_room = 0, mop_len, clean_limit;
2880 mdb_cursor_init(&mc, txn, FREE_DBI, NULL);
2882 if (env->me_pghead) {
2883 /* Make sure first page of freeDB is touched and on freelist */
2884 rc = mdb_page_search(&mc, NULL, MDB_PS_FIRST|MDB_PS_MODIFY);
2885 if (rc && rc != MDB_NOTFOUND)
2889 if (!env->me_pghead && txn->mt_loose_pgs) {
2890 /* Put loose page numbers in mt_free_pgs, since
2891 * we may be unable to return them to me_pghead.
2893 MDB_page *mp = txn->mt_loose_pgs;
2894 if ((rc = mdb_midl_need(&txn->mt_free_pgs, txn->mt_loose_count)) != 0)
2896 for (; mp; mp = NEXT_LOOSE_PAGE(mp))
2897 mdb_midl_xappend(txn->mt_free_pgs, mp->mp_pgno);
2898 txn->mt_loose_pgs = NULL;
2899 txn->mt_loose_count = 0;
2902 /* MDB_RESERVE cancels meminit in ovpage malloc (when no WRITEMAP) */
2903 clean_limit = (env->me_flags & (MDB_NOMEMINIT|MDB_WRITEMAP))
2904 ? SSIZE_MAX : maxfree_1pg;
2907 /* Come back here after each Put() in case freelist changed */
2912 /* If using records from freeDB which we have not yet
2913 * deleted, delete them and any we reserved for me_pghead.
2915 while (pglast < env->me_pglast) {
2916 rc = mdb_cursor_first(&mc, &key, NULL);
2919 pglast = head_id = *(txnid_t *)key.mv_data;
2920 total_room = head_room = 0;
2921 mdb_tassert(txn, pglast <= env->me_pglast);
2922 rc = mdb_cursor_del(&mc, 0);
2927 /* Save the IDL of pages freed by this txn, to a single record */
2928 if (freecnt < txn->mt_free_pgs[0]) {
2930 /* Make sure last page of freeDB is touched and on freelist */
2931 rc = mdb_page_search(&mc, NULL, MDB_PS_LAST|MDB_PS_MODIFY);
2932 if (rc && rc != MDB_NOTFOUND)
2935 free_pgs = txn->mt_free_pgs;
2936 /* Write to last page of freeDB */
2937 key.mv_size = sizeof(txn->mt_txnid);
2938 key.mv_data = &txn->mt_txnid;
2940 freecnt = free_pgs[0];
2941 data.mv_size = MDB_IDL_SIZEOF(free_pgs);
2942 rc = mdb_cursor_put(&mc, &key, &data, MDB_RESERVE);
2945 /* Retry if mt_free_pgs[] grew during the Put() */
2946 free_pgs = txn->mt_free_pgs;
2947 } while (freecnt < free_pgs[0]);
2948 mdb_midl_sort(free_pgs);
2949 memcpy(data.mv_data, free_pgs, data.mv_size);
2952 unsigned int i = free_pgs[0];
2953 DPRINTF(("IDL write txn %"Z"u root %"Z"u num %u",
2954 txn->mt_txnid, txn->mt_dbs[FREE_DBI].md_root, i));
2956 DPRINTF(("IDL %"Z"u", free_pgs[i]));
2962 mop = env->me_pghead;
2963 mop_len = (mop ? mop[0] : 0) + txn->mt_loose_count;
2965 /* Reserve records for me_pghead[]. Split it if multi-page,
2966 * to avoid searching freeDB for a page range. Use keys in
2967 * range [1,me_pglast]: Smaller than txnid of oldest reader.
2969 if (total_room >= mop_len) {
2970 if (total_room == mop_len || --more < 0)
2972 } else if (head_room >= maxfree_1pg && head_id > 1) {
2973 /* Keep current record (overflow page), add a new one */
2977 /* (Re)write {key = head_id, IDL length = head_room} */
2978 total_room -= head_room;
2979 head_room = mop_len - total_room;
2980 if (head_room > maxfree_1pg && head_id > 1) {
2981 /* Overflow multi-page for part of me_pghead */
2982 head_room /= head_id; /* amortize page sizes */
2983 head_room += maxfree_1pg - head_room % (maxfree_1pg + 1);
2984 } else if (head_room < 0) {
2985 /* Rare case, not bothering to delete this record */
2988 key.mv_size = sizeof(head_id);
2989 key.mv_data = &head_id;
2990 data.mv_size = (head_room + 1) * sizeof(pgno_t);
2991 rc = mdb_cursor_put(&mc, &key, &data, MDB_RESERVE);
2994 /* IDL is initially empty, zero out at least the length */
2995 pgs = (pgno_t *)data.mv_data;
2996 j = head_room > clean_limit ? head_room : 0;
3000 total_room += head_room;
3003 /* Return loose page numbers to me_pghead, though usually none are
3004 * left at this point. The pages themselves remain in dirty_list.
3006 if (txn->mt_loose_pgs) {
3007 MDB_page *mp = txn->mt_loose_pgs;
3008 unsigned count = txn->mt_loose_count;
3010 /* Room for loose pages + temp IDL with same */
3011 if ((rc = mdb_midl_need(&env->me_pghead, 2*count+1)) != 0)
3013 mop = env->me_pghead;
3014 loose = mop + MDB_IDL_ALLOCLEN(mop) - count;
3015 for (count = 0; mp; mp = NEXT_LOOSE_PAGE(mp))
3016 loose[ ++count ] = mp->mp_pgno;
3018 mdb_midl_sort(loose);
3019 mdb_midl_xmerge(mop, loose);
3020 txn->mt_loose_pgs = NULL;
3021 txn->mt_loose_count = 0;
3025 /* Fill in the reserved me_pghead records */
3031 rc = mdb_cursor_first(&mc, &key, &data);
3032 for (; !rc; rc = mdb_cursor_next(&mc, &key, &data, MDB_NEXT)) {
3033 txnid_t id = *(txnid_t *)key.mv_data;
3034 ssize_t len = (ssize_t)(data.mv_size / sizeof(MDB_ID)) - 1;
3037 mdb_tassert(txn, len >= 0 && id <= env->me_pglast);
3039 if (len > mop_len) {
3041 data.mv_size = (len + 1) * sizeof(MDB_ID);
3043 data.mv_data = mop -= len;
3046 rc = mdb_cursor_put(&mc, &key, &data, MDB_CURRENT);
3048 if (rc || !(mop_len -= len))
3055 /** Flush (some) dirty pages to the map, after clearing their dirty flag.
3056 * @param[in] txn the transaction that's being committed
3057 * @param[in] keep number of initial pages in dirty_list to keep dirty.
3058 * @return 0 on success, non-zero on failure.
3061 mdb_page_flush(MDB_txn *txn, int keep)
3063 MDB_env *env = txn->mt_env;
3064 MDB_ID2L dl = txn->mt_u.dirty_list;
3065 unsigned psize = env->me_psize, j;
3066 int i, pagecount = dl[0].mid, rc;
3067 size_t size = 0, pos = 0;
3069 MDB_page *dp = NULL;
3073 struct iovec iov[MDB_COMMIT_PAGES];
3074 ssize_t wpos = 0, wsize = 0, wres;
3075 size_t next_pos = 1; /* impossible pos, so pos != next_pos */
3081 if (env->me_flags & MDB_WRITEMAP) {
3082 /* Clear dirty flags */
3083 while (++i <= pagecount) {
3085 /* Don't flush this page yet */
3086 if (dp->mp_flags & (P_LOOSE|P_KEEP)) {
3087 dp->mp_flags &= ~P_KEEP;
3091 dp->mp_flags &= ~P_DIRTY;
3096 /* Write the pages */
3098 if (++i <= pagecount) {
3100 /* Don't flush this page yet */
3101 if (dp->mp_flags & (P_LOOSE|P_KEEP)) {
3102 dp->mp_flags &= ~P_KEEP;
3107 /* clear dirty flag */
3108 dp->mp_flags &= ~P_DIRTY;
3111 if (IS_OVERFLOW(dp)) size *= dp->mp_pages;
3116 /* Windows actually supports scatter/gather I/O, but only on
3117 * unbuffered file handles. Since we're relying on the OS page
3118 * cache for all our data, that's self-defeating. So we just
3119 * write pages one at a time. We use the ov structure to set
3120 * the write offset, to at least save the overhead of a Seek
3123 DPRINTF(("committing page %"Z"u", pgno));
3124 memset(&ov, 0, sizeof(ov));
3125 ov.Offset = pos & 0xffffffff;
3126 ov.OffsetHigh = pos >> 16 >> 16;
3127 if (!WriteFile(env->me_fd, dp, size, NULL, &ov)) {
3129 DPRINTF(("WriteFile: %d", rc));
3133 /* Write up to MDB_COMMIT_PAGES dirty pages at a time. */
3134 if (pos!=next_pos || n==MDB_COMMIT_PAGES || wsize+size>MAX_WRITE) {
3136 /* Write previous page(s) */
3137 #ifdef MDB_USE_PWRITEV
3138 wres = pwritev(env->me_fd, iov, n, wpos);
3141 wres = pwrite(env->me_fd, iov[0].iov_base, wsize, wpos);
3143 if (lseek(env->me_fd, wpos, SEEK_SET) == -1) {
3145 DPRINTF(("lseek: %s", strerror(rc)));
3148 wres = writev(env->me_fd, iov, n);
3151 if (wres != wsize) {
3154 DPRINTF(("Write error: %s", strerror(rc)));
3156 rc = EIO; /* TODO: Use which error code? */
3157 DPUTS("short write, filesystem full?");
3168 DPRINTF(("committing page %"Z"u", pgno));
3169 next_pos = pos + size;
3170 iov[n].iov_len = size;
3171 iov[n].iov_base = (char *)dp;
3177 /* MIPS has cache coherency issues, this is a no-op everywhere else
3178 * Note: for any size >= on-chip cache size, entire on-chip cache is
3181 CACHEFLUSH(env->me_map, txn->mt_next_pgno * env->me_psize, DCACHE);
3183 for (i = keep; ++i <= pagecount; ) {
3185 /* This is a page we skipped above */
3188 dl[j].mid = dp->mp_pgno;
3191 mdb_dpage_free(env, dp);
3196 txn->mt_dirty_room += i - j;
3202 mdb_txn_commit(MDB_txn *txn)
3208 if (txn == NULL || txn->mt_env == NULL)
3211 if (txn->mt_child) {
3212 rc = mdb_txn_commit(txn->mt_child);
3213 txn->mt_child = NULL;
3220 if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) {
3221 mdb_dbis_update(txn, 1);
3222 txn->mt_numdbs = 2; /* so txn_abort() doesn't close any new handles */
3227 if (F_ISSET(txn->mt_flags, MDB_TXN_ERROR)) {
3228 DPUTS("error flag is set, can't commit");
3230 txn->mt_parent->mt_flags |= MDB_TXN_ERROR;
3235 if (txn->mt_parent) {
3236 MDB_txn *parent = txn->mt_parent;
3240 unsigned x, y, len, ps_len;
3242 /* Append our free list to parent's */
3243 rc = mdb_midl_append_list(&parent->mt_free_pgs, txn->mt_free_pgs);
3246 mdb_midl_free(txn->mt_free_pgs);
3247 /* Failures after this must either undo the changes
3248 * to the parent or set MDB_TXN_ERROR in the parent.
3251 parent->mt_next_pgno = txn->mt_next_pgno;
3252 parent->mt_flags = txn->mt_flags;
3254 /* Merge our cursors into parent's and close them */
3255 mdb_cursors_close(txn, 1);
3257 /* Update parent's DB table. */
3258 memcpy(parent->mt_dbs, txn->mt_dbs, txn->mt_numdbs * sizeof(MDB_db));
3259 parent->mt_numdbs = txn->mt_numdbs;
3260 parent->mt_dbflags[0] = txn->mt_dbflags[0];
3261 parent->mt_dbflags[1] = txn->mt_dbflags[1];
3262 for (i=2; i<txn->mt_numdbs; i++) {
3263 /* preserve parent's DB_NEW status */
3264 x = parent->mt_dbflags[i] & DB_NEW;
3265 parent->mt_dbflags[i] = txn->mt_dbflags[i] | x;
3268 dst = parent->mt_u.dirty_list;
3269 src = txn->mt_u.dirty_list;
3270 /* Remove anything in our dirty list from parent's spill list */
3271 if ((pspill = parent->mt_spill_pgs) && (ps_len = pspill[0])) {
3273 pspill[0] = (pgno_t)-1;
3274 /* Mark our dirty pages as deleted in parent spill list */
3275 for (i=0, len=src[0].mid; ++i <= len; ) {
3276 MDB_ID pn = src[i].mid << 1;
3277 while (pn > pspill[x])
3279 if (pn == pspill[x]) {
3284 /* Squash deleted pagenums if we deleted any */
3285 for (x=y; ++x <= ps_len; )
3286 if (!(pspill[x] & 1))
3287 pspill[++y] = pspill[x];
3291 /* Find len = length of merging our dirty list with parent's */
3293 dst[0].mid = 0; /* simplify loops */
3294 if (parent->mt_parent) {
3295 len = x + src[0].mid;
3296 y = mdb_mid2l_search(src, dst[x].mid + 1) - 1;
3297 for (i = x; y && i; y--) {
3298 pgno_t yp = src[y].mid;
3299 while (yp < dst[i].mid)
3301 if (yp == dst[i].mid) {
3306 } else { /* Simplify the above for single-ancestor case */
3307 len = MDB_IDL_UM_MAX - txn->mt_dirty_room;
3309 /* Merge our dirty list with parent's */
3311 for (i = len; y; dst[i--] = src[y--]) {
3312 pgno_t yp = src[y].mid;
3313 while (yp < dst[x].mid)
3314 dst[i--] = dst[x--];
3315 if (yp == dst[x].mid)
3316 free(dst[x--].mptr);
3318 mdb_tassert(txn, i == x);
3320 free(txn->mt_u.dirty_list);
3321 parent->mt_dirty_room = txn->mt_dirty_room;
3322 if (txn->mt_spill_pgs) {
3323 if (parent->mt_spill_pgs) {
3324 /* TODO: Prevent failure here, so parent does not fail */
3325 rc = mdb_midl_append_list(&parent->mt_spill_pgs, txn->mt_spill_pgs);
3327 parent->mt_flags |= MDB_TXN_ERROR;
3328 mdb_midl_free(txn->mt_spill_pgs);
3329 mdb_midl_sort(parent->mt_spill_pgs);
3331 parent->mt_spill_pgs = txn->mt_spill_pgs;
3335 /* Append our loose page list to parent's */
3336 for (lp = &parent->mt_loose_pgs; *lp; lp = &NEXT_LOOSE_PAGE(lp))
3338 *lp = txn->mt_loose_pgs;
3339 parent->mt_loose_count += txn->mt_loose_count;
3341 parent->mt_child = NULL;
3342 mdb_midl_free(((MDB_ntxn *)txn)->mnt_pgstate.mf_pghead);
3347 if (txn != env->me_txn) {
3348 DPUTS("attempt to commit unknown transaction");
3353 mdb_cursors_close(txn, 0);
3355 if (!txn->mt_u.dirty_list[0].mid &&
3356 !(txn->mt_flags & (MDB_TXN_DIRTY|MDB_TXN_SPILLS)))
3359 DPRINTF(("committing txn %"Z"u %p on mdbenv %p, root page %"Z"u",
3360 txn->mt_txnid, (void*)txn, (void*)env, txn->mt_dbs[MAIN_DBI].md_root));
3362 /* Update DB root pointers */
3363 if (txn->mt_numdbs > 2) {
3367 data.mv_size = sizeof(MDB_db);
3369 mdb_cursor_init(&mc, txn, MAIN_DBI, NULL);
3370 for (i = 2; i < txn->mt_numdbs; i++) {
3371 if (txn->mt_dbflags[i] & DB_DIRTY) {
3372 if (TXN_DBI_CHANGED(txn, i)) {
3376 data.mv_data = &txn->mt_dbs[i];
3377 rc = mdb_cursor_put(&mc, &txn->mt_dbxs[i].md_name, &data, 0);
3384 rc = mdb_freelist_save(txn);
3388 mdb_midl_free(env->me_pghead);
3389 env->me_pghead = NULL;
3390 if (mdb_midl_shrink(&txn->mt_free_pgs))
3391 env->me_free_pgs = txn->mt_free_pgs;
3398 #ifdef HAVE_FDATASYNC
3399 if (txn->mt_next_pgno * env->me_psize > env->me_size) {
3401 env->me_size = txn->mt_next_pgno * env->me_psize;
3404 if ((rc = mdb_page_flush(txn, 0)) ||
3405 (rc = mdb_env_sync0(env, i)) ||
3406 (rc = mdb_env_write_meta(txn)))
3409 /* Free P_LOOSE pages left behind in dirty_list */
3410 if (!(env->me_flags & MDB_WRITEMAP))
3411 mdb_dlist_free(txn);
3416 mdb_dbis_update(txn, 1);
3419 UNLOCK_MUTEX_W(env);
3420 if (txn != env->me_txn0)
3430 /** Read the environment parameters of a DB environment before
3431 * mapping it into memory.
3432 * @param[in] env the environment handle
3433 * @param[out] meta address of where to store the meta information
3434 * @return 0 on success, non-zero on failure.
3437 mdb_env_read_header(MDB_env *env, MDB_meta *meta)
3443 enum { Size = sizeof(pbuf) };
3445 /* We don't know the page size yet, so use a minimum value.
3446 * Read both meta pages so we can use the latest one.
3449 for (i=off=0; i<2; i++, off = meta->mm_psize) {
3453 memset(&ov, 0, sizeof(ov));
3455 rc = ReadFile(env->me_fd, &pbuf, Size, &len, &ov) ? (int)len : -1;
3456 if (rc == -1 && ErrCode() == ERROR_HANDLE_EOF)
3459 rc = pread(env->me_fd, &pbuf, Size, off);
3462 if (rc == 0 && off == 0)
3464 rc = rc < 0 ? (int) ErrCode() : MDB_INVALID;
3465 DPRINTF(("read: %s", mdb_strerror(rc)));
3469 p = (MDB_page *)&pbuf;
3471 if (!F_ISSET(p->mp_flags, P_META)) {
3472 DPRINTF(("page %"Z"u not a meta page", p->mp_pgno));
3477 if (m->mm_magic != MDB_MAGIC) {
3478 DPUTS("meta has invalid magic");
3482 if (m->mm_version != MDB_DATA_VERSION) {
3483 DPRINTF(("database is version %u, expected version %u",
3484 m->mm_version, MDB_DATA_VERSION));
3485 return MDB_VERSION_MISMATCH;
3488 if (off == 0 || m->mm_txnid > meta->mm_txnid)
3495 mdb_env_init_meta0(MDB_env *env, MDB_meta *meta)
3497 meta->mm_magic = MDB_MAGIC;
3498 meta->mm_version = MDB_DATA_VERSION;
3499 meta->mm_mapsize = env->me_mapsize;
3500 meta->mm_psize = env->me_psize;
3501 meta->mm_last_pg = 1;
3502 meta->mm_flags = env->me_flags & 0xffff;
3503 meta->mm_flags |= MDB_INTEGERKEY;
3504 meta->mm_dbs[0].md_root = P_INVALID;
3505 meta->mm_dbs[1].md_root = P_INVALID;
3508 /** Write the environment parameters of a freshly created DB environment.
3509 * @param[in] env the environment handle
3510 * @param[out] meta address of where to store the meta information
3511 * @return 0 on success, non-zero on failure.
3514 mdb_env_init_meta(MDB_env *env, MDB_meta *meta)
3522 memset(&ov, 0, sizeof(ov));
3523 #define DO_PWRITE(rc, fd, ptr, size, len, pos) do { \
3525 rc = WriteFile(fd, ptr, size, &len, &ov); } while(0)
3528 #define DO_PWRITE(rc, fd, ptr, size, len, pos) do { \
3529 len = pwrite(fd, ptr, size, pos); \
3530 rc = (len >= 0); } while(0)
3533 DPUTS("writing new meta page");
3535 psize = env->me_psize;
3537 mdb_env_init_meta0(env, meta);
3539 p = calloc(2, psize);
3541 p->mp_flags = P_META;
3542 *(MDB_meta *)METADATA(p) = *meta;
3544 q = (MDB_page *)((char *)p + psize);
3546 q->mp_flags = P_META;
3547 *(MDB_meta *)METADATA(q) = *meta;
3549 DO_PWRITE(rc, env->me_fd, p, psize * 2, len, 0);
3552 else if ((unsigned) len == psize * 2)
3560 /** Update the environment info to commit a transaction.
3561 * @param[in] txn the transaction that's being committed
3562 * @return 0 on success, non-zero on failure.
3565 mdb_env_write_meta(MDB_txn *txn)
3568 MDB_meta meta, metab, *mp;
3571 int rc, len, toggle;
3580 toggle = txn->mt_txnid & 1;
3581 DPRINTF(("writing meta page %d for root page %"Z"u",
3582 toggle, txn->mt_dbs[MAIN_DBI].md_root));
3585 mp = env->me_metas[toggle];
3586 mapsize = env->me_metas[toggle ^ 1]->mm_mapsize;
3587 /* Persist any increases of mapsize config */
3588 if (mapsize < env->me_mapsize)
3589 mapsize = env->me_mapsize;
3591 if (env->me_flags & MDB_WRITEMAP) {
3592 mp->mm_mapsize = mapsize;
3593 mp->mm_dbs[0] = txn->mt_dbs[0];
3594 mp->mm_dbs[1] = txn->mt_dbs[1];
3595 mp->mm_last_pg = txn->mt_next_pgno - 1;
3596 mp->mm_txnid = txn->mt_txnid;
3597 if (!(env->me_flags & (MDB_NOMETASYNC|MDB_NOSYNC))) {
3598 unsigned meta_size = env->me_psize;
3599 rc = (env->me_flags & MDB_MAPASYNC) ? MS_ASYNC : MS_SYNC;
3602 #ifndef _WIN32 /* POSIX msync() requires ptr = start of OS page */
3603 if (meta_size < env->me_os_psize)
3604 meta_size += meta_size;
3609 if (MDB_MSYNC(ptr, meta_size, rc)) {
3616 metab.mm_txnid = env->me_metas[toggle]->mm_txnid;
3617 metab.mm_last_pg = env->me_metas[toggle]->mm_last_pg;
3619 meta.mm_mapsize = mapsize;
3620 meta.mm_dbs[0] = txn->mt_dbs[0];
3621 meta.mm_dbs[1] = txn->mt_dbs[1];
3622 meta.mm_last_pg = txn->mt_next_pgno - 1;
3623 meta.mm_txnid = txn->mt_txnid;
3625 off = offsetof(MDB_meta, mm_mapsize);
3626 ptr = (char *)&meta + off;
3627 len = sizeof(MDB_meta) - off;
3629 off += env->me_psize;
3632 /* Write to the SYNC fd */
3633 mfd = env->me_flags & (MDB_NOSYNC|MDB_NOMETASYNC) ?
3634 env->me_fd : env->me_mfd;
3637 memset(&ov, 0, sizeof(ov));
3639 if (!WriteFile(mfd, ptr, len, (DWORD *)&rc, &ov))
3643 rc = pwrite(mfd, ptr, len, off);
3646 rc = rc < 0 ? ErrCode() : EIO;
3647 DPUTS("write failed, disk error?");
3648 /* On a failure, the pagecache still contains the new data.
3649 * Write some old data back, to prevent it from being used.
3650 * Use the non-SYNC fd; we know it will fail anyway.
3652 meta.mm_last_pg = metab.mm_last_pg;
3653 meta.mm_txnid = metab.mm_txnid;
3655 memset(&ov, 0, sizeof(ov));
3657 WriteFile(env->me_fd, ptr, len, NULL, &ov);
3659 r2 = pwrite(env->me_fd, ptr, len, off);
3660 (void)r2; /* Silence warnings. We don't care about pwrite's return value */
3663 env->me_flags |= MDB_FATAL_ERROR;
3666 /* MIPS has cache coherency issues, this is a no-op everywhere else */
3667 CACHEFLUSH(env->me_map + off, len, DCACHE);
3669 /* Memory ordering issues are irrelevant; since the entire writer
3670 * is wrapped by wmutex, all of these changes will become visible
3671 * after the wmutex is unlocked. Since the DB is multi-version,
3672 * readers will get consistent data regardless of how fresh or
3673 * how stale their view of these values is.
3676 env->me_txns->mti_txnid = txn->mt_txnid;
3681 /** Check both meta pages to see which one is newer.
3682 * @param[in] env the environment handle
3683 * @return meta toggle (0 or 1).
3686 mdb_env_pick_meta(const MDB_env *env)
3688 return (env->me_metas[0]->mm_txnid < env->me_metas[1]->mm_txnid);
3692 mdb_env_create(MDB_env **env)
3696 e = calloc(1, sizeof(MDB_env));
3700 e->me_maxreaders = DEFAULT_READERS;
3701 e->me_maxdbs = e->me_numdbs = 2;
3702 e->me_fd = INVALID_HANDLE_VALUE;
3703 e->me_lfd = INVALID_HANDLE_VALUE;
3704 e->me_mfd = INVALID_HANDLE_VALUE;
3705 #ifdef MDB_USE_POSIX_SEM
3706 e->me_rmutex = SEM_FAILED;
3707 e->me_wmutex = SEM_FAILED;
3709 e->me_pid = getpid();
3710 GET_PAGESIZE(e->me_os_psize);
3711 VGMEMP_CREATE(e,0,0);
3717 mdb_env_map(MDB_env *env, void *addr)
3720 unsigned int flags = env->me_flags;
3724 LONG sizelo, sizehi;
3727 if (flags & MDB_RDONLY) {
3728 /* Don't set explicit map size, use whatever exists */
3733 msize = env->me_mapsize;
3734 sizelo = msize & 0xffffffff;
3735 sizehi = msize >> 16 >> 16; /* only needed on Win64 */
3737 /* Windows won't create mappings for zero length files.
3738 * and won't map more than the file size.
3739 * Just set the maxsize right now.
3741 if (SetFilePointer(env->me_fd, sizelo, &sizehi, 0) != (DWORD)sizelo
3742 || !SetEndOfFile(env->me_fd)
3743 || SetFilePointer(env->me_fd, 0, NULL, 0) != 0)
3747 mh = CreateFileMapping(env->me_fd, NULL, flags & MDB_WRITEMAP ?
3748 PAGE_READWRITE : PAGE_READONLY,
3749 sizehi, sizelo, NULL);
3752 env->me_map = MapViewOfFileEx(mh, flags & MDB_WRITEMAP ?
3753 FILE_MAP_WRITE : FILE_MAP_READ,
3755 rc = env->me_map ? 0 : ErrCode();
3760 int prot = PROT_READ;
3761 if (flags & MDB_WRITEMAP) {
3763 if (ftruncate(env->me_fd, env->me_mapsize) < 0)
3766 env->me_map = mmap(addr, env->me_mapsize, prot, MAP_SHARED,
3768 if (env->me_map == MAP_FAILED) {
3773 if (flags & MDB_NORDAHEAD) {
3774 /* Turn off readahead. It's harmful when the DB is larger than RAM. */
3776 madvise(env->me_map, env->me_mapsize, MADV_RANDOM);
3778 #ifdef POSIX_MADV_RANDOM
3779 posix_madvise(env->me_map, env->me_mapsize, POSIX_MADV_RANDOM);
3780 #endif /* POSIX_MADV_RANDOM */
3781 #endif /* MADV_RANDOM */
3785 /* Can happen because the address argument to mmap() is just a
3786 * hint. mmap() can pick another, e.g. if the range is in use.
3787 * The MAP_FIXED flag would prevent that, but then mmap could
3788 * instead unmap existing pages to make room for the new map.
3790 if (addr && env->me_map != addr)
3791 return EBUSY; /* TODO: Make a new MDB_* error code? */
3793 p = (MDB_page *)env->me_map;
3794 env->me_metas[0] = METADATA(p);
3795 env->me_metas[1] = (MDB_meta *)((char *)env->me_metas[0] + env->me_psize);
3801 mdb_env_set_mapsize(MDB_env *env, size_t size)
3803 /* If env is already open, caller is responsible for making
3804 * sure there are no active txns.
3812 size = env->me_metas[mdb_env_pick_meta(env)]->mm_mapsize;
3813 else if (size < env->me_mapsize) {
3814 /* If the configured size is smaller, make sure it's
3815 * still big enough. Silently round up to minimum if not.
3817 size_t minsize = (env->me_metas[mdb_env_pick_meta(env)]->mm_last_pg + 1) * env->me_psize;
3821 munmap(env->me_map, env->me_mapsize);
3822 env->me_mapsize = size;
3823 old = (env->me_flags & MDB_FIXEDMAP) ? env->me_map : NULL;
3824 rc = mdb_env_map(env, old);
3828 env->me_mapsize = size;
3830 env->me_maxpg = env->me_mapsize / env->me_psize;
3835 mdb_env_set_maxdbs(MDB_env *env, MDB_dbi dbs)
3839 env->me_maxdbs = dbs + 2; /* Named databases + main and free DB */
3844 mdb_env_set_maxreaders(MDB_env *env, unsigned int readers)
3846 if (env->me_map || readers < 1)
3848 env->me_maxreaders = readers;
3853 mdb_env_get_maxreaders(MDB_env *env, unsigned int *readers)
3855 if (!env || !readers)
3857 *readers = env->me_maxreaders;
3862 mdb_fsize(HANDLE fd, size_t *size)
3865 LARGE_INTEGER fsize;
3867 if (!GetFileSizeEx(fd, &fsize))
3870 *size = fsize.QuadPart;
3882 /** Further setup required for opening an LMDB environment
3885 mdb_env_open2(MDB_env *env)
3887 unsigned int flags = env->me_flags;
3888 int i, newenv = 0, rc;
3892 /* See if we should use QueryLimited */
3894 if ((rc & 0xff) > 5)
3895 env->me_pidquery = MDB_PROCESS_QUERY_LIMITED_INFORMATION;
3897 env->me_pidquery = PROCESS_QUERY_INFORMATION;
3900 memset(&meta, 0, sizeof(meta));
3902 if ((i = mdb_env_read_header(env, &meta)) != 0) {
3905 DPUTS("new mdbenv");
3907 env->me_psize = env->me_os_psize;
3908 if (env->me_psize > MAX_PAGESIZE)
3909 env->me_psize = MAX_PAGESIZE;
3911 env->me_psize = meta.mm_psize;
3914 /* Was a mapsize configured? */
3915 if (!env->me_mapsize) {
3916 /* If this is a new environment, take the default,
3917 * else use the size recorded in the existing env.
3919 env->me_mapsize = newenv ? DEFAULT_MAPSIZE : meta.mm_mapsize;
3920 } else if (env->me_mapsize < meta.mm_mapsize) {
3921 /* If the configured size is smaller, make sure it's
3922 * still big enough. Silently round up to minimum if not.
3924 size_t minsize = (meta.mm_last_pg + 1) * meta.mm_psize;
3925 if (env->me_mapsize < minsize)
3926 env->me_mapsize = minsize;
3929 rc = mdb_fsize(env->me_fd, &env->me_size);
3933 rc = mdb_env_map(env, (flags & MDB_FIXEDMAP) ? meta.mm_address : NULL);
3938 if (flags & MDB_FIXEDMAP)
3939 meta.mm_address = env->me_map;
3940 i = mdb_env_init_meta(env, &meta);
3941 if (i != MDB_SUCCESS) {
3946 env->me_maxfree_1pg = (env->me_psize - PAGEHDRSZ) / sizeof(pgno_t) - 1;
3947 env->me_nodemax = (((env->me_psize - PAGEHDRSZ) / MDB_MINKEYS) & -2)
3949 #if !(MDB_MAXKEYSIZE)
3950 env->me_maxkey = env->me_nodemax - (NODESIZE + sizeof(MDB_db));
3952 env->me_maxpg = env->me_mapsize / env->me_psize;
3956 int toggle = mdb_env_pick_meta(env);
3957 MDB_db *db = &env->me_metas[toggle]->mm_dbs[MAIN_DBI];
3959 DPRINTF(("opened database version %u, pagesize %u",
3960 env->me_metas[0]->mm_version, env->me_psize));
3961 DPRINTF(("using meta page %d", toggle));
3962 DPRINTF(("depth: %u", db->md_depth));
3963 DPRINTF(("entries: %"Z"u", db->md_entries));
3964 DPRINTF(("branch pages: %"Z"u", db->md_branch_pages));
3965 DPRINTF(("leaf pages: %"Z"u", db->md_leaf_pages));
3966 DPRINTF(("overflow pages: %"Z"u", db->md_overflow_pages));
3967 DPRINTF(("root: %"Z"u", db->md_root));
3975 /** Release a reader thread's slot in the reader lock table.
3976 * This function is called automatically when a thread exits.
3977 * @param[in] ptr This points to the slot in the reader lock table.
3980 mdb_env_reader_dest(void *ptr)
3982 MDB_reader *reader = ptr;
3988 /** Junk for arranging thread-specific callbacks on Windows. This is
3989 * necessarily platform and compiler-specific. Windows supports up
3990 * to 1088 keys. Let's assume nobody opens more than 64 environments
3991 * in a single process, for now. They can override this if needed.
3993 #ifndef MAX_TLS_KEYS
3994 #define MAX_TLS_KEYS 64
3996 static pthread_key_t mdb_tls_keys[MAX_TLS_KEYS];
3997 static int mdb_tls_nkeys;
3999 static void NTAPI mdb_tls_callback(PVOID module, DWORD reason, PVOID ptr)
4003 case DLL_PROCESS_ATTACH: break;
4004 case DLL_THREAD_ATTACH: break;
4005 case DLL_THREAD_DETACH:
4006 for (i=0; i<mdb_tls_nkeys; i++) {
4007 MDB_reader *r = pthread_getspecific(mdb_tls_keys[i]);
4009 mdb_env_reader_dest(r);
4013 case DLL_PROCESS_DETACH: break;
4018 const PIMAGE_TLS_CALLBACK mdb_tls_cbp __attribute__((section (".CRT$XLB"))) = mdb_tls_callback;
4020 PIMAGE_TLS_CALLBACK mdb_tls_cbp __attribute__((section (".CRT$XLB"))) = mdb_tls_callback;
4024 /* Force some symbol references.
4025 * _tls_used forces the linker to create the TLS directory if not already done
4026 * mdb_tls_cbp prevents whole-program-optimizer from dropping the symbol.
4028 #pragma comment(linker, "/INCLUDE:_tls_used")
4029 #pragma comment(linker, "/INCLUDE:mdb_tls_cbp")
4030 #pragma const_seg(".CRT$XLB")
4031 extern const PIMAGE_TLS_CALLBACK mdb_tls_cbp;
4032 const PIMAGE_TLS_CALLBACK mdb_tls_cbp = mdb_tls_callback;
4035 #pragma comment(linker, "/INCLUDE:__tls_used")
4036 #pragma comment(linker, "/INCLUDE:_mdb_tls_cbp")
4037 #pragma data_seg(".CRT$XLB")
4038 PIMAGE_TLS_CALLBACK mdb_tls_cbp = mdb_tls_callback;
4040 #endif /* WIN 32/64 */
4041 #endif /* !__GNUC__ */
4044 /** Downgrade the exclusive lock on the region back to shared */
4046 mdb_env_share_locks(MDB_env *env, int *excl)
4048 int rc = 0, toggle = mdb_env_pick_meta(env);
4050 env->me_txns->mti_txnid = env->me_metas[toggle]->mm_txnid;
4055 /* First acquire a shared lock. The Unlock will
4056 * then release the existing exclusive lock.
4058 memset(&ov, 0, sizeof(ov));
4059 if (!LockFileEx(env->me_lfd, 0, 0, 1, 0, &ov)) {
4062 UnlockFile(env->me_lfd, 0, 0, 1, 0);
4068 struct flock lock_info;
4069 /* The shared lock replaces the existing lock */
4070 memset((void *)&lock_info, 0, sizeof(lock_info));
4071 lock_info.l_type = F_RDLCK;
4072 lock_info.l_whence = SEEK_SET;
4073 lock_info.l_start = 0;
4074 lock_info.l_len = 1;
4075 while ((rc = fcntl(env->me_lfd, F_SETLK, &lock_info)) &&
4076 (rc = ErrCode()) == EINTR) ;
4077 *excl = rc ? -1 : 0; /* error may mean we lost the lock */
4084 /** Try to get exclusive lock, otherwise shared.
4085 * Maintain *excl = -1: no/unknown lock, 0: shared, 1: exclusive.
4088 mdb_env_excl_lock(MDB_env *env, int *excl)
4092 if (LockFile(env->me_lfd, 0, 0, 1, 0)) {
4096 memset(&ov, 0, sizeof(ov));
4097 if (LockFileEx(env->me_lfd, 0, 0, 1, 0, &ov)) {
4104 struct flock lock_info;
4105 memset((void *)&lock_info, 0, sizeof(lock_info));
4106 lock_info.l_type = F_WRLCK;
4107 lock_info.l_whence = SEEK_SET;
4108 lock_info.l_start = 0;
4109 lock_info.l_len = 1;
4110 while ((rc = fcntl(env->me_lfd, F_SETLK, &lock_info)) &&
4111 (rc = ErrCode()) == EINTR) ;
4115 # ifdef MDB_USE_POSIX_SEM
4116 if (*excl < 0) /* always true when !MDB_USE_POSIX_SEM */
4119 lock_info.l_type = F_RDLCK;
4120 while ((rc = fcntl(env->me_lfd, F_SETLKW, &lock_info)) &&
4121 (rc = ErrCode()) == EINTR) ;
4131 * hash_64 - 64 bit Fowler/Noll/Vo-0 FNV-1a hash code
4133 * @(#) $Revision: 5.1 $
4134 * @(#) $Id: hash_64a.c,v 5.1 2009/06/30 09:01:38 chongo Exp $
4135 * @(#) $Source: /usr/local/src/cmd/fnv/RCS/hash_64a.c,v $
4137 * http://www.isthe.com/chongo/tech/comp/fnv/index.html
4141 * Please do not copyright this code. This code is in the public domain.
4143 * LANDON CURT NOLL DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
4144 * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO
4145 * EVENT SHALL LANDON CURT NOLL BE LIABLE FOR ANY SPECIAL, INDIRECT OR
4146 * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF
4147 * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
4148 * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
4149 * PERFORMANCE OF THIS SOFTWARE.
4152 * chongo <Landon Curt Noll> /\oo/\
4153 * http://www.isthe.com/chongo/
4155 * Share and Enjoy! :-)
4158 typedef unsigned long long mdb_hash_t;
4159 #define MDB_HASH_INIT ((mdb_hash_t)0xcbf29ce484222325ULL)
4161 /** perform a 64 bit Fowler/Noll/Vo FNV-1a hash on a buffer
4162 * @param[in] val value to hash
4163 * @param[in] hval initial value for hash
4164 * @return 64 bit hash
4166 * NOTE: To use the recommended 64 bit FNV-1a hash, use MDB_HASH_INIT as the
4167 * hval arg on the first call.
4170 mdb_hash_val(MDB_val *val, mdb_hash_t hval)
4172 unsigned char *s = (unsigned char *)val->mv_data; /* unsigned string */
4173 unsigned char *end = s + val->mv_size;
4175 * FNV-1a hash each octet of the string
4178 /* xor the bottom with the current octet */
4179 hval ^= (mdb_hash_t)*s++;
4181 /* multiply by the 64 bit FNV magic prime mod 2^64 */
4182 hval += (hval << 1) + (hval << 4) + (hval << 5) +
4183 (hval << 7) + (hval << 8) + (hval << 40);
4185 /* return our new hash value */
4189 /** Hash the string and output the encoded hash.
4190 * This uses modified RFC1924 Ascii85 encoding to accommodate systems with
4191 * very short name limits. We don't care about the encoding being reversible,
4192 * we just want to preserve as many bits of the input as possible in a
4193 * small printable string.
4194 * @param[in] str string to hash
4195 * @param[out] encbuf an array of 11 chars to hold the hash
4197 static const char mdb_a85[]= "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!#$%&()*+-;<=>?@^_`{|}~";
4200 mdb_pack85(unsigned long l, char *out)
4204 for (i=0; i<5; i++) {
4205 *out++ = mdb_a85[l % 85];
4211 mdb_hash_enc(MDB_val *val, char *encbuf)
4213 mdb_hash_t h = mdb_hash_val(val, MDB_HASH_INIT);
4215 mdb_pack85(h, encbuf);
4216 mdb_pack85(h>>32, encbuf+5);
4221 /** Open and/or initialize the lock region for the environment.
4222 * @param[in] env The LMDB environment.
4223 * @param[in] lpath The pathname of the file used for the lock region.
4224 * @param[in] mode The Unix permissions for the file, if we create it.
4225 * @param[in,out] excl In -1, out lock type: -1 none, 0 shared, 1 exclusive
4226 * @return 0 on success, non-zero on failure.
4229 mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl)
4232 # define MDB_ERRCODE_ROFS ERROR_WRITE_PROTECT
4234 # define MDB_ERRCODE_ROFS EROFS
4235 #ifdef O_CLOEXEC /* Linux: Open file and set FD_CLOEXEC atomically */
4236 # define MDB_CLOEXEC O_CLOEXEC
4239 # define MDB_CLOEXEC 0
4246 env->me_lfd = CreateFile(lpath, GENERIC_READ|GENERIC_WRITE,
4247 FILE_SHARE_READ|FILE_SHARE_WRITE, NULL, OPEN_ALWAYS,
4248 FILE_ATTRIBUTE_NORMAL, NULL);
4250 env->me_lfd = open(lpath, O_RDWR|O_CREAT|MDB_CLOEXEC, mode);
4252 if (env->me_lfd == INVALID_HANDLE_VALUE) {
4254 if (rc == MDB_ERRCODE_ROFS && (env->me_flags & MDB_RDONLY)) {
4259 #if ! ((MDB_CLOEXEC) || defined(_WIN32))
4260 /* Lose record locks when exec*() */
4261 if ((fdflags = fcntl(env->me_lfd, F_GETFD) | FD_CLOEXEC) >= 0)
4262 fcntl(env->me_lfd, F_SETFD, fdflags);
4265 if (!(env->me_flags & MDB_NOTLS)) {
4266 rc = pthread_key_create(&env->me_txkey, mdb_env_reader_dest);
4269 env->me_flags |= MDB_ENV_TXKEY;
4271 /* Windows TLS callbacks need help finding their TLS info. */
4272 if (mdb_tls_nkeys >= MAX_TLS_KEYS) {
4276 mdb_tls_keys[mdb_tls_nkeys++] = env->me_txkey;
4280 /* Try to get exclusive lock. If we succeed, then
4281 * nobody is using the lock region and we should initialize it.
4283 if ((rc = mdb_env_excl_lock(env, excl))) goto fail;
4286 size = GetFileSize(env->me_lfd, NULL);
4288 size = lseek(env->me_lfd, 0, SEEK_END);
4289 if (size == -1) goto fail_errno;
4291 rsize = (env->me_maxreaders-1) * sizeof(MDB_reader) + sizeof(MDB_txninfo);
4292 if (size < rsize && *excl > 0) {
4294 if (SetFilePointer(env->me_lfd, rsize, NULL, FILE_BEGIN) != (DWORD)rsize
4295 || !SetEndOfFile(env->me_lfd))
4298 if (ftruncate(env->me_lfd, rsize) != 0) goto fail_errno;
4302 size = rsize - sizeof(MDB_txninfo);
4303 env->me_maxreaders = size/sizeof(MDB_reader) + 1;
4308 mh = CreateFileMapping(env->me_lfd, NULL, PAGE_READWRITE,
4310 if (!mh) goto fail_errno;
4311 env->me_txns = MapViewOfFileEx(mh, FILE_MAP_WRITE, 0, 0, rsize, NULL);
4313 if (!env->me_txns) goto fail_errno;
4315 void *m = mmap(NULL, rsize, PROT_READ|PROT_WRITE, MAP_SHARED,
4317 if (m == MAP_FAILED) goto fail_errno;
4323 BY_HANDLE_FILE_INFORMATION stbuf;
4332 if (!mdb_sec_inited) {
4333 InitializeSecurityDescriptor(&mdb_null_sd,
4334 SECURITY_DESCRIPTOR_REVISION);
4335 SetSecurityDescriptorDacl(&mdb_null_sd, TRUE, 0, FALSE);
4336 mdb_all_sa.nLength = sizeof(SECURITY_ATTRIBUTES);
4337 mdb_all_sa.bInheritHandle = FALSE;
4338 mdb_all_sa.lpSecurityDescriptor = &mdb_null_sd;
4341 if (!GetFileInformationByHandle(env->me_lfd, &stbuf)) goto fail_errno;
4342 idbuf.volume = stbuf.dwVolumeSerialNumber;
4343 idbuf.nhigh = stbuf.nFileIndexHigh;
4344 idbuf.nlow = stbuf.nFileIndexLow;
4345 val.mv_data = &idbuf;
4346 val.mv_size = sizeof(idbuf);
4347 mdb_hash_enc(&val, encbuf);
4348 sprintf(env->me_txns->mti_rmname, "Global\\MDBr%s", encbuf);
4349 sprintf(env->me_txns->mti_wmname, "Global\\MDBw%s", encbuf);
4350 env->me_rmutex = CreateMutex(&mdb_all_sa, FALSE, env->me_txns->mti_rmname);
4351 if (!env->me_rmutex) goto fail_errno;
4352 env->me_wmutex = CreateMutex(&mdb_all_sa, FALSE, env->me_txns->mti_wmname);
4353 if (!env->me_wmutex) goto fail_errno;
4354 #elif defined(MDB_USE_POSIX_SEM)
4363 #if defined(__NetBSD__)
4364 #define MDB_SHORT_SEMNAMES 1 /* limited to 14 chars */
4366 if (fstat(env->me_lfd, &stbuf)) goto fail_errno;
4367 idbuf.dev = stbuf.st_dev;
4368 idbuf.ino = stbuf.st_ino;
4369 val.mv_data = &idbuf;
4370 val.mv_size = sizeof(idbuf);
4371 mdb_hash_enc(&val, encbuf);
4372 #ifdef MDB_SHORT_SEMNAMES
4373 encbuf[9] = '\0'; /* drop name from 15 chars to 14 chars */
4375 sprintf(env->me_txns->mti_rmname, "/MDBr%s", encbuf);
4376 sprintf(env->me_txns->mti_wmname, "/MDBw%s", encbuf);
4377 /* Clean up after a previous run, if needed: Try to
4378 * remove both semaphores before doing anything else.
4380 sem_unlink(env->me_txns->mti_rmname);
4381 sem_unlink(env->me_txns->mti_wmname);
4382 env->me_rmutex = sem_open(env->me_txns->mti_rmname,
4383 O_CREAT|O_EXCL, mode, 1);
4384 if (env->me_rmutex == SEM_FAILED) goto fail_errno;
4385 env->me_wmutex = sem_open(env->me_txns->mti_wmname,
4386 O_CREAT|O_EXCL, mode, 1);
4387 if (env->me_wmutex == SEM_FAILED) goto fail_errno;
4388 #else /* MDB_USE_POSIX_SEM */
4389 pthread_mutexattr_t mattr;
4391 if ((rc = pthread_mutexattr_init(&mattr))
4392 || (rc = pthread_mutexattr_setpshared(&mattr, PTHREAD_PROCESS_SHARED))
4393 || (rc = pthread_mutex_init(&env->me_txns->mti_mutex, &mattr))
4394 || (rc = pthread_mutex_init(&env->me_txns->mti_wmutex, &mattr)))
4396 pthread_mutexattr_destroy(&mattr);
4397 #endif /* _WIN32 || MDB_USE_POSIX_SEM */
4399 env->me_txns->mti_magic = MDB_MAGIC;
4400 env->me_txns->mti_format = MDB_LOCK_FORMAT;
4401 env->me_txns->mti_txnid = 0;
4402 env->me_txns->mti_numreaders = 0;
4405 if (env->me_txns->mti_magic != MDB_MAGIC) {
4406 DPUTS("lock region has invalid magic");
4410 if (env->me_txns->mti_format != MDB_LOCK_FORMAT) {
4411 DPRINTF(("lock region has format+version 0x%x, expected 0x%x",
4412 env->me_txns->mti_format, MDB_LOCK_FORMAT));
4413 rc = MDB_VERSION_MISMATCH;
4417 if (rc && rc != EACCES && rc != EAGAIN) {
4421 env->me_rmutex = OpenMutex(SYNCHRONIZE, FALSE, env->me_txns->mti_rmname);
4422 if (!env->me_rmutex) goto fail_errno;
4423 env->me_wmutex = OpenMutex(SYNCHRONIZE, FALSE, env->me_txns->mti_wmname);
4424 if (!env->me_wmutex) goto fail_errno;
4425 #elif defined(MDB_USE_POSIX_SEM)
4426 env->me_rmutex = sem_open(env->me_txns->mti_rmname, 0);
4427 if (env->me_rmutex == SEM_FAILED) goto fail_errno;
4428 env->me_wmutex = sem_open(env->me_txns->mti_wmname, 0);
4429 if (env->me_wmutex == SEM_FAILED) goto fail_errno;
4440 /** The name of the lock file in the DB environment */
4441 #define LOCKNAME "/lock.mdb"
4442 /** The name of the data file in the DB environment */
4443 #define DATANAME "/data.mdb"
4444 /** The suffix of the lock file when no subdir is used */
4445 #define LOCKSUFF "-lock"
4446 /** Only a subset of the @ref mdb_env flags can be changed
4447 * at runtime. Changing other flags requires closing the
4448 * environment and re-opening it with the new flags.
4450 #define CHANGEABLE (MDB_NOSYNC|MDB_NOMETASYNC|MDB_MAPASYNC|MDB_NOMEMINIT)
4451 #define CHANGELESS (MDB_FIXEDMAP|MDB_NOSUBDIR|MDB_RDONLY|MDB_WRITEMAP| \
4452 MDB_NOTLS|MDB_NOLOCK|MDB_NORDAHEAD)
4454 #if VALID_FLAGS & PERSISTENT_FLAGS & (CHANGEABLE|CHANGELESS)
4455 # error "Persistent DB flags & env flags overlap, but both go in mm_flags"
4459 mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode)
4461 int oflags, rc, len, excl = -1;
4462 char *lpath, *dpath;
4464 if (env->me_fd!=INVALID_HANDLE_VALUE || (flags & ~(CHANGEABLE|CHANGELESS)))
4468 if (flags & MDB_NOSUBDIR) {
4469 rc = len + sizeof(LOCKSUFF) + len + 1;
4471 rc = len + sizeof(LOCKNAME) + len + sizeof(DATANAME);
4476 if (flags & MDB_NOSUBDIR) {
4477 dpath = lpath + len + sizeof(LOCKSUFF);
4478 sprintf(lpath, "%s" LOCKSUFF, path);
4479 strcpy(dpath, path);
4481 dpath = lpath + len + sizeof(LOCKNAME);
4482 sprintf(lpath, "%s" LOCKNAME, path);
4483 sprintf(dpath, "%s" DATANAME, path);
4487 flags |= env->me_flags;
4488 if (flags & MDB_RDONLY) {
4489 /* silently ignore WRITEMAP when we're only getting read access */
4490 flags &= ~MDB_WRITEMAP;
4492 if (!((env->me_free_pgs = mdb_midl_alloc(MDB_IDL_UM_MAX)) &&
4493 (env->me_dirty_list = calloc(MDB_IDL_UM_SIZE, sizeof(MDB_ID2)))))
4496 env->me_flags = flags |= MDB_ENV_ACTIVE;
4500 env->me_path = strdup(path);
4501 env->me_dbxs = calloc(env->me_maxdbs, sizeof(MDB_dbx));
4502 env->me_dbflags = calloc(env->me_maxdbs, sizeof(uint16_t));
4503 env->me_dbiseqs = calloc(env->me_maxdbs, sizeof(unsigned int));
4504 if (!(env->me_dbxs && env->me_path && env->me_dbflags && env->me_dbiseqs)) {
4509 /* For RDONLY, get lockfile after we know datafile exists */
4510 if (!(flags & (MDB_RDONLY|MDB_NOLOCK))) {
4511 rc = mdb_env_setup_locks(env, lpath, mode, &excl);
4517 if (F_ISSET(flags, MDB_RDONLY)) {
4518 oflags = GENERIC_READ;
4519 len = OPEN_EXISTING;
4521 oflags = GENERIC_READ|GENERIC_WRITE;
4524 mode = FILE_ATTRIBUTE_NORMAL;
4525 env->me_fd = CreateFile(dpath, oflags, FILE_SHARE_READ|FILE_SHARE_WRITE,
4526 NULL, len, mode, NULL);
4528 if (F_ISSET(flags, MDB_RDONLY))
4531 oflags = O_RDWR | O_CREAT;
4533 env->me_fd = open(dpath, oflags, mode);
4535 if (env->me_fd == INVALID_HANDLE_VALUE) {
4540 if ((flags & (MDB_RDONLY|MDB_NOLOCK)) == MDB_RDONLY) {
4541 rc = mdb_env_setup_locks(env, lpath, mode, &excl);
4546 if ((rc = mdb_env_open2(env)) == MDB_SUCCESS) {
4547 if (flags & (MDB_RDONLY|MDB_WRITEMAP)) {
4548 env->me_mfd = env->me_fd;
4550 /* Synchronous fd for meta writes. Needed even with
4551 * MDB_NOSYNC/MDB_NOMETASYNC, in case these get reset.
4554 len = OPEN_EXISTING;
4555 env->me_mfd = CreateFile(dpath, oflags,
4556 FILE_SHARE_READ|FILE_SHARE_WRITE, NULL, len,
4557 mode | FILE_FLAG_WRITE_THROUGH, NULL);
4560 env->me_mfd = open(dpath, oflags | MDB_DSYNC, mode);
4562 if (env->me_mfd == INVALID_HANDLE_VALUE) {
4567 DPRINTF(("opened dbenv %p", (void *) env));
4569 rc = mdb_env_share_locks(env, &excl);
4573 if (!((flags & MDB_RDONLY) ||
4574 (env->me_pbuf = calloc(1, env->me_psize))))
4576 if (!(flags & MDB_RDONLY)) {
4578 int tsize = sizeof(MDB_txn), size = tsize + env->me_maxdbs *
4579 (sizeof(MDB_db)+sizeof(MDB_cursor *)+sizeof(unsigned int)+1);
4580 txn = calloc(1, size);
4582 txn->mt_dbs = (MDB_db *)((char *)txn + tsize);
4583 txn->mt_cursors = (MDB_cursor **)(txn->mt_dbs + env->me_maxdbs);
4584 txn->mt_dbiseqs = (unsigned int *)(txn->mt_cursors + env->me_maxdbs);
4585 txn->mt_dbflags = (unsigned char *)(txn->mt_dbiseqs + env->me_maxdbs);
4587 txn->mt_dbxs = env->me_dbxs;
4597 mdb_env_close0(env, excl);
4603 /** Destroy resources from mdb_env_open(), clear our readers & DBIs */
4605 mdb_env_close0(MDB_env *env, int excl)
4609 if (!(env->me_flags & MDB_ENV_ACTIVE))
4612 /* Doing this here since me_dbxs may not exist during mdb_env_close */
4613 for (i = env->me_maxdbs; --i > MAIN_DBI; )
4614 free(env->me_dbxs[i].md_name.mv_data);
4617 free(env->me_dbiseqs);
4618 free(env->me_dbflags);
4621 free(env->me_dirty_list);
4623 mdb_midl_free(env->me_free_pgs);
4625 if (env->me_flags & MDB_ENV_TXKEY) {
4626 pthread_key_delete(env->me_txkey);
4628 /* Delete our key from the global list */
4629 for (i=0; i<mdb_tls_nkeys; i++)
4630 if (mdb_tls_keys[i] == env->me_txkey) {
4631 mdb_tls_keys[i] = mdb_tls_keys[mdb_tls_nkeys-1];
4639 munmap(env->me_map, env->me_mapsize);
4641 if (env->me_mfd != env->me_fd && env->me_mfd != INVALID_HANDLE_VALUE)
4642 (void) close(env->me_mfd);
4643 if (env->me_fd != INVALID_HANDLE_VALUE)
4644 (void) close(env->me_fd);
4646 MDB_PID_T pid = env->me_pid;
4647 /* Clearing readers is done in this function because
4648 * me_txkey with its destructor must be disabled first.
4650 for (i = env->me_numreaders; --i >= 0; )
4651 if (env->me_txns->mti_readers[i].mr_pid == pid)
4652 env->me_txns->mti_readers[i].mr_pid = 0;
4654 if (env->me_rmutex) {
4655 CloseHandle(env->me_rmutex);
4656 if (env->me_wmutex) CloseHandle(env->me_wmutex);
4658 /* Windows automatically destroys the mutexes when
4659 * the last handle closes.
4661 #elif defined(MDB_USE_POSIX_SEM)
4662 if (env->me_rmutex != SEM_FAILED) {
4663 sem_close(env->me_rmutex);
4664 if (env->me_wmutex != SEM_FAILED)
4665 sem_close(env->me_wmutex);
4666 /* If we have the filelock: If we are the
4667 * only remaining user, clean up semaphores.
4670 mdb_env_excl_lock(env, &excl);
4672 sem_unlink(env->me_txns->mti_rmname);
4673 sem_unlink(env->me_txns->mti_wmname);
4677 munmap((void *)env->me_txns, (env->me_maxreaders-1)*sizeof(MDB_reader)+sizeof(MDB_txninfo));
4679 if (env->me_lfd != INVALID_HANDLE_VALUE) {
4682 /* Unlock the lockfile. Windows would have unlocked it
4683 * after closing anyway, but not necessarily at once.
4685 UnlockFile(env->me_lfd, 0, 0, 1, 0);
4688 (void) close(env->me_lfd);
4691 env->me_flags &= ~(MDB_ENV_ACTIVE|MDB_ENV_TXKEY);
4696 mdb_env_close(MDB_env *env)
4703 VGMEMP_DESTROY(env);
4704 while ((dp = env->me_dpages) != NULL) {
4705 VGMEMP_DEFINED(&dp->mp_next, sizeof(dp->mp_next));
4706 env->me_dpages = dp->mp_next;
4710 mdb_env_close0(env, 0);
4714 /** Compare two items pointing at aligned size_t's */
4716 mdb_cmp_long(const MDB_val *a, const MDB_val *b)
4718 return (*(size_t *)a->mv_data < *(size_t *)b->mv_data) ? -1 :
4719 *(size_t *)a->mv_data > *(size_t *)b->mv_data;
4722 /** Compare two items pointing at aligned unsigned int's */
4724 mdb_cmp_int(const MDB_val *a, const MDB_val *b)
4726 return (*(unsigned int *)a->mv_data < *(unsigned int *)b->mv_data) ? -1 :
4727 *(unsigned int *)a->mv_data > *(unsigned int *)b->mv_data;
4730 /** Compare two items pointing at unsigned ints of unknown alignment.
4731 * Nodes and keys are guaranteed to be 2-byte aligned.
4734 mdb_cmp_cint(const MDB_val *a, const MDB_val *b)
4736 #if BYTE_ORDER == LITTLE_ENDIAN
4737 unsigned short *u, *c;
4740 u = (unsigned short *) ((char *) a->mv_data + a->mv_size);
4741 c = (unsigned short *) ((char *) b->mv_data + a->mv_size);
4744 } while(!x && u > (unsigned short *)a->mv_data);
4747 unsigned short *u, *c, *end;
4750 end = (unsigned short *) ((char *) a->mv_data + a->mv_size);
4751 u = (unsigned short *)a->mv_data;
4752 c = (unsigned short *)b->mv_data;
4755 } while(!x && u < end);
4760 /** Compare two items pointing at size_t's of unknown alignment. */
4761 #ifdef MISALIGNED_OK
4762 # define mdb_cmp_clong mdb_cmp_long
4764 # define mdb_cmp_clong mdb_cmp_cint
4767 /** Compare two items lexically */
4769 mdb_cmp_memn(const MDB_val *a, const MDB_val *b)
4776 len_diff = (ssize_t) a->mv_size - (ssize_t) b->mv_size;
4782 diff = memcmp(a->mv_data, b->mv_data, len);
4783 return diff ? diff : len_diff<0 ? -1 : len_diff;
4786 /** Compare two items in reverse byte order */
4788 mdb_cmp_memnr(const MDB_val *a, const MDB_val *b)
4790 const unsigned char *p1, *p2, *p1_lim;
4794 p1_lim = (const unsigned char *)a->mv_data;
4795 p1 = (const unsigned char *)a->mv_data + a->mv_size;
4796 p2 = (const unsigned char *)b->mv_data + b->mv_size;
4798 len_diff = (ssize_t) a->mv_size - (ssize_t) b->mv_size;
4804 while (p1 > p1_lim) {
4805 diff = *--p1 - *--p2;
4809 return len_diff<0 ? -1 : len_diff;
4812 /** Search for key within a page, using binary search.
4813 * Returns the smallest entry larger or equal to the key.
4814 * If exactp is non-null, stores whether the found entry was an exact match
4815 * in *exactp (1 or 0).
4816 * Updates the cursor index with the index of the found entry.
4817 * If no entry larger or equal to the key is found, returns NULL.
4820 mdb_node_search(MDB_cursor *mc, MDB_val *key, int *exactp)
4822 unsigned int i = 0, nkeys;
4825 MDB_page *mp = mc->mc_pg[mc->mc_top];
4826 MDB_node *node = NULL;
4831 nkeys = NUMKEYS(mp);
4833 DPRINTF(("searching %u keys in %s %spage %"Z"u",
4834 nkeys, IS_LEAF(mp) ? "leaf" : "branch", IS_SUBP(mp) ? "sub-" : "",
4837 low = IS_LEAF(mp) ? 0 : 1;
4839 cmp = mc->mc_dbx->md_cmp;
4841 /* Branch pages have no data, so if using integer keys,
4842 * alignment is guaranteed. Use faster mdb_cmp_int.
4844 if (cmp == mdb_cmp_cint && IS_BRANCH(mp)) {
4845 if (NODEPTR(mp, 1)->mn_ksize == sizeof(size_t))
4852 nodekey.mv_size = mc->mc_db->md_pad;
4853 node = NODEPTR(mp, 0); /* fake */
4854 while (low <= high) {
4855 i = (low + high) >> 1;
4856 nodekey.mv_data = LEAF2KEY(mp, i, nodekey.mv_size);
4857 rc = cmp(key, &nodekey);
4858 DPRINTF(("found leaf index %u [%s], rc = %i",
4859 i, DKEY(&nodekey), rc));
4868 while (low <= high) {
4869 i = (low + high) >> 1;
4871 node = NODEPTR(mp, i);
4872 nodekey.mv_size = NODEKSZ(node);
4873 nodekey.mv_data = NODEKEY(node);
4875 rc = cmp(key, &nodekey);
4878 DPRINTF(("found leaf index %u [%s], rc = %i",
4879 i, DKEY(&nodekey), rc));
4881 DPRINTF(("found branch index %u [%s -> %"Z"u], rc = %i",
4882 i, DKEY(&nodekey), NODEPGNO(node), rc));
4893 if (rc > 0) { /* Found entry is less than the key. */
4894 i++; /* Skip to get the smallest entry larger than key. */
4896 node = NODEPTR(mp, i);
4899 *exactp = (rc == 0 && nkeys > 0);
4900 /* store the key index */
4901 mc->mc_ki[mc->mc_top] = i;
4903 /* There is no entry larger or equal to the key. */
4906 /* nodeptr is fake for LEAF2 */
4912 mdb_cursor_adjust(MDB_cursor *mc, func)
4916 for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) {
4917 if (m2->mc_pg[m2->mc_top] == mc->mc_pg[mc->mc_top]) {
4924 /** Pop a page off the top of the cursor's stack. */
4926 mdb_cursor_pop(MDB_cursor *mc)
4930 MDB_page *top = mc->mc_pg[mc->mc_top];
4936 DPRINTF(("popped page %"Z"u off db %d cursor %p", top->mp_pgno,
4937 DDBI(mc), (void *) mc));
4941 /** Push a page onto the top of the cursor's stack. */
4943 mdb_cursor_push(MDB_cursor *mc, MDB_page *mp)
4945 DPRINTF(("pushing page %"Z"u on db %d cursor %p", mp->mp_pgno,
4946 DDBI(mc), (void *) mc));
4948 if (mc->mc_snum >= CURSOR_STACK) {
4949 mc->mc_txn->mt_flags |= MDB_TXN_ERROR;
4950 return MDB_CURSOR_FULL;
4953 mc->mc_top = mc->mc_snum++;
4954 mc->mc_pg[mc->mc_top] = mp;
4955 mc->mc_ki[mc->mc_top] = 0;
4960 /** Find the address of the page corresponding to a given page number.
4961 * @param[in] txn the transaction for this access.
4962 * @param[in] pgno the page number for the page to retrieve.
4963 * @param[out] ret address of a pointer where the page's address will be stored.
4964 * @param[out] lvl dirty_list inheritance level of found page. 1=current txn, 0=mapped page.
4965 * @return 0 on success, non-zero on failure.
4968 mdb_page_get(MDB_txn *txn, pgno_t pgno, MDB_page **ret, int *lvl)
4970 MDB_env *env = txn->mt_env;
4974 if (!((txn->mt_flags & MDB_TXN_RDONLY) | (env->me_flags & MDB_WRITEMAP))) {
4978 MDB_ID2L dl = tx2->mt_u.dirty_list;
4980 /* Spilled pages were dirtied in this txn and flushed
4981 * because the dirty list got full. Bring this page
4982 * back in from the map (but don't unspill it here,
4983 * leave that unless page_touch happens again).
4985 if (tx2->mt_spill_pgs) {
4986 MDB_ID pn = pgno << 1;
4987 x = mdb_midl_search(tx2->mt_spill_pgs, pn);
4988 if (x <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[x] == pn) {
4989 p = (MDB_page *)(env->me_map + env->me_psize * pgno);
4994 unsigned x = mdb_mid2l_search(dl, pgno);
4995 if (x <= dl[0].mid && dl[x].mid == pgno) {
5001 } while ((tx2 = tx2->mt_parent) != NULL);
5004 if (pgno < txn->mt_next_pgno) {
5006 p = (MDB_page *)(env->me_map + env->me_psize * pgno);
5008 DPRINTF(("page %"Z"u not found", pgno));
5009 txn->mt_flags |= MDB_TXN_ERROR;
5010 return MDB_PAGE_NOTFOUND;
5020 /** Finish #mdb_page_search() / #mdb_page_search_lowest().
5021 * The cursor is at the root page, set up the rest of it.
5024 mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int flags)
5026 MDB_page *mp = mc->mc_pg[mc->mc_top];
5030 while (IS_BRANCH(mp)) {
5034 DPRINTF(("branch page %"Z"u has %u keys", mp->mp_pgno, NUMKEYS(mp)));
5035 mdb_cassert(mc, NUMKEYS(mp) > 1);
5036 DPRINTF(("found index 0 to page %"Z"u", NODEPGNO(NODEPTR(mp, 0))));
5038 if (flags & (MDB_PS_FIRST|MDB_PS_LAST)) {
5040 if (flags & MDB_PS_LAST)
5041 i = NUMKEYS(mp) - 1;
5044 node = mdb_node_search(mc, key, &exact);
5046 i = NUMKEYS(mp) - 1;
5048 i = mc->mc_ki[mc->mc_top];
5050 mdb_cassert(mc, i > 0);
5054 DPRINTF(("following index %u for key [%s]", i, DKEY(key)));
5057 mdb_cassert(mc, i < NUMKEYS(mp));
5058 node = NODEPTR(mp, i);
5060 if ((rc = mdb_page_get(mc->mc_txn, NODEPGNO(node), &mp, NULL)) != 0)
5063 mc->mc_ki[mc->mc_top] = i;
5064 if ((rc = mdb_cursor_push(mc, mp)))
5067 if (flags & MDB_PS_MODIFY) {
5068 if ((rc = mdb_page_touch(mc)) != 0)
5070 mp = mc->mc_pg[mc->mc_top];
5075 DPRINTF(("internal error, index points to a %02X page!?",
5077 mc->mc_txn->mt_flags |= MDB_TXN_ERROR;
5078 return MDB_CORRUPTED;
5081 DPRINTF(("found leaf page %"Z"u for key [%s]", mp->mp_pgno,
5082 key ? DKEY(key) : "null"));
5083 mc->mc_flags |= C_INITIALIZED;
5084 mc->mc_flags &= ~C_EOF;
5089 /** Search for the lowest key under the current branch page.
5090 * This just bypasses a NUMKEYS check in the current page
5091 * before calling mdb_page_search_root(), because the callers
5092 * are all in situations where the current page is known to
5096 mdb_page_search_lowest(MDB_cursor *mc)
5098 MDB_page *mp = mc->mc_pg[mc->mc_top];
5099 MDB_node *node = NODEPTR(mp, 0);
5102 if ((rc = mdb_page_get(mc->mc_txn, NODEPGNO(node), &mp, NULL)) != 0)
5105 mc->mc_ki[mc->mc_top] = 0;
5106 if ((rc = mdb_cursor_push(mc, mp)))
5108 return mdb_page_search_root(mc, NULL, MDB_PS_FIRST);
5111 /** Search for the page a given key should be in.
5112 * Push it and its parent pages on the cursor stack.
5113 * @param[in,out] mc the cursor for this operation.
5114 * @param[in] key the key to search for, or NULL for first/last page.
5115 * @param[in] flags If MDB_PS_MODIFY is set, visited pages in the DB
5116 * are touched (updated with new page numbers).
5117 * If MDB_PS_FIRST or MDB_PS_LAST is set, find first or last leaf.
5118 * This is used by #mdb_cursor_first() and #mdb_cursor_last().
5119 * If MDB_PS_ROOTONLY set, just fetch root node, no further lookups.
5120 * @return 0 on success, non-zero on failure.
5123 mdb_page_search(MDB_cursor *mc, MDB_val *key, int flags)
5128 /* Make sure the txn is still viable, then find the root from
5129 * the txn's db table and set it as the root of the cursor's stack.
5131 if (F_ISSET(mc->mc_txn->mt_flags, MDB_TXN_ERROR)) {
5132 DPUTS("transaction has failed, must abort");
5135 /* Make sure we're using an up-to-date root */
5136 if (*mc->mc_dbflag & DB_STALE) {
5138 if (TXN_DBI_CHANGED(mc->mc_txn, mc->mc_dbi))
5140 mdb_cursor_init(&mc2, mc->mc_txn, MAIN_DBI, NULL);
5141 rc = mdb_page_search(&mc2, &mc->mc_dbx->md_name, 0);
5148 MDB_node *leaf = mdb_node_search(&mc2,
5149 &mc->mc_dbx->md_name, &exact);
5151 return MDB_NOTFOUND;
5152 rc = mdb_node_read(mc->mc_txn, leaf, &data);
5155 memcpy(&flags, ((char *) data.mv_data + offsetof(MDB_db, md_flags)),
5157 /* The txn may not know this DBI, or another process may
5158 * have dropped and recreated the DB with other flags.
5160 if ((mc->mc_db->md_flags & PERSISTENT_FLAGS) != flags)
5161 return MDB_INCOMPATIBLE;
5162 memcpy(mc->mc_db, data.mv_data, sizeof(MDB_db));
5164 *mc->mc_dbflag &= ~DB_STALE;
5166 root = mc->mc_db->md_root;
5168 if (root == P_INVALID) { /* Tree is empty. */
5169 DPUTS("tree is empty");
5170 return MDB_NOTFOUND;
5174 mdb_cassert(mc, root > 1);
5175 if (!mc->mc_pg[0] || mc->mc_pg[0]->mp_pgno != root)
5176 if ((rc = mdb_page_get(mc->mc_txn, root, &mc->mc_pg[0], NULL)) != 0)
5182 DPRINTF(("db %d root page %"Z"u has flags 0x%X",
5183 DDBI(mc), root, mc->mc_pg[0]->mp_flags));
5185 if (flags & MDB_PS_MODIFY) {
5186 if ((rc = mdb_page_touch(mc)))
5190 if (flags & MDB_PS_ROOTONLY)
5193 return mdb_page_search_root(mc, key, flags);
5197 mdb_ovpage_free(MDB_cursor *mc, MDB_page *mp)
5199 MDB_txn *txn = mc->mc_txn;
5200 pgno_t pg = mp->mp_pgno;
5201 unsigned x = 0, ovpages = mp->mp_pages;
5202 MDB_env *env = txn->mt_env;
5203 MDB_IDL sl = txn->mt_spill_pgs;
5204 MDB_ID pn = pg << 1;
5207 DPRINTF(("free ov page %"Z"u (%d)", pg, ovpages));
5208 /* If the page is dirty or on the spill list we just acquired it,
5209 * so we should give it back to our current free list, if any.
5210 * Otherwise put it onto the list of pages we freed in this txn.
5212 * Won't create me_pghead: me_pglast must be inited along with it.
5213 * Unsupported in nested txns: They would need to hide the page
5214 * range in ancestor txns' dirty and spilled lists.
5216 if (env->me_pghead &&
5218 ((mp->mp_flags & P_DIRTY) ||
5219 (sl && (x = mdb_midl_search(sl, pn)) <= sl[0] && sl[x] == pn)))
5223 MDB_ID2 *dl, ix, iy;
5224 rc = mdb_midl_need(&env->me_pghead, ovpages);
5227 if (!(mp->mp_flags & P_DIRTY)) {
5228 /* This page is no longer spilled */
5235 /* Remove from dirty list */
5236 dl = txn->mt_u.dirty_list;
5238 for (ix = dl[x]; ix.mptr != mp; ix = iy) {
5244 mdb_cassert(mc, x > 1);
5246 dl[j] = ix; /* Unsorted. OK when MDB_TXN_ERROR. */
5247 txn->mt_flags |= MDB_TXN_ERROR;
5248 return MDB_CORRUPTED;
5251 if (!(env->me_flags & MDB_WRITEMAP))
5252 mdb_dpage_free(env, mp);
5254 /* Insert in me_pghead */
5255 mop = env->me_pghead;
5256 j = mop[0] + ovpages;
5257 for (i = mop[0]; i && mop[i] < pg; i--)
5263 rc = mdb_midl_append_range(&txn->mt_free_pgs, pg, ovpages);
5267 mc->mc_db->md_overflow_pages -= ovpages;
5271 /** Return the data associated with a given node.
5272 * @param[in] txn The transaction for this operation.
5273 * @param[in] leaf The node being read.
5274 * @param[out] data Updated to point to the node's data.
5275 * @return 0 on success, non-zero on failure.
5278 mdb_node_read(MDB_txn *txn, MDB_node *leaf, MDB_val *data)
5280 MDB_page *omp; /* overflow page */
5284 if (!F_ISSET(leaf->mn_flags, F_BIGDATA)) {
5285 data->mv_size = NODEDSZ(leaf);
5286 data->mv_data = NODEDATA(leaf);
5290 /* Read overflow data.
5292 data->mv_size = NODEDSZ(leaf);
5293 memcpy(&pgno, NODEDATA(leaf), sizeof(pgno));
5294 if ((rc = mdb_page_get(txn, pgno, &omp, NULL)) != 0) {
5295 DPRINTF(("read overflow page %"Z"u failed", pgno));
5298 data->mv_data = METADATA(omp);
5304 mdb_get(MDB_txn *txn, MDB_dbi dbi,
5305 MDB_val *key, MDB_val *data)
5312 DPRINTF(("===> get db %u key [%s]", dbi, DKEY(key)));
5314 if (!key || !data || dbi == FREE_DBI || !TXN_DBI_EXIST(txn, dbi))
5317 if (txn->mt_flags & MDB_TXN_ERROR)
5320 mdb_cursor_init(&mc, txn, dbi, &mx);
5321 return mdb_cursor_set(&mc, key, data, MDB_SET, &exact);
5324 /** Find a sibling for a page.
5325 * Replaces the page at the top of the cursor's stack with the
5326 * specified sibling, if one exists.
5327 * @param[in] mc The cursor for this operation.
5328 * @param[in] move_right Non-zero if the right sibling is requested,
5329 * otherwise the left sibling.
5330 * @return 0 on success, non-zero on failure.
5333 mdb_cursor_sibling(MDB_cursor *mc, int move_right)
5339 if (mc->mc_snum < 2) {
5340 return MDB_NOTFOUND; /* root has no siblings */
5344 DPRINTF(("parent page is page %"Z"u, index %u",
5345 mc->mc_pg[mc->mc_top]->mp_pgno, mc->mc_ki[mc->mc_top]));
5347 if (move_right ? (mc->mc_ki[mc->mc_top] + 1u >= NUMKEYS(mc->mc_pg[mc->mc_top]))
5348 : (mc->mc_ki[mc->mc_top] == 0)) {
5349 DPRINTF(("no more keys left, moving to %s sibling",
5350 move_right ? "right" : "left"));
5351 if ((rc = mdb_cursor_sibling(mc, move_right)) != MDB_SUCCESS) {
5352 /* undo cursor_pop before returning */
5359 mc->mc_ki[mc->mc_top]++;
5361 mc->mc_ki[mc->mc_top]--;
5362 DPRINTF(("just moving to %s index key %u",
5363 move_right ? "right" : "left", mc->mc_ki[mc->mc_top]));
5365 mdb_cassert(mc, IS_BRANCH(mc->mc_pg[mc->mc_top]));
5367 indx = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
5368 if ((rc = mdb_page_get(mc->mc_txn, NODEPGNO(indx), &mp, NULL)) != 0) {
5369 /* mc will be inconsistent if caller does mc_snum++ as above */
5370 mc->mc_flags &= ~(C_INITIALIZED|C_EOF);
5374 mdb_cursor_push(mc, mp);
5376 mc->mc_ki[mc->mc_top] = NUMKEYS(mp)-1;
5381 /** Move the cursor to the next data item. */
5383 mdb_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op)
5389 if (mc->mc_flags & C_EOF) {
5390 return MDB_NOTFOUND;
5393 mdb_cassert(mc, mc->mc_flags & C_INITIALIZED);
5395 mp = mc->mc_pg[mc->mc_top];
5397 if (mc->mc_db->md_flags & MDB_DUPSORT) {
5398 leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
5399 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
5400 if (op == MDB_NEXT || op == MDB_NEXT_DUP) {
5401 rc = mdb_cursor_next(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_NEXT);
5402 if (op != MDB_NEXT || rc != MDB_NOTFOUND) {
5403 if (rc == MDB_SUCCESS)
5404 MDB_GET_KEY(leaf, key);
5409 mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF);
5410 if (op == MDB_NEXT_DUP)
5411 return MDB_NOTFOUND;
5415 DPRINTF(("cursor_next: top page is %"Z"u in cursor %p",
5416 mdb_dbg_pgno(mp), (void *) mc));
5417 if (mc->mc_flags & C_DEL)
5420 if (mc->mc_ki[mc->mc_top] + 1u >= NUMKEYS(mp)) {
5421 DPUTS("=====> move to next sibling page");
5422 if ((rc = mdb_cursor_sibling(mc, 1)) != MDB_SUCCESS) {
5423 mc->mc_flags |= C_EOF;
5426 mp = mc->mc_pg[mc->mc_top];
5427 DPRINTF(("next page is %"Z"u, key index %u", mp->mp_pgno, mc->mc_ki[mc->mc_top]));
5429 mc->mc_ki[mc->mc_top]++;
5432 DPRINTF(("==> cursor points to page %"Z"u with %u keys, key index %u",
5433 mdb_dbg_pgno(mp), NUMKEYS(mp), mc->mc_ki[mc->mc_top]));
5436 key->mv_size = mc->mc_db->md_pad;
5437 key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size);
5441 mdb_cassert(mc, IS_LEAF(mp));
5442 leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
5444 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
5445 mdb_xcursor_init1(mc, leaf);
5448 if ((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS)
5451 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
5452 rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL);
5453 if (rc != MDB_SUCCESS)
5458 MDB_GET_KEY(leaf, key);
5462 /** Move the cursor to the previous data item. */
5464 mdb_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op)
5470 mdb_cassert(mc, mc->mc_flags & C_INITIALIZED);
5472 mp = mc->mc_pg[mc->mc_top];
5474 if (mc->mc_db->md_flags & MDB_DUPSORT) {
5475 leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
5476 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
5477 if (op == MDB_PREV || op == MDB_PREV_DUP) {
5478 rc = mdb_cursor_prev(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_PREV);
5479 if (op != MDB_PREV || rc != MDB_NOTFOUND) {
5480 if (rc == MDB_SUCCESS) {
5481 MDB_GET_KEY(leaf, key);
5482 mc->mc_flags &= ~C_EOF;
5488 mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF);
5489 if (op == MDB_PREV_DUP)
5490 return MDB_NOTFOUND;
5494 DPRINTF(("cursor_prev: top page is %"Z"u in cursor %p",
5495 mdb_dbg_pgno(mp), (void *) mc));
5497 if (mc->mc_ki[mc->mc_top] == 0) {
5498 DPUTS("=====> move to prev sibling page");
5499 if ((rc = mdb_cursor_sibling(mc, 0)) != MDB_SUCCESS) {
5502 mp = mc->mc_pg[mc->mc_top];
5503 mc->mc_ki[mc->mc_top] = NUMKEYS(mp) - 1;
5504 DPRINTF(("prev page is %"Z"u, key index %u", mp->mp_pgno, mc->mc_ki[mc->mc_top]));
5506 mc->mc_ki[mc->mc_top]--;
5508 mc->mc_flags &= ~C_EOF;
5510 DPRINTF(("==> cursor points to page %"Z"u with %u keys, key index %u",
5511 mdb_dbg_pgno(mp), NUMKEYS(mp), mc->mc_ki[mc->mc_top]));
5514 key->mv_size = mc->mc_db->md_pad;
5515 key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size);
5519 mdb_cassert(mc, IS_LEAF(mp));
5520 leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
5522 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
5523 mdb_xcursor_init1(mc, leaf);
5526 if ((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS)
5529 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
5530 rc = mdb_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL);
5531 if (rc != MDB_SUCCESS)
5536 MDB_GET_KEY(leaf, key);
5540 /** Set the cursor on a specific data item. */
5542 mdb_cursor_set(MDB_cursor *mc, MDB_val *key, MDB_val *data,
5543 MDB_cursor_op op, int *exactp)
5547 MDB_node *leaf = NULL;
5550 if (key->mv_size == 0)
5551 return MDB_BAD_VALSIZE;
5554 mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF);
5556 /* See if we're already on the right page */
5557 if (mc->mc_flags & C_INITIALIZED) {
5560 mp = mc->mc_pg[mc->mc_top];
5562 mc->mc_ki[mc->mc_top] = 0;
5563 return MDB_NOTFOUND;
5565 if (mp->mp_flags & P_LEAF2) {
5566 nodekey.mv_size = mc->mc_db->md_pad;
5567 nodekey.mv_data = LEAF2KEY(mp, 0, nodekey.mv_size);
5569 leaf = NODEPTR(mp, 0);
5570 MDB_GET_KEY2(leaf, nodekey);
5572 rc = mc->mc_dbx->md_cmp(key, &nodekey);
5574 /* Probably happens rarely, but first node on the page
5575 * was the one we wanted.
5577 mc->mc_ki[mc->mc_top] = 0;
5584 unsigned int nkeys = NUMKEYS(mp);
5586 if (mp->mp_flags & P_LEAF2) {
5587 nodekey.mv_data = LEAF2KEY(mp,
5588 nkeys-1, nodekey.mv_size);
5590 leaf = NODEPTR(mp, nkeys-1);
5591 MDB_GET_KEY2(leaf, nodekey);
5593 rc = mc->mc_dbx->md_cmp(key, &nodekey);
5595 /* last node was the one we wanted */
5596 mc->mc_ki[mc->mc_top] = nkeys-1;
5602 if (mc->mc_ki[mc->mc_top] < NUMKEYS(mp)) {
5603 /* This is definitely the right page, skip search_page */
5604 if (mp->mp_flags & P_LEAF2) {
5605 nodekey.mv_data = LEAF2KEY(mp,
5606 mc->mc_ki[mc->mc_top], nodekey.mv_size);
5608 leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
5609 MDB_GET_KEY2(leaf, nodekey);
5611 rc = mc->mc_dbx->md_cmp(key, &nodekey);
5613 /* current node was the one we wanted */
5623 /* If any parents have right-sibs, search.
5624 * Otherwise, there's nothing further.
5626 for (i=0; i<mc->mc_top; i++)
5628 NUMKEYS(mc->mc_pg[i])-1)
5630 if (i == mc->mc_top) {
5631 /* There are no other pages */
5632 mc->mc_ki[mc->mc_top] = nkeys;
5633 return MDB_NOTFOUND;
5637 /* There are no other pages */
5638 mc->mc_ki[mc->mc_top] = 0;
5639 if (op == MDB_SET_RANGE && !exactp) {
5643 return MDB_NOTFOUND;
5647 rc = mdb_page_search(mc, key, 0);
5648 if (rc != MDB_SUCCESS)
5651 mp = mc->mc_pg[mc->mc_top];
5652 mdb_cassert(mc, IS_LEAF(mp));
5655 leaf = mdb_node_search(mc, key, exactp);
5656 if (exactp != NULL && !*exactp) {
5657 /* MDB_SET specified and not an exact match. */
5658 return MDB_NOTFOUND;
5662 DPUTS("===> inexact leaf not found, goto sibling");
5663 if ((rc = mdb_cursor_sibling(mc, 1)) != MDB_SUCCESS)
5664 return rc; /* no entries matched */
5665 mp = mc->mc_pg[mc->mc_top];
5666 mdb_cassert(mc, IS_LEAF(mp));
5667 leaf = NODEPTR(mp, 0);
5671 mc->mc_flags |= C_INITIALIZED;
5672 mc->mc_flags &= ~C_EOF;
5675 if (op == MDB_SET_RANGE || op == MDB_SET_KEY) {
5676 key->mv_size = mc->mc_db->md_pad;
5677 key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size);
5682 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
5683 mdb_xcursor_init1(mc, leaf);
5686 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
5687 if (op == MDB_SET || op == MDB_SET_KEY || op == MDB_SET_RANGE) {
5688 rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL);
5691 if (op == MDB_GET_BOTH) {
5697 rc = mdb_cursor_set(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_SET_RANGE, ex2p);
5698 if (rc != MDB_SUCCESS)
5701 } else if (op == MDB_GET_BOTH || op == MDB_GET_BOTH_RANGE) {
5703 if ((rc = mdb_node_read(mc->mc_txn, leaf, &d2)) != MDB_SUCCESS)
5705 rc = mc->mc_dbx->md_dcmp(data, &d2);
5707 if (op == MDB_GET_BOTH || rc > 0)
5708 return MDB_NOTFOUND;
5715 mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF);
5716 if ((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS)
5721 /* The key already matches in all other cases */
5722 if (op == MDB_SET_RANGE || op == MDB_SET_KEY)
5723 MDB_GET_KEY(leaf, key);
5724 DPRINTF(("==> cursor placed on key [%s]", DKEY(key)));
5729 /** Move the cursor to the first item in the database. */
5731 mdb_cursor_first(MDB_cursor *mc, MDB_val *key, MDB_val *data)
5737 mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF);
5739 if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) {
5740 rc = mdb_page_search(mc, NULL, MDB_PS_FIRST);
5741 if (rc != MDB_SUCCESS)
5744 mdb_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top]));
5746 leaf = NODEPTR(mc->mc_pg[mc->mc_top], 0);
5747 mc->mc_flags |= C_INITIALIZED;
5748 mc->mc_flags &= ~C_EOF;
5750 mc->mc_ki[mc->mc_top] = 0;
5752 if (IS_LEAF2(mc->mc_pg[mc->mc_top])) {
5753 key->mv_size = mc->mc_db->md_pad;
5754 key->mv_data = LEAF2KEY(mc->mc_pg[mc->mc_top], 0, key->mv_size);
5759 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
5760 mdb_xcursor_init1(mc, leaf);
5761 rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL);
5765 if ((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS)
5769 MDB_GET_KEY(leaf, key);
5773 /** Move the cursor to the last item in the database. */
5775 mdb_cursor_last(MDB_cursor *mc, MDB_val *key, MDB_val *data)
5781 mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF);
5783 if (!(mc->mc_flags & C_EOF)) {
5785 if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) {
5786 rc = mdb_page_search(mc, NULL, MDB_PS_LAST);
5787 if (rc != MDB_SUCCESS)
5790 mdb_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top]));
5793 mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]) - 1;
5794 mc->mc_flags |= C_INITIALIZED|C_EOF;
5795 leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
5797 if (IS_LEAF2(mc->mc_pg[mc->mc_top])) {
5798 key->mv_size = mc->mc_db->md_pad;
5799 key->mv_data = LEAF2KEY(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], key->mv_size);
5804 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
5805 mdb_xcursor_init1(mc, leaf);
5806 rc = mdb_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL);
5810 if ((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS)
5815 MDB_GET_KEY(leaf, key);
5820 mdb_cursor_get(MDB_cursor *mc, MDB_val *key, MDB_val *data,
5825 int (*mfunc)(MDB_cursor *mc, MDB_val *key, MDB_val *data);
5830 if (mc->mc_txn->mt_flags & MDB_TXN_ERROR)
5834 case MDB_GET_CURRENT:
5835 if (!(mc->mc_flags & C_INITIALIZED)) {
5838 MDB_page *mp = mc->mc_pg[mc->mc_top];
5839 int nkeys = NUMKEYS(mp);
5840 if (!nkeys || mc->mc_ki[mc->mc_top] >= nkeys) {
5841 mc->mc_ki[mc->mc_top] = nkeys;
5847 key->mv_size = mc->mc_db->md_pad;
5848 key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size);
5850 MDB_node *leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
5851 MDB_GET_KEY(leaf, key);
5853 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
5854 if (mc->mc_flags & C_DEL)
5855 mdb_xcursor_init1(mc, leaf);
5856 rc = mdb_cursor_get(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_GET_CURRENT);
5858 rc = mdb_node_read(mc->mc_txn, leaf, data);
5865 case MDB_GET_BOTH_RANGE:
5870 if (mc->mc_xcursor == NULL) {
5871 rc = MDB_INCOMPATIBLE;
5881 rc = mdb_cursor_set(mc, key, data, op,
5882 op == MDB_SET_RANGE ? NULL : &exact);
5885 case MDB_GET_MULTIPLE:
5886 if (data == NULL || !(mc->mc_flags & C_INITIALIZED)) {
5890 if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) {
5891 rc = MDB_INCOMPATIBLE;
5895 if (!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) ||
5896 (mc->mc_xcursor->mx_cursor.mc_flags & C_EOF))
5899 case MDB_NEXT_MULTIPLE:
5904 if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) {
5905 rc = MDB_INCOMPATIBLE;
5908 if (!(mc->mc_flags & C_INITIALIZED))
5909 rc = mdb_cursor_first(mc, key, data);
5911 rc = mdb_cursor_next(mc, key, data, MDB_NEXT_DUP);
5912 if (rc == MDB_SUCCESS) {
5913 if (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) {
5916 mx = &mc->mc_xcursor->mx_cursor;
5917 data->mv_size = NUMKEYS(mx->mc_pg[mx->mc_top]) *
5919 data->mv_data = METADATA(mx->mc_pg[mx->mc_top]);
5920 mx->mc_ki[mx->mc_top] = NUMKEYS(mx->mc_pg[mx->mc_top])-1;
5928 case MDB_NEXT_NODUP:
5929 if (!(mc->mc_flags & C_INITIALIZED))
5930 rc = mdb_cursor_first(mc, key, data);
5932 rc = mdb_cursor_next(mc, key, data, op);
5936 case MDB_PREV_NODUP:
5937 if (!(mc->mc_flags & C_INITIALIZED)) {
5938 rc = mdb_cursor_last(mc, key, data);
5941 mc->mc_flags |= C_INITIALIZED;
5942 mc->mc_ki[mc->mc_top]++;
5944 rc = mdb_cursor_prev(mc, key, data, op);
5947 rc = mdb_cursor_first(mc, key, data);
5950 mfunc = mdb_cursor_first;
5952 if (data == NULL || !(mc->mc_flags & C_INITIALIZED)) {
5956 if (mc->mc_xcursor == NULL) {
5957 rc = MDB_INCOMPATIBLE;
5961 MDB_node *leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
5962 if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) {
5963 MDB_GET_KEY(leaf, key);
5964 rc = mdb_node_read(mc->mc_txn, leaf, data);
5968 if (!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) {
5972 rc = mfunc(&mc->mc_xcursor->mx_cursor, data, NULL);
5975 rc = mdb_cursor_last(mc, key, data);
5978 mfunc = mdb_cursor_last;
5981 DPRINTF(("unhandled/unimplemented cursor operation %u", op));
5986 if (mc->mc_flags & C_DEL)
5987 mc->mc_flags ^= C_DEL;
5992 /** Touch all the pages in the cursor stack. Set mc_top.
5993 * Makes sure all the pages are writable, before attempting a write operation.
5994 * @param[in] mc The cursor to operate on.
5997 mdb_cursor_touch(MDB_cursor *mc)
5999 int rc = MDB_SUCCESS;
6001 if (mc->mc_dbi > MAIN_DBI && !(*mc->mc_dbflag & DB_DIRTY)) {
6004 if (TXN_DBI_CHANGED(mc->mc_txn, mc->mc_dbi))
6006 mdb_cursor_init(&mc2, mc->mc_txn, MAIN_DBI, &mcx);
6007 rc = mdb_page_search(&mc2, &mc->mc_dbx->md_name, MDB_PS_MODIFY);
6010 *mc->mc_dbflag |= DB_DIRTY;
6015 rc = mdb_page_touch(mc);
6016 } while (!rc && ++(mc->mc_top) < mc->mc_snum);
6017 mc->mc_top = mc->mc_snum-1;
6022 /** Do not spill pages to disk if txn is getting full, may fail instead */
6023 #define MDB_NOSPILL 0x8000
6026 mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data,
6029 enum { MDB_NO_ROOT = MDB_LAST_ERRCODE+10 }; /* internal code */
6031 MDB_node *leaf = NULL;
6034 MDB_val xdata, *rdata, dkey, olddata;
6036 int do_sub = 0, insert_key, insert_data;
6037 unsigned int mcount = 0, dcount = 0, nospill;
6040 unsigned int nflags;
6043 if (mc == NULL || key == NULL)
6046 env = mc->mc_txn->mt_env;
6048 /* Check this first so counter will always be zero on any
6051 if (flags & MDB_MULTIPLE) {
6052 dcount = data[1].mv_size;
6053 data[1].mv_size = 0;
6054 if (!F_ISSET(mc->mc_db->md_flags, MDB_DUPFIXED))
6055 return MDB_INCOMPATIBLE;
6058 nospill = flags & MDB_NOSPILL;
6059 flags &= ~MDB_NOSPILL;
6061 if (mc->mc_txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_ERROR))
6062 return (mc->mc_txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN;
6064 if (key->mv_size-1 >= ENV_MAXKEY(env))
6065 return MDB_BAD_VALSIZE;
6067 #if SIZE_MAX > MAXDATASIZE
6068 if (data->mv_size > ((mc->mc_db->md_flags & MDB_DUPSORT) ? ENV_MAXKEY(env) : MAXDATASIZE))
6069 return MDB_BAD_VALSIZE;
6071 if ((mc->mc_db->md_flags & MDB_DUPSORT) && data->mv_size > ENV_MAXKEY(env))
6072 return MDB_BAD_VALSIZE;
6075 DPRINTF(("==> put db %d key [%s], size %"Z"u, data size %"Z"u",
6076 DDBI(mc), DKEY(key), key ? key->mv_size : 0, data->mv_size));
6080 if (flags == MDB_CURRENT) {
6081 if (!(mc->mc_flags & C_INITIALIZED))
6084 } else if (mc->mc_db->md_root == P_INVALID) {
6085 /* new database, cursor has nothing to point to */
6088 mc->mc_flags &= ~C_INITIALIZED;
6093 if (flags & MDB_APPEND) {
6095 rc = mdb_cursor_last(mc, &k2, &d2);
6097 rc = mc->mc_dbx->md_cmp(key, &k2);
6100 mc->mc_ki[mc->mc_top]++;
6102 /* new key is <= last key */
6107 rc = mdb_cursor_set(mc, key, &d2, MDB_SET, &exact);
6109 if ((flags & MDB_NOOVERWRITE) && rc == 0) {
6110 DPRINTF(("duplicate key [%s]", DKEY(key)));
6112 return MDB_KEYEXIST;
6114 if (rc && rc != MDB_NOTFOUND)
6118 if (mc->mc_flags & C_DEL)
6119 mc->mc_flags ^= C_DEL;
6121 /* Cursor is positioned, check for room in the dirty list */
6123 if (flags & MDB_MULTIPLE) {
6125 xdata.mv_size = data->mv_size * dcount;
6129 if ((rc2 = mdb_page_spill(mc, key, rdata)))
6133 if (rc == MDB_NO_ROOT) {
6135 /* new database, write a root leaf page */
6136 DPUTS("allocating new root leaf page");
6137 if ((rc2 = mdb_page_new(mc, P_LEAF, 1, &np))) {
6140 mdb_cursor_push(mc, np);
6141 mc->mc_db->md_root = np->mp_pgno;
6142 mc->mc_db->md_depth++;
6143 *mc->mc_dbflag |= DB_DIRTY;
6144 if ((mc->mc_db->md_flags & (MDB_DUPSORT|MDB_DUPFIXED))
6146 np->mp_flags |= P_LEAF2;
6147 mc->mc_flags |= C_INITIALIZED;
6149 /* make sure all cursor pages are writable */
6150 rc2 = mdb_cursor_touch(mc);
6155 insert_key = insert_data = rc;
6157 /* The key does not exist */
6158 DPRINTF(("inserting key at index %i", mc->mc_ki[mc->mc_top]));
6159 if ((mc->mc_db->md_flags & MDB_DUPSORT) &&
6160 LEAFSIZE(key, data) > env->me_nodemax)
6162 /* Too big for a node, insert in sub-DB. Set up an empty
6163 * "old sub-page" for prep_subDB to expand to a full page.
6165 fp_flags = P_LEAF|P_DIRTY;
6167 fp->mp_pad = data->mv_size; /* used if MDB_DUPFIXED */
6168 fp->mp_lower = fp->mp_upper = (PAGEHDRSZ-PAGEBASE);
6169 olddata.mv_size = PAGEHDRSZ;
6173 /* there's only a key anyway, so this is a no-op */
6174 if (IS_LEAF2(mc->mc_pg[mc->mc_top])) {
6176 unsigned int ksize = mc->mc_db->md_pad;
6177 if (key->mv_size != ksize)
6178 return MDB_BAD_VALSIZE;
6179 ptr = LEAF2KEY(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], ksize);
6180 memcpy(ptr, key->mv_data, ksize);
6182 /* if overwriting slot 0 of leaf, need to
6183 * update branch key if there is a parent page
6185 if (mc->mc_top && !mc->mc_ki[mc->mc_top]) {
6186 unsigned short top = mc->mc_top;
6188 /* slot 0 is always an empty key, find real slot */
6189 while (mc->mc_top && !mc->mc_ki[mc->mc_top])
6191 if (mc->mc_ki[mc->mc_top])
6192 rc2 = mdb_update_key(mc, key);
6203 leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
6204 olddata.mv_size = NODEDSZ(leaf);
6205 olddata.mv_data = NODEDATA(leaf);
6208 if (F_ISSET(mc->mc_db->md_flags, MDB_DUPSORT)) {
6209 /* Prepare (sub-)page/sub-DB to accept the new item,
6210 * if needed. fp: old sub-page or a header faking
6211 * it. mp: new (sub-)page. offset: growth in page
6212 * size. xdata: node data with new page or DB.
6214 unsigned i, offset = 0;
6215 mp = fp = xdata.mv_data = env->me_pbuf;
6216 mp->mp_pgno = mc->mc_pg[mc->mc_top]->mp_pgno;
6218 /* Was a single item before, must convert now */
6219 if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) {
6220 /* Just overwrite the current item */
6221 if (flags == MDB_CURRENT)
6224 #if UINT_MAX < SIZE_MAX
6225 if (mc->mc_dbx->md_dcmp == mdb_cmp_int && olddata.mv_size == sizeof(size_t))
6226 mc->mc_dbx->md_dcmp = mdb_cmp_clong;
6228 /* does data match? */
6229 if (!mc->mc_dbx->md_dcmp(data, &olddata)) {
6230 if (flags & MDB_NODUPDATA)
6231 return MDB_KEYEXIST;
6236 /* Back up original data item */
6237 dkey.mv_size = olddata.mv_size;
6238 dkey.mv_data = memcpy(fp+1, olddata.mv_data, olddata.mv_size);
6240 /* Make sub-page header for the dup items, with dummy body */
6241 fp->mp_flags = P_LEAF|P_DIRTY|P_SUBP;
6242 fp->mp_lower = (PAGEHDRSZ-PAGEBASE);
6243 xdata.mv_size = PAGEHDRSZ + dkey.mv_size + data->mv_size;
6244 if (mc->mc_db->md_flags & MDB_DUPFIXED) {
6245 fp->mp_flags |= P_LEAF2;
6246 fp->mp_pad = data->mv_size;
6247 xdata.mv_size += 2 * data->mv_size; /* leave space for 2 more */
6249 xdata.mv_size += 2 * (sizeof(indx_t) + NODESIZE) +
6250 (dkey.mv_size & 1) + (data->mv_size & 1);
6252 fp->mp_upper = xdata.mv_size - PAGEBASE;
6253 olddata.mv_size = xdata.mv_size; /* pretend olddata is fp */
6254 } else if (leaf->mn_flags & F_SUBDATA) {
6255 /* Data is on sub-DB, just store it */
6256 flags |= F_DUPDATA|F_SUBDATA;
6259 /* Data is on sub-page */
6260 fp = olddata.mv_data;
6263 if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) {
6264 offset = EVEN(NODESIZE + sizeof(indx_t) +
6268 offset = fp->mp_pad;
6269 if (SIZELEFT(fp) < offset) {
6270 offset *= 4; /* space for 4 more */
6273 /* FALLTHRU: Big enough MDB_DUPFIXED sub-page */
6275 fp->mp_flags |= P_DIRTY;
6276 COPY_PGNO(fp->mp_pgno, mp->mp_pgno);
6277 mc->mc_xcursor->mx_cursor.mc_pg[0] = fp;
6281 xdata.mv_size = olddata.mv_size + offset;
6284 fp_flags = fp->mp_flags;
6285 if (NODESIZE + NODEKSZ(leaf) + xdata.mv_size > env->me_nodemax) {
6286 /* Too big for a sub-page, convert to sub-DB */
6287 fp_flags &= ~P_SUBP;
6289 if (mc->mc_db->md_flags & MDB_DUPFIXED) {
6290 fp_flags |= P_LEAF2;
6291 dummy.md_pad = fp->mp_pad;
6292 dummy.md_flags = MDB_DUPFIXED;
6293 if (mc->mc_db->md_flags & MDB_INTEGERDUP)
6294 dummy.md_flags |= MDB_INTEGERKEY;
6300 dummy.md_branch_pages = 0;
6301 dummy.md_leaf_pages = 1;
6302 dummy.md_overflow_pages = 0;
6303 dummy.md_entries = NUMKEYS(fp);
6304 xdata.mv_size = sizeof(MDB_db);
6305 xdata.mv_data = &dummy;
6306 if ((rc = mdb_page_alloc(mc, 1, &mp)))
6308 offset = env->me_psize - olddata.mv_size;
6309 flags |= F_DUPDATA|F_SUBDATA;
6310 dummy.md_root = mp->mp_pgno;
6313 mp->mp_flags = fp_flags | P_DIRTY;
6314 mp->mp_pad = fp->mp_pad;
6315 mp->mp_lower = fp->mp_lower;
6316 mp->mp_upper = fp->mp_upper + offset;
6317 if (fp_flags & P_LEAF2) {
6318 memcpy(METADATA(mp), METADATA(fp), NUMKEYS(fp) * fp->mp_pad);
6320 memcpy((char *)mp + mp->mp_upper + PAGEBASE, (char *)fp + fp->mp_upper + PAGEBASE,
6321 olddata.mv_size - fp->mp_upper - PAGEBASE);
6322 for (i=0; i<NUMKEYS(fp); i++)
6323 mp->mp_ptrs[i] = fp->mp_ptrs[i] + offset;
6331 mdb_node_del(mc, 0);
6335 /* overflow page overwrites need special handling */
6336 if (F_ISSET(leaf->mn_flags, F_BIGDATA)) {
6339 int level, ovpages, dpages = OVPAGES(data->mv_size, env->me_psize);
6341 memcpy(&pg, olddata.mv_data, sizeof(pg));
6342 if ((rc2 = mdb_page_get(mc->mc_txn, pg, &omp, &level)) != 0)
6344 ovpages = omp->mp_pages;
6346 /* Is the ov page large enough? */
6347 if (ovpages >= dpages) {
6348 if (!(omp->mp_flags & P_DIRTY) &&
6349 (level || (env->me_flags & MDB_WRITEMAP)))
6351 rc = mdb_page_unspill(mc->mc_txn, omp, &omp);
6354 level = 0; /* dirty in this txn or clean */
6357 if (omp->mp_flags & P_DIRTY) {
6358 /* yes, overwrite it. Note in this case we don't
6359 * bother to try shrinking the page if the new data
6360 * is smaller than the overflow threshold.
6363 /* It is writable only in a parent txn */
6364 size_t sz = (size_t) env->me_psize * ovpages, off;
6365 MDB_page *np = mdb_page_malloc(mc->mc_txn, ovpages);
6371 rc2 = mdb_mid2l_insert(mc->mc_txn->mt_u.dirty_list, &id2);
6372 mdb_cassert(mc, rc2 == 0);
6373 if (!(flags & MDB_RESERVE)) {
6374 /* Copy end of page, adjusting alignment so
6375 * compiler may copy words instead of bytes.
6377 off = (PAGEHDRSZ + data->mv_size) & -sizeof(size_t);
6378 memcpy((size_t *)((char *)np + off),
6379 (size_t *)((char *)omp + off), sz - off);
6382 memcpy(np, omp, sz); /* Copy beginning of page */
6385 SETDSZ(leaf, data->mv_size);
6386 if (F_ISSET(flags, MDB_RESERVE))
6387 data->mv_data = METADATA(omp);
6389 memcpy(METADATA(omp), data->mv_data, data->mv_size);
6393 if ((rc2 = mdb_ovpage_free(mc, omp)) != MDB_SUCCESS)
6395 } else if (data->mv_size == olddata.mv_size) {
6396 /* same size, just replace it. Note that we could
6397 * also reuse this node if the new data is smaller,
6398 * but instead we opt to shrink the node in that case.
6400 if (F_ISSET(flags, MDB_RESERVE))
6401 data->mv_data = olddata.mv_data;
6402 else if (!(mc->mc_flags & C_SUB))
6403 memcpy(olddata.mv_data, data->mv_data, data->mv_size);
6405 memcpy(NODEKEY(leaf), key->mv_data, key->mv_size);
6410 mdb_node_del(mc, 0);
6416 nflags = flags & NODE_ADD_FLAGS;
6417 nsize = IS_LEAF2(mc->mc_pg[mc->mc_top]) ? key->mv_size : mdb_leaf_size(env, key, rdata);
6418 if (SIZELEFT(mc->mc_pg[mc->mc_top]) < nsize) {
6419 if (( flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA )
6420 nflags &= ~MDB_APPEND; /* sub-page may need room to grow */
6422 nflags |= MDB_SPLIT_REPLACE;
6423 rc = mdb_page_split(mc, key, rdata, P_INVALID, nflags);
6425 /* There is room already in this leaf page. */
6426 rc = mdb_node_add(mc, mc->mc_ki[mc->mc_top], key, rdata, 0, nflags);
6427 if (rc == 0 && insert_key) {
6428 /* Adjust other cursors pointing to mp */
6429 MDB_cursor *m2, *m3;
6430 MDB_dbi dbi = mc->mc_dbi;
6431 unsigned i = mc->mc_top;
6432 MDB_page *mp = mc->mc_pg[i];
6434 for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
6435 if (mc->mc_flags & C_SUB)
6436 m3 = &m2->mc_xcursor->mx_cursor;
6439 if (m3 == mc || m3->mc_snum < mc->mc_snum) continue;
6440 if (m3->mc_pg[i] == mp && m3->mc_ki[i] >= mc->mc_ki[i]) {
6447 if (rc == MDB_SUCCESS) {
6448 /* Now store the actual data in the child DB. Note that we're
6449 * storing the user data in the keys field, so there are strict
6450 * size limits on dupdata. The actual data fields of the child
6451 * DB are all zero size.
6459 leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
6460 if (flags & MDB_CURRENT) {
6461 xflags = MDB_CURRENT|MDB_NOSPILL;
6463 mdb_xcursor_init1(mc, leaf);
6464 xflags = (flags & MDB_NODUPDATA) ?
6465 MDB_NOOVERWRITE|MDB_NOSPILL : MDB_NOSPILL;
6467 /* converted, write the original data first */
6469 rc = mdb_cursor_put(&mc->mc_xcursor->mx_cursor, &dkey, &xdata, xflags);
6473 /* Adjust other cursors pointing to mp */
6475 unsigned i = mc->mc_top;
6476 MDB_page *mp = mc->mc_pg[i];
6478 for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) {
6479 if (m2 == mc || m2->mc_snum < mc->mc_snum) continue;
6480 if (!(m2->mc_flags & C_INITIALIZED)) continue;
6481 if (m2->mc_pg[i] == mp && m2->mc_ki[i] == mc->mc_ki[i]) {
6482 mdb_xcursor_init1(m2, leaf);
6486 /* we've done our job */
6489 ecount = mc->mc_xcursor->mx_db.md_entries;
6490 if (flags & MDB_APPENDDUP)
6491 xflags |= MDB_APPEND;
6492 rc = mdb_cursor_put(&mc->mc_xcursor->mx_cursor, data, &xdata, xflags);
6493 if (flags & F_SUBDATA) {
6494 void *db = NODEDATA(leaf);
6495 memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDB_db));
6497 insert_data = mc->mc_xcursor->mx_db.md_entries - ecount;
6499 /* Increment count unless we just replaced an existing item. */
6501 mc->mc_db->md_entries++;
6503 /* Invalidate txn if we created an empty sub-DB */
6506 /* If we succeeded and the key didn't exist before,
6507 * make sure the cursor is marked valid.
6509 mc->mc_flags |= C_INITIALIZED;
6511 if (flags & MDB_MULTIPLE) {
6514 /* let caller know how many succeeded, if any */
6515 data[1].mv_size = mcount;
6516 if (mcount < dcount) {
6517 data[0].mv_data = (char *)data[0].mv_data + data[0].mv_size;
6518 insert_key = insert_data = 0;
6525 if (rc == MDB_KEYEXIST) /* should not happen, we deleted that item */
6528 mc->mc_txn->mt_flags |= MDB_TXN_ERROR;
6533 mdb_cursor_del(MDB_cursor *mc, unsigned int flags)
6539 if (mc->mc_txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_ERROR))
6540 return (mc->mc_txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN;
6542 if (!(mc->mc_flags & C_INITIALIZED))
6545 if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mc->mc_pg[mc->mc_top]))
6546 return MDB_NOTFOUND;
6548 if (!(flags & MDB_NOSPILL) && (rc = mdb_page_spill(mc, NULL, NULL)))
6551 rc = mdb_cursor_touch(mc);
6555 mp = mc->mc_pg[mc->mc_top];
6558 leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
6560 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
6561 if (flags & MDB_NODUPDATA) {
6562 /* mdb_cursor_del0() will subtract the final entry */
6563 mc->mc_db->md_entries -= mc->mc_xcursor->mx_db.md_entries - 1;
6565 if (!F_ISSET(leaf->mn_flags, F_SUBDATA)) {
6566 mc->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf);
6568 rc = mdb_cursor_del(&mc->mc_xcursor->mx_cursor, MDB_NOSPILL);
6571 /* If sub-DB still has entries, we're done */
6572 if (mc->mc_xcursor->mx_db.md_entries) {
6573 if (leaf->mn_flags & F_SUBDATA) {
6574 /* update subDB info */
6575 void *db = NODEDATA(leaf);
6576 memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDB_db));
6579 /* shrink fake page */
6580 mdb_node_shrink(mp, mc->mc_ki[mc->mc_top]);
6581 leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
6582 mc->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf);
6583 /* fix other sub-DB cursors pointed at this fake page */
6584 for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) {
6585 if (m2 == mc || m2->mc_snum < mc->mc_snum) continue;
6586 if (m2->mc_pg[mc->mc_top] == mp &&
6587 m2->mc_ki[mc->mc_top] == mc->mc_ki[mc->mc_top])
6588 m2->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf);
6591 mc->mc_db->md_entries--;
6592 mc->mc_flags |= C_DEL;
6595 /* otherwise fall thru and delete the sub-DB */
6598 if (leaf->mn_flags & F_SUBDATA) {
6599 /* add all the child DB's pages to the free list */
6600 rc = mdb_drop0(&mc->mc_xcursor->mx_cursor, 0);
6606 /* add overflow pages to free list */
6607 if (F_ISSET(leaf->mn_flags, F_BIGDATA)) {
6611 memcpy(&pg, NODEDATA(leaf), sizeof(pg));
6612 if ((rc = mdb_page_get(mc->mc_txn, pg, &omp, NULL)) ||
6613 (rc = mdb_ovpage_free(mc, omp)))
6618 return mdb_cursor_del0(mc);
6621 mc->mc_txn->mt_flags |= MDB_TXN_ERROR;
6625 /** Allocate and initialize new pages for a database.
6626 * @param[in] mc a cursor on the database being added to.
6627 * @param[in] flags flags defining what type of page is being allocated.
6628 * @param[in] num the number of pages to allocate. This is usually 1,
6629 * unless allocating overflow pages for a large record.
6630 * @param[out] mp Address of a page, or NULL on failure.
6631 * @return 0 on success, non-zero on failure.
6634 mdb_page_new(MDB_cursor *mc, uint32_t flags, int num, MDB_page **mp)
6639 if ((rc = mdb_page_alloc(mc, num, &np)))
6641 DPRINTF(("allocated new mpage %"Z"u, page size %u",
6642 np->mp_pgno, mc->mc_txn->mt_env->me_psize));
6643 np->mp_flags = flags | P_DIRTY;
6644 np->mp_lower = (PAGEHDRSZ-PAGEBASE);
6645 np->mp_upper = mc->mc_txn->mt_env->me_psize - PAGEBASE;
6648 mc->mc_db->md_branch_pages++;
6649 else if (IS_LEAF(np))
6650 mc->mc_db->md_leaf_pages++;
6651 else if (IS_OVERFLOW(np)) {
6652 mc->mc_db->md_overflow_pages += num;
6660 /** Calculate the size of a leaf node.
6661 * The size depends on the environment's page size; if a data item
6662 * is too large it will be put onto an overflow page and the node
6663 * size will only include the key and not the data. Sizes are always
6664 * rounded up to an even number of bytes, to guarantee 2-byte alignment
6665 * of the #MDB_node headers.
6666 * @param[in] env The environment handle.
6667 * @param[in] key The key for the node.
6668 * @param[in] data The data for the node.
6669 * @return The number of bytes needed to store the node.
6672 mdb_leaf_size(MDB_env *env, MDB_val *key, MDB_val *data)
6676 sz = LEAFSIZE(key, data);
6677 if (sz > env->me_nodemax) {
6678 /* put on overflow page */
6679 sz -= data->mv_size - sizeof(pgno_t);
6682 return EVEN(sz + sizeof(indx_t));
6685 /** Calculate the size of a branch node.
6686 * The size should depend on the environment's page size but since
6687 * we currently don't support spilling large keys onto overflow
6688 * pages, it's simply the size of the #MDB_node header plus the
6689 * size of the key. Sizes are always rounded up to an even number
6690 * of bytes, to guarantee 2-byte alignment of the #MDB_node headers.
6691 * @param[in] env The environment handle.
6692 * @param[in] key The key for the node.
6693 * @return The number of bytes needed to store the node.
6696 mdb_branch_size(MDB_env *env, MDB_val *key)
6701 if (sz > env->me_nodemax) {
6702 /* put on overflow page */
6703 /* not implemented */
6704 /* sz -= key->size - sizeof(pgno_t); */
6707 return sz + sizeof(indx_t);
6710 /** Add a node to the page pointed to by the cursor.
6711 * @param[in] mc The cursor for this operation.
6712 * @param[in] indx The index on the page where the new node should be added.
6713 * @param[in] key The key for the new node.
6714 * @param[in] data The data for the new node, if any.
6715 * @param[in] pgno The page number, if adding a branch node.
6716 * @param[in] flags Flags for the node.
6717 * @return 0 on success, non-zero on failure. Possible errors are:
6719 * <li>ENOMEM - failed to allocate overflow pages for the node.
6720 * <li>MDB_PAGE_FULL - there is insufficient room in the page. This error
6721 * should never happen since all callers already calculate the
6722 * page's free space before calling this function.
6726 mdb_node_add(MDB_cursor *mc, indx_t indx,
6727 MDB_val *key, MDB_val *data, pgno_t pgno, unsigned int flags)
6730 size_t node_size = NODESIZE;
6734 MDB_page *mp = mc->mc_pg[mc->mc_top];
6735 MDB_page *ofp = NULL; /* overflow page */
6738 mdb_cassert(mc, mp->mp_upper >= mp->mp_lower);
6740 DPRINTF(("add to %s %spage %"Z"u index %i, data size %"Z"u key size %"Z"u [%s]",
6741 IS_LEAF(mp) ? "leaf" : "branch",
6742 IS_SUBP(mp) ? "sub-" : "",
6743 mdb_dbg_pgno(mp), indx, data ? data->mv_size : 0,
6744 key ? key->mv_size : 0, key ? DKEY(key) : "null"));
6747 /* Move higher keys up one slot. */
6748 int ksize = mc->mc_db->md_pad, dif;
6749 char *ptr = LEAF2KEY(mp, indx, ksize);
6750 dif = NUMKEYS(mp) - indx;
6752 memmove(ptr+ksize, ptr, dif*ksize);
6753 /* insert new key */
6754 memcpy(ptr, key->mv_data, ksize);
6756 /* Just using these for counting */
6757 mp->mp_lower += sizeof(indx_t);
6758 mp->mp_upper -= ksize - sizeof(indx_t);
6762 room = (ssize_t)SIZELEFT(mp) - (ssize_t)sizeof(indx_t);
6764 node_size += key->mv_size;
6766 mdb_cassert(mc, data);
6767 if (F_ISSET(flags, F_BIGDATA)) {
6768 /* Data already on overflow page. */
6769 node_size += sizeof(pgno_t);
6770 } else if (node_size + data->mv_size > mc->mc_txn->mt_env->me_nodemax) {
6771 int ovpages = OVPAGES(data->mv_size, mc->mc_txn->mt_env->me_psize);
6773 /* Put data on overflow page. */
6774 DPRINTF(("data size is %"Z"u, node would be %"Z"u, put data on overflow page",
6775 data->mv_size, node_size+data->mv_size));
6776 node_size = EVEN(node_size + sizeof(pgno_t));
6777 if ((ssize_t)node_size > room)
6779 if ((rc = mdb_page_new(mc, P_OVERFLOW, ovpages, &ofp)))
6781 DPRINTF(("allocated overflow page %"Z"u", ofp->mp_pgno));
6785 node_size += data->mv_size;
6788 node_size = EVEN(node_size);
6789 if ((ssize_t)node_size > room)
6793 /* Move higher pointers up one slot. */
6794 for (i = NUMKEYS(mp); i > indx; i--)
6795 mp->mp_ptrs[i] = mp->mp_ptrs[i - 1];
6797 /* Adjust free space offsets. */
6798 ofs = mp->mp_upper - node_size;
6799 mdb_cassert(mc, ofs >= mp->mp_lower + sizeof(indx_t));
6800 mp->mp_ptrs[indx] = ofs;
6802 mp->mp_lower += sizeof(indx_t);
6804 /* Write the node data. */
6805 node = NODEPTR(mp, indx);
6806 node->mn_ksize = (key == NULL) ? 0 : key->mv_size;
6807 node->mn_flags = flags;
6809 SETDSZ(node,data->mv_size);
6814 memcpy(NODEKEY(node), key->mv_data, key->mv_size);
6817 mdb_cassert(mc, key);
6819 if (F_ISSET(flags, F_BIGDATA))
6820 memcpy(node->mn_data + key->mv_size, data->mv_data,
6822 else if (F_ISSET(flags, MDB_RESERVE))
6823 data->mv_data = node->mn_data + key->mv_size;
6825 memcpy(node->mn_data + key->mv_size, data->mv_data,
6828 memcpy(node->mn_data + key->mv_size, &ofp->mp_pgno,
6830 if (F_ISSET(flags, MDB_RESERVE))
6831 data->mv_data = METADATA(ofp);
6833 memcpy(METADATA(ofp), data->mv_data, data->mv_size);
6840 DPRINTF(("not enough room in page %"Z"u, got %u ptrs",
6841 mdb_dbg_pgno(mp), NUMKEYS(mp)));
6842 DPRINTF(("upper-lower = %u - %u = %"Z"d", mp->mp_upper,mp->mp_lower,room));
6843 DPRINTF(("node size = %"Z"u", node_size));
6844 mc->mc_txn->mt_flags |= MDB_TXN_ERROR;
6845 return MDB_PAGE_FULL;
6848 /** Delete the specified node from a page.
6849 * @param[in] mc Cursor pointing to the node to delete.
6850 * @param[in] ksize The size of a node. Only used if the page is
6851 * part of a #MDB_DUPFIXED database.
6854 mdb_node_del(MDB_cursor *mc, int ksize)
6856 MDB_page *mp = mc->mc_pg[mc->mc_top];
6857 indx_t indx = mc->mc_ki[mc->mc_top];
6859 indx_t i, j, numkeys, ptr;
6863 DPRINTF(("delete node %u on %s page %"Z"u", indx,
6864 IS_LEAF(mp) ? "leaf" : "branch", mdb_dbg_pgno(mp)));
6865 numkeys = NUMKEYS(mp);
6866 mdb_cassert(mc, indx < numkeys);
6869 int x = numkeys - 1 - indx;
6870 base = LEAF2KEY(mp, indx, ksize);
6872 memmove(base, base + ksize, x * ksize);
6873 mp->mp_lower -= sizeof(indx_t);
6874 mp->mp_upper += ksize - sizeof(indx_t);
6878 node = NODEPTR(mp, indx);
6879 sz = NODESIZE + node->mn_ksize;
6881 if (F_ISSET(node->mn_flags, F_BIGDATA))
6882 sz += sizeof(pgno_t);
6884 sz += NODEDSZ(node);
6888 ptr = mp->mp_ptrs[indx];
6889 for (i = j = 0; i < numkeys; i++) {
6891 mp->mp_ptrs[j] = mp->mp_ptrs[i];
6892 if (mp->mp_ptrs[i] < ptr)
6893 mp->mp_ptrs[j] += sz;
6898 base = (char *)mp + mp->mp_upper + PAGEBASE;
6899 memmove(base + sz, base, ptr - mp->mp_upper);
6901 mp->mp_lower -= sizeof(indx_t);
6905 /** Compact the main page after deleting a node on a subpage.
6906 * @param[in] mp The main page to operate on.
6907 * @param[in] indx The index of the subpage on the main page.
6910 mdb_node_shrink(MDB_page *mp, indx_t indx)
6916 indx_t i, numkeys, ptr;
6918 node = NODEPTR(mp, indx);
6919 sp = (MDB_page *)NODEDATA(node);
6920 delta = SIZELEFT(sp);
6921 xp = (MDB_page *)((char *)sp + delta);
6923 /* shift subpage upward */
6925 nsize = NUMKEYS(sp) * sp->mp_pad;
6927 return; /* do not make the node uneven-sized */
6928 memmove(METADATA(xp), METADATA(sp), nsize);
6931 numkeys = NUMKEYS(sp);
6932 for (i=numkeys-1; i>=0; i--)
6933 xp->mp_ptrs[i] = sp->mp_ptrs[i] - delta;
6935 xp->mp_upper = sp->mp_lower;
6936 xp->mp_lower = sp->mp_lower;
6937 xp->mp_flags = sp->mp_flags;
6938 xp->mp_pad = sp->mp_pad;
6939 COPY_PGNO(xp->mp_pgno, mp->mp_pgno);
6941 nsize = NODEDSZ(node) - delta;
6942 SETDSZ(node, nsize);
6944 /* shift lower nodes upward */
6945 ptr = mp->mp_ptrs[indx];
6946 numkeys = NUMKEYS(mp);
6947 for (i = 0; i < numkeys; i++) {
6948 if (mp->mp_ptrs[i] <= ptr)
6949 mp->mp_ptrs[i] += delta;
6952 base = (char *)mp + mp->mp_upper + PAGEBASE;
6953 memmove(base + delta, base, ptr - mp->mp_upper + NODESIZE + NODEKSZ(node));
6954 mp->mp_upper += delta;
6957 /** Initial setup of a sorted-dups cursor.
6958 * Sorted duplicates are implemented as a sub-database for the given key.
6959 * The duplicate data items are actually keys of the sub-database.
6960 * Operations on the duplicate data items are performed using a sub-cursor
6961 * initialized when the sub-database is first accessed. This function does
6962 * the preliminary setup of the sub-cursor, filling in the fields that
6963 * depend only on the parent DB.
6964 * @param[in] mc The main cursor whose sorted-dups cursor is to be initialized.
6967 mdb_xcursor_init0(MDB_cursor *mc)
6969 MDB_xcursor *mx = mc->mc_xcursor;
6971 mx->mx_cursor.mc_xcursor = NULL;
6972 mx->mx_cursor.mc_txn = mc->mc_txn;
6973 mx->mx_cursor.mc_db = &mx->mx_db;
6974 mx->mx_cursor.mc_dbx = &mx->mx_dbx;
6975 mx->mx_cursor.mc_dbi = mc->mc_dbi;
6976 mx->mx_cursor.mc_dbflag = &mx->mx_dbflag;
6977 mx->mx_cursor.mc_snum = 0;
6978 mx->mx_cursor.mc_top = 0;
6979 mx->mx_cursor.mc_flags = C_SUB;
6980 mx->mx_dbx.md_name.mv_size = 0;
6981 mx->mx_dbx.md_name.mv_data = NULL;
6982 mx->mx_dbx.md_cmp = mc->mc_dbx->md_dcmp;
6983 mx->mx_dbx.md_dcmp = NULL;
6984 mx->mx_dbx.md_rel = mc->mc_dbx->md_rel;
6987 /** Final setup of a sorted-dups cursor.
6988 * Sets up the fields that depend on the data from the main cursor.
6989 * @param[in] mc The main cursor whose sorted-dups cursor is to be initialized.
6990 * @param[in] node The data containing the #MDB_db record for the
6991 * sorted-dup database.
6994 mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node)
6996 MDB_xcursor *mx = mc->mc_xcursor;
6998 if (node->mn_flags & F_SUBDATA) {
6999 memcpy(&mx->mx_db, NODEDATA(node), sizeof(MDB_db));
7000 mx->mx_cursor.mc_pg[0] = 0;
7001 mx->mx_cursor.mc_snum = 0;
7002 mx->mx_cursor.mc_top = 0;
7003 mx->mx_cursor.mc_flags = C_SUB;
7005 MDB_page *fp = NODEDATA(node);
7006 mx->mx_db.md_pad = mc->mc_pg[mc->mc_top]->mp_pad;
7007 mx->mx_db.md_flags = 0;
7008 mx->mx_db.md_depth = 1;
7009 mx->mx_db.md_branch_pages = 0;
7010 mx->mx_db.md_leaf_pages = 1;
7011 mx->mx_db.md_overflow_pages = 0;
7012 mx->mx_db.md_entries = NUMKEYS(fp);
7013 COPY_PGNO(mx->mx_db.md_root, fp->mp_pgno);
7014 mx->mx_cursor.mc_snum = 1;
7015 mx->mx_cursor.mc_top = 0;
7016 mx->mx_cursor.mc_flags = C_INITIALIZED|C_SUB;
7017 mx->mx_cursor.mc_pg[0] = fp;
7018 mx->mx_cursor.mc_ki[0] = 0;
7019 if (mc->mc_db->md_flags & MDB_DUPFIXED) {
7020 mx->mx_db.md_flags = MDB_DUPFIXED;
7021 mx->mx_db.md_pad = fp->mp_pad;
7022 if (mc->mc_db->md_flags & MDB_INTEGERDUP)
7023 mx->mx_db.md_flags |= MDB_INTEGERKEY;
7026 DPRINTF(("Sub-db -%u root page %"Z"u", mx->mx_cursor.mc_dbi,
7027 mx->mx_db.md_root));
7028 mx->mx_dbflag = DB_VALID|DB_DIRTY; /* DB_DIRTY guides mdb_cursor_touch */
7029 #if UINT_MAX < SIZE_MAX
7030 if (mx->mx_dbx.md_cmp == mdb_cmp_int && mx->mx_db.md_pad == sizeof(size_t))
7031 mx->mx_dbx.md_cmp = mdb_cmp_clong;
7035 /** Initialize a cursor for a given transaction and database. */
7037 mdb_cursor_init(MDB_cursor *mc, MDB_txn *txn, MDB_dbi dbi, MDB_xcursor *mx)
7040 mc->mc_backup = NULL;
7043 mc->mc_db = &txn->mt_dbs[dbi];
7044 mc->mc_dbx = &txn->mt_dbxs[dbi];
7045 mc->mc_dbflag = &txn->mt_dbflags[dbi];
7050 if (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT) {
7051 mdb_tassert(txn, mx != NULL);
7052 mc->mc_xcursor = mx;
7053 mdb_xcursor_init0(mc);
7055 mc->mc_xcursor = NULL;
7057 if (*mc->mc_dbflag & DB_STALE) {
7058 mdb_page_search(mc, NULL, MDB_PS_ROOTONLY);
7063 mdb_cursor_open(MDB_txn *txn, MDB_dbi dbi, MDB_cursor **ret)
7066 size_t size = sizeof(MDB_cursor);
7068 if (!ret || !TXN_DBI_EXIST(txn, dbi))
7071 if (txn->mt_flags & MDB_TXN_ERROR)
7074 /* Allow read access to the freelist */
7075 if (!dbi && !F_ISSET(txn->mt_flags, MDB_TXN_RDONLY))
7078 if (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT)
7079 size += sizeof(MDB_xcursor);
7081 if ((mc = malloc(size)) != NULL) {
7082 mdb_cursor_init(mc, txn, dbi, (MDB_xcursor *)(mc + 1));
7083 if (txn->mt_cursors) {
7084 mc->mc_next = txn->mt_cursors[dbi];
7085 txn->mt_cursors[dbi] = mc;
7086 mc->mc_flags |= C_UNTRACK;
7098 mdb_cursor_renew(MDB_txn *txn, MDB_cursor *mc)
7100 if (!mc || !TXN_DBI_EXIST(txn, mc->mc_dbi))
7103 if ((mc->mc_flags & C_UNTRACK) || txn->mt_cursors)
7106 if (txn->mt_flags & MDB_TXN_ERROR)
7109 mdb_cursor_init(mc, txn, mc->mc_dbi, mc->mc_xcursor);
7113 /* Return the count of duplicate data items for the current key */
7115 mdb_cursor_count(MDB_cursor *mc, size_t *countp)
7119 if (mc == NULL || countp == NULL)
7122 if (mc->mc_xcursor == NULL)
7123 return MDB_INCOMPATIBLE;
7125 if (mc->mc_txn->mt_flags & MDB_TXN_ERROR)
7128 if (!(mc->mc_flags & C_INITIALIZED))
7131 if (!mc->mc_snum || (mc->mc_flags & C_EOF))
7132 return MDB_NOTFOUND;
7134 leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
7135 if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) {
7138 if (!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED))
7141 *countp = mc->mc_xcursor->mx_db.md_entries;
7147 mdb_cursor_close(MDB_cursor *mc)
7149 if (mc && !mc->mc_backup) {
7150 /* remove from txn, if tracked */
7151 if ((mc->mc_flags & C_UNTRACK) && mc->mc_txn->mt_cursors) {
7152 MDB_cursor **prev = &mc->mc_txn->mt_cursors[mc->mc_dbi];
7153 while (*prev && *prev != mc) prev = &(*prev)->mc_next;
7155 *prev = mc->mc_next;
7162 mdb_cursor_txn(MDB_cursor *mc)
7164 if (!mc) return NULL;
7169 mdb_cursor_dbi(MDB_cursor *mc)
7174 /** Replace the key for a branch node with a new key.
7175 * @param[in] mc Cursor pointing to the node to operate on.
7176 * @param[in] key The new key to use.
7177 * @return 0 on success, non-zero on failure.
7180 mdb_update_key(MDB_cursor *mc, MDB_val *key)
7186 int delta, ksize, oksize;
7187 indx_t ptr, i, numkeys, indx;
7190 indx = mc->mc_ki[mc->mc_top];
7191 mp = mc->mc_pg[mc->mc_top];
7192 node = NODEPTR(mp, indx);
7193 ptr = mp->mp_ptrs[indx];
7197 char kbuf2[DKBUF_MAXKEYSIZE*2+1];
7198 k2.mv_data = NODEKEY(node);
7199 k2.mv_size = node->mn_ksize;
7200 DPRINTF(("update key %u (ofs %u) [%s] to [%s] on page %"Z"u",
7202 mdb_dkey(&k2, kbuf2),
7208 /* Sizes must be 2-byte aligned. */
7209 ksize = EVEN(key->mv_size);
7210 oksize = EVEN(node->mn_ksize);
7211 delta = ksize - oksize;
7213 /* Shift node contents if EVEN(key length) changed. */
7215 if (delta > 0 && SIZELEFT(mp) < delta) {
7217 /* not enough space left, do a delete and split */
7218 DPRINTF(("Not enough room, delta = %d, splitting...", delta));
7219 pgno = NODEPGNO(node);
7220 mdb_node_del(mc, 0);
7221 return mdb_page_split(mc, key, NULL, pgno, MDB_SPLIT_REPLACE);
7224 numkeys = NUMKEYS(mp);
7225 for (i = 0; i < numkeys; i++) {
7226 if (mp->mp_ptrs[i] <= ptr)
7227 mp->mp_ptrs[i] -= delta;
7230 base = (char *)mp + mp->mp_upper + PAGEBASE;
7231 len = ptr - mp->mp_upper + NODESIZE;
7232 memmove(base - delta, base, len);
7233 mp->mp_upper -= delta;
7235 node = NODEPTR(mp, indx);
7238 /* But even if no shift was needed, update ksize */
7239 if (node->mn_ksize != key->mv_size)
7240 node->mn_ksize = key->mv_size;
7243 memcpy(NODEKEY(node), key->mv_data, key->mv_size);
7249 mdb_cursor_copy(const MDB_cursor *csrc, MDB_cursor *cdst);
7251 /** Move a node from csrc to cdst.
7254 mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst)
7261 unsigned short flags;
7265 /* Mark src and dst as dirty. */
7266 if ((rc = mdb_page_touch(csrc)) ||
7267 (rc = mdb_page_touch(cdst)))
7270 if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) {
7271 key.mv_size = csrc->mc_db->md_pad;
7272 key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], csrc->mc_ki[csrc->mc_top], key.mv_size);
7274 data.mv_data = NULL;
7278 srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], csrc->mc_ki[csrc->mc_top]);
7279 mdb_cassert(csrc, !((size_t)srcnode & 1));
7280 srcpg = NODEPGNO(srcnode);
7281 flags = srcnode->mn_flags;
7282 if (csrc->mc_ki[csrc->mc_top] == 0 && IS_BRANCH(csrc->mc_pg[csrc->mc_top])) {
7283 unsigned int snum = csrc->mc_snum;
7285 /* must find the lowest key below src */
7286 rc = mdb_page_search_lowest(csrc);
7289 if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) {
7290 key.mv_size = csrc->mc_db->md_pad;
7291 key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], 0, key.mv_size);
7293 s2 = NODEPTR(csrc->mc_pg[csrc->mc_top], 0);
7294 key.mv_size = NODEKSZ(s2);
7295 key.mv_data = NODEKEY(s2);
7297 csrc->mc_snum = snum--;
7298 csrc->mc_top = snum;
7300 key.mv_size = NODEKSZ(srcnode);
7301 key.mv_data = NODEKEY(srcnode);
7303 data.mv_size = NODEDSZ(srcnode);
7304 data.mv_data = NODEDATA(srcnode);
7306 if (IS_BRANCH(cdst->mc_pg[cdst->mc_top]) && cdst->mc_ki[cdst->mc_top] == 0) {
7307 unsigned int snum = cdst->mc_snum;
7310 /* must find the lowest key below dst */
7311 mdb_cursor_copy(cdst, &mn);
7312 rc = mdb_page_search_lowest(&mn);
7315 if (IS_LEAF2(mn.mc_pg[mn.mc_top])) {
7316 bkey.mv_size = mn.mc_db->md_pad;
7317 bkey.mv_data = LEAF2KEY(mn.mc_pg[mn.mc_top], 0, bkey.mv_size);
7319 s2 = NODEPTR(mn.mc_pg[mn.mc_top], 0);
7320 bkey.mv_size = NODEKSZ(s2);
7321 bkey.mv_data = NODEKEY(s2);
7323 mn.mc_snum = snum--;
7326 rc = mdb_update_key(&mn, &bkey);
7331 DPRINTF(("moving %s node %u [%s] on page %"Z"u to node %u on page %"Z"u",
7332 IS_LEAF(csrc->mc_pg[csrc->mc_top]) ? "leaf" : "branch",
7333 csrc->mc_ki[csrc->mc_top],
7335 csrc->mc_pg[csrc->mc_top]->mp_pgno,
7336 cdst->mc_ki[cdst->mc_top], cdst->mc_pg[cdst->mc_top]->mp_pgno));
7338 /* Add the node to the destination page.
7340 rc = mdb_node_add(cdst, cdst->mc_ki[cdst->mc_top], &key, &data, srcpg, flags);
7341 if (rc != MDB_SUCCESS)
7344 /* Delete the node from the source page.
7346 mdb_node_del(csrc, key.mv_size);
7349 /* Adjust other cursors pointing to mp */
7350 MDB_cursor *m2, *m3;
7351 MDB_dbi dbi = csrc->mc_dbi;
7352 MDB_page *mp = csrc->mc_pg[csrc->mc_top];
7354 for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
7355 if (csrc->mc_flags & C_SUB)
7356 m3 = &m2->mc_xcursor->mx_cursor;
7359 if (m3 == csrc) continue;
7360 if (m3->mc_pg[csrc->mc_top] == mp && m3->mc_ki[csrc->mc_top] ==
7361 csrc->mc_ki[csrc->mc_top]) {
7362 m3->mc_pg[csrc->mc_top] = cdst->mc_pg[cdst->mc_top];
7363 m3->mc_ki[csrc->mc_top] = cdst->mc_ki[cdst->mc_top];
7368 /* Update the parent separators.
7370 if (csrc->mc_ki[csrc->mc_top] == 0) {
7371 if (csrc->mc_ki[csrc->mc_top-1] != 0) {
7372 if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) {
7373 key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], 0, key.mv_size);
7375 srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], 0);
7376 key.mv_size = NODEKSZ(srcnode);
7377 key.mv_data = NODEKEY(srcnode);
7379 DPRINTF(("update separator for source page %"Z"u to [%s]",
7380 csrc->mc_pg[csrc->mc_top]->mp_pgno, DKEY(&key)));
7381 mdb_cursor_copy(csrc, &mn);
7384 if ((rc = mdb_update_key(&mn, &key)) != MDB_SUCCESS)
7387 if (IS_BRANCH(csrc->mc_pg[csrc->mc_top])) {
7389 indx_t ix = csrc->mc_ki[csrc->mc_top];
7390 nullkey.mv_size = 0;
7391 csrc->mc_ki[csrc->mc_top] = 0;
7392 rc = mdb_update_key(csrc, &nullkey);
7393 csrc->mc_ki[csrc->mc_top] = ix;
7394 mdb_cassert(csrc, rc == MDB_SUCCESS);
7398 if (cdst->mc_ki[cdst->mc_top] == 0) {
7399 if (cdst->mc_ki[cdst->mc_top-1] != 0) {
7400 if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) {
7401 key.mv_data = LEAF2KEY(cdst->mc_pg[cdst->mc_top], 0, key.mv_size);
7403 srcnode = NODEPTR(cdst->mc_pg[cdst->mc_top], 0);
7404 key.mv_size = NODEKSZ(srcnode);
7405 key.mv_data = NODEKEY(srcnode);
7407 DPRINTF(("update separator for destination page %"Z"u to [%s]",
7408 cdst->mc_pg[cdst->mc_top]->mp_pgno, DKEY(&key)));
7409 mdb_cursor_copy(cdst, &mn);
7412 if ((rc = mdb_update_key(&mn, &key)) != MDB_SUCCESS)
7415 if (IS_BRANCH(cdst->mc_pg[cdst->mc_top])) {
7417 indx_t ix = cdst->mc_ki[cdst->mc_top];
7418 nullkey.mv_size = 0;
7419 cdst->mc_ki[cdst->mc_top] = 0;
7420 rc = mdb_update_key(cdst, &nullkey);
7421 cdst->mc_ki[cdst->mc_top] = ix;
7422 mdb_cassert(csrc, rc == MDB_SUCCESS);
7429 /** Merge one page into another.
7430 * The nodes from the page pointed to by \b csrc will
7431 * be copied to the page pointed to by \b cdst and then
7432 * the \b csrc page will be freed.
7433 * @param[in] csrc Cursor pointing to the source page.
7434 * @param[in] cdst Cursor pointing to the destination page.
7435 * @return 0 on success, non-zero on failure.
7438 mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst)
7440 MDB_page *psrc, *pdst;
7447 psrc = csrc->mc_pg[csrc->mc_top];
7448 pdst = cdst->mc_pg[cdst->mc_top];
7450 DPRINTF(("merging page %"Z"u into %"Z"u", psrc->mp_pgno, pdst->mp_pgno));
7452 mdb_cassert(csrc, csrc->mc_snum > 1); /* can't merge root page */
7453 mdb_cassert(csrc, cdst->mc_snum > 1);
7455 /* Mark dst as dirty. */
7456 if ((rc = mdb_page_touch(cdst)))
7459 /* Move all nodes from src to dst.
7461 j = nkeys = NUMKEYS(pdst);
7462 if (IS_LEAF2(psrc)) {
7463 key.mv_size = csrc->mc_db->md_pad;
7464 key.mv_data = METADATA(psrc);
7465 for (i = 0; i < NUMKEYS(psrc); i++, j++) {
7466 rc = mdb_node_add(cdst, j, &key, NULL, 0, 0);
7467 if (rc != MDB_SUCCESS)
7469 key.mv_data = (char *)key.mv_data + key.mv_size;
7472 for (i = 0; i < NUMKEYS(psrc); i++, j++) {
7473 srcnode = NODEPTR(psrc, i);
7474 if (i == 0 && IS_BRANCH(psrc)) {
7477 mdb_cursor_copy(csrc, &mn);
7478 /* must find the lowest key below src */
7479 rc = mdb_page_search_lowest(&mn);
7482 if (IS_LEAF2(mn.mc_pg[mn.mc_top])) {
7483 key.mv_size = mn.mc_db->md_pad;
7484 key.mv_data = LEAF2KEY(mn.mc_pg[mn.mc_top], 0, key.mv_size);
7486 s2 = NODEPTR(mn.mc_pg[mn.mc_top], 0);
7487 key.mv_size = NODEKSZ(s2);
7488 key.mv_data = NODEKEY(s2);
7491 key.mv_size = srcnode->mn_ksize;
7492 key.mv_data = NODEKEY(srcnode);
7495 data.mv_size = NODEDSZ(srcnode);
7496 data.mv_data = NODEDATA(srcnode);
7497 rc = mdb_node_add(cdst, j, &key, &data, NODEPGNO(srcnode), srcnode->mn_flags);
7498 if (rc != MDB_SUCCESS)
7503 DPRINTF(("dst page %"Z"u now has %u keys (%.1f%% filled)",
7504 pdst->mp_pgno, NUMKEYS(pdst),
7505 (float)PAGEFILL(cdst->mc_txn->mt_env, pdst) / 10));
7507 /* Unlink the src page from parent and add to free list.
7510 mdb_node_del(csrc, 0);
7511 if (csrc->mc_ki[csrc->mc_top] == 0) {
7513 rc = mdb_update_key(csrc, &key);
7521 psrc = csrc->mc_pg[csrc->mc_top];
7522 /* If not operating on FreeDB, allow this page to be reused
7523 * in this txn. Otherwise just add to free list.
7525 rc = mdb_page_loose(csrc, psrc);
7529 csrc->mc_db->md_leaf_pages--;
7531 csrc->mc_db->md_branch_pages--;
7533 /* Adjust other cursors pointing to mp */
7534 MDB_cursor *m2, *m3;
7535 MDB_dbi dbi = csrc->mc_dbi;
7537 for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
7538 if (csrc->mc_flags & C_SUB)
7539 m3 = &m2->mc_xcursor->mx_cursor;
7542 if (m3 == csrc) continue;
7543 if (m3->mc_snum < csrc->mc_snum) continue;
7544 if (m3->mc_pg[csrc->mc_top] == psrc) {
7545 m3->mc_pg[csrc->mc_top] = pdst;
7546 m3->mc_ki[csrc->mc_top] += nkeys;
7551 unsigned int snum = cdst->mc_snum;
7552 uint16_t depth = cdst->mc_db->md_depth;
7553 mdb_cursor_pop(cdst);
7554 rc = mdb_rebalance(cdst);
7555 /* Did the tree shrink? */
7556 if (depth > cdst->mc_db->md_depth)
7558 cdst->mc_snum = snum;
7559 cdst->mc_top = snum-1;
7564 /** Copy the contents of a cursor.
7565 * @param[in] csrc The cursor to copy from.
7566 * @param[out] cdst The cursor to copy to.
7569 mdb_cursor_copy(const MDB_cursor *csrc, MDB_cursor *cdst)
7573 cdst->mc_txn = csrc->mc_txn;
7574 cdst->mc_dbi = csrc->mc_dbi;
7575 cdst->mc_db = csrc->mc_db;
7576 cdst->mc_dbx = csrc->mc_dbx;
7577 cdst->mc_snum = csrc->mc_snum;
7578 cdst->mc_top = csrc->mc_top;
7579 cdst->mc_flags = csrc->mc_flags;
7581 for (i=0; i<csrc->mc_snum; i++) {
7582 cdst->mc_pg[i] = csrc->mc_pg[i];
7583 cdst->mc_ki[i] = csrc->mc_ki[i];
7587 /** Rebalance the tree after a delete operation.
7588 * @param[in] mc Cursor pointing to the page where rebalancing
7590 * @return 0 on success, non-zero on failure.
7593 mdb_rebalance(MDB_cursor *mc)
7597 unsigned int ptop, minkeys;
7601 minkeys = 1 + (IS_BRANCH(mc->mc_pg[mc->mc_top]));
7602 DPRINTF(("rebalancing %s page %"Z"u (has %u keys, %.1f%% full)",
7603 IS_LEAF(mc->mc_pg[mc->mc_top]) ? "leaf" : "branch",
7604 mdb_dbg_pgno(mc->mc_pg[mc->mc_top]), NUMKEYS(mc->mc_pg[mc->mc_top]),
7605 (float)PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) / 10));
7607 if (PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) >= FILL_THRESHOLD &&
7608 NUMKEYS(mc->mc_pg[mc->mc_top]) >= minkeys) {
7609 DPRINTF(("no need to rebalance page %"Z"u, above fill threshold",
7610 mdb_dbg_pgno(mc->mc_pg[mc->mc_top])));
7614 if (mc->mc_snum < 2) {
7615 MDB_page *mp = mc->mc_pg[0];
7617 DPUTS("Can't rebalance a subpage, ignoring");
7620 if (NUMKEYS(mp) == 0) {
7621 DPUTS("tree is completely empty");
7622 mc->mc_db->md_root = P_INVALID;
7623 mc->mc_db->md_depth = 0;
7624 mc->mc_db->md_leaf_pages = 0;
7625 rc = mdb_midl_append(&mc->mc_txn->mt_free_pgs, mp->mp_pgno);
7628 /* Adjust cursors pointing to mp */
7631 mc->mc_flags &= ~C_INITIALIZED;
7633 MDB_cursor *m2, *m3;
7634 MDB_dbi dbi = mc->mc_dbi;
7636 for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
7637 if (mc->mc_flags & C_SUB)
7638 m3 = &m2->mc_xcursor->mx_cursor;
7641 if (m3->mc_snum < mc->mc_snum) continue;
7642 if (m3->mc_pg[0] == mp) {
7645 m3->mc_flags &= ~C_INITIALIZED;
7649 } else if (IS_BRANCH(mp) && NUMKEYS(mp) == 1) {
7651 DPUTS("collapsing root page!");
7652 rc = mdb_midl_append(&mc->mc_txn->mt_free_pgs, mp->mp_pgno);
7655 mc->mc_db->md_root = NODEPGNO(NODEPTR(mp, 0));
7656 rc = mdb_page_get(mc->mc_txn,mc->mc_db->md_root,&mc->mc_pg[0],NULL);
7659 mc->mc_db->md_depth--;
7660 mc->mc_db->md_branch_pages--;
7661 mc->mc_ki[0] = mc->mc_ki[1];
7662 for (i = 1; i<mc->mc_db->md_depth; i++) {
7663 mc->mc_pg[i] = mc->mc_pg[i+1];
7664 mc->mc_ki[i] = mc->mc_ki[i+1];
7667 /* Adjust other cursors pointing to mp */
7668 MDB_cursor *m2, *m3;
7669 MDB_dbi dbi = mc->mc_dbi;
7671 for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
7672 if (mc->mc_flags & C_SUB)
7673 m3 = &m2->mc_xcursor->mx_cursor;
7676 if (m3 == mc || m3->mc_snum < mc->mc_snum) continue;
7677 if (m3->mc_pg[0] == mp) {
7680 for (i=0; i<m3->mc_snum; i++) {
7681 m3->mc_pg[i] = m3->mc_pg[i+1];
7682 m3->mc_ki[i] = m3->mc_ki[i+1];
7688 DPUTS("root page doesn't need rebalancing");
7692 /* The parent (branch page) must have at least 2 pointers,
7693 * otherwise the tree is invalid.
7695 ptop = mc->mc_top-1;
7696 mdb_cassert(mc, NUMKEYS(mc->mc_pg[ptop]) > 1);
7698 /* Leaf page fill factor is below the threshold.
7699 * Try to move keys from left or right neighbor, or
7700 * merge with a neighbor page.
7705 mdb_cursor_copy(mc, &mn);
7706 mn.mc_xcursor = NULL;
7708 oldki = mc->mc_ki[mc->mc_top];
7709 if (mc->mc_ki[ptop] == 0) {
7710 /* We're the leftmost leaf in our parent.
7712 DPUTS("reading right neighbor");
7714 node = NODEPTR(mc->mc_pg[ptop], mn.mc_ki[ptop]);
7715 rc = mdb_page_get(mc->mc_txn,NODEPGNO(node),&mn.mc_pg[mn.mc_top],NULL);
7718 mn.mc_ki[mn.mc_top] = 0;
7719 mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]);
7721 /* There is at least one neighbor to the left.
7723 DPUTS("reading left neighbor");
7725 node = NODEPTR(mc->mc_pg[ptop], mn.mc_ki[ptop]);
7726 rc = mdb_page_get(mc->mc_txn,NODEPGNO(node),&mn.mc_pg[mn.mc_top],NULL);
7729 mn.mc_ki[mn.mc_top] = NUMKEYS(mn.mc_pg[mn.mc_top]) - 1;
7730 mc->mc_ki[mc->mc_top] = 0;
7733 DPRINTF(("found neighbor page %"Z"u (%u keys, %.1f%% full)",
7734 mn.mc_pg[mn.mc_top]->mp_pgno, NUMKEYS(mn.mc_pg[mn.mc_top]),
7735 (float)PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) / 10));
7737 /* If the neighbor page is above threshold and has enough keys,
7738 * move one key from it. Otherwise we should try to merge them.
7739 * (A branch page must never have less than 2 keys.)
7741 minkeys = 1 + (IS_BRANCH(mn.mc_pg[mn.mc_top]));
7742 if (PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) >= FILL_THRESHOLD && NUMKEYS(mn.mc_pg[mn.mc_top]) > minkeys) {
7743 rc = mdb_node_move(&mn, mc);
7744 if (mc->mc_ki[ptop]) {
7748 if (mc->mc_ki[ptop] == 0) {
7749 rc = mdb_page_merge(&mn, mc);
7751 oldki += NUMKEYS(mn.mc_pg[mn.mc_top]);
7752 mn.mc_ki[mn.mc_top] += mc->mc_ki[mn.mc_top] + 1;
7753 rc = mdb_page_merge(mc, &mn);
7754 mdb_cursor_copy(&mn, mc);
7756 mc->mc_flags &= ~C_EOF;
7758 mc->mc_ki[mc->mc_top] = oldki;
7762 /** Complete a delete operation started by #mdb_cursor_del(). */
7764 mdb_cursor_del0(MDB_cursor *mc)
7771 ki = mc->mc_ki[mc->mc_top];
7772 mdb_node_del(mc, mc->mc_db->md_pad);
7773 mc->mc_db->md_entries--;
7774 rc = mdb_rebalance(mc);
7776 if (rc == MDB_SUCCESS) {
7777 MDB_cursor *m2, *m3;
7778 MDB_dbi dbi = mc->mc_dbi;
7780 mp = mc->mc_pg[mc->mc_top];
7781 nkeys = NUMKEYS(mp);
7783 /* if mc points past last node in page, find next sibling */
7784 if (mc->mc_ki[mc->mc_top] >= nkeys) {
7785 rc = mdb_cursor_sibling(mc, 1);
7786 if (rc == MDB_NOTFOUND) {
7787 mc->mc_flags |= C_EOF;
7792 /* Adjust other cursors pointing to mp */
7793 for (m2 = mc->mc_txn->mt_cursors[dbi]; !rc && m2; m2=m2->mc_next) {
7794 m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2;
7795 if (! (m2->mc_flags & m3->mc_flags & C_INITIALIZED))
7797 if (m3 == mc || m3->mc_snum < mc->mc_snum)
7799 if (m3->mc_pg[mc->mc_top] == mp) {
7800 if (m3->mc_ki[mc->mc_top] >= ki) {
7801 m3->mc_flags |= C_DEL;
7802 if (m3->mc_ki[mc->mc_top] > ki)
7803 m3->mc_ki[mc->mc_top]--;
7804 else if (mc->mc_db->md_flags & MDB_DUPSORT)
7805 m3->mc_xcursor->mx_cursor.mc_flags |= C_EOF;
7807 if (m3->mc_ki[mc->mc_top] >= nkeys) {
7808 rc = mdb_cursor_sibling(m3, 1);
7809 if (rc == MDB_NOTFOUND) {
7810 m3->mc_flags |= C_EOF;
7816 mc->mc_flags |= C_DEL;
7820 mc->mc_txn->mt_flags |= MDB_TXN_ERROR;
7825 mdb_del(MDB_txn *txn, MDB_dbi dbi,
7826 MDB_val *key, MDB_val *data)
7828 if (!key || dbi == FREE_DBI || !TXN_DBI_EXIST(txn, dbi))
7831 if (txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_ERROR))
7832 return (txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN;
7834 if (!F_ISSET(txn->mt_dbs[dbi].md_flags, MDB_DUPSORT)) {
7835 /* must ignore any data */
7839 return mdb_del0(txn, dbi, key, data, 0);
7843 mdb_del0(MDB_txn *txn, MDB_dbi dbi,
7844 MDB_val *key, MDB_val *data, unsigned flags)
7849 MDB_val rdata, *xdata;
7853 DPRINTF(("====> delete db %u key [%s]", dbi, DKEY(key)));
7855 mdb_cursor_init(&mc, txn, dbi, &mx);
7864 flags |= MDB_NODUPDATA;
7866 rc = mdb_cursor_set(&mc, key, xdata, op, &exact);
7868 /* let mdb_page_split know about this cursor if needed:
7869 * delete will trigger a rebalance; if it needs to move
7870 * a node from one page to another, it will have to
7871 * update the parent's separator key(s). If the new sepkey
7872 * is larger than the current one, the parent page may
7873 * run out of space, triggering a split. We need this
7874 * cursor to be consistent until the end of the rebalance.
7876 mc.mc_flags |= C_UNTRACK;
7877 mc.mc_next = txn->mt_cursors[dbi];
7878 txn->mt_cursors[dbi] = &mc;
7879 rc = mdb_cursor_del(&mc, flags);
7880 txn->mt_cursors[dbi] = mc.mc_next;
7885 /** Split a page and insert a new node.
7886 * @param[in,out] mc Cursor pointing to the page and desired insertion index.
7887 * The cursor will be updated to point to the actual page and index where
7888 * the node got inserted after the split.
7889 * @param[in] newkey The key for the newly inserted node.
7890 * @param[in] newdata The data for the newly inserted node.
7891 * @param[in] newpgno The page number, if the new node is a branch node.
7892 * @param[in] nflags The #NODE_ADD_FLAGS for the new node.
7893 * @return 0 on success, non-zero on failure.
7896 mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno,
7897 unsigned int nflags)
7900 int rc = MDB_SUCCESS, new_root = 0, did_split = 0;
7903 int i, j, split_indx, nkeys, pmax;
7904 MDB_env *env = mc->mc_txn->mt_env;
7906 MDB_val sepkey, rkey, xdata, *rdata = &xdata;
7907 MDB_page *copy = NULL;
7908 MDB_page *mp, *rp, *pp;
7913 mp = mc->mc_pg[mc->mc_top];
7914 newindx = mc->mc_ki[mc->mc_top];
7915 nkeys = NUMKEYS(mp);
7917 DPRINTF(("-----> splitting %s page %"Z"u and adding [%s] at index %i/%i",
7918 IS_LEAF(mp) ? "leaf" : "branch", mp->mp_pgno,
7919 DKEY(newkey), mc->mc_ki[mc->mc_top], nkeys));
7921 /* Create a right sibling. */
7922 if ((rc = mdb_page_new(mc, mp->mp_flags, 1, &rp)))
7924 DPRINTF(("new right sibling: page %"Z"u", rp->mp_pgno));
7926 if (mc->mc_snum < 2) {
7927 if ((rc = mdb_page_new(mc, P_BRANCH, 1, &pp)))
7929 /* shift current top to make room for new parent */
7930 mc->mc_pg[1] = mc->mc_pg[0];
7931 mc->mc_ki[1] = mc->mc_ki[0];
7934 mc->mc_db->md_root = pp->mp_pgno;
7935 DPRINTF(("root split! new root = %"Z"u", pp->mp_pgno));
7936 mc->mc_db->md_depth++;
7939 /* Add left (implicit) pointer. */
7940 if ((rc = mdb_node_add(mc, 0, NULL, NULL, mp->mp_pgno, 0)) != MDB_SUCCESS) {
7941 /* undo the pre-push */
7942 mc->mc_pg[0] = mc->mc_pg[1];
7943 mc->mc_ki[0] = mc->mc_ki[1];
7944 mc->mc_db->md_root = mp->mp_pgno;
7945 mc->mc_db->md_depth--;
7952 ptop = mc->mc_top-1;
7953 DPRINTF(("parent branch page is %"Z"u", mc->mc_pg[ptop]->mp_pgno));
7956 mc->mc_flags |= C_SPLITTING;
7957 mdb_cursor_copy(mc, &mn);
7958 mn.mc_pg[mn.mc_top] = rp;
7959 mn.mc_ki[ptop] = mc->mc_ki[ptop]+1;
7961 if (nflags & MDB_APPEND) {
7962 mn.mc_ki[mn.mc_top] = 0;
7964 split_indx = newindx;
7968 split_indx = (nkeys+1) / 2;
7973 unsigned int lsize, rsize, ksize;
7974 /* Move half of the keys to the right sibling */
7975 x = mc->mc_ki[mc->mc_top] - split_indx;
7976 ksize = mc->mc_db->md_pad;
7977 split = LEAF2KEY(mp, split_indx, ksize);
7978 rsize = (nkeys - split_indx) * ksize;
7979 lsize = (nkeys - split_indx) * sizeof(indx_t);
7980 mp->mp_lower -= lsize;
7981 rp->mp_lower += lsize;
7982 mp->mp_upper += rsize - lsize;
7983 rp->mp_upper -= rsize - lsize;
7984 sepkey.mv_size = ksize;
7985 if (newindx == split_indx) {
7986 sepkey.mv_data = newkey->mv_data;
7988 sepkey.mv_data = split;
7991 ins = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], ksize);
7992 memcpy(rp->mp_ptrs, split, rsize);
7993 sepkey.mv_data = rp->mp_ptrs;
7994 memmove(ins+ksize, ins, (split_indx - mc->mc_ki[mc->mc_top]) * ksize);
7995 memcpy(ins, newkey->mv_data, ksize);
7996 mp->mp_lower += sizeof(indx_t);
7997 mp->mp_upper -= ksize - sizeof(indx_t);
8000 memcpy(rp->mp_ptrs, split, x * ksize);
8001 ins = LEAF2KEY(rp, x, ksize);
8002 memcpy(ins, newkey->mv_data, ksize);
8003 memcpy(ins+ksize, split + x * ksize, rsize - x * ksize);
8004 rp->mp_lower += sizeof(indx_t);
8005 rp->mp_upper -= ksize - sizeof(indx_t);
8006 mc->mc_ki[mc->mc_top] = x;
8007 mc->mc_pg[mc->mc_top] = rp;
8010 int psize, nsize, k;
8011 /* Maximum free space in an empty page */
8012 pmax = env->me_psize - PAGEHDRSZ;
8014 nsize = mdb_leaf_size(env, newkey, newdata);
8016 nsize = mdb_branch_size(env, newkey);
8017 nsize = EVEN(nsize);
8019 /* grab a page to hold a temporary copy */
8020 copy = mdb_page_malloc(mc->mc_txn, 1);
8025 copy->mp_pgno = mp->mp_pgno;
8026 copy->mp_flags = mp->mp_flags;
8027 copy->mp_lower = (PAGEHDRSZ-PAGEBASE);
8028 copy->mp_upper = env->me_psize - PAGEBASE;
8030 /* prepare to insert */
8031 for (i=0, j=0; i<nkeys; i++) {
8033 copy->mp_ptrs[j++] = 0;
8035 copy->mp_ptrs[j++] = mp->mp_ptrs[i];
8038 /* When items are relatively large the split point needs
8039 * to be checked, because being off-by-one will make the
8040 * difference between success or failure in mdb_node_add.
8042 * It's also relevant if a page happens to be laid out
8043 * such that one half of its nodes are all "small" and
8044 * the other half of its nodes are "large." If the new
8045 * item is also "large" and falls on the half with
8046 * "large" nodes, it also may not fit.
8048 * As a final tweak, if the new item goes on the last
8049 * spot on the page (and thus, onto the new page), bias
8050 * the split so the new page is emptier than the old page.
8051 * This yields better packing during sequential inserts.
8053 if (nkeys < 20 || nsize > pmax/16 || newindx >= nkeys) {
8054 /* Find split point */
8056 if (newindx <= split_indx || newindx >= nkeys) {
8058 k = newindx >= nkeys ? nkeys : split_indx+2;
8063 for (; i!=k; i+=j) {
8068 node = (MDB_node *)((char *)mp + copy->mp_ptrs[i] + PAGEBASE);
8069 psize += NODESIZE + NODEKSZ(node) + sizeof(indx_t);
8071 if (F_ISSET(node->mn_flags, F_BIGDATA))
8072 psize += sizeof(pgno_t);
8074 psize += NODEDSZ(node);
8076 psize = EVEN(psize);
8078 if (psize > pmax || i == k-j) {
8079 split_indx = i + (j<0);
8084 if (split_indx == newindx) {
8085 sepkey.mv_size = newkey->mv_size;
8086 sepkey.mv_data = newkey->mv_data;
8088 node = (MDB_node *)((char *)mp + copy->mp_ptrs[split_indx] + PAGEBASE);
8089 sepkey.mv_size = node->mn_ksize;
8090 sepkey.mv_data = NODEKEY(node);
8095 DPRINTF(("separator is %d [%s]", split_indx, DKEY(&sepkey)));
8097 /* Copy separator key to the parent.
8099 if (SIZELEFT(mn.mc_pg[ptop]) < mdb_branch_size(env, &sepkey)) {
8103 rc = mdb_page_split(&mn, &sepkey, NULL, rp->mp_pgno, 0);
8108 if (mn.mc_snum == mc->mc_snum) {
8109 mc->mc_pg[mc->mc_snum] = mc->mc_pg[mc->mc_top];
8110 mc->mc_ki[mc->mc_snum] = mc->mc_ki[mc->mc_top];
8111 mc->mc_pg[mc->mc_top] = mc->mc_pg[ptop];
8112 mc->mc_ki[mc->mc_top] = mc->mc_ki[ptop];
8117 /* Right page might now have changed parent.
8118 * Check if left page also changed parent.
8120 if (mn.mc_pg[ptop] != mc->mc_pg[ptop] &&
8121 mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) {
8122 for (i=0; i<ptop; i++) {
8123 mc->mc_pg[i] = mn.mc_pg[i];
8124 mc->mc_ki[i] = mn.mc_ki[i];
8126 mc->mc_pg[ptop] = mn.mc_pg[ptop];
8127 if (mn.mc_ki[ptop]) {
8128 mc->mc_ki[ptop] = mn.mc_ki[ptop] - 1;
8130 /* find right page's left sibling */
8131 mc->mc_ki[ptop] = mn.mc_ki[ptop];
8132 mdb_cursor_sibling(mc, 0);
8137 rc = mdb_node_add(&mn, mn.mc_ki[ptop], &sepkey, NULL, rp->mp_pgno, 0);
8140 mc->mc_flags ^= C_SPLITTING;
8141 if (rc != MDB_SUCCESS) {
8144 if (nflags & MDB_APPEND) {
8145 mc->mc_pg[mc->mc_top] = rp;
8146 mc->mc_ki[mc->mc_top] = 0;
8147 rc = mdb_node_add(mc, 0, newkey, newdata, newpgno, nflags);
8150 for (i=0; i<mc->mc_top; i++)
8151 mc->mc_ki[i] = mn.mc_ki[i];
8152 } else if (!IS_LEAF2(mp)) {
8154 mc->mc_pg[mc->mc_top] = rp;
8159 rkey.mv_data = newkey->mv_data;
8160 rkey.mv_size = newkey->mv_size;
8166 /* Update index for the new key. */
8167 mc->mc_ki[mc->mc_top] = j;
8169 node = (MDB_node *)((char *)mp + copy->mp_ptrs[i] + PAGEBASE);
8170 rkey.mv_data = NODEKEY(node);
8171 rkey.mv_size = node->mn_ksize;
8173 xdata.mv_data = NODEDATA(node);
8174 xdata.mv_size = NODEDSZ(node);
8177 pgno = NODEPGNO(node);
8178 flags = node->mn_flags;
8181 if (!IS_LEAF(mp) && j == 0) {
8182 /* First branch index doesn't need key data. */
8186 rc = mdb_node_add(mc, j, &rkey, rdata, pgno, flags);
8192 mc->mc_pg[mc->mc_top] = copy;
8197 } while (i != split_indx);
8199 nkeys = NUMKEYS(copy);
8200 for (i=0; i<nkeys; i++)
8201 mp->mp_ptrs[i] = copy->mp_ptrs[i];
8202 mp->mp_lower = copy->mp_lower;
8203 mp->mp_upper = copy->mp_upper;
8204 memcpy(NODEPTR(mp, nkeys-1), NODEPTR(copy, nkeys-1),
8205 env->me_psize - copy->mp_upper - PAGEBASE);
8207 /* reset back to original page */
8208 if (newindx < split_indx) {
8209 mc->mc_pg[mc->mc_top] = mp;
8210 if (nflags & MDB_RESERVE) {
8211 node = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
8212 if (!(node->mn_flags & F_BIGDATA))
8213 newdata->mv_data = NODEDATA(node);
8216 mc->mc_pg[mc->mc_top] = rp;
8218 /* Make sure mc_ki is still valid.
8220 if (mn.mc_pg[ptop] != mc->mc_pg[ptop] &&
8221 mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) {
8222 for (i=0; i<=ptop; i++) {
8223 mc->mc_pg[i] = mn.mc_pg[i];
8224 mc->mc_ki[i] = mn.mc_ki[i];
8231 /* Adjust other cursors pointing to mp */
8232 MDB_cursor *m2, *m3;
8233 MDB_dbi dbi = mc->mc_dbi;
8234 int fixup = NUMKEYS(mp);
8236 for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
8237 if (mc->mc_flags & C_SUB)
8238 m3 = &m2->mc_xcursor->mx_cursor;
8243 if (!(m2->mc_flags & m3->mc_flags & C_INITIALIZED))
8245 if (m3->mc_flags & C_SPLITTING)
8250 for (k=m3->mc_top; k>=0; k--) {
8251 m3->mc_ki[k+1] = m3->mc_ki[k];
8252 m3->mc_pg[k+1] = m3->mc_pg[k];
8254 if (m3->mc_ki[0] >= split_indx) {
8259 m3->mc_pg[0] = mc->mc_pg[0];
8263 if (m3->mc_top >= mc->mc_top && m3->mc_pg[mc->mc_top] == mp) {
8264 if (m3->mc_ki[mc->mc_top] >= newindx && !(nflags & MDB_SPLIT_REPLACE))
8265 m3->mc_ki[mc->mc_top]++;
8266 if (m3->mc_ki[mc->mc_top] >= fixup) {
8267 m3->mc_pg[mc->mc_top] = rp;
8268 m3->mc_ki[mc->mc_top] -= fixup;
8269 m3->mc_ki[ptop] = mn.mc_ki[ptop];
8271 } else if (!did_split && m3->mc_top >= ptop && m3->mc_pg[ptop] == mc->mc_pg[ptop] &&
8272 m3->mc_ki[ptop] >= mc->mc_ki[ptop]) {
8277 DPRINTF(("mp left: %d, rp left: %d", SIZELEFT(mp), SIZELEFT(rp)));
8280 if (copy) /* tmp page */
8281 mdb_page_free(env, copy);
8283 mc->mc_txn->mt_flags |= MDB_TXN_ERROR;
8288 mdb_put(MDB_txn *txn, MDB_dbi dbi,
8289 MDB_val *key, MDB_val *data, unsigned int flags)
8294 if (!key || !data || dbi == FREE_DBI || !TXN_DBI_EXIST(txn, dbi))
8297 if ((flags & (MDB_NOOVERWRITE|MDB_NODUPDATA|MDB_RESERVE|MDB_APPEND|MDB_APPENDDUP)) != flags)
8300 mdb_cursor_init(&mc, txn, dbi, &mx);
8301 return mdb_cursor_put(&mc, key, data, flags);
8305 #define MDB_WBUF (1024*1024)
8308 /** State needed for a compacting copy. */
8309 typedef struct mdb_copy {
8310 pthread_mutex_t mc_mutex;
8311 pthread_cond_t mc_cond;
8318 pgno_t mc_next_pgno;
8321 volatile int mc_new;
8326 /** Dedicated writer thread for compacting copy. */
8327 static THREAD_RET ESECT
8328 mdb_env_copythr(void *arg)
8332 int toggle = 0, wsize, rc;
8335 #define DO_WRITE(rc, fd, ptr, w2, len) rc = WriteFile(fd, ptr, w2, &len, NULL)
8338 #define DO_WRITE(rc, fd, ptr, w2, len) len = write(fd, ptr, w2); rc = (len >= 0)
8341 pthread_mutex_lock(&my->mc_mutex);
8343 pthread_cond_signal(&my->mc_cond);
8346 pthread_cond_wait(&my->mc_cond, &my->mc_mutex);
8347 if (my->mc_new < 0) {
8352 wsize = my->mc_wlen[toggle];
8353 ptr = my->mc_wbuf[toggle];
8356 DO_WRITE(rc, my->mc_fd, ptr, wsize, len);
8360 } else if (len > 0) {
8374 /* If there's an overflow page tail, write it too */
8375 if (my->mc_olen[toggle]) {
8376 wsize = my->mc_olen[toggle];
8377 ptr = my->mc_over[toggle];
8378 my->mc_olen[toggle] = 0;
8381 my->mc_wlen[toggle] = 0;
8383 pthread_cond_signal(&my->mc_cond);
8385 pthread_cond_signal(&my->mc_cond);
8386 pthread_mutex_unlock(&my->mc_mutex);
8387 return (THREAD_RET)0;
8391 /** Tell the writer thread there's a buffer ready to write */
8393 mdb_env_cthr_toggle(mdb_copy *my, int st)
8395 int toggle = my->mc_toggle ^ 1;
8396 pthread_mutex_lock(&my->mc_mutex);
8397 if (my->mc_status) {
8398 pthread_mutex_unlock(&my->mc_mutex);
8399 return my->mc_status;
8401 while (my->mc_new == 1)
8402 pthread_cond_wait(&my->mc_cond, &my->mc_mutex);
8404 my->mc_toggle = toggle;
8405 pthread_cond_signal(&my->mc_cond);
8406 pthread_mutex_unlock(&my->mc_mutex);
8410 /** Depth-first tree traversal for compacting copy. */
8412 mdb_env_cwalk(mdb_copy *my, pgno_t *pg, int flags)
8415 MDB_txn *txn = my->mc_txn;
8417 MDB_page *mo, *mp, *leaf;
8422 /* Empty DB, nothing to do */
8423 if (*pg == P_INVALID)
8430 rc = mdb_page_get(my->mc_txn, *pg, &mc.mc_pg[0], NULL);
8433 rc = mdb_page_search_root(&mc, NULL, MDB_PS_FIRST);
8437 /* Make cursor pages writable */
8438 buf = ptr = malloc(my->mc_env->me_psize * mc.mc_snum);
8442 for (i=0; i<mc.mc_top; i++) {
8443 mdb_page_copy((MDB_page *)ptr, mc.mc_pg[i], my->mc_env->me_psize);
8444 mc.mc_pg[i] = (MDB_page *)ptr;
8445 ptr += my->mc_env->me_psize;
8448 /* This is writable space for a leaf page. Usually not needed. */
8449 leaf = (MDB_page *)ptr;
8451 toggle = my->mc_toggle;
8452 while (mc.mc_snum > 0) {
8454 mp = mc.mc_pg[mc.mc_top];
8458 if (!IS_LEAF2(mp) && !(flags & F_DUPDATA)) {
8459 for (i=0; i<n; i++) {
8460 ni = NODEPTR(mp, i);
8461 if (ni->mn_flags & F_BIGDATA) {
8465 /* Need writable leaf */
8467 mc.mc_pg[mc.mc_top] = leaf;
8468 mdb_page_copy(leaf, mp, my->mc_env->me_psize);
8470 ni = NODEPTR(mp, i);
8473 memcpy(&pg, NODEDATA(ni), sizeof(pg));
8474 rc = mdb_page_get(txn, pg, &omp, NULL);
8477 if (my->mc_wlen[toggle] >= MDB_WBUF) {
8478 rc = mdb_env_cthr_toggle(my, 1);
8481 toggle = my->mc_toggle;
8483 mo = (MDB_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]);
8484 memcpy(mo, omp, my->mc_env->me_psize);
8485 mo->mp_pgno = my->mc_next_pgno;
8486 my->mc_next_pgno += omp->mp_pages;
8487 my->mc_wlen[toggle] += my->mc_env->me_psize;
8488 if (omp->mp_pages > 1) {
8489 my->mc_olen[toggle] = my->mc_env->me_psize * (omp->mp_pages - 1);
8490 my->mc_over[toggle] = (char *)omp + my->mc_env->me_psize;
8491 rc = mdb_env_cthr_toggle(my, 1);
8494 toggle = my->mc_toggle;
8496 memcpy(NODEDATA(ni), &mo->mp_pgno, sizeof(pgno_t));
8497 } else if (ni->mn_flags & F_SUBDATA) {
8500 /* Need writable leaf */
8502 mc.mc_pg[mc.mc_top] = leaf;
8503 mdb_page_copy(leaf, mp, my->mc_env->me_psize);
8505 ni = NODEPTR(mp, i);
8508 memcpy(&db, NODEDATA(ni), sizeof(db));
8509 my->mc_toggle = toggle;
8510 rc = mdb_env_cwalk(my, &db.md_root, ni->mn_flags & F_DUPDATA);
8513 toggle = my->mc_toggle;
8514 memcpy(NODEDATA(ni), &db, sizeof(db));
8519 mc.mc_ki[mc.mc_top]++;
8520 if (mc.mc_ki[mc.mc_top] < n) {
8523 ni = NODEPTR(mp, mc.mc_ki[mc.mc_top]);
8525 rc = mdb_page_get(txn, pg, &mp, NULL);
8530 mc.mc_ki[mc.mc_top] = 0;
8531 if (IS_BRANCH(mp)) {
8532 /* Whenever we advance to a sibling branch page,
8533 * we must proceed all the way down to its first leaf.
8535 mdb_page_copy(mc.mc_pg[mc.mc_top], mp, my->mc_env->me_psize);
8538 mc.mc_pg[mc.mc_top] = mp;
8542 if (my->mc_wlen[toggle] >= MDB_WBUF) {
8543 rc = mdb_env_cthr_toggle(my, 1);
8546 toggle = my->mc_toggle;
8548 mo = (MDB_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]);
8549 mdb_page_copy(mo, mp, my->mc_env->me_psize);
8550 mo->mp_pgno = my->mc_next_pgno++;
8551 my->mc_wlen[toggle] += my->mc_env->me_psize;
8553 /* Update parent if there is one */
8554 ni = NODEPTR(mc.mc_pg[mc.mc_top-1], mc.mc_ki[mc.mc_top-1]);
8555 SETPGNO(ni, mo->mp_pgno);
8556 mdb_cursor_pop(&mc);
8558 /* Otherwise we're done */
8568 /** Copy environment with compaction. */
8570 mdb_env_copyfd1(MDB_env *env, HANDLE fd)
8575 MDB_txn *txn = NULL;
8580 my.mc_mutex = CreateMutex(NULL, FALSE, NULL);
8581 my.mc_cond = CreateEvent(NULL, FALSE, FALSE, NULL);
8582 my.mc_wbuf[0] = _aligned_malloc(MDB_WBUF*2, env->me_os_psize);
8583 if (my.mc_wbuf[0] == NULL)
8586 pthread_mutex_init(&my.mc_mutex, NULL);
8587 pthread_cond_init(&my.mc_cond, NULL);
8588 #ifdef HAVE_MEMALIGN
8589 my.mc_wbuf[0] = memalign(env->me_os_psize, MDB_WBUF*2);
8590 if (my.mc_wbuf[0] == NULL)
8593 rc = posix_memalign((void **)&my.mc_wbuf[0], env->me_os_psize, MDB_WBUF*2);
8598 memset(my.mc_wbuf[0], 0, MDB_WBUF*2);
8599 my.mc_wbuf[1] = my.mc_wbuf[0] + MDB_WBUF;
8604 my.mc_next_pgno = 2;
8610 THREAD_CREATE(thr, mdb_env_copythr, &my);
8612 rc = mdb_txn_begin(env, NULL, MDB_RDONLY, &txn);
8616 mp = (MDB_page *)my.mc_wbuf[0];
8617 memset(mp, 0, 2*env->me_psize);
8619 mp->mp_flags = P_META;
8620 mm = (MDB_meta *)METADATA(mp);
8621 mdb_env_init_meta0(env, mm);
8622 mm->mm_address = env->me_metas[0]->mm_address;
8624 mp = (MDB_page *)(my.mc_wbuf[0] + env->me_psize);
8626 mp->mp_flags = P_META;
8627 *(MDB_meta *)METADATA(mp) = *mm;
8628 mm = (MDB_meta *)METADATA(mp);
8630 /* Count the number of free pages, subtract from lastpg to find
8631 * number of active pages
8634 MDB_ID freecount = 0;
8637 mdb_cursor_init(&mc, txn, FREE_DBI, NULL);
8638 while ((rc = mdb_cursor_get(&mc, &key, &data, MDB_NEXT)) == 0)
8639 freecount += *(MDB_ID *)data.mv_data;
8640 freecount += txn->mt_dbs[0].md_branch_pages +
8641 txn->mt_dbs[0].md_leaf_pages +
8642 txn->mt_dbs[0].md_overflow_pages;
8644 /* Set metapage 1 */
8645 mm->mm_last_pg = txn->mt_next_pgno - freecount - 1;
8646 mm->mm_dbs[1] = txn->mt_dbs[1];
8647 if (mm->mm_last_pg > 1) {
8648 mm->mm_dbs[1].md_root = mm->mm_last_pg;
8651 mm->mm_dbs[1].md_root = P_INVALID;
8654 my.mc_wlen[0] = env->me_psize * 2;
8656 pthread_mutex_lock(&my.mc_mutex);
8658 pthread_cond_wait(&my.mc_cond, &my.mc_mutex);
8659 pthread_mutex_unlock(&my.mc_mutex);
8660 rc = mdb_env_cwalk(&my, &txn->mt_dbs[1].md_root, 0);
8661 if (rc == MDB_SUCCESS && my.mc_wlen[my.mc_toggle])
8662 rc = mdb_env_cthr_toggle(&my, 1);
8663 mdb_env_cthr_toggle(&my, -1);
8664 pthread_mutex_lock(&my.mc_mutex);
8666 pthread_cond_wait(&my.mc_cond, &my.mc_mutex);
8667 pthread_mutex_unlock(&my.mc_mutex);
8672 CloseHandle(my.mc_cond);
8673 CloseHandle(my.mc_mutex);
8674 _aligned_free(my.mc_wbuf[0]);
8676 pthread_cond_destroy(&my.mc_cond);
8677 pthread_mutex_destroy(&my.mc_mutex);
8678 free(my.mc_wbuf[0]);
8683 /** Copy environment as-is. */
8685 mdb_env_copyfd0(MDB_env *env, HANDLE fd)
8687 MDB_txn *txn = NULL;
8693 #define DO_WRITE(rc, fd, ptr, w2, len) rc = WriteFile(fd, ptr, w2, &len, NULL)
8697 #define DO_WRITE(rc, fd, ptr, w2, len) len = write(fd, ptr, w2); rc = (len >= 0)
8700 /* Do the lock/unlock of the reader mutex before starting the
8701 * write txn. Otherwise other read txns could block writers.
8703 rc = mdb_txn_begin(env, NULL, MDB_RDONLY, &txn);
8708 /* We must start the actual read txn after blocking writers */
8709 mdb_txn_reset0(txn, "reset-stage1");
8711 /* Temporarily block writers until we snapshot the meta pages */
8714 rc = mdb_txn_renew0(txn);
8716 UNLOCK_MUTEX_W(env);
8721 wsize = env->me_psize * 2;
8725 DO_WRITE(rc, fd, ptr, w2, len);
8729 } else if (len > 0) {
8735 /* Non-blocking or async handles are not supported */
8741 UNLOCK_MUTEX_W(env);
8746 w2 = txn->mt_next_pgno * env->me_psize;
8749 if ((rc = mdb_fsize(env->me_fd, &fsize)))
8756 if (wsize > MAX_WRITE)
8760 DO_WRITE(rc, fd, ptr, w2, len);
8764 } else if (len > 0) {
8781 mdb_env_copyfd2(MDB_env *env, HANDLE fd, unsigned int flags)
8783 if (flags & MDB_CP_COMPACT)
8784 return mdb_env_copyfd1(env, fd);
8786 return mdb_env_copyfd0(env, fd);
8790 mdb_env_copyfd(MDB_env *env, HANDLE fd)
8792 return mdb_env_copyfd2(env, fd, 0);
8796 mdb_env_copy2(MDB_env *env, const char *path, unsigned int flags)
8800 HANDLE newfd = INVALID_HANDLE_VALUE;
8802 if (env->me_flags & MDB_NOSUBDIR) {
8803 lpath = (char *)path;
8806 len += sizeof(DATANAME);
8807 lpath = malloc(len);
8810 sprintf(lpath, "%s" DATANAME, path);
8813 /* The destination path must exist, but the destination file must not.
8814 * We don't want the OS to cache the writes, since the source data is
8815 * already in the OS cache.
8818 newfd = CreateFile(lpath, GENERIC_WRITE, 0, NULL, CREATE_NEW,
8819 FILE_FLAG_NO_BUFFERING|FILE_FLAG_WRITE_THROUGH, NULL);
8821 newfd = open(lpath, O_WRONLY|O_CREAT|O_EXCL, 0666);
8823 if (newfd == INVALID_HANDLE_VALUE) {
8828 if (env->me_psize >= env->me_os_psize) {
8830 /* Set O_DIRECT if the file system supports it */
8831 if ((rc = fcntl(newfd, F_GETFL)) != -1)
8832 (void) fcntl(newfd, F_SETFL, rc | O_DIRECT);
8834 #ifdef F_NOCACHE /* __APPLE__ */
8835 rc = fcntl(newfd, F_NOCACHE, 1);
8843 rc = mdb_env_copyfd2(env, newfd, flags);
8846 if (!(env->me_flags & MDB_NOSUBDIR))
8848 if (newfd != INVALID_HANDLE_VALUE)
8849 if (close(newfd) < 0 && rc == MDB_SUCCESS)
8856 mdb_env_copy(MDB_env *env, const char *path)
8858 return mdb_env_copy2(env, path, 0);
8862 mdb_env_set_flags(MDB_env *env, unsigned int flag, int onoff)
8864 if ((flag & CHANGEABLE) != flag)
8867 env->me_flags |= flag;
8869 env->me_flags &= ~flag;
8874 mdb_env_get_flags(MDB_env *env, unsigned int *arg)
8879 *arg = env->me_flags;
8884 mdb_env_set_userctx(MDB_env *env, void *ctx)
8888 env->me_userctx = ctx;
8893 mdb_env_get_userctx(MDB_env *env)
8895 return env ? env->me_userctx : NULL;
8899 mdb_env_set_assert(MDB_env *env, MDB_assert_func *func)
8904 env->me_assert_func = func;
8910 mdb_env_get_path(MDB_env *env, const char **arg)
8915 *arg = env->me_path;
8920 mdb_env_get_fd(MDB_env *env, mdb_filehandle_t *arg)
8929 /** Common code for #mdb_stat() and #mdb_env_stat().
8930 * @param[in] env the environment to operate in.
8931 * @param[in] db the #MDB_db record containing the stats to return.
8932 * @param[out] arg the address of an #MDB_stat structure to receive the stats.
8933 * @return 0, this function always succeeds.
8936 mdb_stat0(MDB_env *env, MDB_db *db, MDB_stat *arg)
8938 arg->ms_psize = env->me_psize;
8939 arg->ms_depth = db->md_depth;
8940 arg->ms_branch_pages = db->md_branch_pages;
8941 arg->ms_leaf_pages = db->md_leaf_pages;
8942 arg->ms_overflow_pages = db->md_overflow_pages;
8943 arg->ms_entries = db->md_entries;
8949 mdb_env_stat(MDB_env *env, MDB_stat *arg)
8953 if (env == NULL || arg == NULL)
8956 toggle = mdb_env_pick_meta(env);
8958 return mdb_stat0(env, &env->me_metas[toggle]->mm_dbs[MAIN_DBI], arg);
8962 mdb_env_info(MDB_env *env, MDB_envinfo *arg)
8966 if (env == NULL || arg == NULL)
8969 toggle = mdb_env_pick_meta(env);
8970 arg->me_mapaddr = env->me_metas[toggle]->mm_address;
8971 arg->me_mapsize = env->me_mapsize;
8972 arg->me_maxreaders = env->me_maxreaders;
8974 /* me_numreaders may be zero if this process never used any readers. Use
8975 * the shared numreader count if it exists.
8977 arg->me_numreaders = env->me_txns ? env->me_txns->mti_numreaders : env->me_numreaders;
8979 arg->me_last_pgno = env->me_metas[toggle]->mm_last_pg;
8980 arg->me_last_txnid = env->me_metas[toggle]->mm_txnid;
8984 /** Set the default comparison functions for a database.
8985 * Called immediately after a database is opened to set the defaults.
8986 * The user can then override them with #mdb_set_compare() or
8987 * #mdb_set_dupsort().
8988 * @param[in] txn A transaction handle returned by #mdb_txn_begin()
8989 * @param[in] dbi A database handle returned by #mdb_dbi_open()
8992 mdb_default_cmp(MDB_txn *txn, MDB_dbi dbi)
8994 uint16_t f = txn->mt_dbs[dbi].md_flags;
8996 txn->mt_dbxs[dbi].md_cmp =
8997 (f & MDB_REVERSEKEY) ? mdb_cmp_memnr :
8998 (f & MDB_INTEGERKEY) ? mdb_cmp_cint : mdb_cmp_memn;
9000 txn->mt_dbxs[dbi].md_dcmp =
9001 !(f & MDB_DUPSORT) ? 0 :
9002 ((f & MDB_INTEGERDUP)
9003 ? ((f & MDB_DUPFIXED) ? mdb_cmp_int : mdb_cmp_cint)
9004 : ((f & MDB_REVERSEDUP) ? mdb_cmp_memnr : mdb_cmp_memn));
9007 int mdb_dbi_open(MDB_txn *txn, const char *name, unsigned int flags, MDB_dbi *dbi)
9013 int rc, dbflag, exact;
9014 unsigned int unused = 0, seq;
9017 if (txn->mt_dbxs[FREE_DBI].md_cmp == NULL) {
9018 mdb_default_cmp(txn, FREE_DBI);
9021 if ((flags & VALID_FLAGS) != flags)
9023 if (txn->mt_flags & MDB_TXN_ERROR)
9029 if (flags & PERSISTENT_FLAGS) {
9030 uint16_t f2 = flags & PERSISTENT_FLAGS;
9031 /* make sure flag changes get committed */
9032 if ((txn->mt_dbs[MAIN_DBI].md_flags | f2) != txn->mt_dbs[MAIN_DBI].md_flags) {
9033 txn->mt_dbs[MAIN_DBI].md_flags |= f2;
9034 txn->mt_flags |= MDB_TXN_DIRTY;
9037 mdb_default_cmp(txn, MAIN_DBI);
9041 if (txn->mt_dbxs[MAIN_DBI].md_cmp == NULL) {
9042 mdb_default_cmp(txn, MAIN_DBI);
9045 /* Is the DB already open? */
9047 for (i=2; i<txn->mt_numdbs; i++) {
9048 if (!txn->mt_dbxs[i].md_name.mv_size) {
9049 /* Remember this free slot */
9050 if (!unused) unused = i;
9053 if (len == txn->mt_dbxs[i].md_name.mv_size &&
9054 !strncmp(name, txn->mt_dbxs[i].md_name.mv_data, len)) {
9060 /* If no free slot and max hit, fail */
9061 if (!unused && txn->mt_numdbs >= txn->mt_env->me_maxdbs)
9062 return MDB_DBS_FULL;
9064 /* Cannot mix named databases with some mainDB flags */
9065 if (txn->mt_dbs[MAIN_DBI].md_flags & (MDB_DUPSORT|MDB_INTEGERKEY))
9066 return (flags & MDB_CREATE) ? MDB_INCOMPATIBLE : MDB_NOTFOUND;
9068 /* Find the DB info */
9069 dbflag = DB_NEW|DB_VALID;
9072 key.mv_data = (void *)name;
9073 mdb_cursor_init(&mc, txn, MAIN_DBI, NULL);
9074 rc = mdb_cursor_set(&mc, &key, &data, MDB_SET, &exact);
9075 if (rc == MDB_SUCCESS) {
9076 /* make sure this is actually a DB */
9077 MDB_node *node = NODEPTR(mc.mc_pg[mc.mc_top], mc.mc_ki[mc.mc_top]);
9078 if (!(node->mn_flags & F_SUBDATA))
9079 return MDB_INCOMPATIBLE;
9080 } else if (rc == MDB_NOTFOUND && (flags & MDB_CREATE)) {
9081 /* Create if requested */
9082 data.mv_size = sizeof(MDB_db);
9083 data.mv_data = &dummy;
9084 memset(&dummy, 0, sizeof(dummy));
9085 dummy.md_root = P_INVALID;
9086 dummy.md_flags = flags & PERSISTENT_FLAGS;
9087 rc = mdb_cursor_put(&mc, &key, &data, F_SUBDATA);
9091 /* OK, got info, add to table */
9092 if (rc == MDB_SUCCESS) {
9093 unsigned int slot = unused ? unused : txn->mt_numdbs;
9094 txn->mt_dbxs[slot].md_name.mv_data = strdup(name);
9095 txn->mt_dbxs[slot].md_name.mv_size = len;
9096 txn->mt_dbxs[slot].md_rel = NULL;
9097 txn->mt_dbflags[slot] = dbflag;
9098 /* txn-> and env-> are the same in read txns, use
9099 * tmp variable to avoid undefined assignment
9101 seq = ++txn->mt_env->me_dbiseqs[slot];
9102 txn->mt_dbiseqs[slot] = seq;
9104 memcpy(&txn->mt_dbs[slot], data.mv_data, sizeof(MDB_db));
9106 mdb_default_cmp(txn, slot);
9115 int mdb_stat(MDB_txn *txn, MDB_dbi dbi, MDB_stat *arg)
9117 if (!arg || !TXN_DBI_EXIST(txn, dbi))
9120 if (txn->mt_flags & MDB_TXN_ERROR)
9123 if (txn->mt_dbflags[dbi] & DB_STALE) {
9126 /* Stale, must read the DB's root. cursor_init does it for us. */
9127 mdb_cursor_init(&mc, txn, dbi, &mx);
9129 return mdb_stat0(txn->mt_env, &txn->mt_dbs[dbi], arg);
9132 void mdb_dbi_close(MDB_env *env, MDB_dbi dbi)
9135 if (dbi <= MAIN_DBI || dbi >= env->me_maxdbs)
9137 ptr = env->me_dbxs[dbi].md_name.mv_data;
9138 /* If there was no name, this was already closed */
9140 env->me_dbxs[dbi].md_name.mv_data = NULL;
9141 env->me_dbxs[dbi].md_name.mv_size = 0;
9142 env->me_dbflags[dbi] = 0;
9143 env->me_dbiseqs[dbi]++;
9148 int mdb_dbi_flags(MDB_txn *txn, MDB_dbi dbi, unsigned int *flags)
9150 /* We could return the flags for the FREE_DBI too but what's the point? */
9151 if (dbi == FREE_DBI || !TXN_DBI_EXIST(txn, dbi))
9153 *flags = txn->mt_dbs[dbi].md_flags & PERSISTENT_FLAGS;
9157 /** Add all the DB's pages to the free list.
9158 * @param[in] mc Cursor on the DB to free.
9159 * @param[in] subs non-Zero to check for sub-DBs in this DB.
9160 * @return 0 on success, non-zero on failure.
9163 mdb_drop0(MDB_cursor *mc, int subs)
9167 rc = mdb_page_search(mc, NULL, MDB_PS_FIRST);
9168 if (rc == MDB_SUCCESS) {
9169 MDB_txn *txn = mc->mc_txn;
9174 /* LEAF2 pages have no nodes, cannot have sub-DBs */
9175 if (IS_LEAF2(mc->mc_pg[mc->mc_top]))
9178 mdb_cursor_copy(mc, &mx);
9179 while (mc->mc_snum > 0) {
9180 MDB_page *mp = mc->mc_pg[mc->mc_top];
9181 unsigned n = NUMKEYS(mp);
9183 for (i=0; i<n; i++) {
9184 ni = NODEPTR(mp, i);
9185 if (ni->mn_flags & F_BIGDATA) {
9188 memcpy(&pg, NODEDATA(ni), sizeof(pg));
9189 rc = mdb_page_get(txn, pg, &omp, NULL);
9192 mdb_cassert(mc, IS_OVERFLOW(omp));
9193 rc = mdb_midl_append_range(&txn->mt_free_pgs,
9197 } else if (subs && (ni->mn_flags & F_SUBDATA)) {
9198 mdb_xcursor_init1(mc, ni);
9199 rc = mdb_drop0(&mc->mc_xcursor->mx_cursor, 0);
9205 if ((rc = mdb_midl_need(&txn->mt_free_pgs, n)) != 0)
9207 for (i=0; i<n; i++) {
9209 ni = NODEPTR(mp, i);
9212 mdb_midl_xappend(txn->mt_free_pgs, pg);
9217 mc->mc_ki[mc->mc_top] = i;
9218 rc = mdb_cursor_sibling(mc, 1);
9220 if (rc != MDB_NOTFOUND)
9222 /* no more siblings, go back to beginning
9223 * of previous level.
9227 for (i=1; i<mc->mc_snum; i++) {
9229 mc->mc_pg[i] = mx.mc_pg[i];
9234 rc = mdb_midl_append(&txn->mt_free_pgs, mc->mc_db->md_root);
9237 txn->mt_flags |= MDB_TXN_ERROR;
9238 } else if (rc == MDB_NOTFOUND) {
9244 int mdb_drop(MDB_txn *txn, MDB_dbi dbi, int del)
9246 MDB_cursor *mc, *m2;
9249 if ((unsigned)del > 1 || dbi == FREE_DBI || !TXN_DBI_EXIST(txn, dbi))
9252 if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY))
9255 if (dbi > MAIN_DBI && TXN_DBI_CHANGED(txn, dbi))
9258 rc = mdb_cursor_open(txn, dbi, &mc);
9262 rc = mdb_drop0(mc, mc->mc_db->md_flags & MDB_DUPSORT);
9263 /* Invalidate the dropped DB's cursors */
9264 for (m2 = txn->mt_cursors[dbi]; m2; m2 = m2->mc_next)
9265 m2->mc_flags &= ~(C_INITIALIZED|C_EOF);
9269 /* Can't delete the main DB */
9270 if (del && dbi > MAIN_DBI) {
9271 rc = mdb_del0(txn, MAIN_DBI, &mc->mc_dbx->md_name, NULL, 0);
9273 txn->mt_dbflags[dbi] = DB_STALE;
9274 mdb_dbi_close(txn->mt_env, dbi);
9276 txn->mt_flags |= MDB_TXN_ERROR;
9279 /* reset the DB record, mark it dirty */
9280 txn->mt_dbflags[dbi] |= DB_DIRTY;
9281 txn->mt_dbs[dbi].md_depth = 0;
9282 txn->mt_dbs[dbi].md_branch_pages = 0;
9283 txn->mt_dbs[dbi].md_leaf_pages = 0;
9284 txn->mt_dbs[dbi].md_overflow_pages = 0;
9285 txn->mt_dbs[dbi].md_entries = 0;
9286 txn->mt_dbs[dbi].md_root = P_INVALID;
9288 txn->mt_flags |= MDB_TXN_DIRTY;
9291 mdb_cursor_close(mc);
9295 int mdb_set_compare(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp)
9297 if (dbi == FREE_DBI || !TXN_DBI_EXIST(txn, dbi))
9300 txn->mt_dbxs[dbi].md_cmp = cmp;
9304 int mdb_set_dupsort(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp)
9306 if (dbi == FREE_DBI || !TXN_DBI_EXIST(txn, dbi))
9309 txn->mt_dbxs[dbi].md_dcmp = cmp;
9313 int mdb_set_relfunc(MDB_txn *txn, MDB_dbi dbi, MDB_rel_func *rel)
9315 if (dbi == FREE_DBI || !TXN_DBI_EXIST(txn, dbi))
9318 txn->mt_dbxs[dbi].md_rel = rel;
9322 int mdb_set_relctx(MDB_txn *txn, MDB_dbi dbi, void *ctx)
9324 if (dbi == FREE_DBI || !TXN_DBI_EXIST(txn, dbi))
9327 txn->mt_dbxs[dbi].md_relctx = ctx;
9332 mdb_env_get_maxkeysize(MDB_env *env)
9334 return ENV_MAXKEY(env);
9338 mdb_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx)
9340 unsigned int i, rdrs;
9343 int rc = 0, first = 1;
9347 if (!env->me_txns) {
9348 return func("(no reader locks)\n", ctx);
9350 rdrs = env->me_txns->mti_numreaders;
9351 mr = env->me_txns->mti_readers;
9352 for (i=0; i<rdrs; i++) {
9354 txnid_t txnid = mr[i].mr_txnid;
9355 sprintf(buf, txnid == (txnid_t)-1 ?
9356 "%10d %"Z"x -\n" : "%10d %"Z"x %"Z"u\n",
9357 (int)mr[i].mr_pid, (size_t)mr[i].mr_tid, txnid);
9360 rc = func(" pid thread txnid\n", ctx);
9364 rc = func(buf, ctx);
9370 rc = func("(no active readers)\n", ctx);
9375 /** Insert pid into list if not already present.
9376 * return -1 if already present.
9379 mdb_pid_insert(MDB_PID_T *ids, MDB_PID_T pid)
9381 /* binary search of pid in list */
9383 unsigned cursor = 1;
9385 unsigned n = ids[0];
9388 unsigned pivot = n >> 1;
9389 cursor = base + pivot + 1;
9390 val = pid - ids[cursor];
9395 } else if ( val > 0 ) {
9400 /* found, so it's a duplicate */
9409 for (n = ids[0]; n > cursor; n--)
9416 mdb_reader_check(MDB_env *env, int *dead)
9418 unsigned int i, j, rdrs;
9420 MDB_PID_T *pids, pid;
9429 rdrs = env->me_txns->mti_numreaders;
9430 pids = malloc((rdrs+1) * sizeof(MDB_PID_T));
9434 mr = env->me_txns->mti_readers;
9435 for (i=0; i<rdrs; i++) {
9436 if (mr[i].mr_pid && mr[i].mr_pid != env->me_pid) {
9438 if (mdb_pid_insert(pids, pid) == 0) {
9439 if (!mdb_reader_pid(env, Pidcheck, pid)) {
9441 /* Recheck, a new process may have reused pid */
9442 if (!mdb_reader_pid(env, Pidcheck, pid)) {
9443 for (j=i; j<rdrs; j++)
9444 if (mr[j].mr_pid == pid) {
9445 DPRINTF(("clear stale reader pid %u txn %"Z"d",
9446 (unsigned) pid, mr[j].mr_txnid));
9451 UNLOCK_MUTEX_R(env);