* database can grow quickly. Write transactions prevent
* other write transactions, since writes are serialized.
*
- * ...when several processes can use a database concurrently:
- *
* - Avoid suspending a process with active transactions. These
- * would then be "long-lived" as above.
+ * would then be "long-lived" as above. Also read transactions
+ * suspended when writers commit could sometimes see wrong data.
+ *
+ * ...when several processes can use a database concurrently:
*
* - Avoid aborting a process with an active transaction.
* The transaction becomes "long-lived" as above until the lockfile
* Values do not overlap Database Flags.
* @{
*/
- /** mmap at a fixed address */
+ /** mmap at a fixed address (experimental) */
#define MDB_FIXEDMAP 0x01
/** no environment directory */
#define MDB_NOSUBDIR 0x4000
#define MDB_NOMETASYNC 0x40000
/** use writable mmap */
#define MDB_WRITEMAP 0x80000
- /** use asynchronous msync */
+ /** use asynchronous msync when MDB_WRITEMAP is used */
#define MDB_MAPASYNC 0x100000
/** @} */
* under that directory. With this option, \b path is used as-is for
* the database main data file. The database lock file is the \b path
* with "-lock" appended.
- * <li>#MDB_NOSYNC
- * Don't perform a synchronous flush after committing a transaction. This means
- * transactions will exhibit the ACI (atomicity, consistency, and isolation)
- * properties, but not D (durability); that is database integrity will be
- * maintained but it is possible some number of the most recently committed
- * transactions may be undone after a system crash. The number of transactions
- * at risk is governed by how often the system flushes dirty buffers to disk
- * and how often #mdb_env_sync() is called. This flag may be changed
- * at any time using #mdb_env_set_flags().
- * <li>#MDB_NOMETASYNC
- * Don't perform a synchronous flush of the meta page after committing
- * a transaction. This is similar to the #MDB_NOSYNC case, but safer
- * because the transaction data is still flushed. The meta page for any
- * transaction N will be flushed by the data flush of transaction N+1.
- * In case of a system crash, the last committed transaction may be
- * lost. This flag may be changed at any time using #mdb_env_set_flags().
* <li>#MDB_RDONLY
- * Open the environment in read-only mode. No write operations will be allowed.
+ * Open the environment in read-only mode. No write operations will be
+ * allowed. MDB will still modify the lock file - except on read-only
+ * filesystems, where MDB does not use locks.
+ * <li>#MDB_WRITEMAP
+ * Use a writeable memory map unless MDB_RDONLY is set. This is faster
+ * and uses fewer mallocs, but loses protection from application bugs
+ * like wild pointer writes and other bad updates into the database.
+ * Incompatible with nested transactions.
+ * <li>#MDB_NOMETASYNC
+ * Flush system buffers to disk only once per transaction, omit the
+ * metadata flush. Defer that until the system flushes files to disk,
+ * or next non-MDB_RDONLY commit or #mdb_env_sync(). This optimization
+ * maintains database integrity, but a system crash may undo the last
+ * committed transaction. I.e. it preserves the ACI (atomicity,
+ * consistency, isolation) but not D (durability) database property.
+ * This flag may be changed at any time using #mdb_env_set_flags().
+ * <li>#MDB_NOSYNC
+ * Don't flush system buffers to disk when committing a transaction.
+ * This optimization means a system crash can corrupt the database or
+ * lose the last transactions if buffers are not yet flushed to disk.
+ * The risk is governed by how often the system flushes dirty buffers
+ * to disk and how often #mdb_env_sync() is called. However, if the
+ * filesystem preserves write order and the #MDB_WRITEMAP flag is not
+ * used, transactions exhibit ACI (atomicity, consistency, isolation)
+ * properties and only lose D (durability). I.e. database integrity
+ * is maintained, but a system crash may undo the final transactions.
+ * Note that (#MDB_NOSYNC | #MDB_WRITEMAP) leaves the system with no
+ * hint for when to write transactions to disk, unless #mdb_env_sync()
+ * is called. (#MDB_MAPASYNC | #MDB_WRITEMAP) may be preferable.
+ * This flag may be changed at any time using #mdb_env_set_flags().
+ * <li>#MDB_MAPASYNC
+ * When using #MDB_WRITEMAP, use asynchronous flushes to disk.
+ * As with #MDB_NOSYNC, a system crash can then corrupt the
+ * database or lose the last transactions. Calling #mdb_env_sync()
+ * ensures on-disk database integrity until next commit.
+ * This flag may be changed at any time using #mdb_env_set_flags().
* </ul>
* @param[in] mode The UNIX permissions to set on created files. This parameter
* is ignored on Windows.
* Data is always written to disk when #mdb_txn_commit() is called,
* but the operating system may keep it buffered. MDB always flushes
* the OS buffers upon commit as well, unless the environment was
- * opened with #MDB_NOSYNC.
+ * opened with #MDB_NOSYNC or in part #MDB_NOMETASYNC.
* @param[in] env An environment handle returned by #mdb_env_create()
* @param[in] force If non-zero, force a synchronous flush. Otherwise
* if the environment has the #MDB_NOSYNC flag set the flushes
* by the given transaction. Only one thread should call this function;
* it is not mutex-protected in a read-only transaction.
* To use named databases (with name != NULL), #mdb_env_set_maxdbs()
- * must be called before opening the enviorment.
+ * must be called before opening the environment.
* @param[in] txn A transaction handle returned by #mdb_txn_begin()
* @param[in] name The name of the database to open. If only a single
* database is needed in the environment, this value may be NULL.
*
* This call is not mutex protected. Handles should only be closed by
* a single thread, and only if no other threads are going to reference
- * the database handle any further.
+ * the database handle or one of its cursors any further.
* @param[in] env An environment handle returned by #mdb_env_create()
* @param[in] dbi A database handle returned by #mdb_dbi_open()
*/
*/
#define P_INVALID (~(pgno_t)0)
- /** Test if a flag \b f is set in a flag word \b w. */
+ /** Test if the flags \b f are set in a flag word \b w. */
#define F_ISSET(w, f) (((w) & (f)) == (f))
/** Used for offsets within a single page.
* slot's address is saved in thread-specific data so that subsequent read
* transactions started by the same thread need no further locking to proceed.
*
+ * No reader table is used if the database is on a read-only filesystem.
+ *
* Since the database uses multi-version concurrency control, readers don't
* actually need any locking. This table is used to keep track of which
* readers are using data from which old transactions, so that we'll know
*/
MDB_IDL mt_free_pgs;
union {
- MDB_ID2L dirty_list; /**< modified pages */
- MDB_reader *reader; /**< this thread's slot in the reader table */
+ MDB_ID2L dirty_list; /**< for write txns: modified pages */
+ MDB_reader *reader; /**< this thread's reader table slot or NULL */
} mt_u;
/** Array of records for each DB known in the environment. */
MDB_dbx *mt_dbxs;
#define DB_DIRTY 0x01 /**< DB was written in this txn */
#define DB_STALE 0x02 /**< DB record is older than txnID */
/** @} */
- /** Array of cursors for each DB */
+ /** In write txns, array of cursors for each DB */
MDB_cursor **mt_cursors;
/** Array of flags for each DB */
unsigned char *mt_dbflags;
pid_t me_pid; /**< process ID of this env */
char *me_path; /**< path to the DB files */
char *me_map; /**< the memory map of the data file */
- MDB_txninfo *me_txns; /**< the memory map of the lock file */
+ MDB_txninfo *me_txns; /**< the memory map of the lock file or NULL */
MDB_meta *me_metas[2]; /**< pointers to the two meta pages */
MDB_txn *me_txn; /**< current write transaction */
size_t me_mapsize; /**< size of the data memory map */
txnid_t me_pgfirst; /**< ID of first old page record we used */
txnid_t me_pglast; /**< ID of last old page record we used */
MDB_dbx *me_dbxs; /**< array of static DB info */
- uint16_t *me_dbflags; /**< array of DB flags */
+ uint16_t *me_dbflags; /**< array of flags from MDB_db.md_flags */
MDB_oldpages *me_pghead; /**< list of old page records */
MDB_oldpages *me_pgfree; /**< list of page records to free */
pthread_key_t me_txkey; /**< thread-key for readers */