#endif
#if !(__STDC_VERSION__ >= 199901L || defined(__GNUC__))
+# undef MDB_DEBUG
+# define MDB_DEBUG 0
# define DPRINTF (void) /* Vararg macros may be unsupported */
#elif MDB_DEBUG
static int mdb_debug;
* slot's address is saved in thread-specific data so that subsequent read
* transactions started by the same thread need no further locking to proceed.
*
+ * If #MDB_NOTLS is set, the slot address is not saved in thread-specific data.
+ *
* No reader table is used if the database is on a read-only filesystem.
*
* Since the database uses multi-version concurrency control, readers don't
pgno_t md_root; /**< the root page of this tree */
} MDB_db;
+ /** mdb_dbi_open flags */
+#define PERSISTENT_FLAGS 0x7fff
+#define VALID_FLAGS (MDB_REVERSEKEY|MDB_DUPSORT|MDB_INTEGERKEY|MDB_DUPFIXED|\
+ MDB_INTEGERDUP|MDB_REVERSEDUP|MDB_CREATE)
+
/** Handle for the DB used to track free pages. */
#define FREE_DBI 0
/** Handle for the default DB. */
*/
#define DB_DIRTY 0x01 /**< DB was written in this txn */
#define DB_STALE 0x02 /**< DB record is older than txnID */
+#define DB_NEW 0x04 /**< DB handle opened in this txn */
+#define DB_VALID 0x08 /**< DB handle is valid */
+#define MDB_VALID 0x8000 /**< DB handle is valid, for me_dbflags */
/** @} */
/** In write txns, array of cursors for each DB */
MDB_cursor **mt_cursors;
HANDLE me_mfd; /**< just for writing the meta pages */
/** Failed to update the meta page. Probably an I/O error. */
#define MDB_FATAL_ERROR 0x80000000U
- /** Read-only Filesystem. Allow read access, no locking. */
-#define MDB_ROFS 0x40000000U
/** Some fields are initialized. */
#define MDB_ENV_ACTIVE 0x20000000U
+ /** me_txkey is set */
+#define MDB_ENV_TXKEY 0x10000000U
uint32_t me_flags; /**< @ref mdb_env */
unsigned int me_psize; /**< size of a page, from #GET_PAGESIZE */
unsigned int me_maxreaders; /**< size of the reader table */
MDB_page *me_dpages; /**< list of malloc'd blocks for re-use */
/** IDL of pages that became unused in a write txn */
MDB_IDL me_free_pgs;
- /** ID2L of pages that were written during a write txn */
- MDB_ID2 me_dirty_list[MDB_IDL_UM_SIZE];
+ /** ID2L of pages written during a write txn. Length MDB_IDL_UM_SIZE. */
+ MDB_ID2L me_dirty_list;
/** Max number of freelist items that can fit in a single overflow page */
unsigned int me_maxfree_1pg;
/** Max size of a node on a page */
"MDB_PAGE_FULL: Internal error - page has no more space",
"MDB_MAP_RESIZED: Database contents grew beyond environment mapsize",
"MDB_INCOMPATIBLE: Database flags changed or would change",
+ "MDB_BAD_RSLOT: Invalid reuse of reader locktable slot",
};
char *
{
MDB_env *env = txn->mt_env;
unsigned int i;
+ uint16_t x;
int rc;
/* Setup db info */
txn->mt_dbxs = env->me_dbxs; /* mostly static anyway */
if (txn->mt_flags & MDB_TXN_RDONLY) {
- if (env->me_flags & MDB_ROFS) {
+ if (!env->me_txns) {
i = mdb_env_pick_meta(env);
txn->mt_txnid = env->me_metas[i]->mm_txnid;
txn->mt_u.reader = NULL;
} else {
- MDB_reader *r = pthread_getspecific(env->me_txkey);
- if (!r) {
+ MDB_reader *r = (env->me_flags & MDB_NOTLS) ? txn->mt_u.reader :
+ pthread_getspecific(env->me_txkey);
+ if (r) {
+ if (r->mr_pid != env->me_pid || r->mr_txnid != (txnid_t)-1)
+ return MDB_BAD_RSLOT;
+ } else {
pid_t pid = env->me_pid;
pthread_t tid = pthread_self();
env->me_numreaders = env->me_txns->mti_numreaders;
UNLOCK_MUTEX_R(env);
r = &env->me_txns->mti_readers[i];
- if ((rc = pthread_setspecific(env->me_txkey, r)) != 0) {
+ if (!(env->me_flags & MDB_NOTLS) &&
+ (rc = pthread_setspecific(env->me_txkey, r)) != 0) {
env->me_txns->mti_readers[i].mr_pid = 0;
return rc;
}
/* Copy the DB info and flags */
memcpy(txn->mt_dbs, env->me_metas[txn->mt_toggle]->mm_dbs, 2 * sizeof(MDB_db));
- for (i=2; i<txn->mt_numdbs; i++)
- txn->mt_dbs[i].md_flags = env->me_dbflags[i];
- txn->mt_dbflags[0] = txn->mt_dbflags[1] = 0;
- if (txn->mt_numdbs > 2)
- memset(txn->mt_dbflags+2, DB_STALE, txn->mt_numdbs-2);
+ for (i=2; i<txn->mt_numdbs; i++) {
+ x = env->me_dbflags[i];
+ txn->mt_dbs[i].md_flags = x & PERSISTENT_FLAGS;
+ txn->mt_dbflags[i] = (x & MDB_VALID) ? DB_VALID|DB_STALE : 0;
+ }
+ txn->mt_dbflags[0] = txn->mt_dbflags[1] = DB_VALID;
if (env->me_maxpg < txn->mt_next_pgno) {
mdb_txn_reset0(txn);
{
int rc;
- if (! (txn && (txn->mt_flags & MDB_TXN_RDONLY)))
+ if (!txn || txn->mt_numdbs || !(txn->mt_flags & MDB_TXN_RDONLY))
return EINVAL;
if (txn->mt_env->me_flags & MDB_FATAL_ERROR) {
txn->mt_env = env;
if (parent) {
+ unsigned int i;
txn->mt_free_pgs = mdb_midl_alloc();
if (!txn->mt_free_pgs) {
free(txn);
txn->mt_numdbs = parent->mt_numdbs;
txn->mt_dbxs = parent->mt_dbxs;
memcpy(txn->mt_dbs, parent->mt_dbs, txn->mt_numdbs * sizeof(MDB_db));
- memcpy(txn->mt_dbflags, parent->mt_dbflags, txn->mt_numdbs);
+ /* Copy parent's mt_dbflags, but clear DB_NEW */
+ for (i=0; i<txn->mt_numdbs; i++)
+ txn->mt_dbflags[i] = parent->mt_dbflags[i] & ~DB_NEW;
rc = 0;
ntxn = (MDB_ntxn *)txn;
ntxn->mnt_pgstate = env->me_pgstate; /* save parent me_pghead & co */
}
/** Common code for #mdb_txn_reset() and #mdb_txn_abort().
+ * May be called twice for readonly txns: First reset it, then abort.
* @param[in] txn the transaction handle to reset
*/
static void
mdb_txn_reset0(MDB_txn *txn)
{
MDB_env *env = txn->mt_env;
+ unsigned int i;
+
+ /* Close any DBI handles opened in this txn */
+ for (i=2; i<txn->mt_numdbs; i++) {
+ if (txn->mt_dbflags[i] & DB_NEW) {
+ char *ptr = env->me_dbxs[i].md_name.mv_data;
+ env->me_dbxs[i].md_name.mv_data = NULL;
+ env->me_dbxs[i].md_name.mv_size = 0;
+ free(ptr);
+ }
+ }
if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) {
- if (!(env->me_flags & MDB_ROFS))
+ if (txn->mt_u.reader) {
txn->mt_u.reader->mr_txnid = (txnid_t)-1;
+ if (!(env->me_flags & MDB_NOTLS))
+ txn->mt_u.reader = NULL; /* txn does not own reader */
+ }
+ txn->mt_numdbs = 0; /* mark txn as reset, do not close DBs again */
} else {
MDB_page *dp;
- unsigned int i;
/* close(free) all cursors */
for (i=0; i<txn->mt_numdbs; i++) {
txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w',
(void *) txn, (void *)txn->mt_env, txn->mt_dbs[MAIN_DBI].md_root);
+ /* This call is only valid for read-only txns */
+ if (!(txn->mt_flags & MDB_TXN_RDONLY))
+ return;
+
mdb_txn_reset0(txn);
}
mdb_txn_abort(txn->mt_child);
mdb_txn_reset0(txn);
+ /* Free reader slot tied to this txn (if MDB_NOTLS && writable FS) */
+ if ((txn->mt_flags & MDB_TXN_RDONLY) && txn->mt_u.reader)
+ txn->mt_u.reader->mr_pid = 0;
+
free(txn);
}
env = txn->mt_env;
if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) {
- if (txn->mt_numdbs > env->me_numdbs) {
- /* update the DB flags */
- MDB_dbi i;
- for (i = env->me_numdbs; i<txn->mt_numdbs; i++)
- env->me_dbflags[i] = txn->mt_dbs[i].md_flags;
- env->me_numdbs = i;
+ /* update the DB flags */
+ for (i = 2; i<txn->mt_numdbs; i++) {
+ if (txn->mt_dbflags[i] & DB_NEW)
+ env->me_dbflags[i] = txn->mt_dbs[i].md_flags | MDB_VALID;
}
+ if (txn->mt_numdbs > env->me_numdbs)
+ env->me_numdbs = txn->mt_numdbs;
+ txn->mt_numdbs = 2; /* so txn_abort() doesn't close any new handles */
mdb_txn_abort(txn);
return MDB_SUCCESS;
}
/* Update parent's DB table. */
memcpy(parent->mt_dbs, txn->mt_dbs, txn->mt_numdbs * sizeof(MDB_db));
- memcpy(parent->mt_dbflags, txn->mt_dbflags, txn->mt_numdbs);
txn->mt_parent->mt_numdbs = txn->mt_numdbs;
+ txn->mt_parent->mt_dbflags[0] = txn->mt_dbflags[0];
+ txn->mt_parent->mt_dbflags[1] = txn->mt_dbflags[1];
+ for (i=2; i<txn->mt_numdbs; i++) {
+ /* preserve parent's DB_NEW status */
+ x = txn->mt_parent->mt_dbflags[i] & DB_NEW;
+ txn->mt_parent->mt_dbflags[i] = txn->mt_dbflags[i] | x;
+ }
dst = txn->mt_parent->mt_u.dirty_list;
src = txn->mt_u.dirty_list;
done:
env->me_pglast = 0;
env->me_txn = NULL;
- if (txn->mt_numdbs > env->me_numdbs) {
- /* update the DB flags */
- MDB_dbi i;
- for (i = env->me_numdbs; i<txn->mt_numdbs; i++)
- env->me_dbflags[i] = txn->mt_dbs[i].md_flags;
- env->me_numdbs = i;
+ /* update the DB flags */
+ for (i = 2; i<txn->mt_numdbs; i++) {
+ if (txn->mt_dbflags[i] & DB_NEW)
+ env->me_dbflags[i] = txn->mt_dbs[i].md_flags | MDB_VALID;
}
+ if (txn->mt_numdbs > env->me_numdbs)
+ env->me_numdbs = txn->mt_numdbs;
UNLOCK_MUTEX_W(env);
free(txn);
if (!e)
return ENOMEM;
- e->me_free_pgs = mdb_midl_alloc();
- if (!e->me_free_pgs) {
- free(e);
- return ENOMEM;
- }
e->me_maxreaders = DEFAULT_READERS;
- e->me_maxdbs = 2;
+ e->me_maxdbs = e->me_numdbs = 2;
e->me_fd = INVALID_HANDLE_VALUE;
e->me_lfd = INVALID_HANDLE_VALUE;
e->me_mfd = INVALID_HANDLE_VALUE;
* @param[in] lpath The pathname of the file used for the lock region.
* @param[in] mode The Unix permissions for the file, if we create it.
* @param[out] excl Resulting file lock type: -1 none, 0 shared, 1 exclusive
+ * @param[in,out] excl In -1, out lock type: -1 none, 0 shared, 1 exclusive
* @return 0 on success, non-zero on failure.
*/
static int
mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl)
{
+#ifdef _WIN32
+# define MDB_ERRCODE_ROFS ERROR_WRITE_PROTECT
+#else
+# define MDB_ERRCODE_ROFS EROFS
+#ifdef O_CLOEXEC /* Linux: Open file and set FD_CLOEXEC atomically */
+# define MDB_CLOEXEC O_CLOEXEC
+#else
+ int fdflags;
+# define MDB_CLOEXEC 0
+#endif
+#endif
int rc;
off_t size, rsize;
- *excl = -1;
-
#ifdef _WIN32
- if ((env->me_lfd = CreateFile(lpath, GENERIC_READ|GENERIC_WRITE,
+ env->me_lfd = CreateFile(lpath, GENERIC_READ|GENERIC_WRITE,
FILE_SHARE_READ|FILE_SHARE_WRITE, NULL, OPEN_ALWAYS,
- FILE_ATTRIBUTE_NORMAL, NULL)) == INVALID_HANDLE_VALUE) {
+ FILE_ATTRIBUTE_NORMAL, NULL);
+#else
+ env->me_lfd = open(lpath, O_RDWR|O_CREAT|MDB_CLOEXEC, mode);
+#endif
+ if (env->me_lfd == INVALID_HANDLE_VALUE) {
rc = ErrCode();
- if (rc == ERROR_WRITE_PROTECT && (env->me_flags & MDB_RDONLY)) {
- env->me_flags |= MDB_ROFS;
+ if (rc == MDB_ERRCODE_ROFS && (env->me_flags & MDB_RDONLY)) {
return MDB_SUCCESS;
}
goto fail_errno;
}
- /* Try to get exclusive lock. If we succeed, then
- * nobody is using the lock region and we should initialize it.
- */
- if ((rc = mdb_env_excl_lock(env, excl))) goto fail;
- size = GetFileSize(env->me_lfd, NULL);
-
-#else
-#if !(O_CLOEXEC)
- {
- int fdflags;
- if ((env->me_lfd = open(lpath, O_RDWR|O_CREAT, mode)) == -1) {
- rc = ErrCode();
- if (rc == EROFS && (env->me_flags & MDB_RDONLY)) {
- env->me_flags |= MDB_ROFS;
- return MDB_SUCCESS;
- }
- goto fail_errno;
- }
- /* Lose record locks when exec*() */
- if ((fdflags = fcntl(env->me_lfd, F_GETFD) | FD_CLOEXEC) >= 0)
+#if ! ((MDB_CLOEXEC) || defined(_WIN32))
+ /* Lose record locks when exec*() */
+ if ((fdflags = fcntl(env->me_lfd, F_GETFD) | FD_CLOEXEC) >= 0)
fcntl(env->me_lfd, F_SETFD, fdflags);
- }
-#else /* O_CLOEXEC on Linux: Open file and set FD_CLOEXEC atomically */
- if ((env->me_lfd = open(lpath, O_RDWR|O_CREAT|O_CLOEXEC, mode)) == -1) {
- rc = ErrCode();
- if (rc == EROFS && (env->me_flags & MDB_RDONLY)) {
- env->me_flags |= MDB_ROFS;
- return MDB_SUCCESS;
+#endif
+
+ if (!(env->me_flags & MDB_NOTLS)) {
+ rc = pthread_key_create(&env->me_txkey, mdb_env_reader_dest);
+ if (rc)
+ goto fail;
+ env->me_flags |= MDB_ENV_TXKEY;
+#ifdef _WIN32
+ /* Windows TLS callbacks need help finding their TLS info. */
+ if (mdb_tls_nkeys >= MAX_TLS_KEYS) {
+ rc = MDB_TLS_FULL;
+ goto fail;
}
- goto fail_errno;
- }
+ mdb_tls_keys[mdb_tls_nkeys++] = env->me_txkey;
#endif
+ }
/* Try to get exclusive lock. If we succeed, then
* nobody is using the lock region and we should initialize it.
*/
if ((rc = mdb_env_excl_lock(env, excl))) goto fail;
+#ifdef _WIN32
+ size = GetFileSize(env->me_lfd, NULL);
+#else
size = lseek(env->me_lfd, 0, SEEK_END);
#endif
rsize = (env->me_maxreaders-1) * sizeof(MDB_reader) + sizeof(MDB_txninfo);
* environment and re-opening it with the new flags.
*/
#define CHANGEABLE (MDB_NOSYNC|MDB_NOMETASYNC|MDB_MAPASYNC)
-#define CHANGELESS (MDB_FIXEDMAP|MDB_NOSUBDIR|MDB_RDONLY|MDB_WRITEMAP)
+#define CHANGELESS (MDB_FIXEDMAP|MDB_NOSUBDIR|MDB_RDONLY|MDB_WRITEMAP|MDB_NOTLS)
int
-mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mode_t mode)
+mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode)
{
- int oflags, rc, len, excl;
+ int oflags, rc, len, excl = -1;
char *lpath, *dpath;
if (env->me_fd!=INVALID_HANDLE_VALUE || (flags & ~(CHANGEABLE|CHANGELESS)))
sprintf(dpath, "%s" DATANAME, path);
}
+ rc = MDB_SUCCESS;
flags |= env->me_flags;
- /* silently ignore WRITEMAP if we're only getting read access */
- if (F_ISSET(flags, MDB_RDONLY|MDB_WRITEMAP))
- flags ^= MDB_WRITEMAP;
+ if (flags & MDB_RDONLY) {
+ /* silently ignore WRITEMAP when we're only getting read access */
+ flags &= ~MDB_WRITEMAP;
+ } else {
+ if (!((env->me_free_pgs = mdb_midl_alloc()) &&
+ (env->me_dirty_list = calloc(MDB_IDL_UM_SIZE, sizeof(MDB_ID2)))))
+ rc = ENOMEM;
+ }
env->me_flags = flags |= MDB_ENV_ACTIVE;
+ if (rc)
+ goto leave;
+
+ env->me_path = strdup(path);
+ env->me_dbxs = calloc(env->me_maxdbs, sizeof(MDB_dbx));
+ env->me_dbflags = calloc(env->me_maxdbs, sizeof(uint16_t));
+ if (!(env->me_dbxs && env->me_path && env->me_dbflags)) {
+ rc = ENOMEM;
+ goto leave;
+ }
rc = mdb_env_setup_locks(env, lpath, mode, &excl);
if (rc)
}
}
DPRINTF("opened dbenv %p", (void *) env);
- rc = pthread_key_create(&env->me_txkey, mdb_env_reader_dest);
- if (rc)
- goto leave;
- env->me_numdbs = 2; /* this notes that me_txkey was set */
-#ifdef _WIN32
- /* Windows TLS callbacks need help finding their TLS info. */
- if (mdb_tls_nkeys < MAX_TLS_KEYS)
- mdb_tls_keys[mdb_tls_nkeys++] = env->me_txkey;
- else {
- rc = MDB_TLS_FULL;
- goto leave;
- }
-#endif
if (excl > 0) {
rc = mdb_env_share_locks(env, &excl);
- if (rc)
- goto leave;
}
- env->me_dbxs = calloc(env->me_maxdbs, sizeof(MDB_dbx));
- env->me_dbflags = calloc(env->me_maxdbs, sizeof(uint16_t));
- env->me_path = strdup(path);
- if (!env->me_dbxs || !env->me_dbflags || !env->me_path)
- rc = ENOMEM;
}
leave:
free(env->me_dbflags);
free(env->me_dbxs);
free(env->me_path);
+ free(env->me_dirty_list);
+ if (env->me_free_pgs)
+ mdb_midl_free(env->me_free_pgs);
- if (env->me_numdbs) {
+ if (env->me_flags & MDB_ENV_TXKEY) {
pthread_key_delete(env->me_txkey);
#ifdef _WIN32
/* Delete our key from the global list */
close(env->me_lfd);
}
- env->me_flags &= ~MDB_ENV_ACTIVE;
+ env->me_flags &= ~(MDB_ENV_ACTIVE|MDB_ENV_TXKEY);
}
int
if (rc)
goto leave;
- if (!(env->me_flags & MDB_ROFS)) {
+ if (env->me_txns) {
/* We must start the actual read txn after blocking writers */
mdb_txn_reset0(txn);
rc = write(newfd, env->me_map, wsize);
rc = (rc == (int)wsize) ? MDB_SUCCESS : ErrCode();
#endif
- if (! (env->me_flags & MDB_ROFS))
+ if (env->me_txns)
UNLOCK_MUTEX_W(env);
if (rc)
mdb_env_close(MDB_env *env)
{
MDB_page *dp;
+ int i;
if (env == NULL)
return;
+ for (i = env->me_numdbs; --i > MAIN_DBI; )
+ free(env->me_dbxs[i].md_name.mv_data);
+
VGMEMP_DESTROY(env);
while ((dp = env->me_dpages) != NULL) {
VGMEMP_DEFINED(&dp->mp_next, sizeof(dp->mp_next));
}
mdb_env_close0(env, 0);
- mdb_midl_free(env->me_free_pgs);
free(env);
}
/* The txn may not know this DBI, or another process may
* have dropped and recreated the DB with other flags.
*/
- if (mc->mc_db->md_flags != flags)
+ if ((mc->mc_db->md_flags & PERSISTENT_FLAGS) != flags)
return MDB_INCOMPATIBLE;
memcpy(mc->mc_db, data.mv_data, sizeof(MDB_db));
}
if (flags & MDB_PS_MODIFY)
dbflag = DB_DIRTY;
- *mc->mc_dbflag = dbflag;
+ *mc->mc_dbflag &= ~DB_STALE;
+ *mc->mc_dbflag |= dbflag;
}
}
root = mc->mc_db->md_root;
assert(data);
DPRINTF("===> get db %u key [%s]", dbi, DKEY(key));
- if (txn == NULL || !dbi || dbi >= txn->mt_numdbs)
+ if (txn == NULL || !dbi || dbi >= txn->mt_numdbs || !(txn->mt_dbflags[dbi] & DB_VALID))
return EINVAL;
if (key->mv_size == 0 || key->mv_size > MDB_MAXKEYSIZE) {
return rc;
mdb_cursor_push(mc, mp);
+ if (!move_right)
+ mc->mc_ki[mc->mc_top] = NUMKEYS(mp)-1;
return MDB_SUCCESS;
}
}
assert(IS_LEAF(mc->mc_pg[mc->mc_top]));
- mc->mc_flags |= C_INITIALIZED|C_EOF;
mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]) - 1;
}
+ mc->mc_flags |= C_INITIALIZED|C_EOF;
leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
if (IS_LEAF2(mc->mc_pg[mc->mc_top])) {
case MDB_PREV:
case MDB_PREV_DUP:
case MDB_PREV_NODUP:
- if (!(mc->mc_flags & C_INITIALIZED) || (mc->mc_flags & C_EOF)) {
+ if (!(mc->mc_flags & C_INITIALIZED)) {
rc = mdb_cursor_last(mc, key, data);
mc->mc_flags |= C_INITIALIZED;
mc->mc_ki[mc->mc_top]++;
rc = mdb_page_search(&mc2, &mc->mc_dbx->md_name, MDB_PS_MODIFY);
if (rc)
return rc;
- *mc->mc_dbflag = DB_DIRTY;
+ *mc->mc_dbflag |= DB_DIRTY;
}
for (mc->mc_top = 0; mc->mc_top < mc->mc_snum; mc->mc_top++) {
rc = mdb_page_touch(mc);
mdb_cursor_push(mc, np);
mc->mc_db->md_root = np->mp_pgno;
mc->mc_db->md_depth++;
- *mc->mc_dbflag = DB_DIRTY;
+ *mc->mc_dbflag |= DB_DIRTY;
if ((mc->mc_db->md_flags & (MDB_DUPSORT|MDB_DUPFIXED))
== MDB_DUPFIXED)
np->mp_flags |= P_LEAF2;
rc = MDB_NOTFOUND;
mc->mc_ki[mc->mc_top]++;
} else {
- rc = 0;
+ /* new key is <= last key */
+ rc = MDB_KEYEXIST;
}
}
} else {
}
DPRINTF("Sub-db %u for db %u root page %zu", mx->mx_cursor.mc_dbi, mc->mc_dbi,
mx->mx_db.md_root);
- mx->mx_dbflag = (F_ISSET(mc->mc_pg[mc->mc_top]->mp_flags, P_DIRTY)) ?
- DB_DIRTY : 0;
+ mx->mx_dbflag = DB_VALID | (F_ISSET(mc->mc_pg[mc->mc_top]->mp_flags, P_DIRTY) ?
+ DB_DIRTY : 0);
mx->mx_dbx.md_name.mv_data = NODEKEY(node);
mx->mx_dbx.md_name.mv_size = node->mn_ksize;
#if UINT_MAX < SIZE_MAX
MDB_xcursor *mx = NULL;
size_t size = sizeof(MDB_cursor);
- if (txn == NULL || ret == NULL || dbi >= txn->mt_numdbs)
+ if (txn == NULL || ret == NULL || dbi >= txn->mt_numdbs || !(txn->mt_dbflags[dbi] & DB_VALID))
return EINVAL;
/* Allow read access to the freelist */
mdb_rebalance(MDB_cursor *mc)
{
MDB_node *node;
- int rc, minkeys;
- unsigned int ptop;
+ int rc;
+ unsigned int ptop, minkeys;
MDB_cursor mn;
+ minkeys = 1 + (IS_BRANCH(mc->mc_pg[mc->mc_top]));
#if MDB_DEBUG
{
pgno_t pgno;
}
#endif
- if (PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) >= FILL_THRESHOLD) {
+ if (PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) >= FILL_THRESHOLD &&
+ NUMKEYS(mc->mc_pg[mc->mc_top]) >= minkeys) {
#if MDB_DEBUG
pgno_t pgno;
COPY_PGNO(pgno, mc->mc_pg[mc->mc_top]->mp_pgno);
if (mc->mc_snum < 2) {
MDB_page *mp = mc->mc_pg[0];
+ if (IS_SUBP(mp)) {
+ DPUTS("Can't rebalance a subpage, ignoring");
+ return MDB_SUCCESS;
+ }
if (NUMKEYS(mp) == 0) {
DPUTS("tree is completely empty");
mc->mc_db->md_root = P_INVALID;
if (m3->mc_snum < mc->mc_snum) continue;
if (m3->mc_pg[0] == mp) {
m3->mc_pg[0] = mc->mc_pg[0];
+ m3->mc_snum = 1;
+ m3->mc_top = 0;
}
}
}
DPRINTF("====> delete db %u key [%s]", dbi, DKEY(key));
- if (txn == NULL || !dbi || dbi >= txn->mt_numdbs)
+ if (txn == NULL || !dbi || dbi >= txn->mt_numdbs || !(txn->mt_dbflags[dbi] & DB_VALID))
return EINVAL;
if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) {
assert(key != NULL);
assert(data != NULL);
- if (txn == NULL || !dbi || dbi >= txn->mt_numdbs)
+ if (txn == NULL || !dbi || dbi >= txn->mt_numdbs || !(txn->mt_dbflags[dbi] & DB_VALID))
return EINVAL;
if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) {
return EINVAL;
}
- if ((flags & (MDB_NOOVERWRITE|MDB_NODUPDATA|MDB_RESERVE|MDB_APPEND)) != flags)
+ if ((flags & (MDB_NOOVERWRITE|MDB_NODUPDATA|MDB_RESERVE|MDB_APPEND|MDB_APPENDDUP)) != flags)
return EINVAL;
mdb_cursor_init(&mc, txn, dbi, &mx);
: ((f & MDB_REVERSEDUP) ? mdb_cmp_memnr : mdb_cmp_memn));
}
-#define PERSISTENT_FLAGS 0xffff
-#define VALID_FLAGS (MDB_REVERSEKEY|MDB_DUPSORT|MDB_INTEGERKEY|MDB_DUPFIXED|\
- MDB_INTEGERDUP|MDB_REVERSEDUP|MDB_CREATE)
int mdb_dbi_open(MDB_txn *txn, const char *name, unsigned int flags, MDB_dbi *dbi)
{
MDB_val key, data;
MDB_dbi i;
MDB_cursor mc;
- uint16_t mdflags;
int rc, dbflag, exact;
unsigned int unused = 0;
size_t len;
return MDB_DBS_FULL;
/* Find the DB info */
- dbflag = 0;
+ dbflag = DB_NEW|DB_VALID;
exact = 0;
key.mv_size = len;
key.mv_data = (void *)name;
dummy.md_root = P_INVALID;
dummy.md_flags = flags & PERSISTENT_FLAGS;
rc = mdb_cursor_put(&mc, &key, &data, F_SUBDATA);
- dbflag = DB_DIRTY;
+ dbflag |= DB_DIRTY;
}
/* OK, got info, add to table */
txn->mt_dbflags[slot] = dbflag;
memcpy(&txn->mt_dbs[slot], data.mv_data, sizeof(MDB_db));
*dbi = slot;
- txn->mt_env->me_dbflags[slot] = mdflags = txn->mt_dbs[slot].md_flags;
+ txn->mt_env->me_dbflags[slot] = txn->mt_dbs[slot].md_flags;
mdb_default_cmp(txn, slot);
if (!unused) {
txn->mt_numdbs++;
- txn->mt_env->me_numdbs++;
- }
- /* Open the DB in parent txns as well */
- while ((txn = txn->mt_parent) != NULL) {
- txn->mt_dbflags[slot] = DB_STALE;
- txn->mt_dbs[slot].md_flags = mdflags;
- if (!unused)
- txn->mt_numdbs++;
}
}
void mdb_dbi_close(MDB_env *env, MDB_dbi dbi)
{
char *ptr;
- if (dbi <= MAIN_DBI || dbi >= env->me_numdbs)
+ if (dbi <= MAIN_DBI || dbi >= env->me_maxdbs)
return;
ptr = env->me_dbxs[dbi].md_name.mv_data;
env->me_dbxs[dbi].md_name.mv_data = NULL;
unsigned int i;
/* LEAF2 pages have no nodes, cannot have sub-DBs */
- if (!subs || IS_LEAF2(mc->mc_pg[mc->mc_top]))
+ if (IS_LEAF2(mc->mc_pg[mc->mc_top]))
mdb_cursor_pop(mc);
mdb_cursor_copy(mc, &mx);
if (IS_LEAF(mc->mc_pg[mc->mc_top])) {
for (i=0; i<NUMKEYS(mc->mc_pg[mc->mc_top]); i++) {
ni = NODEPTR(mc->mc_pg[mc->mc_top], i);
- if (ni->mn_flags & F_SUBDATA) {
+ if (ni->mn_flags & F_BIGDATA) {
+ int j, ovpages = OVPAGES(NODEDSZ(ni), mc->mc_txn->mt_env->me_psize);
+ pgno_t pg;
+ memcpy(&pg, NODEDATA(ni), sizeof(pg));
+ for (j=0; j<ovpages; j++) {
+ mdb_midl_append(&mc->mc_txn->mt_free_pgs, pg);
+ pg++;
+ }
+ } else if (subs && (ni->mn_flags & F_SUBDATA)) {
mdb_xcursor_init1(mc, ni);
rc = mdb_drop0(&mc->mc_xcursor->mx_cursor, 0);
if (rc)
}
if (!mc->mc_top)
break;
+ mc->mc_ki[mc->mc_top] = i;
rc = mdb_cursor_sibling(mc, 1);
if (rc) {
/* no more siblings, go back to beginning
* of previous level.
*/
mdb_cursor_pop(mc);
- for (i=1; i<mc->mc_top; i++)
+ mc->mc_ki[0] = 0;
+ for (i=1; i<mc->mc_snum; i++) {
+ mc->mc_ki[i] = 0;
mc->mc_pg[i] = mx.mc_pg[i];
+ }
}
}
/* free it */
MDB_cursor *mc;
int rc;
- if (!txn || !dbi || dbi >= txn->mt_numdbs || (unsigned)del > 1)
+ if (!txn || !dbi || dbi >= txn->mt_numdbs || (unsigned)del > 1 || !(txn->mt_dbflags[dbi] & DB_VALID))
return EINVAL;
if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY))
txn->mt_dbs[dbi].md_entries = 0;
txn->mt_dbs[dbi].md_root = P_INVALID;
- if (!txn->mt_u.dirty_list[0].mid) {
- MDB_cursor m2;
- MDB_val key, data;
- /* make sure we have at least one dirty page in this txn
- * otherwise these changes will be ignored.
- */
- key.mv_size = sizeof(txnid_t);
- key.mv_data = &txn->mt_txnid;
- data.mv_size = sizeof(MDB_ID);
- data.mv_data = txn->mt_free_pgs;
- mdb_cursor_init(&m2, txn, FREE_DBI, NULL);
- rc = mdb_cursor_put(&m2, &key, &data, 0);
- }
+ txn->mt_flags |= MDB_TXN_DIRTY;
}
leave:
mdb_cursor_close(mc);
int mdb_set_compare(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp)
{
- if (txn == NULL || !dbi || dbi >= txn->mt_numdbs)
+ if (txn == NULL || !dbi || dbi >= txn->mt_numdbs || !(txn->mt_dbflags[dbi] & DB_VALID))
return EINVAL;
txn->mt_dbxs[dbi].md_cmp = cmp;
int mdb_set_dupsort(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp)
{
- if (txn == NULL || !dbi || dbi >= txn->mt_numdbs)
+ if (txn == NULL || !dbi || dbi >= txn->mt_numdbs || !(txn->mt_dbflags[dbi] & DB_VALID))
return EINVAL;
txn->mt_dbxs[dbi].md_dcmp = cmp;
int mdb_set_relfunc(MDB_txn *txn, MDB_dbi dbi, MDB_rel_func *rel)
{
- if (txn == NULL || !dbi || dbi >= txn->mt_numdbs)
+ if (txn == NULL || !dbi || dbi >= txn->mt_numdbs || !(txn->mt_dbflags[dbi] & DB_VALID))
return EINVAL;
txn->mt_dbxs[dbi].md_rel = rel;
int mdb_set_relctx(MDB_txn *txn, MDB_dbi dbi, void *ctx)
{
- if (txn == NULL || !dbi || dbi >= txn->mt_numdbs)
+ if (txn == NULL || !dbi || dbi >= txn->mt_numdbs || !(txn->mt_dbflags[dbi] & DB_VALID))
return EINVAL;
txn->mt_dbxs[dbi].md_relctx = ctx;