#define GET_PAGESIZE(x) {SYSTEM_INFO si; GetSystemInfo(&si); (x) = si.dwPageSize;}
#define close(fd) (CloseHandle(fd) ? 0 : -1)
#define munmap(ptr,len) UnmapViewOfFile(ptr)
+#define Z "I"
#else
+#define Z "z"
+
#ifdef MDB_USE_POSIX_SEM
#define LOCK_MUTEX_R(env) mdb_sem_wait((env)->me_rmutex)
/** The list of pages that became unused during this transaction.
*/
MDB_IDL mt_free_pgs;
- /** The list of dirty pages we temporarily wrote to disk
+ /** The sorted list of dirty pages we temporarily wrote to disk
* because the dirty list was full.
*/
MDB_IDL mt_spill_pgs;
union {
- MDB_ID2L dirty_list; /**< for write txns: modified pages */
- MDB_reader *reader; /**< this thread's reader table slot or NULL */
+ /** For write txns: Modified pages. Sorted when not MDB_WRITEMAP. */
+ MDB_ID2L dirty_list;
+ /** For read txns: This thread/txn's reader table slot, or NULL. */
+ MDB_reader *reader;
} mt_u;
/** Array of records for each DB known in the environment. */
MDB_dbx *mt_dbxs;
#define MDB_ENV_ACTIVE 0x20000000U
/** me_txkey is set */
#define MDB_ENV_TXKEY 0x10000000U
+ /** Have liveness lock in reader table */
+#define MDB_LIVE_READER 0x08000000U
uint32_t me_flags; /**< @ref mdb_env */
unsigned int me_psize; /**< size of a page, from #GET_PAGESIZE */
unsigned int me_maxreaders; /**< size of the reader table */
/** Max size of a node on a page */
unsigned int me_nodemax;
#ifdef _WIN32
+ int me_pidquery; /**< Used in OpenProcess */
HANDLE me_rmutex; /* Windows mutexes don't reside in shared mem */
HANDLE me_wmutex;
#elif defined(MDB_USE_POSIX_SEM)
DKBUF;
nkeys = NUMKEYS(mp);
- fprintf(stderr, "Page %zu numkeys %d\n", mp->mp_pgno, nkeys);
+ fprintf(stderr, "Page %"Z"u numkeys %d\n", mp->mp_pgno, nkeys);
for (i=0; i<nkeys; i++) {
node = NODEPTR(mp, i);
key.mv_size = node->mn_ksize;
key.mv_data = node->mn_data;
nsize = NODESIZE + NODEKSZ(node) + sizeof(indx_t);
if (IS_BRANCH(mp)) {
- fprintf(stderr, "key %d: page %zu, %s\n", i, NODEPGNO(node),
+ fprintf(stderr, "key %d: page %"Z"u, %s\n", i, NODEPGNO(node),
DKEY(&key));
} else {
if (F_ISSET(node->mn_flags, F_BIGDATA))
return txn->mt_dbxs[dbi].md_dcmp(a, b);
}
-/** Allocate a page.
+/** Allocate memory for a page.
* Re-use old malloc'd pages first for singletons, otherwise just malloc.
*/
static MDB_page *
txn->mt_dirty_room--;
}
-/** Allocate pages for writing.
+/** Allocate page numbers and memory for writing. Maintain me_pglast,
+ * me_pghead and mt_next_pgno.
+ *
* If there are free pages available from older transactions, they
- * will be re-used first. Otherwise a new page will be allocated.
+ * are re-used first. Otherwise allocate a new page at mt_next_pgno.
+ * Do not modify the freedB, just merge freeDB records into me_pghead[]
+ * and move me_pglast to say which records were consumed. Only this
+ * function can create me_pghead and move me_pglast/mt_next_pgno.
* @param[in] mc cursor A cursor handle identifying the transaction and
* database for which we are allocating.
* @param[in] num the number of pages to allocate.
mdb_cursor_init(&m2, txn, FREE_DBI, NULL);
if (last) {
op = MDB_SET_RANGE;
- key.mv_data = &last; /* will loop up last+1 */
+ key.mv_data = &last; /* will look up last+1 */
key.mv_size = sizeof(last);
}
if (Paranoid && mc->mc_dbi == FREE_DBI)
}
env->me_pglast = last;
#if MDB_DEBUG > 1
- DPRINTF("IDL read txn %zu root %zu num %u",
+ DPRINTF("IDL read txn %"Z"u root %"Z"u num %u",
last, txn->mt_dbs[FREE_DBI].md_root, i);
for (k = i; k; k--)
- DPRINTF("IDL %zu", idl[k]);
+ DPRINTF("IDL %"Z"u", idl[k]);
#endif
/* Merge in descending sorted order */
j = mop_len;
(rc = mdb_page_alloc(mc, 1, &np)))
return rc;
pgno = np->mp_pgno;
- DPRINTF("touched db %u page %zu -> %zu", mc->mc_dbi,mp->mp_pgno,pgno);
+ DPRINTF("touched db %u page %"Z"u -> %"Z"u", mc->mc_dbi,mp->mp_pgno,pgno);
assert(mp->mp_pgno != pgno);
mdb_midl_xappend(txn->mt_free_pgs, mp->mp_pgno);
/* Update the parent page, if any, to point to the new page */
}
mc = bk;
}
+ /* Only malloced cursors are permanently tracked. */
free(mc);
}
cursors[i] = NULL;
static void
mdb_txn_reset0(MDB_txn *txn, const char *act);
+#ifdef _WIN32
+enum Pidlock_op {
+ Pidset, Pidcheck
+};
+#else
+enum Pidlock_op {
+ Pidset = F_SETLK, Pidcheck = F_GETLK
+};
+#endif
+
+/** Set or check a pid lock. Set returns 0 on success.
+ * Check returns 0 if lock exists (meaning the process is alive).
+ *
+ * On Windows Pidset is a no-op, we merely check for the existence
+ * of the process with the given pid. On POSIX we use a single byte
+ * lock on the lockfile, set at an offset equal to the pid.
+ */
+static int
+mdb_reader_pid(MDB_env *env, enum Pidlock_op op, pid_t pid)
+{
+#ifdef _WIN32
+ HANDLE h;
+ int ver, query;
+ switch(op) {
+ case Pidset:
+ break;
+ case Pidcheck:
+ h = OpenProcess(env->me_pidquery, FALSE, pid);
+ if (!h)
+ return GetLastError();
+ CloseHandle(h);
+ break;
+ }
+ return 0;
+#else
+ int rc;
+ struct flock lock_info;
+ memset((void *)&lock_info, 0, sizeof(lock_info));
+ lock_info.l_type = F_WRLCK;
+ lock_info.l_whence = SEEK_SET;
+ lock_info.l_start = pid;
+ lock_info.l_len = 1;
+ while ((rc = fcntl(env->me_lfd, op, &lock_info)) &&
+ (rc = ErrCode()) == EINTR) ;
+ if (op == F_GETLK && rc == 0 && lock_info.l_type == F_UNLCK)
+ rc = -1;
+ return rc;
+#endif
+}
+
/** Common code for #mdb_txn_begin() and #mdb_txn_renew().
* @param[in] txn the transaction handle to initialize
* @return 0 on success, non-zero on failure.
pid_t pid = env->me_pid;
pthread_t tid = pthread_self();
+ if (!(env->me_flags & MDB_LIVE_READER)) {
+ rc = mdb_reader_pid(env, Pidset, pid);
+ if (rc) {
+ UNLOCK_MUTEX_R(env);
+ return rc;
+ }
+ env->me_flags |= MDB_LIVE_READER;
+ }
+
LOCK_MUTEX_R(env);
for (i=0; i<env->me_txns->mti_numreaders; i++)
if (env->me_txns->mti_readers[i].mr_pid == 0)
rc = mdb_txn_renew0(txn);
if (rc == MDB_SUCCESS) {
- DPRINTF("renew txn %zu%c %p on mdbenv %p, root page %zu",
+ DPRINTF("renew txn %"Z"u%c %p on mdbenv %p, root page %"Z"u",
txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w',
(void *)txn, (void *)txn->mt_env, txn->mt_dbs[MAIN_DBI].md_root);
}
free(txn);
else {
*ret = txn;
- DPRINTF("begin txn %zu%c %p on mdbenv %p, root page %zu",
+ DPRINTF("begin txn %"Z"u%c %p on mdbenv %p, root page %"Z"u",
txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w',
(void *) txn, (void *) env, txn->mt_dbs[MAIN_DBI].md_root);
}
/** Common code for #mdb_txn_reset() and #mdb_txn_abort().
* May be called twice for readonly txns: First reset it, then abort.
* @param[in] txn the transaction handle to reset
+ * @param[in] act why the transaction is being reset
*/
static void
mdb_txn_reset0(MDB_txn *txn, const char *act)
/* Close any DBI handles opened in this txn */
mdb_dbis_update(txn, 0);
- DPRINTF("%s txn %zu%c %p on mdbenv %p, root page %zu",
+ DPRINTF("%s txn %"Z"u%c %p on mdbenv %p, root page %"Z"u",
act, txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w',
(void *) txn, (void *)env, txn->mt_dbs[MAIN_DBI].md_root);
#if MDB_DEBUG > 1
{
unsigned int i = free_pgs[0];
- DPRINTF("IDL write txn %zu root %zu num %u",
+ DPRINTF("IDL write txn %"Z"u root %"Z"u num %u",
txn->mt_txnid, txn->mt_dbs[FREE_DBI].md_root, i);
for (; i; i--)
- DPRINTF("IDL %zu", free_pgs[i]);
+ DPRINTF("IDL %"Z"u", free_pgs[i]);
}
#endif
continue;
* the write offset, to at least save the overhead of a Seek
* system call.
*/
- DPRINTF("committing page %zu", pgno);
+ DPRINTF("committing page %"Z"u", pgno);
memset(&ov, 0, sizeof(ov));
ov.Offset = pos & 0xffffffff;
ov.OffsetHigh = pos >> 16 >> 16;
wpos = pos;
wsize = 0;
}
- DPRINTF("committing page %zu", pgno);
+ DPRINTF("committing page %"Z"u", pgno);
next_pos = pos + size;
iov[n].iov_len = size;
iov[n].iov_base = (char *)dp;
if (!txn->mt_u.dirty_list[0].mid && !(txn->mt_flags & MDB_TXN_DIRTY))
goto done;
- DPRINTF("committing txn %zu %p on mdbenv %p, root page %zu",
+ DPRINTF("committing txn %"Z"u %p on mdbenv %p, root page %"Z"u",
txn->mt_txnid, (void *)txn, (void *)env, txn->mt_dbs[MAIN_DBI].md_root);
/* Update DB root pointers */
p = (MDB_page *)&pbuf;
if (!F_ISSET(p->mp_flags, P_META)) {
- DPRINTF("page %zu not a meta page", p->mp_pgno);
+ DPRINTF("page %"Z"u not a meta page", p->mp_pgno);
return MDB_INVALID;
}
MDB_page *p, *q;
int rc;
unsigned int psize;
+#ifdef _WIN32
+ DWORD len;
+ OVERLAPPED ov;
+ memset(&ov, 0, sizeof(ov));
+#define DO_PWRITE(rc, fd, ptr, size, len, pos) do { \
+ ov.Offset = pos; \
+ rc = WriteFile(fd, ptr, size, &len, &ov); } while(0)
+#else
+ int len;
+#define DO_PWRITE(rc, fd, ptr, size, len, pos) do { \
+ len = pwrite(fd, ptr, size, pos); \
+ rc = (len >= 0); } while(0)
+#endif
DPUTS("writing new meta page");
q->mp_flags = P_META;
*(MDB_meta *)METADATA(q) = *meta;
-#ifdef _WIN32
- {
- DWORD len;
- OVERLAPPED ov;
- memset(&ov, 0, sizeof(ov));
- rc = WriteFile(env->me_fd, p, psize * 2, &len, &ov);
- rc = rc ? (len == psize * 2 ? MDB_SUCCESS : EIO) : ErrCode();
- }
-#else
- rc = pwrite(env->me_fd, p, psize * 2, 0);
- rc = (rc == (int)psize * 2) ? MDB_SUCCESS : rc < 0 ? ErrCode() : EIO;
-#endif
+ DO_PWRITE(rc, env->me_fd, p, psize * 2, len, 0);
+ if (!rc)
+ rc = ErrCode();
+ else if ((unsigned) len == psize * 2)
+ rc = MDB_SUCCESS;
+ else
+ rc = ENOSPC;
free(p);
return rc;
}
assert(txn->mt_env != NULL);
toggle = !txn->mt_toggle;
- DPRINTF("writing meta page %d for root page %zu",
+ DPRINTF("writing meta page %d for root page %"Z"u",
toggle, txn->mt_dbs[MAIN_DBI].md_root);
env = txn->mt_env;
LONG sizelo, sizehi;
sizelo = env->me_mapsize & 0xffffffff;
sizehi = env->me_mapsize >> 16 >> 16; /* only needed on Win64 */
+
+ /* See if we should use QueryLimited */
+ rc = GetVersion();
+ if ((rc & 0xff) > 5)
+ env->me_pidquery = PROCESS_QUERY_LIMITED_INFORMATION;
+ else
+ env->me_pidquery = PROCESS_QUERY_INFORMATION;
+
/* Windows won't create mappings for zero length files.
* Just allocate the maxsize right now.
*/
env->me_metas[0]->mm_version, env->me_psize);
DPRINTF("using meta page %d", toggle);
DPRINTF("depth: %u", db->md_depth);
- DPRINTF("entries: %zu", db->md_entries);
- DPRINTF("branch pages: %zu", db->md_branch_pages);
- DPRINTF("leaf pages: %zu", db->md_leaf_pages);
- DPRINTF("overflow pages: %zu", db->md_overflow_pages);
- DPRINTF("root: %zu", db->md_root);
+ DPRINTF("entries: %"Z"u", db->md_entries);
+ DPRINTF("branch pages: %"Z"u", db->md_branch_pages);
+ DPRINTF("leaf pages: %"Z"u", db->md_leaf_pages);
+ DPRINTF("overflow pages: %"Z"u", db->md_overflow_pages);
+ DPRINTF("root: %"Z"u", db->md_root);
}
#endif
#define MDB_HASH_INIT ((mdb_hash_t)0xcbf29ce484222325ULL)
/** perform a 64 bit Fowler/Noll/Vo FNV-1a hash on a buffer
- * @param[in] str string to hash
+ * @param[in] val value to hash
* @param[in] hval initial value for hash
* @return 64 bit hash
*
return hval;
}
-/** Hash the string and output the hash in hex.
+/** Hash the string and output the encoded hash.
+ * This uses modified RFC1924 Ascii85 encoding to accommodate systems with
+ * very short name limits. We don't care about the encoding being reversible,
+ * we just want to preserve as many bits of the input as possible in a
+ * small printable string.
* @param[in] str string to hash
- * @param[out] hexbuf an array of 17 chars to hold the hash
+ * @param[out] encbuf an array of 11 chars to hold the hash
*/
+static const char mdb_a85[]= "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!#$%&()*+-;<=>?@^_`{|}~";
+
static void
-mdb_hash_hex(MDB_val *val, char *hexbuf)
+mdb_pack85(unsigned long l, char *out)
{
int i;
- mdb_hash_t h = mdb_hash_val(val, MDB_HASH_INIT);
- for (i=0; i<8; i++) {
- hexbuf += sprintf(hexbuf, "%02x", (unsigned int)h & 0xff);
- h >>= 8;
+
+ for (i=0; i<5; i++) {
+ *out++ = mdb_a85[l % 85];
+ l /= 85;
}
}
+
+static void
+mdb_hash_enc(MDB_val *val, char *encbuf)
+{
+ mdb_hash_t h = mdb_hash_val(val, MDB_HASH_INIT);
+ unsigned long *l = (unsigned long *)&h;
+
+ mdb_pack85(l[0], encbuf);
+ mdb_pack85(l[1], encbuf+5);
+ encbuf[10] = '\0';
+}
#endif
/** Open and/or initialize the lock region for the environment.
DWORD nlow;
} idbuf;
MDB_val val;
- char hexbuf[17];
+ char encbuf[11];
if (!mdb_sec_inited) {
InitializeSecurityDescriptor(&mdb_null_sd,
idbuf.nlow = stbuf.nFileIndexLow;
val.mv_data = &idbuf;
val.mv_size = sizeof(idbuf);
- mdb_hash_hex(&val, hexbuf);
- sprintf(env->me_txns->mti_rmname, "Global\\MDBr%s", hexbuf);
- sprintf(env->me_txns->mti_wmname, "Global\\MDBw%s", hexbuf);
+ mdb_hash_enc(&val, encbuf);
+ sprintf(env->me_txns->mti_rmname, "Global\\MDBr%s", encbuf);
+ sprintf(env->me_txns->mti_wmname, "Global\\MDBw%s", encbuf);
env->me_rmutex = CreateMutex(&mdb_all_sa, FALSE, env->me_txns->mti_rmname);
if (!env->me_rmutex) goto fail_errno;
env->me_wmutex = CreateMutex(&mdb_all_sa, FALSE, env->me_txns->mti_wmname);
ino_t ino;
} idbuf;
MDB_val val;
- char hexbuf[17];
+ char encbuf[11];
+#if defined(__NetBSD__)
+#define MDB_SHORT_SEMNAMES 1 /* limited to 14 chars */
+#endif
if (fstat(env->me_lfd, &stbuf)) goto fail_errno;
idbuf.dev = stbuf.st_dev;
idbuf.ino = stbuf.st_ino;
val.mv_data = &idbuf;
val.mv_size = sizeof(idbuf);
- mdb_hash_hex(&val, hexbuf);
- sprintf(env->me_txns->mti_rmname, "/MDBr%s", hexbuf);
- sprintf(env->me_txns->mti_wmname, "/MDBw%s", hexbuf);
+ mdb_hash_enc(&val, encbuf);
+#ifdef MDB_SHORT_SEMNAMES
+ encbuf[9] = '\0'; /* drop name from 15 chars to 14 chars */
+#endif
+ sprintf(env->me_txns->mti_rmname, "/MDBr%s", encbuf);
+ sprintf(env->me_txns->mti_wmname, "/MDBw%s", encbuf);
/* Clean up after a previous run, if needed: Try to
* remove both semaphores before doing anything else.
*/
int rc;
size_t wsize;
char *ptr;
+#ifdef _WIN32
+ DWORD len, w2;
+#define DO_WRITE(rc, fd, ptr, w2, len) rc = WriteFile(fd, ptr, w2, &len, NULL)
+#else
+ ssize_t len;
+ size_t w2;
+#define DO_WRITE(rc, fd, ptr, w2, len) len = write(fd, ptr, w2); rc = (len >= 0)
+#endif
/* Do the lock/unlock of the reader mutex before starting the
* write txn. Otherwise other read txns could block writers.
}
wsize = env->me_psize * 2;
-#ifdef _WIN32
- {
- DWORD len;
- rc = WriteFile(fd, env->me_map, wsize, &len, NULL);
- rc = rc ? (len == wsize ? MDB_SUCCESS : EIO) : ErrCode();
+ ptr = env->me_map;
+ w2 = wsize;
+ while (w2 > 0) {
+ DO_WRITE(rc, fd, ptr, w2, len);
+ if (!rc) {
+ rc = ErrCode();
+ break;
+ } else if (len > 0) {
+ rc = MDB_SUCCESS;
+ ptr += len;
+ w2 -= len;
+ continue;
+ } else {
+ /* Non-blocking or async handles are not supported */
+ rc = EIO;
+ break;
+ }
}
-#else
- rc = write(fd, env->me_map, wsize);
- rc = rc == (int)wsize ? MDB_SUCCESS : rc < 0 ? ErrCode() : EIO;
-#endif
if (env->me_txns)
UNLOCK_MUTEX_W(env);
if (rc)
goto leave;
- ptr = env->me_map + wsize;
wsize = txn->mt_next_pgno * env->me_psize - wsize;
-#ifdef _WIN32
while (wsize > 0) {
- DWORD len, w2;
if (wsize > MAX_WRITE)
w2 = MAX_WRITE;
else
w2 = wsize;
- rc = WriteFile(fd, ptr, w2, &len, NULL);
- rc = rc ? (len == w2 ? MDB_SUCCESS : EIO) : ErrCode();
- if (rc) break;
- wsize -= w2;
- ptr += w2;
- }
-#else
- while (wsize > 0) {
- size_t w2;
- ssize_t wres;
- if (wsize > MAX_WRITE)
- w2 = MAX_WRITE;
- else
- w2 = wsize;
- wres = write(fd, ptr, w2);
- rc = wres == (ssize_t)w2 ? MDB_SUCCESS : wres < 0 ? ErrCode() : EIO;
- if (rc) break;
- wsize -= wres;
- ptr += wres;
+ DO_WRITE(rc, fd, ptr, w2, len);
+ if (!rc) {
+ rc = ErrCode();
+ break;
+ } else if (len > 0) {
+ rc = MDB_SUCCESS;
+ ptr += len;
+ wsize -= len;
+ continue;
+ } else {
+ rc = EIO;
+ break;
+ }
}
-#endif
leave:
mdb_txn_abort(txn);
{
pgno_t pgno;
COPY_PGNO(pgno, mp->mp_pgno);
- DPRINTF("searching %u keys in %s %spage %zu",
+ DPRINTF("searching %u keys in %s %spage %"Z"u",
nkeys, IS_LEAF(mp) ? "leaf" : "branch", IS_SUBP(mp) ? "sub-" : "",
pgno);
}
DPRINTF("found leaf index %u [%s], rc = %i",
i, DKEY(&nodekey), rc);
else
- DPRINTF("found branch index %u [%s -> %zu], rc = %i",
+ DPRINTF("found branch index %u [%s -> %"Z"u], rc = %i",
i, DKEY(&nodekey), NODEPGNO(node), rc);
#endif
if (rc == 0)
if (mc->mc_snum)
mc->mc_top--;
- DPRINTF("popped page %zu off db %u cursor %p", top->mp_pgno,
+ DPRINTF("popped page %"Z"u off db %u cursor %p", top->mp_pgno,
mc->mc_dbi, (void *) mc);
}
}
static int
mdb_cursor_push(MDB_cursor *mc, MDB_page *mp)
{
- DPRINTF("pushing page %zu on db %u cursor %p", mp->mp_pgno,
+ DPRINTF("pushing page %"Z"u on db %u cursor %p", mp->mp_pgno,
mc->mc_dbi, (void *) mc);
if (mc->mc_snum >= CURSOR_STACK) {
level = 0;
p = (MDB_page *)(txn->mt_env->me_map + txn->mt_env->me_psize * pgno);
} else {
- DPRINTF("page %zu not found", pgno);
+ DPRINTF("page %"Z"u not found", pgno);
assert(p != NULL);
return MDB_PAGE_NOTFOUND;
}
MDB_node *node;
indx_t i;
- DPRINTF("branch page %zu has %u keys", mp->mp_pgno, NUMKEYS(mp));
+ DPRINTF("branch page %"Z"u has %u keys", mp->mp_pgno, NUMKEYS(mp));
assert(NUMKEYS(mp) > 1);
- DPRINTF("found index 0 to page %zu", NODEPGNO(NODEPTR(mp, 0)));
+ DPRINTF("found index 0 to page %"Z"u", NODEPGNO(NODEPTR(mp, 0)));
if (key == NULL) /* Initialize cursor to first page. */
i = 0;
return MDB_CORRUPTED;
}
- DPRINTF("found leaf page %zu for key [%s]", mp->mp_pgno,
+ DPRINTF("found leaf page %"Z"u for key [%s]", mp->mp_pgno,
key ? DKEY(key) : NULL);
mc->mc_flags |= C_INITIALIZED;
mc->mc_flags &= ~C_EOF;
mc->mc_snum = 1;
mc->mc_top = 0;
- DPRINTF("db %u root page %zu has flags 0x%X",
+ DPRINTF("db %u root page %"Z"u has flags 0x%X",
mc->mc_dbi, root, mc->mc_pg[0]->mp_flags);
if (flags & MDB_PS_MODIFY) {
MDB_env *env = txn->mt_env;
int rc;
- DPRINTF("free ov page %zu (%d)", pg, ovpages);
+ DPRINTF("free ov page %"Z"u (%d)", pg, ovpages);
/* If the page is dirty or on the spill list we just acquired it,
* so we should give it back to our current free list, if any.
* Not currently supported in nested txns.
data->mv_size = NODEDSZ(leaf);
memcpy(&pgno, NODEDATA(leaf), sizeof(pgno));
if ((rc = mdb_page_get(txn, pgno, &omp, NULL)) != 0) {
- DPRINTF("read overflow page %zu failed", pgno);
+ DPRINTF("read overflow page %"Z"u failed", pgno);
return rc;
}
data->mv_data = METADATA(omp);
}
mdb_cursor_pop(mc);
- DPRINTF("parent page is page %zu, index %u",
+ DPRINTF("parent page is page %"Z"u, index %u",
mc->mc_pg[mc->mc_top]->mp_pgno, mc->mc_ki[mc->mc_top]);
if (move_right ? (mc->mc_ki[mc->mc_top] + 1u >= NUMKEYS(mc->mc_pg[mc->mc_top]))
}
}
- DPRINTF("cursor_next: top page is %zu in cursor %p", mp->mp_pgno, (void *) mc);
+ DPRINTF("cursor_next: top page is %"Z"u in cursor %p", mp->mp_pgno, (void *) mc);
if (mc->mc_ki[mc->mc_top] + 1u >= NUMKEYS(mp)) {
DPUTS("=====> move to next sibling page");
return rc;
}
mp = mc->mc_pg[mc->mc_top];
- DPRINTF("next page is %zu, key index %u", mp->mp_pgno, mc->mc_ki[mc->mc_top]);
+ DPRINTF("next page is %"Z"u, key index %u", mp->mp_pgno, mc->mc_ki[mc->mc_top]);
} else
mc->mc_ki[mc->mc_top]++;
- DPRINTF("==> cursor points to page %zu with %u keys, key index %u",
+ DPRINTF("==> cursor points to page %"Z"u with %u keys, key index %u",
mp->mp_pgno, NUMKEYS(mp), mc->mc_ki[mc->mc_top]);
if (IS_LEAF2(mp)) {
}
}
- DPRINTF("cursor_prev: top page is %zu in cursor %p", mp->mp_pgno, (void *) mc);
+ DPRINTF("cursor_prev: top page is %"Z"u in cursor %p", mp->mp_pgno, (void *) mc);
if (mc->mc_ki[mc->mc_top] == 0) {
DPUTS("=====> move to prev sibling page");
}
mp = mc->mc_pg[mc->mc_top];
mc->mc_ki[mc->mc_top] = NUMKEYS(mp) - 1;
- DPRINTF("prev page is %zu, key index %u", mp->mp_pgno, mc->mc_ki[mc->mc_top]);
+ DPRINTF("prev page is %"Z"u, key index %u", mp->mp_pgno, mc->mc_ki[mc->mc_top]);
} else
mc->mc_ki[mc->mc_top]--;
mc->mc_flags &= ~C_EOF;
- DPRINTF("==> cursor points to page %zu with %u keys, key index %u",
+ DPRINTF("==> cursor points to page %"Z"u with %u keys, key index %u",
mp->mp_pgno, NUMKEYS(mp), mc->mc_ki[mc->mc_top]);
if (IS_LEAF2(mp)) {
return EINVAL;
#endif
- DPRINTF("==> put db %u key [%s], size %zu, data size %zu",
+ DPRINTF("==> put db %u key [%s], size %"Z"u, data size %"Z"u",
mc->mc_dbi, DKEY(key), key ? key->mv_size:0, data->mv_size);
dkey.mv_size = 0;
if ((rc = mdb_page_alloc(mc, num, &np)))
return rc;
- DPRINTF("allocated new mpage %zu, page size %u",
+ DPRINTF("allocated new mpage %"Z"u, page size %u",
np->mp_pgno, mc->mc_txn->mt_env->me_psize);
np->mp_flags = flags | P_DIRTY;
np->mp_lower = PAGEHDRSZ;
assert(mp->mp_upper >= mp->mp_lower);
- DPRINTF("add to %s %spage %zu index %i, data size %zu key size %zu [%s]",
+ DPRINTF("add to %s %spage %"Z"u index %i, data size %"Z"u key size %"Z"u [%s]",
IS_LEAF(mp) ? "leaf" : "branch",
IS_SUBP(mp) ? "sub-" : "",
mp->mp_pgno, indx, data ? data->mv_size : 0,
int ovpages = OVPAGES(data->mv_size, mc->mc_txn->mt_env->me_psize);
int rc;
/* Put data on overflow page. */
- DPRINTF("data size is %zu, node would be %zu, put data on overflow page",
+ DPRINTF("data size is %"Z"u, node would be %"Z"u, put data on overflow page",
data->mv_size, node_size+data->mv_size);
node_size += sizeof(pgno_t);
if ((rc = mdb_page_new(mc, P_OVERFLOW, ovpages, &ofp)))
return rc;
- DPRINTF("allocated overflow page %zu", ofp->mp_pgno);
+ DPRINTF("allocated overflow page %"Z"u", ofp->mp_pgno);
flags |= F_BIGDATA;
} else {
node_size += data->mv_size;
node_size += node_size & 1;
if (node_size + sizeof(indx_t) > SIZELEFT(mp)) {
- DPRINTF("not enough room in page %zu, got %u ptrs",
+ DPRINTF("not enough room in page %"Z"u, got %u ptrs",
mp->mp_pgno, NUMKEYS(mp));
DPRINTF("upper - lower = %u - %u = %u", mp->mp_upper, mp->mp_lower,
mp->mp_upper - mp->mp_lower);
- DPRINTF("node size = %zu", node_size);
+ DPRINTF("node size = %"Z"u", node_size);
return MDB_PAGE_FULL;
}
{
pgno_t pgno;
COPY_PGNO(pgno, mp->mp_pgno);
- DPRINTF("delete node %u on %s page %zu", indx,
+ DPRINTF("delete node %u on %s page %"Z"u", indx,
IS_LEAF(mp) ? "leaf" : "branch", pgno);
}
#endif
mx->mx_db.md_flags |= MDB_INTEGERKEY;
}
}
- DPRINTF("Sub-db %u for db %u root page %zu", mx->mx_cursor.mc_dbi, mc->mc_dbi,
+ DPRINTF("Sub-db %u for db %u root page %"Z"u", mx->mx_cursor.mc_dbi, mc->mc_dbi,
mx->mx_db.md_root);
mx->mx_dbflag = DB_VALID | (F_ISSET(mc->mc_pg[mc->mc_top]->mp_flags, P_DIRTY) ?
DB_DIRTY : 0);
char kbuf2[(MDB_MAXKEYSIZE*2+1)];
k2.mv_data = NODEKEY(node);
k2.mv_size = node->mn_ksize;
- DPRINTF("update key %u (ofs %u) [%s] to [%s] on page %zu",
+ DPRINTF("update key %u (ofs %u) [%s] to [%s] on page %"Z"u",
indx, ptr,
mdb_dkey(&k2, kbuf2),
DKEY(key),
return rc;
}
- DPRINTF("moving %s node %u [%s] on page %zu to node %u on page %zu",
+ DPRINTF("moving %s node %u [%s] on page %"Z"u to node %u on page %"Z"u",
IS_LEAF(csrc->mc_pg[csrc->mc_top]) ? "leaf" : "branch",
csrc->mc_ki[csrc->mc_top],
DKEY(&key),
key.mv_size = NODEKSZ(srcnode);
key.mv_data = NODEKEY(srcnode);
}
- DPRINTF("update separator for source page %zu to [%s]",
+ DPRINTF("update separator for source page %"Z"u to [%s]",
csrc->mc_pg[csrc->mc_top]->mp_pgno, DKEY(&key));
mdb_cursor_copy(csrc, &mn);
mn.mc_snum--;
key.mv_size = NODEKSZ(srcnode);
key.mv_data = NODEKEY(srcnode);
}
- DPRINTF("update separator for destination page %zu to [%s]",
+ DPRINTF("update separator for destination page %"Z"u to [%s]",
cdst->mc_pg[cdst->mc_top]->mp_pgno, DKEY(&key));
mdb_cursor_copy(cdst, &mn);
mn.mc_snum--;
MDB_val key, data;
unsigned nkeys;
- DPRINTF("merging page %zu into %zu", csrc->mc_pg[csrc->mc_top]->mp_pgno,
+ DPRINTF("merging page %"Z"u into %"Z"u", csrc->mc_pg[csrc->mc_top]->mp_pgno,
cdst->mc_pg[cdst->mc_top]->mp_pgno);
assert(csrc->mc_snum > 1); /* can't merge root page */
}
}
- DPRINTF("dst page %zu now has %u keys (%.1f%% filled)",
+ DPRINTF("dst page %"Z"u now has %u keys (%.1f%% filled)",
cdst->mc_pg[cdst->mc_top]->mp_pgno, NUMKEYS(cdst->mc_pg[cdst->mc_top]), (float)PAGEFILL(cdst->mc_txn->mt_env, cdst->mc_pg[cdst->mc_top]) / 10);
/* Unlink the src page from parent and add to free list.
{
pgno_t pgno;
COPY_PGNO(pgno, mc->mc_pg[mc->mc_top]->mp_pgno);
- DPRINTF("rebalancing %s page %zu (has %u keys, %.1f%% full)",
+ DPRINTF("rebalancing %s page %"Z"u (has %u keys, %.1f%% full)",
IS_LEAF(mc->mc_pg[mc->mc_top]) ? "leaf" : "branch",
pgno, NUMKEYS(mc->mc_pg[mc->mc_top]), (float)PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) / 10);
}
#if MDB_DEBUG
pgno_t pgno;
COPY_PGNO(pgno, mc->mc_pg[mc->mc_top]->mp_pgno);
- DPRINTF("no need to rebalance page %zu, above fill threshold",
+ DPRINTF("no need to rebalance page %"Z"u, above fill threshold",
pgno);
#endif
return MDB_SUCCESS;
mc->mc_ki[mc->mc_top] = 0;
}
- DPRINTF("found neighbor page %zu (%u keys, %.1f%% full)",
+ DPRINTF("found neighbor page %"Z"u (%u keys, %.1f%% full)",
mn.mc_pg[mn.mc_top]->mp_pgno, NUMKEYS(mn.mc_pg[mn.mc_top]), (float)PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) / 10);
/* If the neighbor page is above threshold and has enough keys,
mp = mc->mc_pg[mc->mc_top];
newindx = mc->mc_ki[mc->mc_top];
- DPRINTF("-----> splitting %s page %zu and adding [%s] at index %i",
+ DPRINTF("-----> splitting %s page %"Z"u and adding [%s] at index %i",
IS_LEAF(mp) ? "leaf" : "branch", mp->mp_pgno,
DKEY(newkey), mc->mc_ki[mc->mc_top]);
/* Create a right sibling. */
if ((rc = mdb_page_new(mc, mp->mp_flags, 1, &rp)))
return rc;
- DPRINTF("new right sibling: page %zu", rp->mp_pgno);
+ DPRINTF("new right sibling: page %"Z"u", rp->mp_pgno);
if (mc->mc_snum < 2) {
if ((rc = mdb_page_new(mc, P_BRANCH, 1, &pp)))
mc->mc_pg[0] = pp;
mc->mc_ki[0] = 0;
mc->mc_db->md_root = pp->mp_pgno;
- DPRINTF("root split! new root = %zu", pp->mp_pgno);
+ DPRINTF("root split! new root = %"Z"u", pp->mp_pgno);
mc->mc_db->md_depth++;
new_root = 1;
ptop = 0;
} else {
ptop = mc->mc_top-1;
- DPRINTF("parent branch page is %zu", mc->mc_pg[ptop]->mp_pgno);
+ DPRINTF("parent branch page is %"Z"u", mc->mc_pg[ptop]->mp_pgno);
}
mc->mc_flags |= C_SPLITTING;
arg->me_mapaddr = (env->me_flags & MDB_FIXEDMAP) ? env->me_map : 0;
arg->me_mapsize = env->me_mapsize;
arg->me_maxreaders = env->me_maxreaders;
- arg->me_numreaders = env->me_numreaders;
+
+ /* me_numreaders may be zero if this process never used any readers. Use
+ * the shared numreader count if it exists.
+ */
+ arg->me_numreaders = env->me_txns ? env->me_txns->mti_numreaders : env->me_numreaders;
+
arg->me_last_pgno = env->me_metas[toggle]->mm_last_pg;
arg->me_last_txnid = env->me_metas[toggle]->mm_txnid;
return MDB_SUCCESS;
if (!env->me_txns) {
return func("(no reader locks)\n", ctx);
}
- rdrs = env->me_numreaders;
+ rdrs = env->me_txns->mti_numreaders;
mr = env->me_txns->mti_readers;
for (i=0; i<rdrs; i++) {
if (mr[i].mr_pid) {
int rc;
tid = mr[i].mr_tid;
if (mr[i].mr_txnid == (txnid_t)-1) {
- sprintf(buf, "%10d %zx -\n", mr[i].mr_pid, tid);
+ sprintf(buf, "%10d %"Z"x -\n", mr[i].mr_pid, tid);
} else {
- sprintf(buf, "%10d %zx %zu\n", mr[i].mr_pid, tid, mr[i].mr_txnid);
+ sprintf(buf, "%10d %"Z"x %"Z"u\n", mr[i].mr_pid, tid, mr[i].mr_txnid);
}
if (first) {
first = 0;
}
return 0;
}
+
+/* insert pid into list if not already present.
+ * return -1 if already present.
+ */
+static int mdb_pid_insert(pid_t *ids, pid_t pid)
+{
+ /* binary search of pid in list */
+ unsigned base = 0;
+ unsigned cursor = 1;
+ int val = 0;
+ unsigned n = ids[0];
+
+ while( 0 < n ) {
+ unsigned pivot = n >> 1;
+ cursor = base + pivot + 1;
+ val = pid - ids[cursor];
+
+ if( val < 0 ) {
+ n = pivot;
+
+ } else if ( val > 0 ) {
+ base = cursor;
+ n -= pivot + 1;
+
+ } else {
+ /* found, so it's a duplicate */
+ return -1;
+ }
+ }
+
+ if( val > 0 ) {
+ ++cursor;
+ }
+ ids[0]++;
+ for (n = ids[0]; n > cursor; n--)
+ ids[n] = ids[n-1];
+ ids[n] = pid;
+ return 0;
+}
+
+int mdb_reader_check(MDB_env *env, int *dead)
+{
+ unsigned int i, j, rdrs;
+ MDB_reader *mr;
+ pid_t *pids, pid;
+ int count = 0;
+
+ if (!env)
+ return EINVAL;
+ if (dead)
+ *dead = 0;
+ if (!env->me_txns)
+ return MDB_SUCCESS;
+ rdrs = env->me_txns->mti_numreaders;
+ pids = malloc((rdrs+1) * sizeof(pid_t));
+ if (!pids)
+ return ENOMEM;
+ pids[0] = 0;
+ mr = env->me_txns->mti_readers;
+ j = 0;
+ for (i=0; i<rdrs; i++) {
+ if (mr[i].mr_pid && mr[i].mr_pid != env->me_pid) {
+ pid = mr[i].mr_pid;
+ if (mdb_pid_insert(pids, pid) == 0) {
+ if (mdb_reader_pid(env, Pidcheck, pid)) {
+ LOCK_MUTEX_R(env);
+ if (mdb_reader_pid(env, Pidcheck, pid)) {
+ for (j=i; j<rdrs; j++)
+ if (mr[j].mr_pid == pid) {
+ mr[j].mr_pid = 0;
+ count++;
+ }
+ }
+ UNLOCK_MUTEX_R(env);
+ }
+ }
+ }
+ }
+ free(pids);
+ if (dead)
+ *dead = count;
+ return MDB_SUCCESS;
+}
/** @} */