* BerkeleyDB API, but much simplified.
*/
/*
- * Copyright 2011-2017 Howard Chu, Symas Corp.
+ * Copyright 2011-2018 Howard Chu, Symas Corp.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* the full size. These APIs are defined in <wdm.h> and <ntifs.h>
* but those headers are meant for driver-level development and
* conflict with the regular user-level headers, so we explicitly
- * declare them here. Using these APIs also means we must link to
- * ntdll.dll, which is not linked by default in user code.
+ * declare them here. We get pointers to these functions from
+ * NTDLL.DLL at runtime, to avoid buildtime dependencies on any
+ * NTDLL import libraries.
*/
-NTSTATUS WINAPI
-NtCreateSection(OUT PHANDLE sh, IN ACCESS_MASK acc,
+typedef NTSTATUS WINAPI (NtCreateSectionFunc)
+ (OUT PHANDLE sh, IN ACCESS_MASK acc,
IN void * oa OPTIONAL,
IN PLARGE_INTEGER ms OPTIONAL,
IN ULONG pp, IN ULONG aa, IN HANDLE fh OPTIONAL);
+static NtCreateSectionFunc *NtCreateSection;
+
typedef enum _SECTION_INHERIT {
ViewShare = 1,
ViewUnmap = 2
} SECTION_INHERIT;
-NTSTATUS WINAPI
-NtMapViewOfSection(IN PHANDLE sh, IN HANDLE ph,
+typedef NTSTATUS WINAPI (NtMapViewOfSectionFunc)
+ (IN PHANDLE sh, IN HANDLE ph,
IN OUT PVOID *addr, IN ULONG_PTR zbits,
IN SIZE_T cs, IN OUT PLARGE_INTEGER off OPTIONAL,
IN OUT PSIZE_T vs, IN SECTION_INHERIT ih,
IN ULONG at, IN ULONG pp);
-NTSTATUS WINAPI
-NtClose(HANDLE h);
+static NtMapViewOfSectionFunc *NtMapViewOfSection;
+
+typedef NTSTATUS WINAPI (NtCloseFunc)(HANDLE h);
+
+static NtCloseFunc *NtClose;
/** getpid() returns int; MinGW defines pid_t but MinGW64 typedefs it
* as int64 which is wrong. MSVC doesn't define it at all, so just
/* Most platforms have posix_memalign, older may only have memalign */
#define HAVE_MEMALIGN 1
#include <malloc.h>
+/* On Solaris, we need the POSIX sigwait function */
+#if defined (__sun)
+# define _POSIX_PTHREAD_SEMANTICS 1
+#endif
#endif
#if !(defined(BYTE_ORDER) || defined(__BYTE_ORDER))
unsigned char mx_dbflag;
} MDB_xcursor;
- /** Check if there is an inited xcursor, so #XCURSOR_REFRESH() is proper */
+ /** Check if there is an inited xcursor */
#define XCURSOR_INITED(mc) \
((mc)->mc_xcursor && ((mc)->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED))
- /** Update sub-page pointer, if any, in \b mc->mc_xcursor. Needed
+ /** Update the xcursor's sub-page pointer, if any, in \b mc. Needed
* when the node which contains the sub-page may have moved. Called
- * with \b mp = mc->mc_pg[mc->mc_top], \b ki = mc->mc_ki[mc->mc_top].
+ * with leaf page \b mp = mc->mc_pg[\b top].
*/
-#define XCURSOR_REFRESH(mc, mp, ki) do { \
+#define XCURSOR_REFRESH(mc, top, mp) do { \
MDB_page *xr_pg = (mp); \
- MDB_node *xr_node = NODEPTR(xr_pg, ki); \
+ MDB_node *xr_node; \
+ if (!XCURSOR_INITED(mc) || (mc)->mc_ki[top] >= NUMKEYS(xr_pg)) break; \
+ xr_node = NODEPTR(xr_pg, (mc)->mc_ki[top]); \
if ((xr_node->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA) \
(mc)->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(xr_node); \
} while (0)
static int mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata,
pgno_t newpgno, unsigned int nflags);
-static int mdb_env_read_header(MDB_env *env, MDB_meta *meta);
+static int mdb_env_read_header(MDB_env *env, int prev, MDB_meta *meta);
static MDB_meta *mdb_env_pick_meta(const MDB_env *env);
static int mdb_env_write_meta(MDB_txn *txn);
#ifdef MDB_USE_POSIX_MUTEX /* Drop unused excl arg */
if (m2 == mc) continue;
if (m2->mc_pg[mc->mc_top] == mp) {
m2->mc_pg[mc->mc_top] = np;
- if (XCURSOR_INITED(m2) && IS_LEAF(np))
- XCURSOR_REFRESH(m2, np, m2->mc_ki[mc->mc_top]);
+ if (IS_LEAF(np))
+ XCURSOR_REFRESH(m2, mc->mc_top, np);
}
}
}
* we may be unable to return them to me_pghead.
*/
MDB_page *mp = txn->mt_loose_pgs;
+ MDB_ID2 *dl = txn->mt_u.dirty_list;
+ unsigned x;
if ((rc = mdb_midl_need(&txn->mt_free_pgs, txn->mt_loose_count)) != 0)
return rc;
- for (; mp; mp = NEXT_LOOSE_PAGE(mp))
+ for (; mp; mp = NEXT_LOOSE_PAGE(mp)) {
mdb_midl_xappend(txn->mt_free_pgs, mp->mp_pgno);
+ /* must also remove from dirty list */
+ if (txn->mt_flags & MDB_TXN_WRITEMAP) {
+ for (x=1; x<=dl[0].mid; x++)
+ if (dl[x].mid == mp->mp_pgno)
+ break;
+ mdb_tassert(txn, x <= dl[0].mid);
+ } else {
+ x = mdb_mid2l_search(dl, mp->mp_pgno);
+ mdb_tassert(txn, dl[x].mid == mp->mp_pgno);
+ }
+ dl[x].mptr = NULL;
+ mdb_dpage_free(env, mp);
+ }
+ {
+ /* squash freed slots out of the dirty list */
+ unsigned y;
+ for (y=1; dl[y].mptr && y <= dl[0].mid; y++);
+ if (y <= dl[0].mid) {
+ for(x=y, y++;;) {
+ while (!dl[y].mptr && y <= dl[0].mid) y++;
+ if (y > dl[0].mid) break;
+ dl[x++] = dl[y++];
+ }
+ dl[0].mid = x-1;
+ } else {
+ /* all slots freed */
+ dl[0].mid = 0;
+ }
+ }
txn->mt_loose_pgs = NULL;
txn->mt_loose_count = 0;
}
/** Read the environment parameters of a DB environment before
* mapping it into memory.
* @param[in] env the environment handle
+ * @param[in] prev whether to read the backup meta page
* @param[out] meta address of where to store the meta information
* @return 0 on success, non-zero on failure.
*/
static int ESECT
-mdb_env_read_header(MDB_env *env, MDB_meta *meta)
+mdb_env_read_header(MDB_env *env, int prev, MDB_meta *meta)
{
MDB_metabuf pbuf;
MDB_page *p;
return MDB_VERSION_MISMATCH;
}
- if (off == 0 || m->mm_txnid > meta->mm_txnid)
+ if (off == 0 || (prev ? m->mm_txnid < meta->mm_txnid : m->mm_txnid > meta->mm_txnid))
*meta = *m;
}
return 0;
/** Further setup required for opening an LMDB environment
*/
static int ESECT
-mdb_env_open2(MDB_env *env)
+mdb_env_open2(MDB_env *env, int prev)
{
unsigned int flags = env->me_flags;
int i, newenv = 0, rc;
env->me_pidquery = MDB_PROCESS_QUERY_LIMITED_INFORMATION;
else
env->me_pidquery = PROCESS_QUERY_INFORMATION;
+ /* Grab functions we need from NTDLL */
+ if (!NtCreateSection) {
+ HMODULE h = GetModuleHandle("NTDLL.DLL");
+ if (!h)
+ return MDB_PROBLEM;
+ NtClose = (NtCloseFunc *)GetProcAddress(h, "NtClose");
+ if (!NtClose)
+ return MDB_PROBLEM;
+ NtMapViewOfSection = (NtMapViewOfSectionFunc *)GetProcAddress(h, "NtMapViewOfSection");
+ if (!NtMapViewOfSection)
+ return MDB_PROBLEM;
+ NtCreateSection = (NtCreateSectionFunc *)GetProcAddress(h, "NtCreateSection");
+ if (!NtCreateSection)
+ return MDB_PROBLEM;
+ }
#endif /* _WIN32 */
#ifdef BROKEN_FDATASYNC
}
#endif
- if ((i = mdb_env_read_header(env, &meta)) != 0) {
+ if ((i = mdb_env_read_header(env, prev, &meta)) != 0) {
if (i != ENOENT)
return i;
DPUTS("new mdbenv");
*/
#define CHANGEABLE (MDB_NOSYNC|MDB_NOMETASYNC|MDB_MAPASYNC|MDB_NOMEMINIT)
#define CHANGELESS (MDB_FIXEDMAP|MDB_NOSUBDIR|MDB_RDONLY| \
- MDB_WRITEMAP|MDB_NOTLS|MDB_NOLOCK|MDB_NORDAHEAD)
+ MDB_WRITEMAP|MDB_NOTLS|MDB_NOLOCK|MDB_NORDAHEAD|MDB_PREVMETA)
#if VALID_FLAGS & PERSISTENT_FLAGS & (CHANGEABLE|CHANGELESS)
# error "Persistent DB flags & env flags overlap, but both go in mm_flags"
goto leave;
}
- if ((rc = mdb_env_open2(env)) == MDB_SUCCESS) {
+ if ((rc = mdb_env_open2(env, flags & MDB_PREVMETA)) == MDB_SUCCESS) {
if (!(flags & (MDB_RDONLY|MDB_WRITEMAP))) {
/* Synchronous fd for meta writes. Needed even with
* MDB_NOSYNC/MDB_NOMETASYNC, in case these get reset.
if (env->me_fd != INVALID_HANDLE_VALUE)
(void) close(env->me_fd);
if (env->me_txns) {
- MDB_PID_T pid = env->me_pid;
+ MDB_PID_T pid = getpid();
/* Clearing readers is done in this function because
* me_txkey with its destructor must be disabled first.
*
if (rc)
return rc;
}
+#ifdef MDB_VL32
+ if (mc->mc_ovpg == mp)
+ mc->mc_ovpg = NULL;
+#endif
mc->mc_db->md_overflow_pages -= ovpages;
return 0;
}
rc = MDB_INCOMPATIBLE;
break;
}
+ if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mc->mc_pg[mc->mc_top])) {
+ mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]);
+ rc = MDB_NOTFOUND;
+ break;
+ }
{
MDB_node *leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) {
} else {
memcpy((char *)mp + mp->mp_upper + PAGEBASE, (char *)fp + fp->mp_upper + PAGEBASE,
olddata.mv_size - fp->mp_upper - PAGEBASE);
+ memcpy((char *)(&mp->mp_ptrs), (char *)(&fp->mp_ptrs), NUMKEYS(fp) * sizeof(mp->mp_ptrs[0]));
for (i=0; i<NUMKEYS(fp); i++)
- mp->mp_ptrs[i] = fp->mp_ptrs[i] + offset;
+ mp->mp_ptrs[i] += offset;
}
}
if (m3->mc_ki[i] >= mc->mc_ki[i] && insert_key) {
m3->mc_ki[i]++;
}
- if (XCURSOR_INITED(m3))
- XCURSOR_REFRESH(m3, mp, m3->mc_ki[i]);
+ XCURSOR_REFRESH(m3, i, mp);
}
}
}
MDB_xcursor *mx = mc->mc_xcursor;
unsigned i = mc->mc_top;
MDB_page *mp = mc->mc_pg[i];
- int nkeys = NUMKEYS(mp);
for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) {
if (m2 == mc || m2->mc_snum < mc->mc_snum) continue;
if (m2->mc_pg[i] == mp) {
if (m2->mc_ki[i] == mc->mc_ki[i]) {
mdb_xcursor_init2(m2, mx, new_dupdata);
- } else if (!insert_key && m2->mc_ki[i] < nkeys) {
- XCURSOR_REFRESH(m2, mp, m2->mc_ki[i]);
+ } else if (!insert_key) {
+ XCURSOR_REFRESH(m2, i, mp);
}
}
}
if (m2 == mc || m2->mc_snum < mc->mc_snum) continue;
if (!(m2->mc_flags & C_INITIALIZED)) continue;
if (m2->mc_pg[mc->mc_top] == mp) {
- MDB_node *n2 = leaf;
- if (m2->mc_ki[mc->mc_top] != mc->mc_ki[mc->mc_top]) {
- n2 = NODEPTR(mp, m2->mc_ki[mc->mc_top]);
- if (n2->mn_flags & F_SUBDATA) continue;
- }
- m2->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(n2);
+ XCURSOR_REFRESH(m2, mc->mc_top, mp);
}
}
}
m3->mc_ki[csrc->mc_top] = cdst->mc_ki[cdst->mc_top];
m3->mc_ki[csrc->mc_top-1]++;
}
- if (XCURSOR_INITED(m3) && IS_LEAF(mps))
- XCURSOR_REFRESH(m3, m3->mc_pg[csrc->mc_top], m3->mc_ki[csrc->mc_top]);
+ if (IS_LEAF(mps))
+ XCURSOR_REFRESH(m3, csrc->mc_top, m3->mc_pg[csrc->mc_top]);
}
} else
/* Adding on the right, bump others down */
} else {
m3->mc_ki[csrc->mc_top]--;
}
- if (XCURSOR_INITED(m3) && IS_LEAF(mps))
- XCURSOR_REFRESH(m3, m3->mc_pg[csrc->mc_top], m3->mc_ki[csrc->mc_top]);
+ if (IS_LEAF(mps))
+ XCURSOR_REFRESH(m3, csrc->mc_top, m3->mc_pg[csrc->mc_top]);
}
}
}
m3->mc_ki[top-1] > csrc->mc_ki[top-1]) {
m3->mc_ki[top-1]--;
}
- if (XCURSOR_INITED(m3) && IS_LEAF(psrc))
- XCURSOR_REFRESH(m3, m3->mc_pg[top], m3->mc_ki[top]);
+ if (IS_LEAF(psrc))
+ XCURSOR_REFRESH(m3, top, m3->mc_pg[top]);
}
}
{
} else if (m3->mc_ki[mc->mc_top] > ki) {
m3->mc_ki[mc->mc_top]--;
}
- if (XCURSOR_INITED(m3))
- XCURSOR_REFRESH(m3, m3->mc_pg[mc->mc_top], m3->mc_ki[mc->mc_top]);
+ XCURSOR_REFRESH(m3, mc->mc_top, mp);
}
}
}
}
if (mc->mc_db->md_flags & MDB_DUPSORT) {
MDB_node *node = NODEPTR(m3->mc_pg[m3->mc_top], m3->mc_ki[m3->mc_top]);
- /* If this node is a fake page, it needs to be reinited
- * because its data has moved. But just reset mc_pg[0]
- * if the xcursor is already live.
+ /* If this node has dupdata, it may need to be reinited
+ * because its data has moved.
+ * If the xcursor was not initd it must be reinited.
+ * Else if node points to a subDB, nothing is needed.
+ * Else (xcursor was initd, not a subDB) needs mc_pg[0] reset.
*/
- if ((node->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA) {
- if (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)
- m3->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(node);
- else
+ if (node->mn_flags & F_DUPDATA) {
+ if (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) {
+ if (!(node->mn_flags & F_SUBDATA))
+ m3->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(node);
+ } else {
mdb_xcursor_init1(m3, node);
+ m3->mc_xcursor->mx_cursor.mc_flags |= C_DEL;
+ }
}
}
}
m3->mc_ki[ptop] >= mc->mc_ki[ptop]) {
m3->mc_ki[ptop]++;
}
- if (XCURSOR_INITED(m3) && IS_LEAF(mp))
- XCURSOR_REFRESH(m3, m3->mc_pg[mc->mc_top], m3->mc_ki[mc->mc_top]);
+ if (IS_LEAF(mp))
+ XCURSOR_REFRESH(m3, mc->mc_top, m3->mc_pg[mc->mc_top]);
}
}
DPRINTF(("mp left: %d, rp left: %d", SIZELEFT(mp), SIZELEFT(rp)));
MDB_node *node = NODEPTR(mc.mc_pg[mc.mc_top], mc.mc_ki[mc.mc_top]);
if ((node->mn_flags & (F_DUPDATA|F_SUBDATA)) != F_SUBDATA)
return MDB_INCOMPATIBLE;
- } else if (! (rc == MDB_NOTFOUND && (flags & MDB_CREATE))) {
- return rc;
+ } else {
+ if (rc != MDB_NOTFOUND || !(flags & MDB_CREATE))
+ return rc;
+ if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY))
+ return EACCES;
}
/* Done here so we cannot fail after creating a new DB */