]> git.sur5r.net Git - openldap/blobdiff - libraries/liblmdb/mdb.c
ITS#8756 remove loose pg from dirty list in freelist_save
[openldap] / libraries / liblmdb / mdb.c
index 4a364f7fdb437c095e41057920a3c635d664c06c..63872c92bff93943dcfa360e772e264c694f29a0 100644 (file)
@@ -5,7 +5,7 @@
  *     BerkeleyDB API, but much simplified.
  */
 /*
- * Copyright 2011-2017 Howard Chu, Symas Corp.
+ * Copyright 2011-2018 Howard Chu, Symas Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * the full size. These APIs are defined in <wdm.h> and <ntifs.h>
  * but those headers are meant for driver-level development and
  * conflict with the regular user-level headers, so we explicitly
- * declare them here. Using these APIs also means we must link to
- * ntdll.dll, which is not linked by default in user code.
+ * declare them here. We get pointers to these functions from
+ * NTDLL.DLL at runtime, to avoid buildtime dependencies on any
+ * NTDLL import libraries.
  */
-NTSTATUS WINAPI
-NtCreateSection(OUT PHANDLE sh, IN ACCESS_MASK acc,
+typedef NTSTATUS WINAPI (NtCreateSectionFunc)
+  (OUT PHANDLE sh, IN ACCESS_MASK acc,
   IN void * oa OPTIONAL,
   IN PLARGE_INTEGER ms OPTIONAL,
   IN ULONG pp, IN ULONG aa, IN HANDLE fh OPTIONAL);
 
+static NtCreateSectionFunc *NtCreateSection;
+
 typedef enum _SECTION_INHERIT {
        ViewShare = 1,
        ViewUnmap = 2
 } SECTION_INHERIT;
 
-NTSTATUS WINAPI
-NtMapViewOfSection(IN PHANDLE sh, IN HANDLE ph,
+typedef NTSTATUS WINAPI (NtMapViewOfSectionFunc)
+  (IN PHANDLE sh, IN HANDLE ph,
   IN OUT PVOID *addr, IN ULONG_PTR zbits,
   IN SIZE_T cs, IN OUT PLARGE_INTEGER off OPTIONAL,
   IN OUT PSIZE_T vs, IN SECTION_INHERIT ih,
   IN ULONG at, IN ULONG pp);
 
-NTSTATUS WINAPI
-NtClose(HANDLE h);
+static NtMapViewOfSectionFunc *NtMapViewOfSection;
+
+typedef NTSTATUS WINAPI (NtCloseFunc)(HANDLE h);
+
+static NtCloseFunc *NtClose;
 
 /** getpid() returns int; MinGW defines pid_t but MinGW64 typedefs it
  *  as int64 which is wrong. MSVC doesn't define it at all, so just
@@ -142,6 +148,10 @@ typedef SSIZE_T    ssize_t;
 /* Most platforms have posix_memalign, older may only have memalign */
 #define HAVE_MEMALIGN  1
 #include <malloc.h>
+/* On Solaris, we need the POSIX sigwait function */
+#if defined (__sun)
+# define _POSIX_PTHREAD_SEMANTICS      1
+#endif
 #endif
 
 #if !(defined(BYTE_ORDER) || defined(__BYTE_ORDER))
@@ -1424,17 +1434,19 @@ typedef struct MDB_xcursor {
        unsigned char mx_dbflag;
 } MDB_xcursor;
 
-       /** Check if there is an inited xcursor, so #XCURSOR_REFRESH() is proper */
+       /** Check if there is an inited xcursor */
 #define XCURSOR_INITED(mc) \
        ((mc)->mc_xcursor && ((mc)->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED))
 
-       /** Update sub-page pointer, if any, in \b mc->mc_xcursor.  Needed
+       /** Update the xcursor's sub-page pointer, if any, in \b mc.  Needed
         *      when the node which contains the sub-page may have moved.  Called
-        *      with \b mp = mc->mc_pg[mc->mc_top], \b ki = mc->mc_ki[mc->mc_top].
+        *      with leaf page \b mp = mc->mc_pg[\b top].
         */
-#define XCURSOR_REFRESH(mc, mp, ki) do { \
+#define XCURSOR_REFRESH(mc, top, mp) do { \
        MDB_page *xr_pg = (mp); \
-       MDB_node *xr_node = NODEPTR(xr_pg, ki); \
+       MDB_node *xr_node; \
+       if (!XCURSOR_INITED(mc) || (mc)->mc_ki[top] >= NUMKEYS(xr_pg)) break; \
+       xr_node = NODEPTR(xr_pg, (mc)->mc_ki[top]); \
        if ((xr_node->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA) \
                (mc)->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(xr_node); \
 } while (0)
@@ -1582,7 +1594,7 @@ static int        mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst);
 static int     mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata,
                                pgno_t newpgno, unsigned int nflags);
 
-static int  mdb_env_read_header(MDB_env *env, MDB_meta *meta);
+static int  mdb_env_read_header(MDB_env *env, int prev, MDB_meta *meta);
 static MDB_meta *mdb_env_pick_meta(const MDB_env *env);
 static int  mdb_env_write_meta(MDB_txn *txn);
 #ifdef MDB_USE_POSIX_MUTEX /* Drop unused excl arg */
@@ -2773,8 +2785,8 @@ done:
                        if (m2 == mc) continue;
                        if (m2->mc_pg[mc->mc_top] == mp) {
                                m2->mc_pg[mc->mc_top] = np;
-                               if (XCURSOR_INITED(m2) && IS_LEAF(np))
-                                       XCURSOR_REFRESH(m2, np, m2->mc_ki[mc->mc_top]);
+                               if (IS_LEAF(np))
+                                       XCURSOR_REFRESH(m2, mc->mc_top, np);
                        }
                }
        }
@@ -3432,10 +3444,41 @@ mdb_freelist_save(MDB_txn *txn)
                 * we may be unable to return them to me_pghead.
                 */
                MDB_page *mp = txn->mt_loose_pgs;
+               MDB_ID2 *dl = txn->mt_u.dirty_list;
+               unsigned x;
                if ((rc = mdb_midl_need(&txn->mt_free_pgs, txn->mt_loose_count)) != 0)
                        return rc;
-               for (; mp; mp = NEXT_LOOSE_PAGE(mp))
+               for (; mp; mp = NEXT_LOOSE_PAGE(mp)) {
                        mdb_midl_xappend(txn->mt_free_pgs, mp->mp_pgno);
+                       /* must also remove from dirty list */
+                       if (txn->mt_flags & MDB_TXN_WRITEMAP) {
+                               for (x=1; x<=dl[0].mid; x++)
+                                       if (dl[x].mid == mp->mp_pgno)
+                                               break;
+                               mdb_tassert(txn, x <= dl[0].mid);
+                       } else {
+                               x = mdb_mid2l_search(dl, mp->mp_pgno);
+                               mdb_tassert(txn, dl[x].mid == mp->mp_pgno);
+                       }
+                       dl[x].mptr = NULL;
+                       mdb_dpage_free(env, mp);
+               }
+               {
+                       /* squash freed slots out of the dirty list */
+                       unsigned y;
+                       for (y=1; dl[y].mptr && y <= dl[0].mid; y++);
+                       if (y <= dl[0].mid) {
+                               for(x=y, y++;;) {
+                                       while (!dl[y].mptr && y <= dl[0].mid) y++;
+                                       if (y > dl[0].mid) break;
+                                       dl[x++] = dl[y++];
+                               }
+                               dl[0].mid = x-1;
+                       } else {
+                               /* all slots freed */
+                               dl[0].mid = 0;
+                       }
+               }
                txn->mt_loose_pgs = NULL;
                txn->mt_loose_count = 0;
        }
@@ -3985,11 +4028,12 @@ fail:
 /** Read the environment parameters of a DB environment before
  * mapping it into memory.
  * @param[in] env the environment handle
+ * @param[in] prev whether to read the backup meta page
  * @param[out] meta address of where to store the meta information
  * @return 0 on success, non-zero on failure.
  */
 static int ESECT
-mdb_env_read_header(MDB_env *env, MDB_meta *meta)
+mdb_env_read_header(MDB_env *env, int prev, MDB_meta *meta)
 {
        MDB_metabuf     pbuf;
        MDB_page        *p;
@@ -4040,7 +4084,7 @@ mdb_env_read_header(MDB_env *env, MDB_meta *meta)
                        return MDB_VERSION_MISMATCH;
                }
 
-               if (off == 0 || m->mm_txnid > meta->mm_txnid)
+               if (off == 0 || (prev ? m->mm_txnid < meta->mm_txnid : m->mm_txnid > meta->mm_txnid))
                        *meta = *m;
        }
        return 0;
@@ -4670,7 +4714,7 @@ mdb_fopen(const MDB_env *env, MDB_name *fname,
 /** Further setup required for opening an LMDB environment
  */
 static int ESECT
-mdb_env_open2(MDB_env *env)
+mdb_env_open2(MDB_env *env, int prev)
 {
        unsigned int flags = env->me_flags;
        int i, newenv = 0, rc;
@@ -4683,6 +4727,21 @@ mdb_env_open2(MDB_env *env)
                env->me_pidquery = MDB_PROCESS_QUERY_LIMITED_INFORMATION;
        else
                env->me_pidquery = PROCESS_QUERY_INFORMATION;
+       /* Grab functions we need from NTDLL */
+       if (!NtCreateSection) {
+               HMODULE h = GetModuleHandle("NTDLL.DLL");
+               if (!h)
+                       return MDB_PROBLEM;
+               NtClose = (NtCloseFunc *)GetProcAddress(h, "NtClose");
+               if (!NtClose)
+                       return MDB_PROBLEM;
+               NtMapViewOfSection = (NtMapViewOfSectionFunc *)GetProcAddress(h, "NtMapViewOfSection");
+               if (!NtMapViewOfSection)
+                       return MDB_PROBLEM;
+               NtCreateSection = (NtCreateSectionFunc *)GetProcAddress(h, "NtCreateSection");
+               if (!NtCreateSection)
+                       return MDB_PROBLEM;
+       }
 #endif /* _WIN32 */
 
 #ifdef BROKEN_FDATASYNC
@@ -4733,7 +4792,7 @@ mdb_env_open2(MDB_env *env)
        }
 #endif
 
-       if ((i = mdb_env_read_header(env, &meta)) != 0) {
+       if ((i = mdb_env_read_header(env, prev, &meta)) != 0) {
                if (i != ENOENT)
                        return i;
                DPUTS("new mdbenv");
@@ -5332,7 +5391,7 @@ fail:
         */
 #define        CHANGEABLE      (MDB_NOSYNC|MDB_NOMETASYNC|MDB_MAPASYNC|MDB_NOMEMINIT)
 #define        CHANGELESS      (MDB_FIXEDMAP|MDB_NOSUBDIR|MDB_RDONLY| \
-       MDB_WRITEMAP|MDB_NOTLS|MDB_NOLOCK|MDB_NORDAHEAD)
+       MDB_WRITEMAP|MDB_NOTLS|MDB_NOLOCK|MDB_NORDAHEAD|MDB_PREVMETA)
 
 #if VALID_FLAGS & PERSISTENT_FLAGS & (CHANGEABLE|CHANGELESS)
 # error "Persistent DB flags & env flags overlap, but both go in mm_flags"
@@ -5432,7 +5491,7 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode
                        goto leave;
        }
 
-       if ((rc = mdb_env_open2(env)) == MDB_SUCCESS) {
+       if ((rc = mdb_env_open2(env, flags & MDB_PREVMETA)) == MDB_SUCCESS) {
                if (!(flags & (MDB_RDONLY|MDB_WRITEMAP))) {
                        /* Synchronous fd for meta writes. Needed even with
                         * MDB_NOSYNC/MDB_NOMETASYNC, in case these get reset.
@@ -5546,7 +5605,7 @@ mdb_env_close0(MDB_env *env, int excl)
        if (env->me_fd != INVALID_HANDLE_VALUE)
                (void) close(env->me_fd);
        if (env->me_txns) {
-               MDB_PID_T pid = env->me_pid;
+               MDB_PID_T pid = getpid();
                /* Clearing readers is done in this function because
                 * me_txkey with its destructor must be disabled first.
                 *
@@ -6517,6 +6576,10 @@ release:
                if (rc)
                        return rc;
        }
+#ifdef MDB_VL32
+       if (mc->mc_ovpg == mp)
+               mc->mc_ovpg = NULL;
+#endif
        mc->mc_db->md_overflow_pages -= ovpages;
        return 0;
 }
@@ -7268,6 +7331,11 @@ fetchm:
                        rc = MDB_INCOMPATIBLE;
                        break;
                }
+               if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mc->mc_pg[mc->mc_top])) {
+                       mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]);
+                       rc = MDB_NOTFOUND;
+                       break;
+               }
                {
                        MDB_node *leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
                        if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) {
@@ -7632,8 +7700,9 @@ prep_subDB:
                                } else {
                                        memcpy((char *)mp + mp->mp_upper + PAGEBASE, (char *)fp + fp->mp_upper + PAGEBASE,
                                                olddata.mv_size - fp->mp_upper - PAGEBASE);
+                                       memcpy((char *)(&mp->mp_ptrs), (char *)(&fp->mp_ptrs), NUMKEYS(fp) * sizeof(mp->mp_ptrs[0]));
                                        for (i=0; i<NUMKEYS(fp); i++)
-                                               mp->mp_ptrs[i] = fp->mp_ptrs[i] + offset;
+                                               mp->mp_ptrs[i] += offset;
                                }
                        }
 
@@ -7762,8 +7831,7 @@ new_sub:
                                if (m3->mc_ki[i] >= mc->mc_ki[i] && insert_key) {
                                        m3->mc_ki[i]++;
                                }
-                               if (XCURSOR_INITED(m3))
-                                       XCURSOR_REFRESH(m3, mp, m3->mc_ki[i]);
+                               XCURSOR_REFRESH(m3, i, mp);
                        }
                }
        }
@@ -7805,7 +7873,6 @@ put_sub:
                                MDB_xcursor *mx = mc->mc_xcursor;
                                unsigned i = mc->mc_top;
                                MDB_page *mp = mc->mc_pg[i];
-                               int nkeys = NUMKEYS(mp);
 
                                for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) {
                                        if (m2 == mc || m2->mc_snum < mc->mc_snum) continue;
@@ -7813,8 +7880,8 @@ put_sub:
                                        if (m2->mc_pg[i] == mp) {
                                                if (m2->mc_ki[i] == mc->mc_ki[i]) {
                                                        mdb_xcursor_init2(m2, mx, new_dupdata);
-                                               } else if (!insert_key && m2->mc_ki[i] < nkeys) {
-                                                       XCURSOR_REFRESH(m2, mp, m2->mc_ki[i]);
+                                               } else if (!insert_key) {
+                                                       XCURSOR_REFRESH(m2, i, mp);
                                                }
                                        }
                                }
@@ -7919,12 +7986,7 @@ mdb_cursor_del(MDB_cursor *mc, unsigned int flags)
                                                if (m2 == mc || m2->mc_snum < mc->mc_snum) continue;
                                                if (!(m2->mc_flags & C_INITIALIZED)) continue;
                                                if (m2->mc_pg[mc->mc_top] == mp) {
-                                                       MDB_node *n2 = leaf;
-                                                       if (m2->mc_ki[mc->mc_top] != mc->mc_ki[mc->mc_top]) {
-                                                               n2 = NODEPTR(mp, m2->mc_ki[mc->mc_top]);
-                                                               if (n2->mn_flags & F_SUBDATA) continue;
-                                                       }
-                                                       m2->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(n2);
+                                                       XCURSOR_REFRESH(m2, mc->mc_top, mp);
                                                }
                                        }
                                }
@@ -8776,8 +8838,8 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft)
                                        m3->mc_ki[csrc->mc_top] = cdst->mc_ki[cdst->mc_top];
                                        m3->mc_ki[csrc->mc_top-1]++;
                                }
-                               if (XCURSOR_INITED(m3) && IS_LEAF(mps))
-                                       XCURSOR_REFRESH(m3, m3->mc_pg[csrc->mc_top], m3->mc_ki[csrc->mc_top]);
+                               if (IS_LEAF(mps))
+                                       XCURSOR_REFRESH(m3, csrc->mc_top, m3->mc_pg[csrc->mc_top]);
                        }
                } else
                /* Adding on the right, bump others down */
@@ -8798,8 +8860,8 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft)
                                        } else {
                                                m3->mc_ki[csrc->mc_top]--;
                                        }
-                                       if (XCURSOR_INITED(m3) && IS_LEAF(mps))
-                                               XCURSOR_REFRESH(m3, m3->mc_pg[csrc->mc_top], m3->mc_ki[csrc->mc_top]);
+                                       if (IS_LEAF(mps))
+                                               XCURSOR_REFRESH(m3, csrc->mc_top, m3->mc_pg[csrc->mc_top]);
                                }
                        }
                }
@@ -9000,8 +9062,8 @@ mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst)
                                m3->mc_ki[top-1] > csrc->mc_ki[top-1]) {
                                m3->mc_ki[top-1]--;
                        }
-                       if (XCURSOR_INITED(m3) && IS_LEAF(psrc))
-                               XCURSOR_REFRESH(m3, m3->mc_pg[top], m3->mc_ki[top]);
+                       if (IS_LEAF(psrc))
+                               XCURSOR_REFRESH(m3, top, m3->mc_pg[top]);
                }
        }
        {
@@ -9264,8 +9326,7 @@ mdb_cursor_del0(MDB_cursor *mc)
                                } else if (m3->mc_ki[mc->mc_top] > ki) {
                                        m3->mc_ki[mc->mc_top]--;
                                }
-                               if (XCURSOR_INITED(m3))
-                                       XCURSOR_REFRESH(m3, m3->mc_pg[mc->mc_top], m3->mc_ki[mc->mc_top]);
+                               XCURSOR_REFRESH(m3, mc->mc_top, mp);
                        }
                }
        }
@@ -9302,15 +9363,20 @@ mdb_cursor_del0(MDB_cursor *mc)
                                        }
                                        if (mc->mc_db->md_flags & MDB_DUPSORT) {
                                                MDB_node *node = NODEPTR(m3->mc_pg[m3->mc_top], m3->mc_ki[m3->mc_top]);
-                                               /* If this node is a fake page, it needs to be reinited
-                                                * because its data has moved. But just reset mc_pg[0]
-                                                * if the xcursor is already live.
+                                               /* If this node has dupdata, it may need to be reinited
+                                                * because its data has moved.
+                                                * If the xcursor was not initd it must be reinited.
+                                                * Else if node points to a subDB, nothing is needed.
+                                                * Else (xcursor was initd, not a subDB) needs mc_pg[0] reset.
                                                 */
-                                               if ((node->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA) {
-                                                       if (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)
-                                                               m3->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(node);
-                                                       else
+                                               if (node->mn_flags & F_DUPDATA) {
+                                                       if (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) {
+                                                               if (!(node->mn_flags & F_SUBDATA))
+                                                                       m3->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(node);
+                                                       } else {
                                                                mdb_xcursor_init1(m3, node);
+                                                               m3->mc_xcursor->mx_cursor.mc_flags |= C_DEL;
+                                                       }
                                                }
                                        }
                                }
@@ -9797,8 +9863,8 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno
                                m3->mc_ki[ptop] >= mc->mc_ki[ptop]) {
                                m3->mc_ki[ptop]++;
                        }
-                       if (XCURSOR_INITED(m3) && IS_LEAF(mp))
-                               XCURSOR_REFRESH(m3, m3->mc_pg[mc->mc_top], m3->mc_ki[mc->mc_top]);
+                       if (IS_LEAF(mp))
+                               XCURSOR_REFRESH(m3, mc->mc_top, m3->mc_pg[mc->mc_top]);
                }
        }
        DPRINTF(("mp left: %d, rp left: %d", SIZELEFT(mp), SIZELEFT(rp)));
@@ -10604,8 +10670,11 @@ int mdb_dbi_open(MDB_txn *txn, const char *name, unsigned int flags, MDB_dbi *db
                MDB_node *node = NODEPTR(mc.mc_pg[mc.mc_top], mc.mc_ki[mc.mc_top]);
                if ((node->mn_flags & (F_DUPDATA|F_SUBDATA)) != F_SUBDATA)
                        return MDB_INCOMPATIBLE;
-       } else if (! (rc == MDB_NOTFOUND && (flags & MDB_CREATE))) {
-               return rc;
+       } else {
+               if (rc != MDB_NOTFOUND || !(flags & MDB_CREATE))
+                       return rc;
+               if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY))
+                       return EACCES;
        }
 
        /* Done here so we cannot fail after creating a new DB */