X-Git-Url: https://git.sur5r.net/?a=blobdiff_plain;f=libraries%2Fliblmdb%2Fmdb.c;h=1edc282b814ca220ca3b4e15a458b1e8320edcce;hb=64bd9f6abc740ba6c5f1e124be37293921501bdf;hp=b87099e487f48f9b33155fc2167b7095dfae270b;hpb=00aae125bea7bc0af11d08cf32b50a2bb1757143;p=openldap diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index b87099e487..1edc282b81 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -5,7 +5,7 @@ * BerkeleyDB API, but much simplified. */ /* - * Copyright 2011-2015 Howard Chu, Symas Corp. + * Copyright 2011-2016 Howard Chu, Symas Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -198,7 +198,7 @@ typedef SSIZE_T ssize_t; #define ESECT #endif -#ifdef _MSC_VER +#ifdef _WIN32 #define CALL_CONV WINAPI #else #define CALL_CONV @@ -256,8 +256,9 @@ typedef SSIZE_T ssize_t; # define MDB_USE_ROBUST 0 # else # define MDB_USE_ROBUST 1 -/* glibc < 2.10 only provided _np API */ -# if defined(__GLIBC__) && GLIBC_VER < 0x02000a +/* glibc < 2.12 only provided _np API */ +# if (defined(__GLIBC__) && GLIBC_VER < 0x02000c) || \ + (defined(PTHREAD_MUTEX_ROBUST_NP) && !defined(PTHREAD_MUTEX_ROBUST)) # define PTHREAD_MUTEX_ROBUST PTHREAD_MUTEX_ROBUST_NP # define pthread_mutexattr_setrobust(attr, flag) pthread_mutexattr_setrobust_np(attr, flag) # define pthread_mutex_consistent(mutex) pthread_mutex_consistent_np(mutex) @@ -1271,7 +1272,7 @@ typedef struct MDB_ntxn { #endif /** max bytes to write in one call */ -#define MAX_WRITE (0x80000000U >> (sizeof(ssize_t) == 4)) +#define MAX_WRITE (0x40000000U >> (sizeof(ssize_t) == 4)) /** Check \b txn and \b dbi arguments to a function */ #define TXN_DBI_EXIST(txn, dbi, validity) \ @@ -1417,8 +1418,9 @@ mdb_strerror(int err) * This works as long as no function between the call to mdb_strerror * and the actual use of the message uses more than 4K of stack. */ - char pad[4096]; - char buf[1024], *ptr = buf; +#define MSGSIZE 1024 +#define PADSIZE 4096 + char buf[MSGSIZE+PADSIZE], *ptr = buf; #endif int i; if (!err) @@ -1450,7 +1452,7 @@ mdb_strerror(int err) buf[0] = 0; FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, - NULL, err, 0, ptr, sizeof(buf), (va_list *)pad); + NULL, err, 0, ptr, MSGSIZE, (va_list *)buf+MSGSIZE); return ptr; #else return strerror(err); @@ -4470,7 +4472,9 @@ mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl) #ifdef _WIN32 wchar_t *wlpath; - utf8_to_utf16(lpath, -1, &wlpath, NULL); + rc = utf8_to_utf16(lpath, -1, &wlpath, NULL); + if (rc) + return rc; env->me_lfd = CreateFileW(wlpath, GENERIC_READ|GENERIC_WRITE, FILE_SHARE_READ|FILE_SHARE_WRITE, NULL, OPEN_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL); @@ -4617,15 +4621,25 @@ mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl) #else /* MDB_USE_POSIX_MUTEX: */ pthread_mutexattr_t mattr; - if ((rc = pthread_mutexattr_init(&mattr)) - || (rc = pthread_mutexattr_setpshared(&mattr, PTHREAD_PROCESS_SHARED)) + /* Solaris needs this before initing a robust mutex. Otherwise + * it may skip the init and return EBUSY "seems someone already + * inited" or EINVAL "it was inited differently". + */ + memset(env->me_txns->mti_rmutex, 0, sizeof(*env->me_txns->mti_rmutex)); + memset(env->me_txns->mti_wmutex, 0, sizeof(*env->me_txns->mti_wmutex)); + + if ((rc = pthread_mutexattr_init(&mattr))) + goto fail; + + rc = pthread_mutexattr_setpshared(&mattr, PTHREAD_PROCESS_SHARED); #ifdef MDB_ROBUST_SUPPORTED - || (rc = pthread_mutexattr_setrobust(&mattr, PTHREAD_MUTEX_ROBUST)) + if (!rc) rc = pthread_mutexattr_setrobust(&mattr, PTHREAD_MUTEX_ROBUST); #endif - || (rc = pthread_mutex_init(env->me_txns->mti_rmutex, &mattr)) - || (rc = pthread_mutex_init(env->me_txns->mti_wmutex, &mattr))) - goto fail; + if (!rc) rc = pthread_mutex_init(env->me_txns->mti_rmutex, &mattr); + if (!rc) rc = pthread_mutex_init(env->me_txns->mti_wmutex, &mattr); pthread_mutexattr_destroy(&mattr); + if (rc) + goto fail; #endif /* _WIN32 || MDB_USE_POSIX_SEM */ env->me_txns->mti_magic = MDB_MAGIC; @@ -4758,7 +4772,9 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode len = OPEN_ALWAYS; } mode = FILE_ATTRIBUTE_NORMAL; - utf8_to_utf16(dpath, -1, &wpath, NULL); + rc = utf8_to_utf16(dpath, -1, &wpath, NULL); + if (rc) + goto leave; env->me_fd = CreateFileW(wpath, oflags, FILE_SHARE_READ|FILE_SHARE_WRITE, NULL, len, mode, NULL); free(wpath); @@ -4790,7 +4806,9 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode */ #ifdef _WIN32 len = OPEN_EXISTING; - utf8_to_utf16(dpath, -1, &wpath, NULL); + rc = utf8_to_utf16(dpath, -1, &wpath, NULL); + if (rc) + goto leave; env->me_mfd = CreateFileW(wpath, oflags, FILE_SHARE_READ|FILE_SHARE_WRITE, NULL, len, mode | FILE_FLAG_WRITE_THROUGH, NULL); @@ -5273,7 +5291,11 @@ mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int flags) indx_t i; DPRINTF(("branch page %"Z"u has %u keys", mp->mp_pgno, NUMKEYS(mp))); - mdb_cassert(mc, NUMKEYS(mp) > 1); + /* Don't assert on branch pages in the FreeDB. We can get here + * while in the process of rebalancing a FreeDB branch page; we must + * let that proceed. ITS#8336 + */ + mdb_cassert(mc, !mc->mc_dbi || NUMKEYS(mp) > 1); DPRINTF(("found index 0 to page %"Z"u", NODEPGNO(NODEPTR(mp, 0)))); if (flags & (MDB_PS_FIRST|MDB_PS_LAST)) { @@ -5630,11 +5652,12 @@ mdb_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) MDB_node *leaf; int rc; - if (mc->mc_flags & C_EOF) { + if ((mc->mc_flags & C_EOF) || + ((mc->mc_flags & C_DEL) && op == MDB_NEXT_DUP)) { return MDB_NOTFOUND; } - - mdb_cassert(mc, mc->mc_flags & C_INITIALIZED); + if (!(mc->mc_flags & C_INITIALIZED)) + return mdb_cursor_first(mc, key, data); mp = mc->mc_pg[mc->mc_top]; @@ -5658,8 +5681,10 @@ mdb_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) DPRINTF(("cursor_next: top page is %"Z"u in cursor %p", mdb_dbg_pgno(mp), (void *) mc)); - if (mc->mc_flags & C_DEL) + if (mc->mc_flags & C_DEL) { + mc->mc_flags ^= C_DEL; goto skip; + } if (mc->mc_ki[mc->mc_top] + 1u >= NUMKEYS(mp)) { DPUTS("=====> move to next sibling page"); @@ -5711,7 +5736,12 @@ mdb_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) MDB_node *leaf; int rc; - mdb_cassert(mc, mc->mc_flags & C_INITIALIZED); + if (!(mc->mc_flags & C_INITIALIZED)) { + rc = mdb_cursor_last(mc, key, data); + if (rc) + return rc; + mc->mc_ki[mc->mc_top]++; + } mp = mc->mc_pg[mc->mc_top]; @@ -5738,6 +5768,8 @@ mdb_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) DPRINTF(("cursor_prev: top page is %"Z"u in cursor %p", mdb_dbg_pgno(mp), (void *) mc)); + mc->mc_flags &= ~(C_EOF|C_DEL); + if (mc->mc_ki[mc->mc_top] == 0) { DPUTS("=====> move to prev sibling page"); if ((rc = mdb_cursor_sibling(mc, 0)) != MDB_SUCCESS) { @@ -5749,8 +5781,6 @@ mdb_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) } else mc->mc_ki[mc->mc_top]--; - mc->mc_flags &= ~C_EOF; - DPRINTF(("==> cursor points to page %"Z"u with %u keys, key index %u", mdb_dbg_pgno(mp), NUMKEYS(mp), mc->mc_ki[mc->mc_top])); @@ -5961,8 +5991,8 @@ set1: if (op == MDB_GET_BOTH || rc > 0) return MDB_NOTFOUND; rc = 0; - *data = olddata; } + *data = olddata; } else { if (mc->mc_xcursor) @@ -6157,10 +6187,7 @@ mdb_cursor_get(MDB_cursor *mc, MDB_val *key, MDB_val *data, rc = MDB_INCOMPATIBLE; break; } - if (!(mc->mc_flags & C_INITIALIZED)) - rc = mdb_cursor_first(mc, key, data); - else - rc = mdb_cursor_next(mc, key, data, MDB_NEXT_DUP); + rc = mdb_cursor_next(mc, key, data, MDB_NEXT_DUP); if (rc == MDB_SUCCESS) { if (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { MDB_cursor *mx; @@ -6178,21 +6205,11 @@ fetchm: case MDB_NEXT: case MDB_NEXT_DUP: case MDB_NEXT_NODUP: - if (!(mc->mc_flags & C_INITIALIZED)) - rc = mdb_cursor_first(mc, key, data); - else - rc = mdb_cursor_next(mc, key, data, op); + rc = mdb_cursor_next(mc, key, data, op); break; case MDB_PREV: case MDB_PREV_DUP: case MDB_PREV_NODUP: - if (!(mc->mc_flags & C_INITIALIZED)) { - rc = mdb_cursor_last(mc, key, data); - if (rc) - break; - mc->mc_flags |= C_INITIALIZED; - mc->mc_ki[mc->mc_top]++; - } rc = mdb_cursor_prev(mc, key, data, op); break; case MDB_FIRST: @@ -6481,7 +6498,7 @@ more: #endif /* does data match? */ if (!dcmp(data, &olddata)) { - if (flags & MDB_NODUPDATA) + if (flags & (MDB_NODUPDATA|MDB_APPENDDUP)) return MDB_KEYEXIST; /* overwrite it */ goto current; @@ -6629,8 +6646,13 @@ current: /* Note - this page is already counted in parent's dirty_room */ rc2 = mdb_mid2l_insert(mc->mc_txn->mt_u.dirty_list, &id2); mdb_cassert(mc, rc2 == 0); + /* Currently we make the page look as with put() in the + * parent txn, in case the user peeks at MDB_RESERVEd + * or unused parts. Some users treat ovpages specially. + */ if (!(flags & MDB_RESERVE)) { - /* Copy end of page, adjusting alignment so + /* Skip the part where LMDB will put *data. + * Copy end of page, adjusting alignment so * compiler may copy words instead of bytes. */ off = (PAGEHDRSZ + data->mv_size) & -sizeof(size_t); @@ -6689,7 +6711,6 @@ new_sub: MDB_dbi dbi = mc->mc_dbi; unsigned i = mc->mc_top; MDB_page *mp = mc->mc_pg[i]; - int nkeys = NUMKEYS(mp); for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { if (mc->mc_flags & C_SUB) @@ -8196,8 +8217,6 @@ mdb_cursor_del0(MDB_cursor *mc) if (m3->mc_pg[mc->mc_top] == mp) { if (m3->mc_ki[mc->mc_top] == ki) { m3->mc_flags |= C_DEL; - if (mc->mc_db->md_flags & MDB_DUPSORT) - m3->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED; } else if (m3->mc_ki[mc->mc_top] > ki) { m3->mc_ki[mc->mc_top]--; } @@ -8231,11 +8250,21 @@ mdb_cursor_del0(MDB_cursor *mc) continue; if (m3->mc_pg[mc->mc_top] == mp) { /* if m3 points past last node in page, find next sibling */ - if (m3->mc_ki[mc->mc_top] >= nkeys) { - rc = mdb_cursor_sibling(m3, 1); - if (rc == MDB_NOTFOUND) { - m3->mc_flags |= C_EOF; - rc = MDB_SUCCESS; + if (m3->mc_ki[mc->mc_top] >= mc->mc_ki[mc->mc_top]) { + if (m3->mc_ki[mc->mc_top] >= nkeys) { + rc = mdb_cursor_sibling(m3, 1); + if (rc == MDB_NOTFOUND) { + m3->mc_flags |= C_EOF; + rc = MDB_SUCCESS; + continue; + } + } + if (mc->mc_db->md_flags & MDB_DUPSORT) { + MDB_node *node = NODEPTR(m3->mc_pg[m3->mc_top], m3->mc_ki[m3->mc_top]); + if (node->mn_flags & F_DUPDATA) { + mdb_xcursor_init1(m3, node); + m3->mc_xcursor->mx_cursor.mc_flags |= C_DEL; + } } } } @@ -8872,7 +8901,7 @@ mdb_env_cthr_toggle(mdb_copy *my, int st) static int ESECT mdb_env_cwalk(mdb_copy *my, pgno_t *pg, int flags) { - MDB_cursor mc; + MDB_cursor mc = {0}; MDB_txn *txn = my->mc_txn; MDB_node *ni; MDB_page *mo, *mp, *leaf; @@ -8885,7 +8914,6 @@ mdb_env_cwalk(mdb_copy *my, pgno_t *pg, int flags) return MDB_SUCCESS; mc.mc_snum = 1; - mc.mc_top = 0; mc.mc_txn = txn; rc = mdb_page_get(my->mc_txn, *pg, &mc.mc_pg[0], NULL); @@ -9148,7 +9176,7 @@ mdb_env_copyfd0(MDB_env *env, HANDLE fd) MDB_txn *txn = NULL; mdb_mutexref_t wmutex = NULL; int rc; - size_t wsize; + size_t wsize, w3; char *ptr; #ifdef _WIN32 DWORD len, w2; @@ -9207,15 +9235,15 @@ mdb_env_copyfd0(MDB_env *env, HANDLE fd) if (rc) goto leave; - w2 = txn->mt_next_pgno * env->me_psize; + w3 = txn->mt_next_pgno * env->me_psize; { size_t fsize = 0; if ((rc = mdb_fsize(env->me_fd, &fsize))) goto leave; - if (w2 > fsize) - w2 = fsize; + if (w3 > fsize) + w3 = fsize; } - wsize = w2 - wsize; + wsize = w3 - wsize; while (wsize > 0) { if (wsize > MAX_WRITE) w2 = MAX_WRITE; @@ -9282,7 +9310,9 @@ mdb_env_copy2(MDB_env *env, const char *path, unsigned int flags) * already in the OS cache. */ #ifdef _WIN32 - utf8_to_utf16(lpath, -1, &wpath, NULL); + rc = utf8_to_utf16(lpath, -1, &wpath, NULL); + if (rc) + goto leave; newfd = CreateFileW(wpath, GENERIC_WRITE, 0, NULL, CREATE_NEW, FILE_FLAG_NO_BUFFERING|FILE_FLAG_WRITE_THROUGH, NULL); free(wpath); @@ -9477,6 +9507,7 @@ int mdb_dbi_open(MDB_txn *txn, const char *name, unsigned int flags, MDB_dbi *db MDB_db dummy; int rc, dbflag, exact; unsigned int unused = 0, seq; + char *namedup; size_t len; if (flags & ~VALID_FLAGS) @@ -9538,8 +9569,16 @@ int mdb_dbi_open(MDB_txn *txn, const char *name, unsigned int flags, MDB_dbi *db MDB_node *node = NODEPTR(mc.mc_pg[mc.mc_top], mc.mc_ki[mc.mc_top]); if ((node->mn_flags & (F_DUPDATA|F_SUBDATA)) != F_SUBDATA) return MDB_INCOMPATIBLE; - } else if (rc == MDB_NOTFOUND && (flags & MDB_CREATE)) { - /* Create if requested */ + } else if (! (rc == MDB_NOTFOUND && (flags & MDB_CREATE))) { + return rc; + } + + /* Done here so we cannot fail after creating a new DB */ + if ((namedup = strdup(name)) == NULL) + return ENOMEM; + + if (rc) { + /* MDB_NOTFOUND and MDB_CREATE: Create new DB */ data.mv_size = sizeof(MDB_db); data.mv_data = &dummy; memset(&dummy, 0, sizeof(dummy)); @@ -9549,10 +9588,12 @@ int mdb_dbi_open(MDB_txn *txn, const char *name, unsigned int flags, MDB_dbi *db dbflag |= DB_DIRTY; } - /* OK, got info, add to table */ - if (rc == MDB_SUCCESS) { + if (rc) { + free(namedup); + } else { + /* Got info, register DBI in this txn */ unsigned int slot = unused ? unused : txn->mt_numdbs; - txn->mt_dbxs[slot].md_name.mv_data = strdup(name); + txn->mt_dbxs[slot].md_name.mv_data = namedup; txn->mt_dbxs[slot].md_name.mv_size = len; txn->mt_dbxs[slot].md_rel = NULL; txn->mt_dbflags[slot] = dbflag; @@ -9635,8 +9676,11 @@ mdb_drop0(MDB_cursor *mc, int subs) /* DUPSORT sub-DBs have no ovpages/DBs. Omit scanning leaves. * This also avoids any P_LEAF2 pages, which have no nodes. + * Also if the DB doesn't have sub-DBs and has no overflow + * pages, omit scanning leaves. */ - if (mc->mc_flags & C_SUB) + if ((mc->mc_flags & C_SUB) || + (!subs && !mc->mc_db->md_overflow_pages)) mdb_cursor_pop(mc); mdb_cursor_copy(mc, &mx); @@ -9658,6 +9702,9 @@ mdb_drop0(MDB_cursor *mc, int subs) pg, omp->mp_pages); if (rc) goto done; + mc->mc_db->md_overflow_pages -= omp->mp_pages; + if (!mc->mc_db->md_overflow_pages && !subs) + break; } else if (subs && (ni->mn_flags & F_SUBDATA)) { mdb_xcursor_init1(mc, ni); rc = mdb_drop0(&mc->mc_xcursor->mx_cursor, 0); @@ -9665,6 +9712,8 @@ mdb_drop0(MDB_cursor *mc, int subs) goto done; } } + if (!subs && !mc->mc_db->md_overflow_pages) + goto pop; } else { if ((rc = mdb_midl_need(&txn->mt_free_pgs, n)) != 0) goto done; @@ -9686,6 +9735,7 @@ mdb_drop0(MDB_cursor *mc, int subs) /* no more siblings, go back to beginning * of previous level. */ +pop: mdb_cursor_pop(mc); mc->mc_ki[0] = 0; for (i=1; imc_snum; i++) { @@ -10003,6 +10053,8 @@ static int utf8_to_utf16(const char *src, int srcsize, wchar_t **dst, int *dstsi if (need == 0) return EINVAL; result = malloc(sizeof(wchar_t) * need); + if (!result) + return ENOMEM; MultiByteToWideChar(CP_UTF8, 0, src, srcsize, result, need); if (dstsize) *dstsize = need;