+ if (mc->mc_dbi > MAIN_DBI && !(*mc->mc_dbflag & DB_DIRTY)) {
+ MDB_cursor mc2;
+ mdb_cursor_init(&mc2, mc->mc_txn, MAIN_DBI, NULL);
+ rc = mdb_page_search(&mc2, &mc->mc_dbx->md_name, 1);
+ if (rc)
+ return rc;
+ *mc->mc_dbflag = DB_DIRTY;
+ }
+ for (mc->mc_top = 0; mc->mc_top < mc->mc_snum; mc->mc_top++) {
+ rc = mdb_page_touch(mc);
+ if (rc)
+ return rc;
+ }
+ mc->mc_top = mc->mc_snum-1;
+ return MDB_SUCCESS;
+}
+
+int
+mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data,
+ unsigned int flags)
+{
+ MDB_node *leaf = NULL;
+ MDB_val xdata, *rdata, dkey;
+ MDB_page *fp;
+ MDB_db dummy;
+ int do_sub = 0;
+ unsigned int mcount = 0;
+ size_t nsize;
+ int rc, rc2;
+ MDB_pagebuf pbuf;
+ char dbuf[MAXKEYSIZE+1];
+ unsigned int nflags;
+ DKBUF;
+
+ if (F_ISSET(mc->mc_txn->mt_flags, MDB_TXN_RDONLY))
+ return EACCES;
+
+ DPRINTF("==> put db %u key [%s], size %zu, data size %zu",
+ mc->mc_dbi, DKEY(key), key ? key->mv_size:0, data->mv_size);
+
+ dkey.mv_size = 0;
+
+ if (flags == MDB_CURRENT) {
+ if (!(mc->mc_flags & C_INITIALIZED))
+ return EINVAL;
+ rc = MDB_SUCCESS;
+ } else if (mc->mc_db->md_root == P_INVALID) {
+ MDB_page *np;
+ /* new database, write a root leaf page */
+ DPUTS("allocating new root leaf page");
+ if ((np = mdb_page_new(mc, P_LEAF, 1)) == NULL) {
+ return ENOMEM;
+ }
+ mc->mc_snum = 0;
+ mdb_cursor_push(mc, np);
+ mc->mc_db->md_root = np->mp_pgno;
+ mc->mc_db->md_depth++;
+ *mc->mc_dbflag = DB_DIRTY;
+ if ((mc->mc_db->md_flags & (MDB_DUPSORT|MDB_DUPFIXED))
+ == MDB_DUPFIXED)
+ np->mp_flags |= P_LEAF2;
+ mc->mc_flags |= C_INITIALIZED;
+ rc = MDB_NOTFOUND;
+ goto top;
+ } else {
+ int exact = 0;
+ MDB_val d2;
+ rc = mdb_cursor_set(mc, key, &d2, MDB_SET, &exact);
+ if ((flags & MDB_NOOVERWRITE) && rc == 0) {
+ DPRINTF("duplicate key [%s]", DKEY(key));
+ *data = d2;
+ return MDB_KEYEXIST;
+ }
+ if (rc && rc != MDB_NOTFOUND)
+ return rc;
+ }
+
+ /* Cursor is positioned, now make sure all pages are writable */
+ rc2 = mdb_cursor_touch(mc);
+ if (rc2)
+ return rc2;
+
+top:
+ /* The key already exists */
+ if (rc == MDB_SUCCESS) {
+ /* there's only a key anyway, so this is a no-op */
+ if (IS_LEAF2(mc->mc_pg[mc->mc_top])) {
+ unsigned int ksize = mc->mc_db->md_pad;
+ if (key->mv_size != ksize)
+ return EINVAL;
+ if (flags == MDB_CURRENT) {
+ char *ptr = LEAF2KEY(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], ksize);
+ memcpy(ptr, key->mv_data, ksize);
+ }
+ return MDB_SUCCESS;
+ }
+
+ leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
+
+ /* DB has dups? */
+ if (F_ISSET(mc->mc_db->md_flags, MDB_DUPSORT)) {
+ /* Was a single item before, must convert now */
+more:
+ if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) {
+ /* Just overwrite the current item */
+ if (flags == MDB_CURRENT)
+ goto current;
+
+ dkey.mv_size = NODEDSZ(leaf);
+ dkey.mv_data = NODEDATA(leaf);
+#if UINT_MAX < SIZE_MAX
+ if (mc->mc_dbx->md_dcmp == mdb_cmp_int && dkey.mv_size == sizeof(size_t))
+#ifdef MISALIGNED_OK
+ mc->mc_dbx->md_dcmp = mdb_cmp_long;
+#else
+ mc->mc_dbx->md_dcmp = mdb_cmp_cint;
+#endif
+#endif
+ /* if data matches, ignore it */
+ if (!mc->mc_dbx->md_dcmp(data, &dkey))
+ return (flags == MDB_NODUPDATA) ? MDB_KEYEXIST : MDB_SUCCESS;
+
+ /* create a fake page for the dup items */
+ memcpy(dbuf, dkey.mv_data, dkey.mv_size);
+ dkey.mv_data = dbuf;
+ fp = (MDB_page *)&pbuf;
+ fp->mp_pgno = mc->mc_pg[mc->mc_top]->mp_pgno;
+ fp->mp_flags = P_LEAF|P_DIRTY|P_SUBP;
+ fp->mp_lower = PAGEHDRSZ;
+ fp->mp_upper = PAGEHDRSZ + dkey.mv_size + data->mv_size;
+ if (mc->mc_db->md_flags & MDB_DUPFIXED) {
+ fp->mp_flags |= P_LEAF2;
+ fp->mp_pad = data->mv_size;
+ } else {
+ fp->mp_upper += 2 * sizeof(indx_t) + 2 * NODESIZE +
+ (dkey.mv_size & 1) + (data->mv_size & 1);
+ }
+ mdb_node_del(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], 0);
+ do_sub = 1;
+ rdata = &xdata;
+ xdata.mv_size = fp->mp_upper;
+ xdata.mv_data = fp;
+ flags |= F_DUPDATA;
+ goto new_sub;
+ }
+ if (!F_ISSET(leaf->mn_flags, F_SUBDATA)) {
+ /* See if we need to convert from fake page to subDB */
+ MDB_page *mp;
+ unsigned int offset;
+ unsigned int i;
+
+ fp = NODEDATA(leaf);
+ if (flags == MDB_CURRENT) {
+ fp->mp_flags |= P_DIRTY;
+ COPY_PGNO(fp->mp_pgno, mc->mc_pg[mc->mc_top]->mp_pgno);
+ mc->mc_xcursor->mx_cursor.mc_pg[0] = fp;
+ flags |= F_DUPDATA;
+ goto put_sub;
+ }
+ if (mc->mc_db->md_flags & MDB_DUPFIXED) {
+ offset = fp->mp_pad;
+ } else {
+ offset = NODESIZE + sizeof(indx_t) + data->mv_size;
+ }
+ offset += offset & 1;
+ if (NODESIZE + sizeof(indx_t) + NODEKSZ(leaf) + NODEDSZ(leaf) +
+ offset >= (mc->mc_txn->mt_env->me_psize - PAGEHDRSZ) /
+ MDB_MINKEYS) {
+ /* yes, convert it */
+ dummy.md_flags = 0;
+ if (mc->mc_db->md_flags & MDB_DUPFIXED) {
+ dummy.md_pad = fp->mp_pad;
+ dummy.md_flags = MDB_DUPFIXED;
+ if (mc->mc_db->md_flags & MDB_INTEGERDUP)
+ dummy.md_flags |= MDB_INTEGERKEY;
+ }
+ dummy.md_depth = 1;
+ dummy.md_branch_pages = 0;
+ dummy.md_leaf_pages = 1;
+ dummy.md_overflow_pages = 0;
+ dummy.md_entries = NUMKEYS(fp);
+ rdata = &xdata;
+ xdata.mv_size = sizeof(MDB_db);
+ xdata.mv_data = &dummy;
+ mp = mdb_page_alloc(mc, 1);
+ if (!mp)
+ return ENOMEM;
+ offset = mc->mc_txn->mt_env->me_psize - NODEDSZ(leaf);
+ flags |= F_DUPDATA|F_SUBDATA;
+ dummy.md_root = mp->mp_pgno;
+ } else {
+ /* no, just grow it */
+ rdata = &xdata;
+ xdata.mv_size = NODEDSZ(leaf) + offset;
+ xdata.mv_data = &pbuf;
+ mp = (MDB_page *)&pbuf;
+ mp->mp_pgno = mc->mc_pg[mc->mc_top]->mp_pgno;
+ flags |= F_DUPDATA;
+ }
+ mp->mp_flags = fp->mp_flags | P_DIRTY;
+ mp->mp_pad = fp->mp_pad;
+ mp->mp_lower = fp->mp_lower;
+ mp->mp_upper = fp->mp_upper + offset;
+ if (IS_LEAF2(fp)) {
+ memcpy(METADATA(mp), METADATA(fp), NUMKEYS(fp) * fp->mp_pad);
+ } else {
+ nsize = NODEDSZ(leaf) - fp->mp_upper;
+ memcpy((char *)mp + mp->mp_upper, (char *)fp + fp->mp_upper, nsize);
+ for (i=0; i<NUMKEYS(fp); i++)
+ mp->mp_ptrs[i] = fp->mp_ptrs[i] + offset;
+ }
+ mdb_node_del(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], 0);
+ do_sub = 1;
+ goto new_sub;
+ }
+ /* data is on sub-DB, just store it */
+ flags |= F_DUPDATA|F_SUBDATA;
+ goto put_sub;
+ }
+current:
+ /* overflow page overwrites need special handling */
+ if (F_ISSET(leaf->mn_flags, F_BIGDATA)) {
+ MDB_page *omp;
+ pgno_t pg;
+ int ovpages, dpages;
+
+ ovpages = OVPAGES(NODEDSZ(leaf), mc->mc_txn->mt_env->me_psize);
+ dpages = OVPAGES(data->mv_size, mc->mc_txn->mt_env->me_psize);
+ memcpy(&pg, NODEDATA(leaf), sizeof(pg));
+ mdb_page_get(mc->mc_txn, pg, &omp);
+ /* Is the ov page writable and large enough? */
+ if ((omp->mp_flags & P_DIRTY) && ovpages >= dpages) {
+ /* yes, overwrite it. Note in this case we don't
+ * bother to try shrinking the node if the new data
+ * is smaller than the overflow threshold.
+ */
+ if (F_ISSET(flags, MDB_RESERVE))
+ data->mv_data = METADATA(omp);
+ else
+ memcpy(METADATA(omp), data->mv_data, data->mv_size);
+ goto done;
+ } else {
+ /* no, free ovpages */
+ int i;
+ mc->mc_db->md_overflow_pages -= ovpages;
+ for (i=0; i<ovpages; i++) {
+ DPRINTF("freed ov page %zu", pg);
+ mdb_midl_append(&mc->mc_txn->mt_free_pgs, pg);
+ pg++;
+ }
+ }
+ } else if (NODEDSZ(leaf) == data->mv_size) {
+ /* same size, just replace it. Note that we could
+ * also reuse this node if the new data is smaller,
+ * but instead we opt to shrink the node in that case.
+ */
+ if (F_ISSET(flags, MDB_RESERVE))
+ data->mv_data = NODEDATA(leaf);
+ else
+ memcpy(NODEDATA(leaf), data->mv_data, data->mv_size);
+ goto done;
+ }
+ mdb_node_del(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], 0);
+ mc->mc_db->md_entries--;
+ } else {
+ DPRINTF("inserting key at index %i", mc->mc_ki[mc->mc_top]);
+ }
+
+ rdata = data;
+
+new_sub:
+ nflags = flags & NODE_ADD_FLAGS;
+ nsize = IS_LEAF2(mc->mc_pg[mc->mc_top]) ? key->mv_size : mdb_leaf_size(mc->mc_txn->mt_env, key, rdata);
+ if (SIZELEFT(mc->mc_pg[mc->mc_top]) < nsize) {
+ if (( flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA )
+ nflags &= ~MDB_APPEND;
+ rc = mdb_page_split(mc, key, rdata, P_INVALID, nflags);
+ } else {
+ /* There is room already in this leaf page. */
+ rc = mdb_node_add(mc, mc->mc_ki[mc->mc_top], key, rdata, 0, nflags);
+ if (rc == 0 && !do_sub) {
+ /* Adjust other cursors pointing to mp */
+ MDB_cursor *m2, *m3;
+ MDB_dbi dbi = mc->mc_dbi;
+ unsigned i = mc->mc_top;
+ MDB_page *mp = mc->mc_pg[i];
+
+ if (mc->mc_flags & C_SUB)
+ dbi--;
+
+ for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
+ if (mc->mc_flags & C_SUB)
+ m3 = &m2->mc_xcursor->mx_cursor;
+ else
+ m3 = m2;
+ if (m3 == mc || m3->mc_snum < mc->mc_snum) continue;
+ if (m3->mc_pg[i] == mp && m3->mc_ki[i] >= mc->mc_ki[i]) {
+ m3->mc_ki[i]++;
+ }
+ }
+ }
+ }
+
+ if (rc != MDB_SUCCESS)
+ mc->mc_txn->mt_flags |= MDB_TXN_ERROR;
+ else {
+ /* Now store the actual data in the child DB. Note that we're
+ * storing the user data in the keys field, so there are strict
+ * size limits on dupdata. The actual data fields of the child
+ * DB are all zero size.
+ */
+ if (do_sub) {
+ int xflags;
+put_sub:
+ xdata.mv_size = 0;
+ xdata.mv_data = "";
+ leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
+ if (flags & MDB_CURRENT) {
+ xflags = MDB_CURRENT;
+ } else {
+ mdb_xcursor_init1(mc, leaf);
+ xflags = (flags & MDB_NODUPDATA) ? MDB_NOOVERWRITE : 0;
+ }
+ /* converted, write the original data first */
+ if (dkey.mv_size) {
+ rc = mdb_cursor_put(&mc->mc_xcursor->mx_cursor, &dkey, &xdata, xflags);
+ if (rc)
+ return rc;
+ {
+ /* Adjust other cursors pointing to mp */
+ MDB_cursor *m2;
+ unsigned i = mc->mc_top;
+ MDB_page *mp = mc->mc_pg[i];
+
+ for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) {
+ if (m2 == mc || m2->mc_snum < mc->mc_snum) continue;
+ if (m2->mc_pg[i] == mp && m2->mc_ki[i] == mc->mc_ki[i]) {
+ mdb_xcursor_init1(m2, leaf);
+ }
+ }
+ }
+ }
+ xflags |= (flags & MDB_APPEND);
+ rc = mdb_cursor_put(&mc->mc_xcursor->mx_cursor, data, &xdata, xflags);
+ if (flags & F_SUBDATA) {
+ void *db = NODEDATA(leaf);
+ memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDB_db));
+ }
+ }
+ /* sub-writes might have failed so check rc again.
+ * Don't increment count if we just replaced an existing item.
+ */
+ if (!rc && !(flags & MDB_CURRENT))
+ mc->mc_db->md_entries++;
+ if (flags & MDB_MULTIPLE) {
+ mcount++;
+ if (mcount < data[1].mv_size) {
+ data[0].mv_data = (char *)data[0].mv_data + data[0].mv_size;
+ leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
+ goto more;
+ }
+ }
+ }
+done:
+ return rc;
+}
+
+int
+mdb_cursor_del(MDB_cursor *mc, unsigned int flags)
+{
+ MDB_node *leaf;
+ int rc;
+
+ if (F_ISSET(mc->mc_txn->mt_flags, MDB_TXN_RDONLY))
+ return EACCES;
+
+ if (!mc->mc_flags & C_INITIALIZED)
+ return EINVAL;
+
+ rc = mdb_cursor_touch(mc);
+ if (rc)
+ return rc;
+
+ leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
+
+ if (!IS_LEAF2(mc->mc_pg[mc->mc_top]) && F_ISSET(leaf->mn_flags, F_DUPDATA)) {
+ if (flags != MDB_NODUPDATA) {
+ if (!F_ISSET(leaf->mn_flags, F_SUBDATA)) {
+ mc->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf);
+ }
+ rc = mdb_cursor_del(&mc->mc_xcursor->mx_cursor, 0);
+ /* If sub-DB still has entries, we're done */
+ if (mc->mc_xcursor->mx_db.md_entries) {
+ if (leaf->mn_flags & F_SUBDATA) {
+ /* update subDB info */
+ void *db = NODEDATA(leaf);
+ memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDB_db));
+ } else {
+ /* shrink fake page */
+ mdb_node_shrink(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
+ }
+ mc->mc_db->md_entries--;
+ return rc;
+ }
+ /* otherwise fall thru and delete the sub-DB */
+ }
+
+ if (leaf->mn_flags & F_SUBDATA) {
+ /* add all the child DB's pages to the free list */
+ rc = mdb_drop0(&mc->mc_xcursor->mx_cursor, 0);
+ if (rc == MDB_SUCCESS) {
+ mc->mc_db->md_entries -=
+ mc->mc_xcursor->mx_db.md_entries;
+ }
+ }
+ }
+
+ return mdb_cursor_del0(mc, leaf);
+}
+
+/** Allocate and initialize new pages for a database.
+ * @param[in] mc a cursor on the database being added to.
+ * @param[in] flags flags defining what type of page is being allocated.
+ * @param[in] num the number of pages to allocate. This is usually 1,
+ * unless allocating overflow pages for a large record.
+ * @return Address of a page, or NULL on failure.
+ */
+static MDB_page *
+mdb_page_new(MDB_cursor *mc, uint32_t flags, int num)
+{
+ MDB_page *np;