-tdb1_incompatible_hash: uint64_t (const void *, size_t, uint64_t, void *)
-tdb_add_flag: void (struct tdb_context *, unsigned int)
-tdb_append: enum TDB_ERROR (struct tdb_context *, struct tdb_data, struct tdb_data)
-tdb_chainlock: enum TDB_ERROR (struct tdb_context *, TDB_DATA)
-tdb_chainlock_read: enum TDB_ERROR (struct tdb_context *, TDB_DATA)
-tdb_chainunlock: void (struct tdb_context *, TDB_DATA)
-tdb_chainunlock_read: void (struct tdb_context *, TDB_DATA)
-tdb_check_: enum TDB_ERROR (struct tdb_context *, enum TDB_ERROR (*)(TDB_DATA, TDB_DATA, void *), void *)
-tdb_close: int (struct tdb_context *)
-tdb_delete: enum TDB_ERROR (struct tdb_context *, struct tdb_data)
-tdb_error: enum TDB_ERROR (struct tdb_context *)
-tdb_errorstr: const char *(enum TDB_ERROR)
-tdb_exists: bool (struct tdb_context *, TDB_DATA)
-tdb_fd: int (const struct tdb_context *)
-tdb_fetch: enum TDB_ERROR (struct tdb_context *, struct tdb_data, struct tdb_data *)
-tdb_firstkey: enum TDB_ERROR (struct tdb_context *, struct tdb_data *)
-tdb_foreach_: void (int (*)(struct tdb_context *, void *), void *)
-tdb_get_attribute: enum TDB_ERROR (struct tdb_context *, union tdb_attribute *)
-tdb_get_flags: unsigned int (struct tdb_context *)
-tdb_get_seqnum: int64_t (struct tdb_context *)
-tdb_lockall: enum TDB_ERROR (struct tdb_context *)
-tdb_lockall_read: enum TDB_ERROR (struct tdb_context *)
-tdb_name: const char *(const struct tdb_context *)
-tdb_nextkey: enum TDB_ERROR (struct tdb_context *, struct tdb_data *)
-tdb_open: struct tdb_context *(const char *, int, int, mode_t, union tdb_attribute *)
-tdb_parse_record_: enum TDB_ERROR (struct tdb_context *, TDB_DATA, enum TDB_ERROR (*)(TDB_DATA, TDB_DATA, void *), void *)
-tdb_remove_flag: void (struct tdb_context *, unsigned int)
-tdb_repack: enum TDB_ERROR (struct tdb_context *)
-tdb_set_attribute: enum TDB_ERROR (struct tdb_context *, const union tdb_attribute *)
-tdb_store: enum TDB_ERROR (struct tdb_context *, struct tdb_data, struct tdb_data, int)
-tdb_summary: enum TDB_ERROR (struct tdb_context *, enum tdb_summary_flags, char **)
-tdb_transaction_cancel: void (struct tdb_context *)
-tdb_transaction_commit: enum TDB_ERROR (struct tdb_context *)
-tdb_transaction_prepare_commit: enum TDB_ERROR (struct tdb_context *)
-tdb_transaction_start: enum TDB_ERROR (struct tdb_context *)
-tdb_traverse_: int64_t (struct tdb_context *, int (*)(struct tdb_context *, TDB_DATA, TDB_DATA, void *), void *)
-tdb_unlockall: void (struct tdb_context *)
-tdb_unlockall_read: void (struct tdb_context *)
-tdb_unset_attribute: void (struct tdb_context *, enum tdb_attribute_type)
-tdb_wipe_all: enum TDB_ERROR (struct tdb_context *)
diff --git a/lib/tdb2/LICENSE b/lib/tdb2/LICENSE
deleted file mode 100644
index cca7fc278f..0000000000
--- a/lib/tdb2/LICENSE
+++ /dev/null
@@ -1,165 +0,0 @@
- Version 3, 29 June 2007
- Copyright (C) 2007 Free Software Foundation, Inc. <>
- Everyone is permitted to copy and distribute verbatim copies
- of this license document, but changing it is not allowed.
- This version of the GNU Lesser General Public License incorporates
-the terms and conditions of version 3 of the GNU General Public
-License, supplemented by the additional permissions listed below.
- 0. Additional Definitions.
- As used herein, "this License" refers to version 3 of the GNU Lesser
-General Public License, and the "GNU GPL" refers to version 3 of the GNU
-General Public License.
- "The Library" refers to a covered work governed by this License,
-other than an Application or a Combined Work as defined below.
- An "Application" is any work that makes use of an interface provided
-by the Library, but which is not otherwise based on the Library.
-Defining a subclass of a class defined by the Library is deemed a mode
-of using an interface provided by the Library.
- A "Combined Work" is a work produced by combining or linking an
-Application with the Library. The particular version of the Library
-with which the Combined Work was made is also called the "Linked
- The "Minimal Corresponding Source" for a Combined Work means the
-Corresponding Source for the Combined Work, excluding any source code
-for portions of the Combined Work that, considered in isolation, are
-based on the Application, and not on the Linked Version.
- The "Corresponding Application Code" for a Combined Work means the
-object code and/or source code for the Application, including any data
-and utility programs needed for reproducing the Combined Work from the
-Application, but excluding the System Libraries of the Combined Work.
- 1. Exception to Section 3 of the GNU GPL.
- You may convey a covered work under sections 3 and 4 of this License
-without being bound by section 3 of the GNU GPL.
- 2. Conveying Modified Versions.
- If you modify a copy of the Library, and, in your modifications, a
-facility refers to a function or data to be supplied by an Application
-that uses the facility (other than as an argument passed when the
-facility is invoked), then you may convey a copy of the modified
- a) under this License, provided that you make a good faith effort to
- ensure that, in the event an Application does not supply the
- function or data, the facility still operates, and performs
- whatever part of its purpose remains meaningful, or
- b) under the GNU GPL, with none of the additional permissions of
- this License applicable to that copy.
- 3. Object Code Incorporating Material from Library Header Files.
- The object code form of an Application may incorporate material from
-a header file that is part of the Library. You may convey such object
-code under terms of your choice, provided that, if the incorporated
-material is not limited to numerical parameters, data structure
-layouts and accessors, or small macros, inline functions and templates
-(ten or fewer lines in length), you do both of the following:
- a) Give prominent notice with each copy of the object code that the
- Library is used in it and that the Library and its use are
- covered by this License.
- b) Accompany the object code with a copy of the GNU GPL and this license
- document.
- 4. Combined Works.
- You may convey a Combined Work under terms of your choice that,
-taken together, effectively do not restrict modification of the
-portions of the Library contained in the Combined Work and reverse
-engineering for debugging such modifications, if you also do each of
-the following:
- a) Give prominent notice with each copy of the Combined Work that
- the Library is used in it and that the Library and its use are
- covered by this License.
- b) Accompany the Combined Work with a copy of the GNU GPL and this license
- document.
- c) For a Combined Work that displays copyright notices during
- execution, include the copyright notice for the Library among
- these notices, as well as a reference directing the user to the
- copies of the GNU GPL and this license document.
- d) Do one of the following:
- 0) Convey the Minimal Corresponding Source under the terms of this
- License, and the Corresponding Application Code in a form
- suitable for, and under terms that permit, the user to
- recombine or relink the Application with a modified version of
- the Linked Version to produce a modified Combined Work, in the
- manner specified by section 6 of the GNU GPL for conveying
- Corresponding Source.
- 1) Use a suitable shared library mechanism for linking with the
- Library. A suitable mechanism is one that (a) uses at run time
- a copy of the Library already present on the user's computer
- system, and (b) will operate properly with a modified version
- of the Library that is interface-compatible with the Linked
- Version.
- e) Provide Installation Information, but only if you would otherwise
- be required to provide such information under section 6 of the
- GNU GPL, and only to the extent that such information is
- necessary to install and execute a modified version of the
- Combined Work produced by recombining or relinking the
- Application with a modified version of the Linked Version. (If
- you use option 4d0, the Installation Information must accompany
- the Minimal Corresponding Source and Corresponding Application
- Code. If you use option 4d1, you must provide the Installation
- Information in the manner specified by section 6 of the GNU GPL
- for conveying Corresponding Source.)
- 5. Combined Libraries.
- You may place library facilities that are a work based on the
-Library side by side in a single library together with other library
-facilities that are not Applications and are not covered by this
-License, and convey such a combined library under terms of your
-choice, if you do both of the following:
- a) Accompany the combined library with a copy of the same work based
- on the Library, uncombined with any other library facilities,
- conveyed under the terms of this License.
- b) Give prominent notice with the combined library that part of it
- is a work based on the Library, and explaining where to find the
- accompanying uncombined form of the same work.
- 6. Revised Versions of the GNU Lesser General Public License.
- The Free Software Foundation may publish revised and/or new versions
-of the GNU Lesser General Public License from time to time. Such new
-versions will be similar in spirit to the present version, but may
-differ in detail to address new problems or concerns.
- Each version is given a distinguishing version number. If the
-Library as you received it specifies that a certain numbered version
-of the GNU Lesser General Public License "or any later version"
-applies to it, you have the option of following the terms and
-conditions either of that published version or of any later version
-published by the Free Software Foundation. If the Library as you
-received it does not specify a version number of the GNU Lesser
-General Public License, you may choose any version of the GNU Lesser
-General Public License ever published by the Free Software Foundation.
- If the Library as you received it specifies that a proxy can decide
-whether future versions of the GNU Lesser General Public License shall
-apply, that proxy's public statement of acceptance of any version is
-permanent authorization for you to choose that version for the
diff --git a/lib/tdb2/Makefile b/lib/tdb2/Makefile
diff --git a/lib/tdb2/_info b/lib/tdb2/_info
deleted file mode 100644
index 37c0c29e99..0000000000
--- a/lib/tdb2/_info
+++ /dev/null
@@ -1,91 +0,0 @@
-#include <string.h>
-#include <stdio.h>
- * tdb2 - [[WORK IN PROGRESS!]] The trivial (64bit transactional) database
- *
- * The tdb2 module provides an efficient keyword data mapping (usually
- * within a file). It supports transactions, so the contents of the
- * database is reliable even across crashes.
- *
- * Example:
- * #include <ccan/tdb2/tdb2.h>
- * #include <ccan/str/str.h>
- * #include <ccan/err/err.h>
- * #include <stdio.h>
- *
- * static void usage(const char *argv0)
- * {
- * errx(1, "Usage: %s fetch <dbfile> <key>\n"
- * "OR %s store <dbfile> <key> <data>", argv0, argv0);
- * }
- *
- * int main(int argc, char *argv[])
- * {
- * struct tdb_context *tdb;
- * TDB_DATA key, value;
- * enum TDB_ERROR error;
- *
- * if (argc < 4)
- * usage(argv[0]);
- *
- * tdb = tdb_open(argv[2], TDB_DEFAULT, O_CREAT|O_RDWR,0600, NULL);
- * if (!tdb)
- * err(1, "Opening %s", argv[2]);
- *
- * key.dptr = (void *)argv[3];
- * key.dsize = strlen(argv[3]);
- *
- * if (streq(argv[1], "fetch")) {
- * if (argc != 4)
- * usage(argv[0]);
- * error = tdb_fetch(tdb, key, &value);
- * if (error)
- * errx(1, "fetch %s: %s",
- * argv[3], tdb_errorstr(error));
- * printf("%.*s\n", value.dsize, (char *)value.dptr);
- * free(value.dptr);
- * } else if (streq(argv[1], "store")) {
- * if (argc != 5)
- * usage(argv[0]);
- * value.dptr = (void *)argv[4];
- * value.dsize = strlen(argv[4]);
- * error = tdb_store(tdb, key, value, 0);
- * if (error)
- * errx(1, "store %s: %s",
- * argv[3], tdb_errorstr(error));
- * } else
- * usage(argv[0]);
- *
- * return 0;
- * }
- *
- * Maintainer: Rusty Russell <>
- *
- * Author: Rusty Russell
- *
- * License: LGPLv3 (or later)
- */
-int main(int argc, char *argv[])
- if (argc != 2)
- return 1;
- if (strcmp(argv[1], "depends") == 0) {
- printf("ccan/asprintf\n");
- printf("ccan/hash\n");
- printf("ccan/likely\n");
- printf("ccan/asearch\n");
- printf("ccan/compiler\n");
- printf("ccan/build_assert\n");
- printf("ccan/ilog\n");
- printf("ccan/failtest\n");
- printf("ccan/tally\n");
- printf("ccan/typesafe_cb\n");
- printf("ccan/cast\n");
- printf("ccan/endian\n");
- return 0;
- }
- return 1;
diff --git a/lib/tdb2/check.c b/lib/tdb2/check.c
deleted file mode 100644
index 4b589b6ee1..0000000000
--- a/lib/tdb2/check.c
+++ /dev/null
@@ -1,864 +0,0 @@
- /*
- Trivial Database 2: free list/block handling
- Copyright (C) Rusty Russell 2010
- This library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 3 of the License, or (at your option) any later version.
- This library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- Lesser General Public License for more details.
- You should have received a copy of the GNU Lesser General Public
- License along with this library; if not, see <>.
-#include "private.h"
-#include <ccan/likely/likely.h>
-#include <ccan/asearch/asearch.h>
-/* We keep an ordered array of offsets. */
-static bool append(tdb_off_t **arr, size_t *num, tdb_off_t off)
- tdb_off_t *new = realloc(*arr, (*num + 1) * sizeof(tdb_off_t));
- if (!new)
- return false;
- new[(*num)++] = off;
- *arr = new;
- return true;
-static enum TDB_ERROR check_header(struct tdb_context *tdb, tdb_off_t *recovery,
- uint64_t *features, size_t *num_capabilities)
- uint64_t hash_test;
- struct tdb_header hdr;
- enum TDB_ERROR ecode;
- tdb_off_t off, next;
- ecode = tdb_read_convert(tdb, 0, &hdr, sizeof(hdr));
- if (ecode != TDB_SUCCESS) {
- return ecode;
- }
- /* magic food should not be converted, so convert back. */
- tdb_convert(tdb, hdr.magic_food, sizeof(hdr.magic_food));
- hash_test = TDB_HASH_MAGIC;
- hash_test = tdb_hash(tdb, &hash_test, sizeof(hash_test));
- if (hdr.hash_test != hash_test) {
- return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
- "check: hash test %llu should be %llu",
- (long long)hdr.hash_test,
- (long long)hash_test);
- }
- if (strcmp(hdr.magic_food, TDB_MAGIC_FOOD) != 0) {
- return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
- "check: bad magic '%.*s'",
- (unsigned)sizeof(hdr.magic_food),
- hdr.magic_food);
- }
- /* Features which are used must be a subset of features offered. */
- if (hdr.features_used & ~hdr.features_offered) {
- return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
- "check: features used (0x%llx) which"
- " are not offered (0x%llx)",
- (long long)hdr.features_used,
- (long long)hdr.features_offered);
- }
- *features = hdr.features_offered;
- *recovery = hdr.recovery;
- if (*recovery) {
- if (*recovery < sizeof(hdr)
- || *recovery > tdb->file->map_size) {
- return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
- "tdb_check:"
- " invalid recovery offset %zu",
- (size_t)*recovery);
- }
- }
- for (off = hdr.capabilities; off && ecode == TDB_SUCCESS; off = next) {
- const struct tdb_capability *cap;
- enum TDB_ERROR e;
- cap = tdb_access_read(tdb, off, sizeof(*cap), true);
- if (TDB_PTR_IS_ERR(cap)) {
- return TDB_PTR_ERR(cap);
- }
- /* All capabilities are unknown. */
- e = unknown_capability(tdb, "tdb_check", cap->type);
- next = cap->next;
- tdb_access_release(tdb, cap);
- if (e)
- return e;
- (*num_capabilities)++;
- }
- /* Don't check reserved: they *can* be used later. */
- return TDB_SUCCESS;
-static enum TDB_ERROR check_hash_tree(struct tdb_context *tdb,
- tdb_off_t off, unsigned int group_bits,
- uint64_t hprefix,
- unsigned hprefix_bits,
- tdb_off_t used[],
- size_t num_used,
- size_t *num_found,
- enum TDB_ERROR (*check)(TDB_DATA,
- TDB_DATA, void *),
- void *data);
-static enum TDB_ERROR check_hash_chain(struct tdb_context *tdb,
- tdb_off_t off,
- uint64_t hash,
- tdb_off_t used[],
- size_t num_used,
- size_t *num_found,
- enum TDB_ERROR (*check)(TDB_DATA,
- void *),
- void *data)
- struct tdb_used_record rec;
- enum TDB_ERROR ecode;
- ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec));
- if (ecode != TDB_SUCCESS) {
- return ecode;
- }
- if (rec_magic(&rec) != TDB_CHAIN_MAGIC) {
- return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
- "tdb_check: Bad hash chain magic %llu",
- (long long)rec_magic(&rec));
- }
- if (rec_data_length(&rec) != sizeof(struct tdb_chain)) {
- return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
- "tdb_check:"
- " Bad hash chain length %llu vs %zu",
- (long long)rec_data_length(&rec),
- sizeof(struct tdb_chain));
- }
- if (rec_key_length(&rec) != 0) {
- return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
- "tdb_check: Bad hash chain key length %llu",
- (long long)rec_key_length(&rec));
- }
- if (rec_hash(&rec) != 0) {
- return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
- "tdb_check: Bad hash chain hash value %llu",
- (long long)rec_hash(&rec));
- }
- off += sizeof(rec);
- ecode = check_hash_tree(tdb, off, 0, hash, 64,
- used, num_used, num_found, check, data);
- if (ecode != TDB_SUCCESS) {
- return ecode;
- }
- off = tdb_read_off(tdb, off + offsetof(struct tdb_chain, next));
- if (TDB_OFF_IS_ERR(off)) {
- return TDB_OFF_TO_ERR(off);
- }
- if (off == 0)
- return TDB_SUCCESS;
- (*num_found)++;
- return check_hash_chain(tdb, off, hash, used, num_used, num_found,
- check, data);
-static enum TDB_ERROR check_hash_record(struct tdb_context *tdb,
- tdb_off_t off,
- uint64_t hprefix,
- unsigned hprefix_bits,
- tdb_off_t used[],
- size_t num_used,
- size_t *num_found,
- enum TDB_ERROR (*check)(TDB_DATA,
- void *),
- void *data)
- struct tdb_used_record rec;
- enum TDB_ERROR ecode;
- if (hprefix_bits >= 64)
- return check_hash_chain(tdb, off, hprefix, used, num_used,
- num_found, check, data);
- ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec));
- if (ecode != TDB_SUCCESS) {
- return ecode;
- }
- if (rec_magic(&rec) != TDB_HTABLE_MAGIC) {
- return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
- "tdb_check: Bad hash table magic %llu",
- (long long)rec_magic(&rec));
- }
- if (rec_data_length(&rec)
- != sizeof(tdb_off_t) << TDB_SUBLEVEL_HASH_BITS) {
- return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
- "tdb_check:"
- " Bad hash table length %llu vs %llu",
- (long long)rec_data_length(&rec),
- (long long)sizeof(tdb_off_t)
- }
- if (rec_key_length(&rec) != 0) {
- return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
- "tdb_check: Bad hash table key length %llu",
- (long long)rec_key_length(&rec));
- }
- if (rec_hash(&rec) != 0) {
- return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
- "tdb_check: Bad hash table hash value %llu",
- (long long)rec_hash(&rec));
- }
- off += sizeof(rec);
- return check_hash_tree(tdb, off,
- hprefix, hprefix_bits,
- used, num_used, num_found, check, data);
-static int off_cmp(const tdb_off_t *a, const tdb_off_t *b)
- /* Can overflow an int. */
- return *a > *b ? 1
- : *a < *b ? -1
- : 0;
-static uint64_t get_bits(uint64_t h, unsigned num, unsigned *used)
- *used += num;
- return (h >> (64 - *used)) & ((1U << num) - 1);
-static enum TDB_ERROR check_hash_tree(struct tdb_context *tdb,
- tdb_off_t off, unsigned int group_bits,
- uint64_t hprefix,
- unsigned hprefix_bits,
- tdb_off_t used[],
- size_t num_used,
- size_t *num_found,
- enum TDB_ERROR (*check)(TDB_DATA,
- TDB_DATA, void *),
- void *data)
- unsigned int g, b;
- const tdb_off_t *hash;
- struct tdb_used_record rec;
- enum TDB_ERROR ecode;
- hash = tdb_access_read(tdb, off,
- sizeof(tdb_off_t)
- << (group_bits + TDB_HASH_GROUP_BITS),
- true);
- if (TDB_PTR_IS_ERR(hash)) {
- return TDB_PTR_ERR(hash);
- }
- for (g = 0; g < (1 << group_bits); g++) {
- const tdb_off_t *group = hash + (g << TDB_HASH_GROUP_BITS);
- for (b = 0; b < (1 << TDB_HASH_GROUP_BITS); b++) {
- unsigned int bucket, i, used_bits;
- uint64_t h;
- tdb_off_t *p;
- if (group[b] == 0)
- continue;
- off = group[b] & TDB_OFF_MASK;
- p = asearch(&off, used, num_used, off_cmp);
- if (!p) {
- ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT,
- "tdb_check: Invalid offset"
- " %llu in hash",
- (long long)off);
- goto fail;
- }
- /* Mark it invalid. */
- *p ^= 1;
- (*num_found)++;
- if (hprefix_bits == 64) {
- /* Chained entries are unordered. */
- if (is_subhash(group[b])) {
- ecode = TDB_ERR_CORRUPT;
- tdb_logerr(tdb, ecode,
- "tdb_check: Invalid chain"
- " entry subhash");
- goto fail;
- }
- h = hash_record(tdb, off);
- if (h != hprefix) {
- ecode = TDB_ERR_CORRUPT;
- tdb_logerr(tdb, ecode,
- "check: bad hash chain"
- " placement"
- " 0x%llx vs 0x%llx",
- (long long)h,
- (long long)hprefix);
- goto fail;
- }
- ecode = tdb_read_convert(tdb, off, &rec,
- sizeof(rec));
- if (ecode != TDB_SUCCESS) {
- goto fail;
- }
- goto check;
- }
- if (is_subhash(group[b])) {
- uint64_t subprefix;
- subprefix = (hprefix
- << (group_bits + TDB_HASH_GROUP_BITS))
- + g * (1 << TDB_HASH_GROUP_BITS) + b;
- ecode = check_hash_record(tdb,
- group[b] & TDB_OFF_MASK,
- subprefix,
- hprefix_bits
- + group_bits
- used, num_used, num_found,
- check, data);
- if (ecode != TDB_SUCCESS) {
- goto fail;
- }
- continue;
- }
- /* A normal entry */
- /* Does it belong here at all? */
- h = hash_record(tdb, off);
- used_bits = 0;
- if (get_bits(h, hprefix_bits, &used_bits) != hprefix
- && hprefix_bits) {
- ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT,
- "check: bad hash placement"
- " 0x%llx vs 0x%llx",
- (long long)h,
- (long long)hprefix);
- goto fail;
- }
- /* Does it belong in this group? */
- if (get_bits(h, group_bits, &used_bits) != g) {
- ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT,
- "check: bad group %llu"
- " vs %u",
- (long long)h, g);
- goto fail;
- }
- /* Are bucket bits correct? */
- bucket = group[b] & TDB_OFF_HASH_GROUP_MASK;
- if (get_bits(h, TDB_HASH_GROUP_BITS, &used_bits)
- != bucket) {
- used_bits -= TDB_HASH_GROUP_BITS;
- ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT,
- "check: bad bucket %u vs %u",
- (unsigned)get_bits(h,
- &used_bits),
- bucket);
- goto fail;
- }
- /* There must not be any zero entries between
- * the bucket it belongs in and this one! */
- for (i = bucket;
- i != b;
- i = (i + 1) % (1 << TDB_HASH_GROUP_BITS)) {
- if (group[i] == 0) {
- ecode = TDB_ERR_CORRUPT;
- tdb_logerr(tdb, ecode,
- "check: bad group placement"
- " %u vs %u",
- b, bucket);
- goto fail;
- }
- }
- ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec));
- if (ecode != TDB_SUCCESS) {
- goto fail;
- }
- /* Bottom bits must match header. */
- if ((h & ((1 << 11)-1)) != rec_hash(&rec)) {
- ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT,
- "tdb_check: Bad hash magic"
- " at offset %llu"
- " (0x%llx vs 0x%llx)",
- (long long)off,
- (long long)h,
- (long long)rec_hash(&rec));
- goto fail;
- }
- check:
- if (check) {
- TDB_DATA k, d;
- const unsigned char *kptr;
- kptr = tdb_access_read(tdb,
- off + sizeof(rec),
- rec_key_length(&rec)
- + rec_data_length(&rec),
- false);
- if (TDB_PTR_IS_ERR(kptr)) {
- ecode = TDB_PTR_ERR(kptr);
- goto fail;
- }
- k = tdb_mkdata(kptr, rec_key_length(&rec));
- d = tdb_mkdata(kptr + k.dsize,
- rec_data_length(&rec));
- ecode = check(k, d, data);
- tdb_access_release(tdb, kptr);
- if (ecode != TDB_SUCCESS) {
- goto fail;
- }
- }
- }
- }
- tdb_access_release(tdb, hash);
- return TDB_SUCCESS;
- tdb_access_release(tdb, hash);
- return ecode;
-static enum TDB_ERROR check_hash(struct tdb_context *tdb,
- tdb_off_t used[],
- size_t num_used, size_t num_other_used,
- enum TDB_ERROR (*check)(TDB_DATA, TDB_DATA, void *),
- void *data)
- /* Free tables and capabilities also show up as used. */
- size_t num_found = num_other_used;
- enum TDB_ERROR ecode;
- ecode = check_hash_tree(tdb, offsetof(struct tdb_header, hashtable),
- 0, 0, used, num_used, &num_found,
- check, data);
- if (ecode == TDB_SUCCESS) {
- if (num_found != num_used) {
- ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
- "tdb_check: Not all entries"
- " are in hash");
- }
- }
- return ecode;
-static enum TDB_ERROR check_free(struct tdb_context *tdb,
- tdb_off_t off,
- const struct tdb_free_record *frec,
- tdb_off_t prev, unsigned int ftable,
- unsigned int bucket)
- enum TDB_ERROR ecode;
- if (frec_magic(frec) != TDB_FREE_MAGIC) {
- return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
- "tdb_check: offset %llu bad magic 0x%llx",
- (long long)off,
- (long long)frec->magic_and_prev);
- }
- if (frec_ftable(frec) != ftable) {
- return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
- "tdb_check: offset %llu bad freetable %u",
- (long long)off, frec_ftable(frec));
- }
- ecode = tdb->io->oob(tdb, off,
- frec_len(frec)
- + sizeof(struct tdb_used_record),
- false);
- if (ecode != TDB_SUCCESS) {
- return ecode;
- }
- if (size_to_bucket(frec_len(frec)) != bucket) {
- return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
- "tdb_check: offset %llu in wrong bucket"
- " (%u vs %u)",
- (long long)off,
- bucket, size_to_bucket(frec_len(frec)));
- }
- if (prev && prev != frec_prev(frec)) {
- return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
- "tdb_check: offset %llu bad prev"
- " (%llu vs %llu)",
- (long long)off,
- (long long)prev, (long long)frec_len(frec));
- }
- return TDB_SUCCESS;
-static enum TDB_ERROR check_free_table(struct tdb_context *tdb,
- tdb_off_t ftable_off,
- unsigned ftable_num,
- tdb_off_t fr[],
- size_t num_free,
- size_t *num_found)
- struct tdb_freetable ft;
- tdb_off_t h;
- unsigned int i;
- enum TDB_ERROR ecode;
- ecode = tdb_read_convert(tdb, ftable_off, &ft, sizeof(ft));
- if (ecode != TDB_SUCCESS) {
- return ecode;
- }
- if (rec_magic(&ft.hdr) != TDB_FTABLE_MAGIC
- || rec_key_length(&ft.hdr) != 0
- || rec_data_length(&ft.hdr) != sizeof(ft) - sizeof(ft.hdr)
- || rec_hash(&ft.hdr) != 0) {
- return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
- "tdb_check: Invalid header on free table");
- }
- for (i = 0; i < TDB_FREE_BUCKETS; i++) {
- tdb_off_t off, prev = 0, *p, first = 0;
- struct tdb_free_record f;
- h = bucket_off(ftable_off, i);
- for (off = tdb_read_off(tdb, h); off; off = {
- if (TDB_OFF_IS_ERR(off)) {
- return TDB_OFF_TO_ERR(off);
- }
- if (!first) {
- off &= TDB_OFF_MASK;
- first = off;
- }
- ecode = tdb_read_convert(tdb, off, &f, sizeof(f));
- if (ecode != TDB_SUCCESS) {
- return ecode;
- }
- ecode = check_free(tdb, off, &f, prev, ftable_num, i);
- if (ecode != TDB_SUCCESS) {
- return ecode;
- }
- /* FIXME: Check hash bits */
- p = asearch(&off, fr, num_free, off_cmp);
- if (!p) {
- return tdb_logerr(tdb, TDB_ERR_CORRUPT,
- "tdb_check: Invalid offset"
- " %llu in free table",
- (long long)off);
- }
- /* Mark it invalid. */
- *p ^= 1;
- (*num_found)++;
- prev = off;
- }
- if (first) {
- /* Now we can check first back pointer. */
- ecode = tdb_read_convert(tdb, first, &f, sizeof(f));
- if (ecode != TDB_SUCCESS) {
- return ecode;
- }
- ecode = check_free(tdb, first, &f, prev, ftable_num, i);
- if (ecode != TDB_SUCCESS) {
- return ecode;
- }
- }
- }
- return TDB_SUCCESS;
-/* Slow, but should be very rare. */
-tdb_off_t dead_space(struct tdb_context *tdb, tdb_off_t off)
- size_t len;
- enum TDB_ERROR ecode;
- for (len = 0; off + len < tdb->file->map_size; len++) {
- char c;
- ecode = tdb->io->tread(tdb, off, &c, 1);
- if (ecode != TDB_SUCCESS) {
- return TDB_ERR_TO_OFF(ecode);
- }
- if (c != 0 && c != 0x43)
- break;
- }
- return len;
-static enum TDB_ERROR check_linear(struct tdb_context *tdb,
- tdb_off_t **used, size_t *num_used,
- tdb_off_t **fr, size_t *num_free,
- uint64_t features, tdb_off_t recovery)
- tdb_off_t off;
- tdb_len_t len;
- enum TDB_ERROR ecode;
- bool found_recovery = false;
- for (off = sizeof(struct tdb_header);
- off < tdb->file->map_size;
- off += len) {
- union {
- struct tdb_used_record u;
- struct tdb_free_record f;
- struct tdb_recovery_record r;
- } rec;
- /* r is larger: only get that if we need to. */
- ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec.f));
- if (ecode != TDB_SUCCESS) {
- return ecode;
- }
- /* If we crash after ftruncate, we can get zeroes or fill. */
- if (rec.r.magic == TDB_RECOVERY_INVALID_MAGIC
- || rec.r.magic == 0x4343434343434343ULL) {
- ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec.r));
- if (ecode != TDB_SUCCESS) {
- return ecode;
- }
- if (recovery == off) {
- found_recovery = true;
- len = sizeof(rec.r) + rec.r.max_len;
- } else {
- len = dead_space(tdb, off);
- if (TDB_OFF_IS_ERR(len)) {
- return TDB_OFF_TO_ERR(len);
- }
- if (len < sizeof(rec.r)) {
- return tdb_logerr(tdb, TDB_ERR_CORRUPT,
- "tdb_check: invalid"
- " dead space at %zu",
- (size_t)off);
- }
- tdb_logerr(tdb, TDB_SUCCESS, TDB_LOG_WARNING,
- "Dead space at %zu-%zu (of %zu)",
- (size_t)off, (size_t)(off + len),
- (size_t)tdb->file->map_size);
- }
- } else if (rec.r.magic == TDB_RECOVERY_MAGIC) {
- ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec.r));
- if (ecode != TDB_SUCCESS) {
- return ecode;
- }
- if (recovery != off) {
- return tdb_logerr(tdb, TDB_ERR_CORRUPT,
- "tdb_check: unexpected"
- " recovery record at offset"
- " %zu",
- (size_t)off);
- }
- if (rec.r.len > rec.r.max_len) {
- return tdb_logerr(tdb, TDB_ERR_CORRUPT,
- "tdb_check: invalid recovery"
- " length %zu",
- (size_t)rec.r.len);
- }
- if (rec.r.eof > tdb->file->map_size) {
- return tdb_logerr(tdb, TDB_ERR_CORRUPT,
- "tdb_check: invalid old EOF"
- " %zu", (size_t)rec.r.eof);
- }
- found_recovery = true;
- len = sizeof(rec.r) + rec.r.max_len;
- } else if (frec_magic(&rec.f) == TDB_FREE_MAGIC) {
- len = sizeof(rec.u) + frec_len(&rec.f);
- if (off + len > tdb->file->map_size) {
- return tdb_logerr(tdb, TDB_ERR_CORRUPT,
- "tdb_check: free overlength"
- " %llu at offset %llu",
- (long long)len,
- (long long)off);
- }
- /* This record should be in free lists. */
- if (frec_ftable(&rec.f) != TDB_FTABLE_NONE
- && !append(fr, num_free, off)) {
- return tdb_logerr(tdb, TDB_ERR_OOM,
- "tdb_check: tracking %zu'th"
- " free record.", *num_free);
- }
- } else if (rec_magic(&rec.u) == TDB_USED_MAGIC
- || rec_magic(&rec.u) == TDB_CHAIN_MAGIC
- || rec_magic(&rec.u) == TDB_HTABLE_MAGIC
- || rec_magic(&rec.u) == TDB_FTABLE_MAGIC
- || rec_magic(&rec.u) == TDB_CAP_MAGIC) {
- uint64_t klen, dlen, extra;
- /* This record is used! */
- if (!append(used, num_used, off)) {
- return tdb_logerr(tdb, TDB_ERR_OOM,
- "tdb_check: tracking %zu'th"
- " used record.", *num_used);
- }
- klen = rec_key_length(&rec.u);
- dlen = rec_data_length(&rec.u);
- extra = rec_extra_padding(&rec.u);
- len = sizeof(rec.u) + klen + dlen + extra;
- if (off + len > tdb->file->map_size) {
- return tdb_logerr(tdb, TDB_ERR_CORRUPT,
- "tdb_check: used overlength"
- " %llu at offset %llu",
- (long long)len,
- (long long)off);
- }
- if (len < sizeof(rec.f)) {
- return tdb_logerr(tdb, TDB_ERR_CORRUPT,
- "tdb_check: too short record"
- " %llu at %llu",
- (long long)len,
- (long long)off);
- }
- /* Check that records have correct 0 at end (but may
- * not in future). */
- if (extra && !features
- && rec_magic(&rec.u) != TDB_CAP_MAGIC) {
- const char *p;
- char c;
- p = tdb_access_read(tdb, off + sizeof(rec.u)
- + klen + dlen, 1, false);
- if (TDB_PTR_IS_ERR(p))
- return TDB_PTR_ERR(p);
- c = *p;
- tdb_access_release(tdb, p);
- if (c != '\0') {
- return tdb_logerr(tdb, TDB_ERR_CORRUPT,
- "tdb_check:"
- " non-zero extra"
- " at %llu",
- (long long)off);
- }
- }
- } else {
- return tdb_logerr(tdb, TDB_ERR_CORRUPT,
- "tdb_check: Bad magic 0x%llx"
- " at offset %zu",
- (long long)rec_magic(&rec.u),
- (size_t)off);
- }
- }
- /* We must have found recovery area if there was one. */
- if (recovery != 0 && !found_recovery) {
- return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
- "tdb_check: expected a recovery area at %zu",
- (size_t)recovery);
- }
- return TDB_SUCCESS;
-_PUBLIC_ enum TDB_ERROR tdb_check_(struct tdb_context *tdb,
- enum TDB_ERROR (*check)(TDB_DATA, TDB_DATA, void *),
- void *data)
- tdb_off_t *fr = NULL, *used = NULL, ft, recovery;
- size_t num_free = 0, num_used = 0, num_found = 0, num_ftables = 0,
- num_capabilities = 0;
- uint64_t features;
- enum TDB_ERROR ecode;
- if (tdb->flags & TDB_CANT_CHECK) {
- return tdb_logerr(tdb, TDB_SUCCESS, TDB_LOG_WARNING,
- "tdb_check: database has unknown capability,"
- " cannot check.");
- }
- ecode = tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_WAIT, false);
- if (ecode != TDB_SUCCESS) {
- return tdb->last_error = ecode;
- }
- ecode = tdb_lock_expand(tdb, F_RDLCK);
- if (ecode != TDB_SUCCESS) {
- tdb_allrecord_unlock(tdb, F_RDLCK);
- return tdb->last_error = ecode;
- }
- ecode = check_header(tdb, &recovery, &features, &num_capabilities);
- if (ecode != TDB_SUCCESS)
- goto out;
- /* First we do a linear scan, checking all records. */
- ecode = check_linear(tdb, &used, &num_used, &fr, &num_free, features,
- recovery);
- if (ecode != TDB_SUCCESS)
- goto out;
- for (ft = first_ftable(tdb); ft; ft = next_ftable(tdb, ft)) {
- if (TDB_OFF_IS_ERR(ft)) {
- ecode = TDB_OFF_TO_ERR(ft);
- goto out;
- }
- ecode = check_free_table(tdb, ft, num_ftables, fr, num_free,
- &num_found);
- if (ecode != TDB_SUCCESS)
- goto out;
- num_ftables++;
- }
- /* FIXME: Check key uniqueness? */
- ecode = check_hash(tdb, used, num_used, num_ftables + num_capabilities,
- check, data);
- if (ecode != TDB_SUCCESS)
- goto out;
- if (num_found != num_free) {
- ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
- "tdb_check: Not all entries are in"
- " free table");
- }
- tdb_allrecord_unlock(tdb, F_RDLCK);
- tdb_unlock_expand(tdb, F_RDLCK);
- free(fr);
- free(used);
- return tdb->last_error = ecode;
diff --git a/lib/tdb2/configure b/lib/tdb2/configure
diff --git a/lib/tdb2/doc/TDB1_porting.txt b/lib/tdb2/doc/TDB1_porting.txt
deleted file mode 100644
index e59295c22f..0000000000
--- a/lib/tdb2/doc/TDB1_porting.txt
+++ /dev/null
@@ -1,72 +0,0 @@
-Interface differences between TDB1 and TDB2.
-- tdb2 uses 'struct tdb_data', tdb1 uses 'struct TDB_DATA'. Use the
- TDB_DATA typedef if you want portability between the two.
-- tdb2 functions return 0 on success, and a negative error on failure,
- whereas tdb1 functions returned 0 on success, and -1 on failure.
- tdb1 then used tdb_error() to determine the error; this is also
- supported in tdb2 to ease backwards compatibility, though the other
- form is preferred.
-- tdb2's tdb_fetch() returns an error, tdb1's returned the data directly
- (or tdb_null, and you were supposed to check tdb_error() to find out why).
-- tdb2's tdb_nextkey() frees the old key's dptr, in tdb1 you needed to do
- this manually.
-- tdb1's tdb_open/tdb_open_ex took an explicit hash size. tdb2's hash table
- resizes as required.
-- tdb2 uses a linked list of attribute structures to implement logging and
- alternate hashes. tdb1 used tdb_open_ex, which was not extensible.
-- tdb2 does locking on read-only databases (ie. O_RDONLY passed to tdb_open).
- tdb1 did not: use the TDB_NOLOCK flag if you want to suppress locking.
-- tdb2's log function is simpler than tdb1's log function. The string is
- already formatted, and it takes an enum tdb_log_level not a tdb_debug_level,
- and which has only three values: TDB_LOG_ERROR, TDB_LOG_USE_ERROR and
-- tdb2 provides tdb_deq() for comparing two struct tdb_data.
-- tdb2's tdb_name() returns a copy of the name even for TDB_INTERNAL dbs.
-- tdb2 does not need tdb_reopen() or tdb_reopen_all(). If you call
- fork() after during certain operations the child should close the
- tdb, or complete the operations before continuing to use the tdb:
- tdb_transaction_start(): child must tdb_transaction_cancel()
- tdb_lockall(): child must call tdb_unlockall()
- tdb_lockall_read(): child must call tdb_unlockall_read()
- tdb_chainlock(): child must call tdb_chainunlock()
- tdb_parse() callback: child must return from tdb_parse()
-- tdb2 will not open a non-tdb file, even if O_CREAT is specified.
-- There is no tdb_traverse_read. For operating on TDB1 files, you can
- simulate it by tdb_add_flag(tdb, TDB_RDONLY); tdb_traverse();
- tdb_remove_flag(tdb, TDB_RDONLY). This may be desirable because
- traverse on TDB1 files use a write lock on the entire database
- unless it's read-only.
-- Failure inside a transaction (such as a lock function failing) does
- not implicitly cancel the transaction; you still need to call
- tdb_transaction_cancel().
-TDB1 Compatibility:
-- tdb2's offers a tdb1_incompatible_hash function, which is the same
- as the default hash with the TDB_INCOMPATIBLE_HASH flag. There is
- no way of marking an old TDB incompatible with versions < 1.2.6
- while using any other hash.
-- The TDB_ATTRIBUTE_TDB1_HASHSIZE attribute can be used to control the
- hash size, but only when creating (ie. O_CREAT) a TDB1
- (ie. TDB_VERSION1).
-- There is no TDB_CLEAR_IF_FIRST flag; it has severe scalability and
- API problems. If necessary, you can emulate this by using the open
- hook and placing a 1-byte lock at offset 4. If your program forks,
- you will need to place this lock again in the child.
diff --git a/lib/tdb2/doc/design-1.3.txt b/lib/tdb2/doc/design-1.3.txt
deleted file mode 100644
index f81ecf7885..0000000000
--- a/lib/tdb2/doc/design-1.3.txt
+++ /dev/null
@@ -1,1049 +0,0 @@
-TDB2: A Redesigning The Trivial DataBase
-Rusty Russell, IBM Corporation
-The Trivial DataBase on-disk format is 32 bits; with usage cases
-heading towards the 4G limit, that must change. This required
-breakage provides an opportunity to revisit TDB's other design
-decisions and reassess them.
-1 Introduction
-The Trivial DataBase was originally written by Andrew Tridgell as
-a simple key/data pair storage system with the same API as dbm,
-but allowing multiple readers and writers while being small
-enough (< 1000 lines of C) to include in SAMBA. The simple design
-created in 1999 has proven surprisingly robust and performant,
-used in Samba versions 3 and 4 as well as numerous other
-projects. Its useful life was greatly increased by the
-(backwards-compatible!) addition of transaction support in 2005.
-The wider variety and greater demands of TDB-using code has lead
-to some organic growth of the API, as well as some compromises on
-the implementation. None of these, by themselves, are seen as
-show-stoppers, but the cumulative effect is to a loss of elegance
-over the initial, simple TDB implementation. Here is a table of
-the approximate number of lines of implementation code and number
-of API functions at the end of each year:
-| Year End | API Functions | Lines of C Code Implementation |
-| 1999 | 13 | 1195 |
-| 2000 | 24 | 1725 |
-| 2001 | 32 | 2228 |
-| 2002 | 35 | 2481 |
-| 2003 | 35 | 2552 |
-| 2004 | 40 | 2584 |
-| 2005 | 38 | 2647 |
-| 2006 | 52 | 3754 |
-| 2007 | 66 | 4398 |
-| 2008 | 71 | 4768 |
-| 2009 | 73 | 5715 |
-This review is an attempt to catalog and address all the known
-issues with TDB and create solutions which address the problems
-without significantly increasing complexity; all involved are far
-too aware of the dangers of second system syndrome in rewriting a
-successful project like this.
-2 API Issues
-2.1 tdb_open_ex Is Not Expandable
-The tdb_open() call was expanded to tdb_open_ex(), which added an
-optional hashing function and an optional logging function
-argument. Additional arguments to open would require the
-introduction of a tdb_open_ex2 call etc.
-2.1.1 Proposed Solution
-tdb_open() will take a linked-list of attributes:
-enum tdb_attribute {
-struct tdb_attribute_base {
- enum tdb_attribute attr;
- union tdb_attribute *next;
-struct tdb_attribute_log {
- struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_LOG
- tdb_log_func log_fn;
- void *log_private;
-struct tdb_attribute_hash {
- struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_HASH
- tdb_hash_func hash_fn;
- void *hash_private;
-union tdb_attribute {
- struct tdb_attribute_base base;
- struct tdb_attribute_log log;
- struct tdb_attribute_hash hash;
-This allows future attributes to be added, even if this expands
-the size of the union.
-2.2 tdb_traverse Makes Impossible Guarantees
-tdb_traverse (and tdb_firstkey/tdb_nextkey) predate transactions,
-and it was thought that it was important to guarantee that all
-records which exist at the start and end of the traversal would
-be included, and no record would be included twice.
-This adds complexity (see[Reliable-Traversal-Adds]) and does not
-work anyway for records which are altered (in particular, those
-which are expanded may be effectively deleted and re-added behind
-the traversal).
-2.2.1 <traverse-Proposed-Solution>Proposed Solution
-Abandon the guarantee. You will see every record if no changes
-occur during your traversal, otherwise you will see some subset.
-You can prevent changes by using a transaction or the locking
-2.3 Nesting of Transactions Is Fraught
-TDB has alternated between allowing nested transactions and not
-allowing them. Various paths in the Samba codebase assume that
-transactions will nest, and in a sense they can: the operation is
-only committed to disk when the outer transaction is committed.
-There are two problems, however:
-1. Canceling the inner transaction will cause the outer
- transaction commit to fail, and will not undo any operations
- since the inner transaction began. This problem is soluble with
- some additional internal code.
-2. An inner transaction commit can be cancelled by the outer
- transaction. This is desirable in the way which Samba's
- database initialization code uses transactions, but could be a
- surprise to any users expecting a successful transaction commit
- to expose changes to others.
-The current solution is to specify the behavior at tdb_open(),
-with the default currently that nested transactions are allowed.
-This flag can also be changed at runtime.
-2.3.1 Proposed Solution
-Given the usage patterns, it seems that the “least-surprise”
-behavior of disallowing nested transactions should become the
-default. Additionally, it seems the outer transaction is the only
-code which knows whether inner transactions should be allowed, so
-a flag to indicate this could be added to tdb_transaction_start.
-However, this behavior can be simulated with a wrapper which uses
-tdb_add_flags() and tdb_remove_flags(), so the API should not be
-expanded for this relatively-obscure case.
-2.4 Incorrect Hash Function is Not Detected
-tdb_open_ex() allows the calling code to specify a different hash
-function to use, but does not check that all other processes
-accessing this tdb are using the same hash function. The result
-is that records are missing from tdb_fetch().
-2.4.1 Proposed Solution
-The header should contain an example hash result (eg. the hash of
-0xdeadbeef), and tdb_open_ex() should check that the given hash
-function produces the same answer, or fail the tdb_open call.
-2.5 tdb_set_max_dead/TDB_VOLATILE Expose Implementation
-In response to scalability issues with the free list ([TDB-Freelist-Is]
-) two API workarounds have been incorporated in TDB:
-tdb_set_max_dead() and the TDB_VOLATILE flag to tdb_open. The
-latter actually calls the former with an argument of “5”.
-This code allows deleted records to accumulate without putting
-them in the free list. On delete we iterate through each chain
-and free them in a batch if there are more than max_dead entries.
-These are never otherwise recycled except as a side-effect of a
-2.5.1 Proposed Solution
-With the scalability problems of the freelist solved, this API
-can be removed. The TDB_VOLATILE flag may still be useful as a
-hint that store and delete of records will be at least as common
-as fetch in order to allow some internal tuning, but initially
-will become a no-op.
-2.6 <TDB-Files-Cannot>TDB Files Cannot Be Opened Multiple Times
- In The Same Process
-No process can open the same TDB twice; we check and disallow it.
-This is an unfortunate side-effect of fcntl locks, which operate
-on a per-file rather than per-file-descriptor basis, and do not
-nest. Thus, closing any file descriptor on a file clears all the
-locks obtained by this process, even if they were placed using a
-different file descriptor!
-Note that even if this were solved, deadlock could occur if
-operations were nested: this is a more manageable programming
-error in most cases.
-2.6.1 Proposed Solution
-We could lobby POSIX to fix the perverse rules, or at least lobby
-Linux to violate them so that the most common implementation does
-not have this restriction. This would be a generally good idea
-for other fcntl lock users.
-Samba uses a wrapper which hands out the same tdb_context to
-multiple callers if this happens, and does simple reference
-counting. We should do this inside the tdb library, which already
-emulates lock nesting internally; it would need to recognize when
-deadlock occurs within a single process. This would create a new
-failure mode for tdb operations (while we currently handle
-locking failures, they are impossible in normal use and a process
-encountering them can do little but give up).
-I do not see benefit in an additional tdb_open flag to indicate
-whether re-opening is allowed, as though there may be some
-benefit to adding a call to detect when a tdb_context is shared,
-to allow other to create such an API.
-2.7 TDB API Is Not POSIX Thread-safe
-The TDB API uses an error code which can be queried after an
-operation to determine what went wrong. This programming model
-does not work with threads, unless specific additional guarantees
-are given by the implementation. In addition, even
-otherwise-independent threads cannot open the same TDB (as in [TDB-Files-Cannot]
-2.7.1 Proposed Solution
-Reachitecting the API to include a tdb_errcode pointer would be a
-great deal of churn; we are better to guarantee that the
-tdb_errcode is per-thread so the current programming model can be
-This requires dynamic per-thread allocations, which is awkward
-with POSIX threads (pthread_key_create space is limited and we
-cannot simply allocate a key for every TDB).
-Internal locking is required to make sure that fcntl locks do not
-overlap between threads, and also that the global list of tdbs is
-The aim is that building tdb with -DTDB_PTHREAD will result in a
-pthread-safe version of the library, and otherwise no overhead
-will exist.
-2.8 *_nonblock Functions And *_mark Functions Expose
- Implementation
-Clustered TDB, see
-] wishes to operate on TDB in a non-blocking manner. This is
-currently done as follows:
-1. Call the _nonblock variant of an API function (eg.
- tdb_lockall_nonblock). If this fails:
-2. Fork a child process, and wait for it to call the normal
- variant (eg. tdb_lockall).
-3. If the child succeeds, call the _mark variant to indicate we
- already have the locks (eg. tdb_lockall_mark).
-4. Upon completion, tell the child to release the locks (eg.
- tdb_unlockall).
-5. Indicate to tdb that it should consider the locks removed (eg.
- tdb_unlockall_mark).
-There are several issues with this approach. Firstly, adding two
-new variants of each function clutters the API for an obscure
-use, and so not all functions have three variants. Secondly, it
-assumes that all paths of the functions ask for the same locks,
-otherwise the parent process will have to get a lock which the
-child doesn't have under some circumstances. I don't believe this
-is currently the case, but it constrains the implementation.
-2.8.1 <Proposed-Solution-locking-hook>Proposed Solution
-Implement a hook for locking methods, so that the caller can
-control the calls to create and remove fcntl locks. In this
-scenario, ctdbd would operate as follows:
-1. Call the normal API function, eg tdb_lockall().
-2. When the lock callback comes in, check if the child has the
- lock. Initially, this is always false. If so, return 0.
- Otherwise, try to obtain it in non-blocking mode. If that
- fails, return EWOULDBLOCK.
-3. Release locks in the unlock callback as normal.
-4. If tdb_lockall() fails, see if we recorded a lock failure; if
- so, call the child to repeat the operation.
-5. The child records what locks it obtains, and returns that
- information to the parent.
-6. When the child has succeeded, goto 1.
-This is flexible enough to handle any potential locking scenario,
-even when lock requirements change. It can be optimized so that
-the parent does not release locks, just tells the child which
-locks it doesn't need to obtain.
-It also keeps the complexity out of the API, and in ctdbd where
-it is needed.
-2.9 tdb_chainlock Functions Expose Implementation
-tdb_chainlock locks some number of records, including the record
-indicated by the given key. This gave atomicity guarantees;
-no-one can start a transaction, alter, read or delete that key
-while the lock is held.
-It also makes the same guarantee for any other key in the chain,
-which is an internal implementation detail and potentially a
-cause for deadlock.
-2.9.1 Proposed Solution
-None. It would be nice to have an explicit single entry lock
-which effected no other keys. Unfortunately, this won't work for
-an entry which doesn't exist. Thus while chainlock may be
-implemented more efficiently for the existing case, it will still
-have overlap issues with the non-existing case. So it is best to
-keep the current (lack of) guarantee about which records will be
-effected to avoid constraining our implementation.
-2.10 Signal Handling is Not Race-Free
-The tdb_setalarm_sigptr() call allows the caller's signal handler
-to indicate that the tdb locking code should return with a
-failure, rather than trying again when a signal is received (and
-errno == EAGAIN). This is usually used to implement timeouts.
-Unfortunately, this does not work in the case where the signal is
-received before the tdb code enters the fcntl() call to place the
-lock: the code will sleep within the fcntl() code, unaware that
-the signal wants it to exit. In the case of long timeouts, this
-does not happen in practice.
-2.10.1 Proposed Solution
-The locking hooks proposed in[Proposed-Solution-locking-hook]
-would allow the user to decide on whether to fail the lock
-acquisition on a signal. This allows the caller to choose their
-own compromise: they could narrow the race by checking
-immediately before the fcntl call.[footnote:
-It may be possible to make this race-free in some implementations
-by having the signal handler alter the struct flock to make it
-invalid. This will cause the fcntl() lock call to fail with
-EINVAL if the signal occurs before the kernel is entered,
-otherwise EAGAIN.
-2.11 The API Uses Gratuitous Typedefs, Capitals
-typedefs are useful for providing source compatibility when types
-can differ across implementations, or arguably in the case of
-function pointer definitions which are hard for humans to parse.
-Otherwise it is simply obfuscation and pollutes the namespace.
-Capitalization is usually reserved for compile-time constants and
- TDB_CONTEXT There is no reason to use this over 'struct
- tdb_context'; the definition isn't visible to the API user
- anyway.
- TDB_DATA There is no reason to use this over struct TDB_DATA;
- the struct needs to be understood by the API user.
- struct TDB_DATA This would normally be called 'struct
- tdb_data'.
- enum TDB_ERROR Similarly, this would normally be enum
- tdb_error.
-2.11.1 Proposed Solution
-None. Introducing lower case variants would please pedants like
-myself, but if it were done the existing ones should be kept.
-There is little point forcing a purely cosmetic change upon tdb
-2.12 <tdb_log_func-Doesnt-Take>tdb_log_func Doesn't Take The
- Private Pointer
-For API compatibility reasons, the logging function needs to call
-tdb_get_logging_private() to retrieve the pointer registered by
-the tdb_open_ex for logging.
-2.12.1 Proposed Solution
-It should simply take an extra argument, since we are prepared to
-break the API/ABI.
-2.13 Various Callback Functions Are Not Typesafe
-The callback functions in tdb_set_logging_function (after [tdb_log_func-Doesnt-Take]
- is resolved), tdb_parse_record, tdb_traverse, tdb_traverse_read
-and tdb_check all take void * and must internally convert it to
-the argument type they were expecting.
-If this type changes, the compiler will not produce warnings on
-the callers, since it only sees void *.
-2.13.1 Proposed Solution
-With careful use of macros, we can create callback functions
-which give a warning when used on gcc and the types of the
-callback and its private argument differ. Unsupported compilers
-will not give a warning, which is no worse than now. In addition,
-the callbacks become clearer, as they need not use void * for
-their parameter.
-See CCAN's typesafe_cb module at
-2.14 TDB_CLEAR_IF_FIRST Must Be Specified On All Opens,
- tdb_reopen_all Problematic
-The TDB_CLEAR_IF_FIRST flag to tdb_open indicates that the TDB
-file should be cleared if the caller discovers it is the only
-process with the TDB open. However, if any caller does not
-specify TDB_CLEAR_IF_FIRST it will not be detected, so will have
-the TDB erased underneath them (usually resulting in a crash).
-There is a similar issue on fork(); if the parent exits (or
-otherwise closes the tdb) before the child calls tdb_reopen_all()
-to establish the lock used to indicate the TDB is opened by
-someone, a TDB_CLEAR_IF_FIRST opener at that moment will believe
-it alone has opened the TDB and will erase it.
-2.14.1 Proposed Solution
-Remove TDB_CLEAR_IF_FIRST. Other workarounds are possible, but
-see [TDB_CLEAR_IF_FIRST-Imposes-Performance].
-3 Performance And Scalability Issues
- Imposes Performance Penalty
-When TDB_CLEAR_IF_FIRST is specified, a 1-byte read lock is
-placed at offset 4 (aka. the ACTIVE_LOCK). While these locks
-never conflict in normal tdb usage, they do add substantial
-overhead for most fcntl lock implementations when the kernel
-scans to detect if a lock conflict exists. This is often a single
-linked list, making the time to acquire and release a fcntl lock
-O(N) where N is the number of processes with the TDB open, not
-the number actually doing work.
-In a Samba server it is common to have huge numbers of clients
-sitting idle, and thus they have weaned themselves off the
-TDB_CLEAR_IF_FIRST flag.[footnote:
-There is a flag to tdb_reopen_all() which is used for this
-optimization: if the parent process will outlive the child, the
-child does not need the ACTIVE_LOCK. This is a workaround for
-this very performance issue.
-3.1.1 Proposed Solution
-Remove the flag. It was a neat idea, but even trivial servers
-tend to know when they are initializing for the first time and
-can simply unlink the old tdb at that point.
-3.2 TDB Files Have a 4G Limit
-This seems to be becoming an issue (so much for “trivial”!),
-particularly for ldb.
-3.2.1 Proposed Solution
-A new, incompatible TDB format which uses 64 bit offsets
-internally rather than 32 bit as now. For simplicity of endian
-conversion (which TDB does on the fly if required), all values
-will be 64 bit on disk. In practice, some upper bits may be used
-for other purposes, but at least 56 bits will be available for
-file offsets.
-tdb_open() will automatically detect the old version, and even
-create them if TDB_VERSION6 is specified to tdb_open.
-32 bit processes will still be able to access TDBs larger than 4G
-(assuming that their off_t allows them to seek to 64 bits), they
-will gracefully fall back as they fail to mmap. This can happen
-already with large TDBs.
-Old versions of tdb will fail to open the new TDB files (since 28
-August 2009, commit 398d0c29290: prior to that any unrecognized
-file format would be erased and initialized as a fresh tdb!)
-3.3 TDB Records Have a 4G Limit
-This has not been a reported problem, and the API uses size_t
-which can be 64 bit on 64 bit platforms. However, other limits
-may have made such an issue moot.
-3.3.1 Proposed Solution
-Record sizes will be 64 bit, with an error returned on 32 bit
-platforms which try to access such records (the current
-implementation would return TDB_ERR_OOM in a similar case). It
-seems unlikely that 32 bit keys will be a limitation, so the
-implementation may not support this (see [sub:Records-Incur-A]).
-3.4 Hash Size Is Determined At TDB Creation Time
-TDB contains a number of hash chains in the header; the number is
-specified at creation time, and defaults to 131. This is such a
-bottleneck on large databases (as each hash chain gets quite
-long), that LDB uses 10,000 for this hash. In general it is
-impossible to know what the 'right' answer is at database
-creation time.
-3.4.1 Proposed Solution
-After comprehensive performance testing on various scalable hash
- and
-This was annoying because I was previously convinced that an
-expanding tree of hashes would be very close to optimal.
-], it became clear that it is hard to beat a straight linear hash
-table which doubles in size when it reaches saturation. There are
-three details which become important:
-1. On encountering a full bucket, we use the next bucket.
-2. Extra hash bits are stored with the offset, to reduce
- comparisons.
-3. A marker entry is used on deleting an entry.
-The doubling of the table must be done under a transaction; we
-will not reduce it on deletion, so it will be an unusual case. It
-will either be placed at the head (other entries will be moved
-out the way so we can expand). We could have a pointer in the
-header to the current hashtable location, but that pointer would
-have to be read frequently to check for hashtable moves.
-The locking for this is slightly more complex than the chained
-case; we currently have one lock per bucket, and that means we
-would need to expand the lock if we overflow to the next bucket.
-The frequency of such collisions will effect our locking
-heuristics: we can always lock more buckets than we need.
-One possible optimization is to only re-check the hash size on an
-insert or a lookup miss.
-3.5 <TDB-Freelist-Is>TDB Freelist Is Highly Contended
-TDB uses a single linked list for the free list. Allocation
-occurs as follows, using heuristics which have evolved over time:
-1. Get the free list lock for this whole operation.
-2. Multiply length by 1.25, so we always over-allocate by 25%.
-3. Set the slack multiplier to 1.
-4. Examine the current freelist entry: if it is > length but <
- the current best case, remember it as the best case.
-5. Multiply the slack multiplier by 1.05.
-6. If our best fit so far is less than length * slack multiplier,
- return it. The slack will be turned into a new free record if
- it's large enough.
-7. Otherwise, go onto the next freelist entry.
-Deleting a record occurs as follows:
-1. Lock the hash chain for this whole operation.
-2. Walk the chain to find the record, keeping the prev pointer
- offset.
-3. If max_dead is non-zero:
- (a) Walk the hash chain again and count the dead records.
- (b) If it's more than max_dead, bulk free all the dead ones
- (similar to steps 4 and below, but the lock is only obtained
- once).
- (c) Simply mark this record as dead and return.
-4. Get the free list lock for the remainder of this operation.
-5. <right-merging>Examine the following block to see if it is
- free; if so, enlarge the current block and remove that block
- from the free list. This was disabled, as removal from the free
- list was O(entries-in-free-list).
-6. Examine the preceeding block to see if it is free: for this
- reason, each block has a 32-bit tailer which indicates its
- length. If it is free, expand it to cover our new block and
- return.
-7. Otherwise, prepend ourselves to the free list.
-Disabling right-merging (step [right-merging]) causes
-fragmentation; the other heuristics proved insufficient to
-address this, so the final answer to this was that when we expand
-the TDB file inside a transaction commit, we repack the entire
-The single list lock limits our allocation rate; due to the other
-issues this is not currently seen as a bottleneck.
-3.5.1 Proposed Solution
-The first step is to remove all the current heuristics, as they
-obviously interact, then examine them once the lock contention is
-The free list must be split to reduce contention. Assuming
-perfect free merging, we can at most have 1 free list entry for
-each entry. This implies that the number of free lists is related
-to the size of the hash table, but as it is rare to walk a large
-number of free list entries we can use far fewer, say 1/32 of the
-number of hash buckets.
-There are various benefits in using per-size free lists (see [sub:TDB-Becomes-Fragmented]
-) but it's not clear this would reduce contention in the common
-case where all processes are allocating/freeing the same size.
-Thus we almost certainly need to divide in other ways: the most
-obvious is to divide the file into zones, and using a free list
-(or set of free lists) for each. This approximates address
-Note that this means we need to split the free lists when we
-expand the file; this is probably acceptable when we double the
-hash table size, since that is such an expensive operation
-already. In the case of increasing the file size, there is an
-optimization we can use: if we use M in the formula above as the
-file size rounded up to the next power of 2, we only need
-reshuffle free lists when the file size crosses a power of 2
-boundary, and reshuffling the free lists is trivial: we simply
-merge every consecutive pair of free lists.
-The basic algorithm is as follows. Freeing is simple:
-1. Identify the correct zone.
-2. Lock the corresponding list.
-3. Re-check the zone (we didn't have a lock, sizes could have
- changed): relock if necessary.
-4. Place the freed entry in the list for that zone.
-Allocation is a little more complicated, as we perform delayed
-coalescing at this point:
-1. Pick a zone either the zone we last freed into, or based on a “
- random” number.
-2. Lock the corresponding list.
-3. Re-check the zone: relock if necessary.
-4. If the top entry is -large enough, remove it from the list and
- return it.
-5. Otherwise, coalesce entries in the list.
- (a)
- (b)
- (c)
- (d)
-6. If there was no entry large enough, unlock the list and try
- the next zone.
-9. If no zone satisfies, expand the file.
-This optimizes rapid insert/delete of free list entries by not
-coalescing them all the time.. First-fit address ordering
-ordering seems to be fairly good for keeping fragmentation low
-(see [sub:TDB-Becomes-Fragmented]). Note that address ordering
-does not need a tailer to coalesce, though if we needed one we
-could have one cheaply: see [sub:Records-Incur-A].
-I anticipate that the number of entries in each free zone would
-be small, but it might be worth using one free entry to hold
-pointers to the others for cache efficiency.
-3.6 <sub:TDB-Becomes-Fragmented>TDB Becomes Fragmented
-Much of this is a result of allocation strategy[footnote:
-The Memory Fragmentation Problem: Solved? Johnstone & Wilson 1995
-] and deliberate hobbling of coalescing; internal fragmentation
-(aka overallocation) is deliberately set at 25%, and external
-fragmentation is only cured by the decision to repack the entire
-db when a transaction commit needs to enlarge the file.
-3.6.1 Proposed Solution
-The 25% overhead on allocation works in practice for ldb because
-indexes tend to expand by one record at a time. This internal
-fragmentation can be resolved by having an “expanded” bit in the
-header to note entries that have previously expanded, and
-allocating more space for them.
-There are is a spectrum of possible solutions for external
-fragmentation: one is to use a fragmentation-avoiding allocation
-strategy such as best-fit address-order allocator. The other end
-of the spectrum would be to use a bump allocator (very fast and
-simple) and simply repack the file when we reach the end.
-There are three problems with efficient fragmentation-avoiding
-allocators: they are non-trivial, they tend to use a single free
-list for each size, and there's no evidence that tdb allocation
-patterns will match those recorded for general allocators (though
-it seems likely).
-Thus we don't spend too much effort on external fragmentation; we
-will be no worse than the current code if we need to repack on
-occasion. More effort is spent on reducing freelist contention,
-and reducing overhead.
-3.7 <sub:Records-Incur-A>Records Incur A 28-Byte Overhead
-Each TDB record has a header as follows:
-struct tdb_record {
- tdb_off_t next; /* offset of the next record in the list
- tdb_len_t rec_len; /* total byte length of record */
- tdb_len_t key_len; /* byte length of key */
- tdb_len_t data_len; /* byte length of data */
- uint32_t full_hash; /* the full 32 bit hash of the key */
- uint32_t magic; /* try to catch errors */
- /* the following union is implied:
- union {
- char record[rec_len];
- struct {
- char key[key_len];
- char data[data_len];
- }
- uint32_t totalsize; (tailer)
- }
- */
-Naively, this would double to a 56-byte overhead on a 64 bit
-3.7.1 Proposed Solution
-We can use various techniques to reduce this for an allocated
-1. The 'next' pointer is not required, as we are using a flat
- hash table.
-2. 'rec_len' can instead be expressed as an addition to key_len
- and data_len (it accounts for wasted or overallocated length in
- the record). Since the record length is always a multiple of 8,
- we can conveniently fit it in 32 bits (representing up to 35
- bits).
-3. 'key_len' and 'data_len' can be reduced. I'm unwilling to
- restrict 'data_len' to 32 bits, but instead we can combine the
- two into one 64-bit field and using a 5 bit value which
- indicates at what bit to divide the two. Keys are unlikely to
- scale as fast as data, so I'm assuming a maximum key size of 32
- bits.
-4. 'full_hash' is used to avoid a memcmp on the “miss” case, but
- this is diminishing returns after a handful of bits (at 10
- bits, it reduces 99.9% of false memcmp). As an aside, as the
- lower bits are already incorporated in the hash table
- resolution, the upper bits should be used here.
-5. 'magic' does not need to be enlarged: it currently reflects
- one of 5 values (used, free, dead, recovery, and
- unused_recovery). It is useful for quick sanity checking
- however, and should not be eliminated.
-6. 'tailer' is only used to coalesce free blocks (so a block to
- the right can find the header to check if this block is free).
- This can be replaced by a single 'free' bit in the header of
- the following block (and the tailer only exists in free
- blocks).[footnote:
-This technique from Thomas Standish. Data Structure Techniques.
-Addison-Wesley, Reading, Massachusetts, 1980.
-] The current proposed coalescing algorithm doesn't need this,
- however.
-This produces a 16 byte used header like this:
-struct tdb_used_record {
- uint32_t magic : 16,
- prev_is_free: 1,
- key_data_divide: 5,
- top_hash: 10;
- uint32_t extra_octets;
- uint64_t key_and_data_len;
-And a free record like this:
-struct tdb_free_record {
- uint32_t free_magic;
- uint64_t total_length;
- ...
- uint64_t tailer;
-3.8 Transaction Commit Requires 4 fdatasync
-The current transaction algorithm is:
-1. write_recovery_data();
-2. sync();
-3. write_recovery_header();
-4. sync();
-5. overwrite_with_new_data();
-6. sync();
-7. remove_recovery_header();
-8. sync();
-On current ext3, each sync flushes all data to disk, so the next
-3 syncs are relatively expensive. But this could become a
-performance bottleneck on other filesystems such as ext4.
-3.8.1 Proposed Solution
-Neil Brown points out that this is overzealous, and only one sync
-is needed:
-1. Bundle the recovery data, a transaction counter and a strong
- checksum of the new data.
-2. Strong checksum that whole bundle.
-3. Store the bundle in the database.
-4. Overwrite the oldest of the two recovery pointers in the
- header (identified using the transaction counter) with the
- offset of this bundle.
-5. sync.
-6. Write the new data to the file.
-Checking for recovery means identifying the latest bundle with a
-valid checksum and using the new data checksum to ensure that it
-has been applied. This is more expensive than the current check,
-but need only be done at open. For running databases, a separate
-header field can be used to indicate a transaction in progress;
-we need only check for recovery if this is set.
-3.9 TDB Does Not Have Snapshot Support
-3.9.1 Proposed Solution
-None. At some point you say “use a real database”.
-But as a thought experiment, if we implemented transactions to
-only overwrite free entries (this is tricky: there must not be a
-header in each entry which indicates whether it is free, but use
-of presence in metadata elsewhere), and a pointer to the hash
-table, we could create an entirely new commit without destroying
-existing data. Then it would be easy to implement snapshots in a
-similar way.
-This would not allow arbitrary changes to the database, such as
-tdb_repack does, and would require more space (since we have to
-preserve the current and future entries at once). If we used hash
-trees rather than one big hash table, we might only have to
-rewrite some sections of the hash, too.
-We could then implement snapshots using a similar method, using
-multiple different hash tables/free tables.
-3.10 Transactions Cannot Operate in Parallel
-This would be useless for ldb, as it hits the index records with
-just about every update. It would add significant complexity in
-resolving clashes, and cause the all transaction callers to write
-their code to loop in the case where the transactions spuriously
-3.10.1 Proposed Solution
-We could solve a small part of the problem by providing read-only
-transactions. These would allow one write transaction to begin,
-but it could not commit until all r/o transactions are done. This
-would require a new RO_TRANSACTION_LOCK, which would be upgraded
-on commit.
-3.11 Default Hash Function Is Suboptimal
-The Knuth-inspired multiplicative hash used by tdb is fairly slow
-(especially if we expand it to 64 bits), and works best when the
-hash bucket size is a prime number (which also means a slow
-modulus). In addition, it is highly predictable which could
-potentially lead to a Denial of Service attack in some TDB uses.
-3.11.1 Proposed Solution
-The Jenkins lookup3 hash[footnote:
-] is a fast and superbly-mixing hash. It's used by the Linux
-kernel and almost everything else. This has the particular
-properties that it takes an initial seed, and produces two 32 bit
-hash numbers, which we can combine into a 64-bit hash.
-The seed should be created at tdb-creation time from some random
-source, and placed in the header. This is far from foolproof, but
-adds a little bit of protection against hash bombing.
-3.12 <Reliable-Traversal-Adds>Reliable Traversal Adds Complexity
-We lock a record during traversal iteration, and try to grab that
-lock in the delete code. If that grab on delete fails, we simply
-mark it deleted and continue onwards; traversal checks for this
-condition and does the delete when it moves off the record.
-If traversal terminates, the dead record may be left
-3.12.1 Proposed Solution
-Remove reliability guarantees; see [traverse-Proposed-Solution].
-3.13 Fcntl Locking Adds Overhead
-Placing a fcntl lock means a system call, as does removing one.
-This is actually one reason why transactions can be faster
-(everything is locked once at transaction start). In the
-uncontended case, this overhead can theoretically be eliminated.
-3.13.1 Proposed Solution
-We tried this before with spinlock support, in the early days of
-TDB, and it didn't make much difference except in manufactured
-We could use spinlocks (with futex kernel support under Linux),
-but it means that we lose automatic cleanup when a process dies
-with a lock. There is a method of auto-cleanup under Linux, but
-it's not supported by other operating systems. We could
-reintroduce a clear-if-first-style lock and sweep for dead
-futexes on open, but that wouldn't help the normal case of one
-concurrent opener dying. Increasingly elaborate repair schemes
-could be considered, but they require an ABI change (everyone
-must use them) anyway, so there's no need to do this at the same
-time as everything else.
diff --git a/lib/tdb2/doc/design.lyx b/lib/tdb2/doc/design.lyx
deleted file mode 100644
index 0a1d6a14bc..0000000000
--- a/lib/tdb2/doc/design.lyx
+++ /dev/null
@@ -1,2689 +0,0 @@
-#LyX 1.6.7 created this file. For more info see
-\lyxformat 345
-\textclass article
-\use_default_options true
-\language english
-\inputencoding auto
-\font_roman default
-\font_sans default
-\font_typewriter default
-\font_default_family default
-\font_sc false
-\font_osf false
-\font_sf_scale 100
-\font_tt_scale 100
-\graphics default
-\paperfontsize default
-\use_hyperref false
-\papersize default
-\use_geometry false
-\use_amsmath 1
-\use_esint 1
-\cite_engine basic
-\use_bibtopic false
-\paperorientation portrait
-\secnumdepth 3
-\tocdepth 3
-\paragraph_separation indent
-\defskip medskip
-\quotes_language english
-\papercolumns 1
-\papersides 1
-\paperpagestyle default
-\tracking_changes true
-\output_changes true
-\author ""
-\author ""
-\begin_layout Title
-TDB2: A Redesigning The Trivial DataBase
-\begin_layout Author
-Rusty Russell, IBM Corporation
-\begin_layout Date
-\begin_layout Abstract
-The Trivial DataBase on-disk format is 32 bits; with usage cases heading
- towards the 4G limit, that must change.
- This required breakage provides an opportunity to revisit TDB's other design
- decisions and reassess them.
-\begin_layout Section
-\begin_layout Standard
-The Trivial DataBase was originally written by Andrew Tridgell as a simple
- key/data pair storage system with the same API as dbm, but allowing multiple
- readers and writers while being small enough (< 1000 lines of C) to include
- in SAMBA.
- The simple design created in 1999 has proven surprisingly robust and performant
-, used in Samba versions 3 and 4 as well as numerous other projects.
- Its useful life was greatly increased by the (backwards-compatible!) addition
- of transaction support in 2005.
-\begin_layout Standard
-The wider variety and greater demands of TDB-using code has lead to some
- organic growth of the API, as well as some compromises on the implementation.
- None of these, by themselves, are seen as show-stoppers, but the cumulative
- effect is to a loss of elegance over the initial, simple TDB implementation.
- Here is a table of the approximate number of lines of implementation code
- and number of API functions at the end of each year:
-\begin_layout Standard
-\begin_inset Tabular
-<lyxtabular version="3" rows="12" columns="3">
-<column alignment="center" valignment="top" width="0">
-<column alignment="center" valignment="top" width="0">
-<column alignment="center" valignment="top" width="0">
-<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
-\begin_inset Text
-\begin_layout Plain Layout
-Year End
-<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
-\begin_inset Text
-\begin_layout Plain Layout
-API Functions
-<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
-\begin_inset Text
-\begin_layout Plain Layout
-Lines of C Code Implementation
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-\begin_layout Plain Layout
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-\begin_layout Plain Layout
-<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
-\begin_inset Text
-\begin_layout Plain Layout
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-\begin_layout Plain Layout
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-\begin_layout Plain Layout
-<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
-\begin_inset Text
-\begin_layout Plain Layout
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-\begin_layout Plain Layout
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-\begin_layout Plain Layout
-<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
-\begin_inset Text
-\begin_layout Plain Layout
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-\begin_layout Plain Layout
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-\begin_layout Plain Layout
-<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
-\begin_inset Text
-\begin_layout Plain Layout
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-\begin_layout Plain Layout
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-\begin_layout Plain Layout
-<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
-\begin_inset Text
-\begin_layout Plain Layout
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-\begin_layout Plain Layout
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-\begin_layout Plain Layout
-<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
-\begin_inset Text
-\begin_layout Plain Layout
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-\begin_layout Plain Layout
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-\begin_layout Plain Layout
-<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
-\begin_inset Text
-\begin_layout Plain Layout
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-\begin_layout Plain Layout
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-\begin_layout Plain Layout
-<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
-\begin_inset Text
-\begin_layout Plain Layout
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-\begin_layout Plain Layout
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-\begin_layout Plain Layout
-<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
-\begin_inset Text
-\begin_layout Plain Layout
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-\begin_layout Plain Layout
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-\begin_layout Plain Layout
-<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
-\begin_inset Text
-\begin_layout Plain Layout
-<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
-\begin_inset Text
-\begin_layout Plain Layout
-<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
-\begin_inset Text
-\begin_layout Plain Layout
-<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
-\begin_inset Text
-\begin_layout Plain Layout
-\begin_layout Standard
-This review is an attempt to catalog and address all the known issues with
- TDB and create solutions which address the problems without significantly
- increasing complexity; all involved are far too aware of the dangers of
- second system syndrome in rewriting a successful project like this.
-\begin_layout Section
-API Issues
-\begin_layout Subsection
-tdb_open_ex Is Not Expandable
-\begin_layout Standard
-The tdb_open() call was expanded to tdb_open_ex(), which added an optional
- hashing function and an optional logging function argument.
- Additional arguments to open would require the introduction of a tdb_open_ex2
- call etc.
-\begin_layout Subsubsection
-Proposed Solution
-\begin_inset CommandInset label
-LatexCommand label
-name "attributes"
-\begin_layout Standard
-tdb_open() will take a linked-list of attributes:
-\begin_layout LyX-Code
-enum tdb_attribute {
-\begin_layout LyX-Code
-\begin_layout LyX-Code
-\begin_layout LyX-Code
-\begin_layout LyX-Code
-struct tdb_attribute_base {
-\begin_layout LyX-Code
- enum tdb_attribute attr;
-\begin_layout LyX-Code
- union tdb_attribute *next;
-\begin_layout LyX-Code
-\begin_layout LyX-Code
-struct tdb_attribute_log {
-\begin_layout LyX-Code
- struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_LOG */
-\begin_layout LyX-Code
- tdb_log_func log_fn;
-\begin_layout LyX-Code
- void *log_private;
-\begin_layout LyX-Code
-\begin_layout LyX-Code
-struct tdb_attribute_hash {
-\begin_layout LyX-Code
- struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_HASH */
-\begin_layout LyX-Code
- tdb_hash_func hash_fn;
-\begin_layout LyX-Code
- void *hash_private;
-\begin_layout LyX-Code
-\begin_layout LyX-Code
-union tdb_attribute {
-\begin_layout LyX-Code
- struct tdb_attribute_base base;
-\begin_layout LyX-Code
- struct tdb_attribute_log log;
-\begin_layout LyX-Code
- struct tdb_attribute_hash hash;
-\begin_layout LyX-Code
-\begin_layout Standard
-This allows future attributes to be added, even if this expands the size
- of the union.
-\begin_layout Subsubsection
-\begin_layout Standard
-\begin_layout Subsection
-tdb_traverse Makes Impossible Guarantees
-\begin_layout Standard
-tdb_traverse (and tdb_firstkey/tdb_nextkey) predate transactions, and it
- was thought that it was important to guarantee that all records which exist
- at the start and end of the traversal would be included, and no record
- would be included twice.
-\begin_layout Standard
-This adds complexity (see
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "Reliable-Traversal-Adds"
-) and does not work anyway for records which are altered (in particular,
- those which are expanded may be effectively deleted and re-added behind
- the traversal).
-\begin_layout Subsubsection
-\begin_inset CommandInset label
-LatexCommand label
-name "traverse-Proposed-Solution"
-Proposed Solution
-\begin_layout Standard
-Abandon the guarantee.
- You will see every record if no changes occur during your traversal, otherwise
- you will see some subset.
- You can prevent changes by using a transaction or the locking API.
-\begin_layout Subsubsection
-\begin_layout Standard
- Delete-during-traverse will still delete every record, too (assuming no
- other changes).
-\begin_layout Subsection
-Nesting of Transactions Is Fraught
-\begin_layout Standard
-TDB has alternated between allowing nested transactions and not allowing
- them.
- Various paths in the Samba codebase assume that transactions will nest,
- and in a sense they can: the operation is only committed to disk when the
- outer transaction is committed.
- There are two problems, however:
-\begin_layout Enumerate
-Canceling the inner transaction will cause the outer transaction commit
- to fail, and will not undo any operations since the inner transaction began.
- This problem is soluble with some additional internal code.
-\begin_layout Enumerate
-An inner transaction commit can be cancelled by the outer transaction.
- This is desirable in the way which Samba's database initialization code
- uses transactions, but could be a surprise to any users expecting a successful
- transaction commit to expose changes to others.
-\begin_layout Standard
-The current solution is to specify the behavior at tdb_open(), with the
- default currently that nested transactions are allowed.
- This flag can also be changed at runtime.
-\begin_layout Subsubsection
-Proposed Solution
-\begin_layout Standard
-Given the usage patterns, it seems that the
-\begin_inset Quotes eld
-\begin_inset Quotes erd
- behavior of disallowing nested transactions should become the default.
- Additionally, it seems the outer transaction is the only code which knows
- whether inner transactions should be allowed, so a flag to indicate this
- could be added to tdb_transaction_start.
- However, this behavior can be simulated with a wrapper which uses tdb_add_flags
-() and tdb_remove_flags(), so the API should not be expanded for this relatively
--obscure case.
-\begin_layout Subsubsection
-\begin_layout Standard
-Complete; the nesting flag has been removed.
-\begin_layout Subsection
-Incorrect Hash Function is Not Detected
-\begin_layout Standard
-tdb_open_ex() allows the calling code to specify a different hash function
- to use, but does not check that all other processes accessing this tdb
- are using the same hash function.
- The result is that records are missing from tdb_fetch().
-\begin_layout Subsubsection
-Proposed Solution
-\begin_layout Standard
-The header should contain an example hash result (eg.
- the hash of 0xdeadbeef), and tdb_open_ex() should check that the given
- hash function produces the same answer, or fail the tdb_open call.
-\begin_layout Subsubsection
-\begin_layout Standard
-\begin_layout Subsection
-tdb_set_max_dead/TDB_VOLATILE Expose Implementation
-\begin_layout Standard
-In response to scalability issues with the free list (
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "TDB-Freelist-Is"
-) two API workarounds have been incorporated in TDB: tdb_set_max_dead()
- and the TDB_VOLATILE flag to tdb_open.
- The latter actually calls the former with an argument of
-\begin_inset Quotes eld
-\begin_inset Quotes erd
-\begin_layout Standard
-This code allows deleted records to accumulate without putting them in the
- free list.
- On delete we iterate through each chain and free them in a batch if there
- are more than max_dead entries.
- These are never otherwise recycled except as a side-effect of a tdb_repack.
-\begin_layout Subsubsection
-Proposed Solution
-\begin_layout Standard
-With the scalability problems of the freelist solved, this API can be removed.
- The TDB_VOLATILE flag may still be useful as a hint that store and delete
- of records will be at least as common as fetch in order to allow some internal
- tuning, but initially will become a no-op.
-\begin_layout Subsubsection
-\begin_layout Standard
- Unknown flags cause tdb_open() to fail as well, so they can be detected
- at runtime.
-\begin_layout Subsection
-\begin_inset CommandInset label
-LatexCommand label
-name "TDB-Files-Cannot"
-TDB Files Cannot Be Opened Multiple Times In The Same Process
-\begin_layout Standard
-No process can open the same TDB twice; we check and disallow it.
- This is an unfortunate side-effect of fcntl locks, which operate on a per-file
- rather than per-file-descriptor basis, and do not nest.
- Thus, closing any file descriptor on a file clears all the locks obtained
- by this process, even if they were placed using a different file descriptor!
-\begin_layout Standard
-Note that even if this were solved, deadlock could occur if operations were
- nested: this is a more manageable programming error in most cases.
-\begin_layout Subsubsection
-Proposed Solution
-\begin_layout Standard
-We could lobby POSIX to fix the perverse rules, or at least lobby Linux
- to violate them so that the most common implementation does not have this
- restriction.
- This would be a generally good idea for other fcntl lock users.
-\begin_layout Standard
-Samba uses a wrapper which hands out the same tdb_context to multiple callers
- if this happens, and does simple reference counting.
- We should do this inside the tdb library, which already emulates lock nesting
- internally; it would need to recognize when deadlock occurs within a single
- process.
- This would create a new failure mode for tdb operations (while we currently
- handle locking failures, they are impossible in normal use and a process
- encountering them can do little but give up).
-\begin_layout Standard
-I do not see benefit in an additional tdb_open flag to indicate whether
- re-opening is allowed, as though there may be some benefit to adding a
- call to detect when a tdb_context is shared, to allow other to create such
- an API.
-\begin_layout Subsubsection
-\begin_layout Standard
-\begin_layout Subsection
-TDB API Is Not POSIX Thread-safe
-\begin_layout Standard
-The TDB API uses an error code which can be queried after an operation to
- determine what went wrong.
- This programming model does not work with threads, unless specific additional
- guarantees are given by the implementation.
- In addition, even otherwise-independent threads cannot open the same TDB
- (as in
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "TDB-Files-Cannot"
-\begin_layout Subsubsection
-Proposed Solution
-\begin_layout Standard
-Reachitecting the API to include a tdb_errcode pointer would be a great
- deal of churn, but fortunately most functions return 0 on success and -1
- on error: we can change these to return 0 on success and a negative error
- code on error, and the API remains similar to previous.
- The tdb_fetch, tdb_firstkey and tdb_nextkey functions need to take a TDB_DATA
- pointer and return an error code.
- It is also simpler to have tdb_nextkey replace its key argument in place,
- freeing up any old .dptr.
-\begin_layout Standard
-Internal locking is required to make sure that fcntl locks do not overlap
- between threads, and also that the global list of tdbs is maintained.
-\begin_layout Standard
-The aim is that building tdb with -DTDB_PTHREAD will result in a pthread-safe
- version of the library, and otherwise no overhead will exist.
- Alternatively, a hooking mechanism similar to that proposed for
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "Proposed-Solution-locking-hook"
- could be used to enable pthread locking at runtime.
-\begin_layout Subsubsection
-\begin_layout Standard
-Incomplete; API has been changed but thread safety has not been implemented.
-\begin_layout Subsection
-*_nonblock Functions And *_mark Functions Expose Implementation
-\begin_layout Standard
-\begin_inset Foot
-status collapsed
-\begin_layout Plain Layout
-Clustered TDB, see
- wishes to operate on TDB in a non-blocking manner.
- This is currently done as follows:
-\begin_layout Enumerate
-Call the _nonblock variant of an API function (eg.
- tdb_lockall_nonblock).
- If this fails:
-\begin_layout Enumerate
-Fork a child process, and wait for it to call the normal variant (eg.
- tdb_lockall).
-\begin_layout Enumerate
-If the child succeeds, call the _mark variant to indicate we already have
- the locks (eg.
- tdb_lockall_mark).
-\begin_layout Enumerate
-Upon completion, tell the child to release the locks (eg.
- tdb_unlockall).
-\begin_layout Enumerate
-Indicate to tdb that it should consider the locks removed (eg.
- tdb_unlockall_mark).
-\begin_layout Standard
-There are several issues with this approach.
- Firstly, adding two new variants of each function clutters the API for
- an obscure use, and so not all functions have three variants.
- Secondly, it assumes that all paths of the functions ask for the same locks,
- otherwise the parent process will have to get a lock which the child doesn't
- have under some circumstances.
- I don't believe this is currently the case, but it constrains the implementatio
-\begin_layout Subsubsection
-\begin_inset CommandInset label
-LatexCommand label
-name "Proposed-Solution-locking-hook"
-Proposed Solution
-\begin_layout Standard
-Implement a hook for locking methods, so that the caller can control the
- calls to create and remove fcntl locks.
- In this scenario, ctdbd would operate as follows:
-\begin_layout Enumerate
-Call the normal API function, eg tdb_lockall().
-\begin_layout Enumerate
-When the lock callback comes in, check if the child has the lock.
- Initially, this is always false.
- If so, return 0.
- Otherwise, try to obtain it in non-blocking mode.
- If that fails, return EWOULDBLOCK.
-\begin_layout Enumerate
-Release locks in the unlock callback as normal.
-\begin_layout Enumerate
-If tdb_lockall() fails, see if we recorded a lock failure; if so, call the
- child to repeat the operation.
-\begin_layout Enumerate
-The child records what locks it obtains, and returns that information to
- the parent.
-\begin_layout Enumerate
-When the child has succeeded, goto 1.
-\begin_layout Standard
-This is flexible enough to handle any potential locking scenario, even when
- lock requirements change.
- It can be optimized so that the parent does not release locks, just tells
- the child which locks it doesn't need to obtain.
-\begin_layout Standard
-It also keeps the complexity out of the API, and in ctdbd where it is needed.
-\begin_layout Subsubsection
-\begin_layout Standard
-\begin_layout Subsection
-tdb_chainlock Functions Expose Implementation
-\begin_layout Standard
-tdb_chainlock locks some number of records, including the record indicated
- by the given key.
- This gave atomicity guarantees; no-one can start a transaction, alter,
- read or delete that key while the lock is held.
-\begin_layout Standard
-It also makes the same guarantee for any other key in the chain, which is
- an internal implementation detail and potentially a cause for deadlock.
-\begin_layout Subsubsection
-Proposed Solution
-\begin_layout Standard
- It would be nice to have an explicit single entry lock which effected no
- other keys.
- Unfortunately, this won't work for an entry which doesn't exist.
- Thus while chainlock may be implemented more efficiently for the existing
- case, it will still have overlap issues with the non-existing case.
- So it is best to keep the current (lack of) guarantee about which records
- will be effected to avoid constraining our implementation.
-\begin_layout Subsection
-Signal Handling is Not Race-Free
-\begin_layout Standard
-The tdb_setalarm_sigptr() call allows the caller's signal handler to indicate
- that the tdb locking code should return with a failure, rather than trying
- again when a signal is received (and errno == EAGAIN).
- This is usually used to implement timeouts.
-\begin_layout Standard
-Unfortunately, this does not work in the case where the signal is received
- before the tdb code enters the fcntl() call to place the lock: the code
- will sleep within the fcntl() code, unaware that the signal wants it to
- exit.
- In the case of long timeouts, this does not happen in practice.
-\begin_layout Subsubsection
-Proposed Solution
-\begin_layout Standard
-The locking hooks proposed in
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "Proposed-Solution-locking-hook"
- would allow the user to decide on whether to fail the lock acquisition
- on a signal.
- This allows the caller to choose their own compromise: they could narrow
- the race by checking immediately before the fcntl call.
-\begin_inset Foot
-status collapsed
-\begin_layout Plain Layout
-It may be possible to make this race-free in some implementations by having
- the signal handler alter the struct flock to make it invalid.
- This will cause the fcntl() lock call to fail with EINVAL if the signal
- occurs before the kernel is entered, otherwise EAGAIN.
-\begin_layout Subsubsection
-\begin_layout Standard
-\begin_layout Subsection
-The API Uses Gratuitous Typedefs, Capitals
-\begin_layout Standard
-typedefs are useful for providing source compatibility when types can differ
- across implementations, or arguably in the case of function pointer definitions
- which are hard for humans to parse.
- Otherwise it is simply obfuscation and pollutes the namespace.
-\begin_layout Standard
-Capitalization is usually reserved for compile-time constants and macros.
-\begin_layout Description
-TDB_CONTEXT There is no reason to use this over 'struct tdb_context'; the
- definition isn't visible to the API user anyway.
-\begin_layout Description
-TDB_DATA There is no reason to use this over struct TDB_DATA; the struct
- needs to be understood by the API user.
-\begin_layout Description
-\begin_inset space ~
-TDB_DATA This would normally be called 'struct tdb_data'.
-\begin_layout Description
-\begin_inset space ~
-TDB_ERROR Similarly, this would normally be enum tdb_error.
-\begin_layout Subsubsection
-Proposed Solution
-\begin_layout Standard
- Introducing lower case variants would please pedants like myself, but if
- it were done the existing ones should be kept.
- There is little point forcing a purely cosmetic change upon tdb users.
-\begin_layout Subsection
-\begin_inset CommandInset label
-LatexCommand label
-name "tdb_log_func-Doesnt-Take"
-tdb_log_func Doesn't Take The Private Pointer
-\begin_layout Standard
-For API compatibility reasons, the logging function needs to call tdb_get_loggin
-g_private() to retrieve the pointer registered by the tdb_open_ex for logging.
-\begin_layout Subsubsection
-Proposed Solution
-\begin_layout Standard
-It should simply take an extra argument, since we are prepared to break
- the API/ABI.
-\begin_layout Subsubsection
-\begin_layout Standard
-\begin_layout Subsection
-Various Callback Functions Are Not Typesafe
-\begin_layout Standard
-The callback functions in tdb_set_logging_function (after
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "tdb_log_func-Doesnt-Take"
- is resolved), tdb_parse_record, tdb_traverse, tdb_traverse_read and tdb_check
- all take void * and must internally convert it to the argument type they
- were expecting.
-\begin_layout Standard
-If this type changes, the compiler will not produce warnings on the callers,
- since it only sees void *.
-\begin_layout Subsubsection
-Proposed Solution
-\begin_layout Standard
-With careful use of macros, we can create callback functions which give
- a warning when used on gcc and the types of the callback and its private
- argument differ.
- Unsupported compilers will not give a warning, which is no worse than now.
- In addition, the callbacks become clearer, as they need not use void *
- for their parameter.
-\begin_layout Standard
-See CCAN's typesafe_cb module at
-\begin_layout Subsubsection
-\begin_layout Standard
-\begin_layout Subsection
-TDB_CLEAR_IF_FIRST Must Be Specified On All Opens, tdb_reopen_all Problematic
-\begin_layout Standard
-The TDB_CLEAR_IF_FIRST flag to tdb_open indicates that the TDB file should
- be cleared if the caller discovers it is the only process with the TDB
- open.
- However, if any caller does not specify TDB_CLEAR_IF_FIRST it will not
- be detected, so will have the TDB erased underneath them (usually resulting
- in a crash).
-\begin_layout Standard
-There is a similar issue on fork(); if the parent exits (or otherwise closes
- the tdb) before the child calls tdb_reopen_all() to establish the lock
- used to indicate the TDB is opened by someone, a TDB_CLEAR_IF_FIRST opener
- at that moment will believe it alone has opened the TDB and will erase
- it.
-\begin_layout Subsubsection
-Proposed Solution
-\begin_layout Standard
- Other workarounds are possible, but see
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "TDB_CLEAR_IF_FIRST-Imposes-Performance"
-\begin_layout Subsubsection
-\begin_layout Standard
-\begin_layout Subsection
-Extending The Header Is Difficult
-\begin_layout Standard
-We have reserved (zeroed) words in the TDB header, which can be used for
- future features.
- If the future features are compulsory, the version number must be updated
- to prevent old code from accessing the database.
- But if the future feature is optional, we have no way of telling if older
- code is accessing the database or not.
-\begin_layout Subsubsection
-Proposed Solution
-\begin_layout Standard
-The header should contain a
-\begin_inset Quotes eld
-format variant
-\begin_inset Quotes erd
- value (64-bit).
- This is divided into two 32-bit parts:
-\begin_layout Enumerate
-The lower part reflects the format variant understood by code accessing
- the database.
-\begin_layout Enumerate
-The upper part reflects the format variant you must understand to write
- to the database (otherwise you can only open for reading).
-\begin_layout Standard
-The latter field can only be written at creation time, the former should
- be written under the OPEN_LOCK when opening the database for writing, if
- the variant of the code is lower than the current lowest variant.
-\begin_layout Standard
-This should allow backwards-compatible features to be added, and detection
- if older code (which doesn't understand the feature) writes to the database.
-\begin_layout Subsubsection
-\begin_layout Standard
-\begin_layout Subsection
-Record Headers Are Not Expandible
-\begin_layout Standard
-If we later want to add (say) checksums on keys and data, it would require
- another format change, which we'd like to avoid.
-\begin_layout Subsubsection
-Proposed Solution
-\begin_layout Standard
-We often have extra padding at the tail of a record.
- If we ensure that the first byte (if any) of this padding is zero, we will
- have a way for future changes to detect code which doesn't understand a
- new format: the new code would write (say) a 1 at the tail, and thus if
- there is no tail or the first byte is 0, we would know the extension is
- not present on that record.
-\begin_layout Subsubsection
-\begin_layout Standard
-\begin_layout Subsection
-TDB Does Not Use Talloc
-\begin_layout Standard
-Many users of TDB (particularly Samba) use the talloc allocator, and thus
- have to wrap TDB in a talloc context to use it conveniently.
-\begin_layout Subsubsection
-Proposed Solution
-\begin_layout Standard
-The allocation within TDB is not complicated enough to justify the use of
- talloc, and I am reluctant to force another (excellent) library on TDB
- users.
- Nonetheless a compromise is possible.
- An attribute (see
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "attributes"
-) can be added later to tdb_open() to provide an alternate allocation mechanism,
- specifically for talloc but usable by any other allocator (which would
- ignore the
-\begin_inset Quotes eld
-\begin_inset Quotes erd
- argument).
-\begin_layout Standard
-This would form a talloc heirarchy as expected, but the caller would still
- have to attach a destructor to the tdb context returned from tdb_open to
- close it.
- All TDB_DATA fields would be children of the tdb_context, and the caller
- would still have to manage them (using talloc_free() or talloc_steal()).
-\begin_layout Subsubsection
-\begin_layout Standard
-\begin_layout Section
-Performance And Scalability Issues
-\begin_layout Subsection
-\begin_inset CommandInset label
-LatexCommand label
-name "TDB_CLEAR_IF_FIRST-Imposes-Performance"
-TDB_CLEAR_IF_FIRST Imposes Performance Penalty
-\begin_layout Standard
-When TDB_CLEAR_IF_FIRST is specified, a 1-byte read lock is placed at offset
- 4 (aka.
- While these locks never conflict in normal tdb usage, they do add substantial
- overhead for most fcntl lock implementations when the kernel scans to detect
- if a lock conflict exists.
- This is often a single linked list, making the time to acquire and release
- a fcntl lock O(N) where N is the number of processes with the TDB open,
- not the number actually doing work.
-\begin_layout Standard
-In a Samba server it is common to have huge numbers of clients sitting idle,
- and thus they have weaned themselves off the TDB_CLEAR_IF_FIRST flag.
-\begin_inset Foot
-status collapsed
-\begin_layout Plain Layout
-There is a flag to tdb_reopen_all() which is used for this optimization:
- if the parent process will outlive the child, the child does not need the
- This is a workaround for this very performance issue.
-\begin_layout Subsubsection
-Proposed Solution
-\begin_layout Standard
-Remove the flag.
- It was a neat idea, but even trivial servers tend to know when they are
- initializing for the first time and can simply unlink the old tdb at that
- point.
-\begin_layout Subsubsection
-\begin_layout Standard
-\begin_layout Subsection
-TDB Files Have a 4G Limit
-\begin_layout Standard
-This seems to be becoming an issue (so much for
-\begin_inset Quotes eld
-\begin_inset Quotes erd
-!), particularly for ldb.
-\begin_layout Subsubsection
-Proposed Solution
-\begin_layout Standard
-A new, incompatible TDB format which uses 64 bit offsets internally rather
- than 32 bit as now.
- For simplicity of endian conversion (which TDB does on the fly if required),
- all values will be 64 bit on disk.
- In practice, some upper bits may be used for other purposes, but at least
- 56 bits will be available for file offsets.
-\begin_layout Standard
-tdb_open() will automatically detect the old version, and even create them
- if TDB_VERSION6 is specified to tdb_open.
-\begin_layout Standard
-32 bit processes will still be able to access TDBs larger than 4G (assuming
- that their off_t allows them to seek to 64 bits), they will gracefully
- fall back as they fail to mmap.
- This can happen already with large TDBs.
-\begin_layout Standard
-Old versions of tdb will fail to open the new TDB files (since 28 August
- 2009, commit 398d0c29290: prior to that any unrecognized file format would
- be erased and initialized as a fresh tdb!)
-\begin_layout Subsubsection
-\begin_layout Standard
-\begin_layout Subsection
-TDB Records Have a 4G Limit
-\begin_layout Standard
-This has not been a reported problem, and the API uses size_t which can
- be 64 bit on 64 bit platforms.
- However, other limits may have made such an issue moot.
-\begin_layout Subsubsection
-Proposed Solution
-\begin_layout Standard
-Record sizes will be 64 bit, with an error returned on 32 bit platforms
- which try to access such records (the current implementation would return
- TDB_ERR_OOM in a similar case).
- It seems unlikely that 32 bit keys will be a limitation, so the implementation
- may not support this (see
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "sub:Records-Incur-A"
-\begin_layout Subsubsection
-\begin_layout Standard
-\begin_layout Subsection
-Hash Size Is Determined At TDB Creation Time
-\begin_layout Standard
-TDB contains a number of hash chains in the header; the number is specified
- at creation time, and defaults to 131.
- This is such a bottleneck on large databases (as each hash chain gets quite
- long), that LDB uses 10,000 for this hash.
- In general it is impossible to know what the 'right' answer is at database
- creation time.
-\begin_layout Subsubsection
-\begin_inset CommandInset label
-LatexCommand label
-name "sub:Hash-Size-Solution"
-Proposed Solution
-\begin_layout Standard
-After comprehensive performance testing on various scalable hash variants
-\begin_inset Foot
-status collapsed
-\begin_layout Plain Layout
- and This was annoying
- because I was previously convinced that an expanding tree of hashes would
- be very close to optimal.
-, it became clear that it is hard to beat a straight linear hash table which
- doubles in size when it reaches saturation.
- Unfortunately, altering the hash table introduces serious locking complications
-: the entire hash table needs to be locked to enlarge the hash table, and
- others might be holding locks.
- Particularly insidious are insertions done under tdb_chainlock.
-\begin_layout Standard
-Thus an expanding layered hash will be used: an array of hash groups, with
- each hash group exploding into pointers to lower hash groups once it fills,
- turning into a hash tree.
- This has implications for locking: we must lock the entire group in case
- we need to expand it, yet we don't know how deep the tree is at that point.
-\begin_layout Standard
-Note that bits from the hash table entries should be stolen to hold more
- hash bits to reduce the penalty of collisions.
- We can use the otherwise-unused lower 3 bits.
- If we limit the size of the database to 64 exabytes, we can use the top
- 8 bits of the hash entry as well.
- These 11 bits would reduce false positives down to 1 in 2000 which is more
- than we need: we can use one of the bits to indicate that the extra hash
- bits are valid.
- This means we can choose not to re-hash all entries when we expand a hash
- group; simply use the next bits we need and mark them invalid.
-\begin_layout Subsubsection
-\begin_layout Standard
-\begin_layout Subsection
-\begin_inset CommandInset label
-LatexCommand label
-name "TDB-Freelist-Is"
-TDB Freelist Is Highly Contended
-\begin_layout Standard
-TDB uses a single linked list for the free list.
- Allocation occurs as follows, using heuristics which have evolved over
- time:
-\begin_layout Enumerate
-Get the free list lock for this whole operation.
-\begin_layout Enumerate
-Multiply length by 1.25, so we always over-allocate by 25%.
-\begin_layout Enumerate
-Set the slack multiplier to 1.
-\begin_layout Enumerate
-Examine the current freelist entry: if it is > length but < the current
- best case, remember it as the best case.
-\begin_layout Enumerate
-Multiply the slack multiplier by 1.05.
-\begin_layout Enumerate
-If our best fit so far is less than length * slack multiplier, return it.
- The slack will be turned into a new free record if it's large enough.
-\begin_layout Enumerate
-Otherwise, go onto the next freelist entry.
-\begin_layout Standard
-Deleting a record occurs as follows:
-\begin_layout Enumerate
-Lock the hash chain for this whole operation.
-\begin_layout Enumerate
-Walk the chain to find the record, keeping the prev pointer offset.
-\begin_layout Enumerate
-If max_dead is non-zero:
-\begin_layout Enumerate
-Walk the hash chain again and count the dead records.
-\begin_layout Enumerate
-If it's more than max_dead, bulk free all the dead ones (similar to steps
- 4 and below, but the lock is only obtained once).
-\begin_layout Enumerate
-Simply mark this record as dead and return.
-\begin_layout Enumerate
-Get the free list lock for the remainder of this operation.
-\begin_layout Enumerate
-\begin_inset CommandInset label
-LatexCommand label
-name "right-merging"
-Examine the following block to see if it is free; if so, enlarge the current
- block and remove that block from the free list.
- This was disabled, as removal from the free list was O(entries-in-free-list).
-\begin_layout Enumerate
-Examine the preceeding block to see if it is free: for this reason, each
- block has a 32-bit tailer which indicates its length.
- If it is free, expand it to cover our new block and return.
-\begin_layout Enumerate
-Otherwise, prepend ourselves to the free list.
-\begin_layout Standard
-Disabling right-merging (step
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "right-merging"
-) causes fragmentation; the other heuristics proved insufficient to address
- this, so the final answer to this was that when we expand the TDB file
- inside a transaction commit, we repack the entire tdb.
-\begin_layout Standard
-The single list lock limits our allocation rate; due to the other issues
- this is not currently seen as a bottleneck.
-\begin_layout Subsubsection
-Proposed Solution
-\begin_layout Standard
-The first step is to remove all the current heuristics, as they obviously
- interact, then examine them once the lock contention is addressed.
-\begin_layout Standard
-The free list must be split to reduce contention.
- Assuming perfect free merging, we can at most have 1 free list entry for
- each entry.
- This implies that the number of free lists is related to the size of the
- hash table, but as it is rare to walk a large number of free list entries
- we can use far fewer, say 1/32 of the number of hash buckets.
-\begin_layout Standard
-It seems tempting to try to reuse the hash implementation which we use for
- records here, but we have two ways of searching for free entries: for allocatio
-n we search by size (and possibly zone) which produces too many clashes
- for our hash table to handle well, and for coalescing we search by address.
- Thus an array of doubly-linked free lists seems preferable.
-\begin_layout Standard
-There are various benefits in using per-size free lists (see
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "sub:TDB-Becomes-Fragmented"
-) but it's not clear this would reduce contention in the common case where
- all processes are allocating/freeing the same size.
- Thus we almost certainly need to divide in other ways: the most obvious
- is to divide the file into zones, and using a free list (or table of free
- lists) for each.
- This approximates address ordering.
-\begin_layout Standard
-Unfortunately it is difficult to know what heuristics should be used to
- determine zone sizes, and our transaction code relies on being able to
- create a
-\begin_inset Quotes eld
-recovery area
-\begin_inset Quotes erd
- by simply appending to the file (difficult if it would need to create a
- new zone header).
- Thus we use a linked-list of free tables; currently we only ever create
- one, but if there is more than one we choose one at random to use.
- In future we may use heuristics to add new free tables on contention.
- We only expand the file when all free tables are exhausted.
-\begin_layout Standard
-The basic algorithm is as follows.
- Freeing is simple:
-\begin_layout Enumerate
-Identify the correct free list.
-\begin_layout Enumerate
-Lock the corresponding list.
-\begin_layout Enumerate
-Re-check the list (we didn't have a lock, sizes could have changed): relock
- if necessary.
-\begin_layout Enumerate
-Place the freed entry in the list.
-\begin_layout Standard
-Allocation is a little more complicated, as we perform delayed coalescing
- at this point:
-\begin_layout Enumerate
-Pick a free table; usually the previous one.
-\begin_layout Enumerate
-Lock the corresponding list.
-\begin_layout Enumerate
-If the top entry is -large enough, remove it from the list and return it.
-\begin_layout Enumerate
-Otherwise, coalesce entries in the list.If there was no entry large enough,
- unlock the list and try the next largest list
-\begin_layout Enumerate
-If no list has an entry which meets our needs, try the next free table.
-\begin_layout Enumerate
-If no zone satisfies, expand the file.
-\begin_layout Standard
-This optimizes rapid insert/delete of free list entries by not coalescing
- them all the time..
- First-fit address ordering ordering seems to be fairly good for keeping
- fragmentation low (see
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "sub:TDB-Becomes-Fragmented"
- Note that address ordering does not need a tailer to coalesce, though if
- we needed one we could have one cheaply: see
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "sub:Records-Incur-A"
-\begin_layout Standard
-Each free entry has the free table number in the header: less than 255.
- It also contains a doubly-linked list for easy deletion.
-\begin_layout Subsection
-\begin_inset CommandInset label
-LatexCommand label
-name "sub:TDB-Becomes-Fragmented"
-TDB Becomes Fragmented
-\begin_layout Standard
-Much of this is a result of allocation strategy
-\begin_inset Foot
-status collapsed
-\begin_layout Plain Layout
-The Memory Fragmentation Problem: Solved? Johnstone & Wilson 1995 ftp://ftp.cs.ute
- and deliberate hobbling of coalescing; internal fragmentation (aka overallocati
-on) is deliberately set at 25%, and external fragmentation is only cured
- by the decision to repack the entire db when a transaction commit needs
- to enlarge the file.
-\begin_layout Subsubsection
-Proposed Solution
-\begin_layout Standard
-The 25% overhead on allocation works in practice for ldb because indexes
- tend to expand by one record at a time.
- This internal fragmentation can be resolved by having an
-\begin_inset Quotes eld
-\begin_inset Quotes erd
- bit in the header to note entries that have previously expanded, and allocating
- more space for them.
-\begin_layout Standard
-There are is a spectrum of possible solutions for external fragmentation:
- one is to use a fragmentation-avoiding allocation strategy such as best-fit
- address-order allocator.
- The other end of the spectrum would be to use a bump allocator (very fast
- and simple) and simply repack the file when we reach the end.
-\begin_layout Standard
-There are three problems with efficient fragmentation-avoiding allocators:
- they are non-trivial, they tend to use a single free list for each size,
- and there's no evidence that tdb allocation patterns will match those recorded
- for general allocators (though it seems likely).
-\begin_layout Standard
-Thus we don't spend too much effort on external fragmentation; we will be
- no worse than the current code if we need to repack on occasion.
- More effort is spent on reducing freelist contention, and reducing overhead.
-\begin_layout Subsection
-\begin_inset CommandInset label
-LatexCommand label
-name "sub:Records-Incur-A"
-Records Incur A 28-Byte Overhead
-\begin_layout Standard
-Each TDB record has a header as follows:
-\begin_layout LyX-Code
-struct tdb_record {
-\begin_layout LyX-Code
- tdb_off_t next; /* offset of the next record in the list */
-\begin_layout LyX-Code
- tdb_len_t rec_len; /* total byte length of record */
-\begin_layout LyX-Code
- tdb_len_t key_len; /* byte length of key */
-\begin_layout LyX-Code
- tdb_len_t data_len; /* byte length of data */
-\begin_layout LyX-Code
- uint32_t full_hash; /* the full 32 bit hash of the key */
-\begin_layout LyX-Code
- uint32_t magic; /* try to catch errors */
-\begin_layout LyX-Code
- /* the following union is implied:
-\begin_layout LyX-Code
- union {
-\begin_layout LyX-Code
- char record[rec_len];
-\begin_layout LyX-Code
- struct {
-\begin_layout LyX-Code
- char key[key_len];
-\begin_layout LyX-Code
- char data[data_len];
-\begin_layout LyX-Code
- }
-\begin_layout LyX-Code
- uint32_t totalsize; (tailer)
-\begin_layout LyX-Code
- }
-\begin_layout LyX-Code
- */
-\begin_layout LyX-Code
-\begin_layout Standard
-Naively, this would double to a 56-byte overhead on a 64 bit implementation.
-\begin_layout Subsubsection
-Proposed Solution
-\begin_layout Standard
-We can use various techniques to reduce this for an allocated block:
-\begin_layout Enumerate
-The 'next' pointer is not required, as we are using a flat hash table.
-\begin_layout Enumerate
-'rec_len' can instead be expressed as an addition to key_len and data_len
- (it accounts for wasted or overallocated length in the record).
- Since the record length is always a multiple of 8, we can conveniently
- fit it in 32 bits (representing up to 35 bits).
-\begin_layout Enumerate
-'key_len' and 'data_len' can be reduced.
- I'm unwilling to restrict 'data_len' to 32 bits, but instead we can combine
- the two into one 64-bit field and using a 5 bit value which indicates at
- what bit to divide the two.
- Keys are unlikely to scale as fast as data, so I'm assuming a maximum key
- size of 32 bits.
-\begin_layout Enumerate
-'full_hash' is used to avoid a memcmp on the
-\begin_inset Quotes eld
-\begin_inset Quotes erd
- case, but this is diminishing returns after a handful of bits (at 10 bits,
- it reduces 99.9% of false memcmp).
- As an aside, as the lower bits are already incorporated in the hash table
- resolution, the upper bits should be used here.
- Note that it's not clear that these bits will be a win, given the extra
- bits in the hash table itself (see
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "sub:Hash-Size-Solution"
-\begin_layout Enumerate
-'magic' does not need to be enlarged: it currently reflects one of 5 values
- (used, free, dead, recovery, and unused_recovery).
- It is useful for quick sanity checking however, and should not be eliminated.
-\begin_layout Enumerate
-'tailer' is only used to coalesce free blocks (so a block to the right can
- find the header to check if this block is free).
- This can be replaced by a single 'free' bit in the header of the following
- block (and the tailer only exists in free blocks).
-\begin_inset Foot
-status collapsed
-\begin_layout Plain Layout
-This technique from Thomas Standish.
- Data Structure Techniques.
- Addison-Wesley, Reading, Massachusetts, 1980.
- The current proposed coalescing algorithm doesn't need this, however.
-\begin_layout Standard
-This produces a 16 byte used header like this:
-\begin_layout LyX-Code
-struct tdb_used_record {
-\begin_layout LyX-Code
- uint32_t used_magic : 16,
-\begin_layout LyX-Code
-\begin_layout LyX-Code
- key_data_divide: 5,
-\begin_layout LyX-Code
- top_hash: 11;
-\begin_layout LyX-Code
- uint32_t extra_octets;
-\begin_layout LyX-Code
- uint64_t key_and_data_len;
-\begin_layout LyX-Code
-\begin_layout Standard
-And a free record like this:
-\begin_layout LyX-Code
-struct tdb_free_record {
-\begin_layout LyX-Code
- uint64_t free_magic: 8,
-\begin_layout LyX-Code
- prev : 56;
-\begin_layout LyX-Code
-\begin_layout LyX-Code
- uint64_t free_table: 8,
-\begin_layout LyX-Code
- total_length : 56
-\begin_layout LyX-Code
- uint64_t next;;
-\begin_layout LyX-Code
-\begin_layout Standard
-Note that by limiting valid offsets to 56 bits, we can pack everything we
- need into 3 64-byte words, meaning our minimum record size is 8 bytes.
-\begin_layout Subsubsection
-\begin_layout Standard
-\begin_layout Subsection
-Transaction Commit Requires 4 fdatasync
-\begin_layout Standard
-The current transaction algorithm is:
-\begin_layout Enumerate
-\begin_layout Enumerate
-\begin_layout Enumerate
-\begin_layout Enumerate
-\begin_layout Enumerate
-\begin_layout Enumerate
-\begin_layout Enumerate
-\begin_layout Enumerate
-\begin_layout Standard
-On current ext3, each sync flushes all data to disk, so the next 3 syncs
- are relatively expensive.
- But this could become a performance bottleneck on other filesystems such
- as ext4.
-\begin_layout Subsubsection
-Proposed Solution
-\begin_layout Standard
-Neil Brown points out that this is overzealous, and only one sync is needed:
-\begin_layout Enumerate
-Bundle the recovery data, a transaction counter and a strong checksum of
- the new data.
-\begin_layout Enumerate
-Strong checksum that whole bundle.
-\begin_layout Enumerate
-Store the bundle in the database.
-\begin_layout Enumerate
-Overwrite the oldest of the two recovery pointers in the header (identified
- using the transaction counter) with the offset of this bundle.
-\begin_layout Enumerate
-\begin_layout Enumerate
-Write the new data to the file.
-\begin_layout Standard
-Checking for recovery means identifying the latest bundle with a valid checksum
- and using the new data checksum to ensure that it has been applied.
- This is more expensive than the current check, but need only be done at
- open.
- For running databases, a separate header field can be used to indicate
- a transaction in progress; we need only check for recovery if this is set.
-\begin_layout Subsubsection
-\begin_layout Standard
-\begin_layout Subsection
-\begin_inset CommandInset label
-LatexCommand label
-name "sub:TDB-Does-Not"
-TDB Does Not Have Snapshot Support
-\begin_layout Subsubsection
-Proposed SolutionNone.
- At some point you say
-\begin_inset Quotes eld
-use a real database
-\begin_inset Quotes erd
- (but see
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "replay-attribute"
-\begin_layout Standard
-But as a thought experiment, if we implemented transactions to only overwrite
- free entries (this is tricky: there must not be a header in each entry
- which indicates whether it is free, but use of presence in metadata elsewhere),
- and a pointer to the hash table, we could create an entirely new commit
- without destroying existing data.
- Then it would be easy to implement snapshots in a similar way.
-\begin_layout Standard
-This would not allow arbitrary changes to the database, such as tdb_repack
- does, and would require more space (since we have to preserve the current
- and future entries at once).
- If we used hash trees rather than one big hash table, we might only have
- to rewrite some sections of the hash, too.
-\begin_layout Standard
-We could then implement snapshots using a similar method, using multiple
- different hash tables/free tables.
-\begin_layout Subsubsection
-\begin_layout Standard
-\begin_layout Subsection
-Transactions Cannot Operate in Parallel
-\begin_layout Standard
-This would be useless for ldb, as it hits the index records with just about
- every update.
- It would add significant complexity in resolving clashes, and cause the
- all transaction callers to write their code to loop in the case where the
- transactions spuriously failed.
-\begin_layout Subsubsection
-Proposed Solution
-\begin_layout Standard
-None (but see
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "replay-attribute"
- We could solve a small part of the problem by providing read-only transactions.
- These would allow one write transaction to begin, but it could not commit
- until all r/o transactions are done.
- This would require a new RO_TRANSACTION_LOCK, which would be upgraded on
- commit.
-\begin_layout Subsubsection
-\begin_layout Standard
-\begin_layout Subsection
-Default Hash Function Is Suboptimal
-\begin_layout Standard
-The Knuth-inspired multiplicative hash used by tdb is fairly slow (especially
- if we expand it to 64 bits), and works best when the hash bucket size is
- a prime number (which also means a slow modulus).
- In addition, it is highly predictable which could potentially lead to a
- Denial of Service attack in some TDB uses.
-\begin_layout Subsubsection
-Proposed Solution
-\begin_layout Standard
-The Jenkins lookup3 hash
-\begin_inset Foot
-status open
-\begin_layout Plain Layout
- is a fast and superbly-mixing hash.
- It's used by the Linux kernel and almost everything else.
- This has the particular properties that it takes an initial seed, and produces
- two 32 bit hash numbers, which we can combine into a 64-bit hash.
-\begin_layout Standard
-The seed should be created at tdb-creation time from some random source,
- and placed in the header.
- This is far from foolproof, but adds a little bit of protection against
- hash bombing.
-\begin_layout Subsubsection
-\begin_layout Standard
-\begin_layout Subsection
-\begin_inset CommandInset label
-LatexCommand label
-name "Reliable-Traversal-Adds"
-Reliable Traversal Adds Complexity
-\begin_layout Standard
-We lock a record during traversal iteration, and try to grab that lock in
- the delete code.
- If that grab on delete fails, we simply mark it deleted and continue onwards;
- traversal checks for this condition and does the delete when it moves off
- the record.
-\begin_layout Standard
-If traversal terminates, the dead record may be left indefinitely.
-\begin_layout Subsubsection
-Proposed Solution
-\begin_layout Standard
-Remove reliability guarantees; see
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "traverse-Proposed-Solution"
-\begin_layout Subsubsection
-\begin_layout Standard
-\begin_layout Subsection
-Fcntl Locking Adds Overhead
-\begin_layout Standard
-Placing a fcntl lock means a system call, as does removing one.
- This is actually one reason why transactions can be faster (everything
- is locked once at transaction start).
- In the uncontended case, this overhead can theoretically be eliminated.
-\begin_layout Subsubsection
-Proposed Solution
-\begin_layout Standard
-\begin_layout Standard
-We tried this before with spinlock support, in the early days of TDB, and
- it didn't make much difference except in manufactured benchmarks.
-\begin_layout Standard
-We could use spinlocks (with futex kernel support under Linux), but it means
- that we lose automatic cleanup when a process dies with a lock.
- There is a method of auto-cleanup under Linux, but it's not supported by
- other operating systems.
- We could reintroduce a clear-if-first-style lock and sweep for dead futexes
- on open, but that wouldn't help the normal case of one concurrent opener
- dying.
- Increasingly elaborate repair schemes could be considered, but they require
- an ABI change (everyone must use them) anyway, so there's no need to do
- this at the same time as everything else.
-\begin_layout Subsection
-Some Transactions Don't Require Durability
-\begin_layout Standard
-Volker points out that gencache uses a CLEAR_IF_FIRST tdb for normal (fast)
- usage, and occasionally empties the results into a transactional TDB.
- This kind of usage prioritizes performance over durability: as long as
- we are consistent, data can be lost.
-\begin_layout Standard
-This would be more neatly implemented inside tdb: a
-\begin_inset Quotes eld
-\begin_inset Quotes erd
- transaction commit (ie.
- syncless) which meant that data may be reverted on a crash.
-\begin_layout Subsubsection
-Proposed Solution
-\begin_layout Standard
-\begin_layout Standard
-Unfortunately any transaction scheme which overwrites old data requires
- a sync before that overwrite to avoid the possibility of corruption.
-\begin_layout Standard
-It seems possible to use a scheme similar to that described in
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "sub:TDB-Does-Not"
-,where transactions are committed without overwriting existing data, and
- an array of top-level pointers were available in the header.
- If the transaction is
-\begin_inset Quotes eld
-\begin_inset Quotes erd
- then we would not need a sync at all: existing processes would pick up
- the new hash table and free list and work with that.
-\begin_layout Standard
-At some later point, a sync would allow recovery of the old data into the
- free lists (perhaps when the array of top-level pointers filled).
- On crash, tdb_open() would examine the array of top levels, and apply the
- transactions until it encountered an invalid checksum.
-\begin_layout Subsection
-Tracing Is Fragile, Replay Is External
-\begin_layout Standard
-The current TDB has compile-time-enabled tracing code, but it often breaks
- as it is not enabled by default.
- In a similar way, the ctdb code has an external wrapper which does replay
- tracing so it can coordinate cluster-wide transactions.
-\begin_layout Subsubsection
-Proposed Solution
-\begin_inset CommandInset label
-LatexCommand label
-name "replay-attribute"
-\begin_layout Standard
-Tridge points out that an attribute can be later added to tdb_open (see
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "attributes"
-) to provide replay/trace hooks, which could become the basis for this and
- future parallel transactions and snapshot support.
-\begin_layout Subsubsection
-\begin_layout Standard
-a1814 2
-\change_inserted 0 1273479960
-a1821 2
-\change_inserted 0 1273480265
-a1830 2
-\change_inserted 0 1273480354
-a1845 2
-\change_inserted 0 1273478968
-a1851 2
-\change_inserted 0 1273492604
-a1859 2
-\change_inserted 0 1273479572
-a1862 2
-a1870 2
-\change_inserted 0 1273480282
-a1874 2
-\change_inserted 0 1273478931
-a1878 2
-\change_inserted 0 1273481549
-a1882 2
-\change_inserted 0 1273481557
-a1886 2
-\change_inserted 0 1273480307
-a1890 2
-\change_inserted 0 1273480335
-a1894 2
-\change_inserted 0 1273479897
-a1898 2
-\change_inserted 0 1273479653
-a1902 2
-\change_inserted 0 1273480371
-a1906 2
-\change_inserted 0 1273480464
-a1910 2
-\change_inserted 0 1273480399
-a1914 2
-\change_inserted 0 1273480425
-a1918 2
-\change_inserted 0 1273480453
-a1922 2
-\change_inserted 0 1273480455
-a1926 2
-\change_inserted 0 1273480450
-a1930 2
-\change_inserted 0 1273480452
-a1935 2
-\change_inserted 0 1273478830
-a1942 5
-\change_deleted 0 1273481604
-In theory, we could get away with 2: one after we write the new data, and
- one to somehow atomically change over to it.
-\change_inserted 0 1273481632
-a1946 2
-\change_inserted 0 1273481724
-a1950 2
-\change_inserted 0 1273481713
-a1954 2
-\change_inserted 0 1273481717
-a1958 2
-\change_inserted 0 1273481730
-a1962 2
-\change_inserted 0 1273481736
-a1966 2
-\change_inserted 0 1273481744
-a1970 2
-\change_inserted 0 1273481748
-a1974 2
-\change_inserted 0 1273482185
-a1978 2
-\change_inserted 0 1273482259
-a1989 50
-\change_deleted 0 1273481848
- Trying to rewrite the transaction code is a separate experiment, which
- I encourage someone else to do.
- At some point you say
-\begin_inset Quotes eld
-use a real database
-\begin_inset Quotes erd
-\begin_layout Standard
-\change_deleted 0 1273481848
-But as a thought experiment:
-\begin_layout Standard
-\change_deleted 0 1273481788
-Say there was a pointer in the header which said where the hash table and
- free list tables were, and that no blocks were labeled with whether they
- were free or not (it had to be derived from what list they were in).
- We could create new hash table and free list in some free space, and populate
- it as we want the post-committed state to look.
- Then we sync, then we switch the offset in the header, then we sync again.
-\begin_layout Standard
-\change_deleted 0 1273481788
-This would not allow arbitrary changes to the database, such as tdb_repack
- does, and would require more space (since we have to preserve the current
- and future entries at once).
- If we used hash trees rather than one big hash table, we might only have
- to rewrite some sections of the hash, too.
-\change_inserted 0 1273481854
-\begin_layout Standard
-\change_inserted 0 1273482102
-a1993 2
-\change_inserted 0 1273482061
-a1998 2
-\change_inserted 0 1273482063
-a2002 2
-\change_inserted 0 1273482072
-a2006 2
-\change_inserted 0 1273482139
-a2011 2
-\change_inserted 0 1273482364
-a2015 2
-\change_inserted 0 1273482163
-a2019 2
-\change_inserted 0 1273482493
-a2037 2
-\change_inserted 0 1273482536
-a2046 2
-a2049 2
-\change_inserted 0 1273482641
-a2058 2
-\change_inserted 0 1273481827
-d2067 2
-a2068 11
-We could
-\change_inserted 0 1273481829
-implement snapshots using a similar method
-\change_deleted 0 1273481838
- to the above, only
-\change_inserted 0 1273481840
- using multiple different hash tables/free tables.
-@After first feedback (Ronnie & Volker)
-@d1314 13
-d1531 11
-a1541 1
-The free list should be split into multiple lists to reduce contention.
-d1547 39
-d1596 7
-d1604 1
-a1604 1
-The algorithm for freeing is simple:
-d1608 7
-a1614 1
-Identify the correct free list.
-d1618 30
-a1647 1
-Lock the list, and place the freed entry at the head.
-d1651 7
-a1657 2
-Allocation is a little more complicated, as we merge entries as we walk
- the list:
-d1661 19
-a1679 1
-Pick a free list; either the list we last freed onto, or based on a
-d1691 17
-a1707 1
-Lock that list.
-d1711 7
-a1717 1
-If the top entry is well-sized, remove it from the list and return it.
-d1721 5
-a1725 1
-Otherwise, examine the entry to the right of it in the file.
-d1731 2
-d1737 2
-d1743 2
-d1749 2
-d1756 8
-d1765 2
-d1770 2
-d1773 2
-d1778 7
-a1784 1
-If no list satisfies, expand the file.
-d1788 28
-a1815 2
-This optimizes rapid insert/delete of free list entries, and allows us to
- get rid of the tailer altogether.
-d1819 2
-d1851 1
-a1851 1
-\change_inserted 0 1272941474
-d1857 303
-a2159 18
-\change_inserted 0 1272942759
-There are various ways to organize these lists, but because we want to be
- able to quickly identify which free list an entry is in, and reduce the
- number of locks required for merging, we will use zoning (eg.
- each of the N free lists in a tdb file of size M covers a fixed fraction
- M/N).
- Note that this means we need to reshuffle the free lists when we expand
- the file; this is probably acceptable when we double the hash table size,
- since that is such an expensive operation already.
- In the case of increasing the file size, there is an optimization we can
- use: if we use M in the formula above as the file size rounded up to the
- next power of 2, we only need reshuffle free lists when the file size crosses
- a power of 2 boundary,
-\emph on
-\emph default
-reshuffling the free lists is trivial: we simply merge every consecutive
- pair of free lists.
-d2164 107
-d2276 2
-d2280 59
-d2346 2
-d2363 2
-d2366 2
-d2371 2
-d2382 2
-d2389 57
-d2458 13
-d2474 32
-a2505 2
-We could implement snapshots using a similar method to the above, only using
- multiple different hash tables/free tables.
-@Initial revision
-@d1 1
-a1 1
-#LyX 1.6.4 created this file. For more info see
-d36 3
-a38 3
-\tracking_changes false
-\output_changes false
-\author ""
-d662 5
-a666 1
- behavior of disallowing transactions should become the default.
-d1215 21
-d1527 2
-d1533 3
-a1535 1
- The algorithm for freeing is simple:
-d1642 26
diff --git a/lib/tdb2/doc/design.pdf b/lib/tdb2/doc/design.pdf
deleted file mode 100644
index 558dc1f8c2..0000000000
--- a/lib/tdb2/doc/design.pdf
+++ /dev/null
Binary files differ
diff --git a/lib/tdb2/doc/design.txt b/lib/tdb2/doc/design.txt
deleted file mode 100644
index bd2ffde4db..0000000000
--- a/lib/tdb2/doc/design.txt
+++ /dev/null
@@ -1,1258 +0,0 @@
-TDB2: A Redesigning The Trivial DataBase
-Rusty Russell, IBM Corporation
-The Trivial DataBase on-disk format is 32 bits; with usage cases
-heading towards the 4G limit, that must change. This required
-breakage provides an opportunity to revisit TDB's other design
-decisions and reassess them.
-1 Introduction
-The Trivial DataBase was originally written by Andrew Tridgell as
-a simple key/data pair storage system with the same API as dbm,
-but allowing multiple readers and writers while being small
-enough (< 1000 lines of C) to include in SAMBA. The simple design
-created in 1999 has proven surprisingly robust and performant,
-used in Samba versions 3 and 4 as well as numerous other
-projects. Its useful life was greatly increased by the
-(backwards-compatible!) addition of transaction support in 2005.
-The wider variety and greater demands of TDB-using code has lead
-to some organic growth of the API, as well as some compromises on
-the implementation. None of these, by themselves, are seen as
-show-stoppers, but the cumulative effect is to a loss of elegance
-over the initial, simple TDB implementation. Here is a table of
-the approximate number of lines of implementation code and number
-of API functions at the end of each year:
-| Year End | API Functions | Lines of C Code Implementation |
-| 1999 | 13 | 1195 |
-| 2000 | 24 | 1725 |
-| 2001 | 32 | 2228 |
-| 2002 | 35 | 2481 |
-| 2003 | 35 | 2552 |
-| 2004 | 40 | 2584 |
-| 2005 | 38 | 2647 |
-| 2006 | 52 | 3754 |
-| 2007 | 66 | 4398 |
-| 2008 | 71 | 4768 |
-| 2009 | 73 | 5715 |
-This review is an attempt to catalog and address all the known
-issues with TDB and create solutions which address the problems
-without significantly increasing complexity; all involved are far
-too aware of the dangers of second system syndrome in rewriting a
-successful project like this.
-2 API Issues
-2.1 tdb_open_ex Is Not Expandable
-The tdb_open() call was expanded to tdb_open_ex(), which added an
-optional hashing function and an optional logging function
-argument. Additional arguments to open would require the
-introduction of a tdb_open_ex2 call etc.
-2.1.1 Proposed Solution<attributes>
-tdb_open() will take a linked-list of attributes:
-enum tdb_attribute {
-struct tdb_attribute_base {
- enum tdb_attribute attr;
- union tdb_attribute *next;
-struct tdb_attribute_log {
- struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_LOG
- tdb_log_func log_fn;
- void *log_private;
-struct tdb_attribute_hash {
- struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_HASH
- tdb_hash_func hash_fn;
- void *hash_private;
-union tdb_attribute {
- struct tdb_attribute_base base;
- struct tdb_attribute_log log;
- struct tdb_attribute_hash hash;
-This allows future attributes to be added, even if this expands
-the size of the union.
-2.1.2 Status
-2.2 tdb_traverse Makes Impossible Guarantees
-tdb_traverse (and tdb_firstkey/tdb_nextkey) predate transactions,
-and it was thought that it was important to guarantee that all
-records which exist at the start and end of the traversal would
-be included, and no record would be included twice.
-This adds complexity (see[Reliable-Traversal-Adds]) and does not
-work anyway for records which are altered (in particular, those
-which are expanded may be effectively deleted and re-added behind
-the traversal).
-2.2.1 <traverse-Proposed-Solution>Proposed Solution
-Abandon the guarantee. You will see every record if no changes
-occur during your traversal, otherwise you will see some subset.
-You can prevent changes by using a transaction or the locking
-2.2.2 Status
-Complete. Delete-during-traverse will still delete every record,
-too (assuming no other changes).
-2.3 Nesting of Transactions Is Fraught
-TDB has alternated between allowing nested transactions and not
-allowing them. Various paths in the Samba codebase assume that
-transactions will nest, and in a sense they can: the operation is
-only committed to disk when the outer transaction is committed.
-There are two problems, however:
-1. Canceling the inner transaction will cause the outer
- transaction commit to fail, and will not undo any operations
- since the inner transaction began. This problem is soluble with
- some additional internal code.
-2. An inner transaction commit can be cancelled by the outer
- transaction. This is desirable in the way which Samba's
- database initialization code uses transactions, but could be a
- surprise to any users expecting a successful transaction commit
- to expose changes to others.
-The current solution is to specify the behavior at tdb_open(),
-with the default currently that nested transactions are allowed.
-This flag can also be changed at runtime.
-2.3.1 Proposed Solution
-Given the usage patterns, it seems that the “least-surprise”
-behavior of disallowing nested transactions should become the
-default. Additionally, it seems the outer transaction is the only
-code which knows whether inner transactions should be allowed, so
-a flag to indicate this could be added to tdb_transaction_start.
-However, this behavior can be simulated with a wrapper which uses
-tdb_add_flags() and tdb_remove_flags(), so the API should not be
-expanded for this relatively-obscure case.
-2.3.2 Status
-Incomplete; nesting flag is still defined as per tdb1.
-2.4 Incorrect Hash Function is Not Detected
-tdb_open_ex() allows the calling code to specify a different hash
-function to use, but does not check that all other processes
-accessing this tdb are using the same hash function. The result
-is that records are missing from tdb_fetch().
-2.4.1 Proposed Solution
-The header should contain an example hash result (eg. the hash of
-0xdeadbeef), and tdb_open_ex() should check that the given hash
-function produces the same answer, or fail the tdb_open call.
-2.4.2 Status
-2.5 tdb_set_max_dead/TDB_VOLATILE Expose Implementation
-In response to scalability issues with the free list ([TDB-Freelist-Is]
-) two API workarounds have been incorporated in TDB:
-tdb_set_max_dead() and the TDB_VOLATILE flag to tdb_open. The
-latter actually calls the former with an argument of “5”.
-This code allows deleted records to accumulate without putting
-them in the free list. On delete we iterate through each chain
-and free them in a batch if there are more than max_dead entries.
-These are never otherwise recycled except as a side-effect of a
-2.5.1 Proposed Solution
-With the scalability problems of the freelist solved, this API
-can be removed. The TDB_VOLATILE flag may still be useful as a
-hint that store and delete of records will be at least as common
-as fetch in order to allow some internal tuning, but initially
-will become a no-op.
-2.5.2 Status
-Incomplete. TDB_VOLATILE still defined, but implementation should
-fail on unknown flags to be future-proof.
-2.6 <TDB-Files-Cannot>TDB Files Cannot Be Opened Multiple Times
- In The Same Process
-No process can open the same TDB twice; we check and disallow it.
-This is an unfortunate side-effect of fcntl locks, which operate
-on a per-file rather than per-file-descriptor basis, and do not
-nest. Thus, closing any file descriptor on a file clears all the
-locks obtained by this process, even if they were placed using a
-different file descriptor!
-Note that even if this were solved, deadlock could occur if
-operations were nested: this is a more manageable programming
-error in most cases.
-2.6.1 Proposed Solution
-We could lobby POSIX to fix the perverse rules, or at least lobby
-Linux to violate them so that the most common implementation does
-not have this restriction. This would be a generally good idea
-for other fcntl lock users.
-Samba uses a wrapper which hands out the same tdb_context to
-multiple callers if this happens, and does simple reference
-counting. We should do this inside the tdb library, which already
-emulates lock nesting internally; it would need to recognize when
-deadlock occurs within a single process. This would create a new
-failure mode for tdb operations (while we currently handle
-locking failures, they are impossible in normal use and a process
-encountering them can do little but give up).
-I do not see benefit in an additional tdb_open flag to indicate
-whether re-opening is allowed, as though there may be some
-benefit to adding a call to detect when a tdb_context is shared,
-to allow other to create such an API.
-2.6.2 Status
-2.7 TDB API Is Not POSIX Thread-safe
-The TDB API uses an error code which can be queried after an
-operation to determine what went wrong. This programming model
-does not work with threads, unless specific additional guarantees
-are given by the implementation. In addition, even
-otherwise-independent threads cannot open the same TDB (as in [TDB-Files-Cannot]
-2.7.1 Proposed Solution
-Reachitecting the API to include a tdb_errcode pointer would be a
-great deal of churn; we are better to guarantee that the
-tdb_errcode is per-thread so the current programming model can be
-This requires dynamic per-thread allocations, which is awkward
-with POSIX threads (pthread_key_create space is limited and we
-cannot simply allocate a key for every TDB).
-Internal locking is required to make sure that fcntl locks do not
-overlap between threads, and also that the global list of tdbs is
-The aim is that building tdb with -DTDB_PTHREAD will result in a
-pthread-safe version of the library, and otherwise no overhead
-will exist. Alternatively, a hooking mechanism similar to that
-proposed for [Proposed-Solution-locking-hook] could be used to
-enable pthread locking at runtime.
-2.7.2 Status
-2.8 *_nonblock Functions And *_mark Functions Expose
- Implementation
-Clustered TDB, see
-] wishes to operate on TDB in a non-blocking manner. This is
-currently done as follows:
-1. Call the _nonblock variant of an API function (eg.
- tdb_lockall_nonblock). If this fails:
-2. Fork a child process, and wait for it to call the normal
- variant (eg. tdb_lockall).
-3. If the child succeeds, call the _mark variant to indicate we
- already have the locks (eg. tdb_lockall_mark).
-4. Upon completion, tell the child to release the locks (eg.
- tdb_unlockall).
-5. Indicate to tdb that it should consider the locks removed (eg.
- tdb_unlockall_mark).
-There are several issues with this approach. Firstly, adding two
-new variants of each function clutters the API for an obscure
-use, and so not all functions have three variants. Secondly, it
-assumes that all paths of the functions ask for the same locks,
-otherwise the parent process will have to get a lock which the
-child doesn't have under some circumstances. I don't believe this
-is currently the case, but it constrains the implementation.
-2.8.1 <Proposed-Solution-locking-hook>Proposed Solution
-Implement a hook for locking methods, so that the caller can
-control the calls to create and remove fcntl locks. In this
-scenario, ctdbd would operate as follows:
-1. Call the normal API function, eg tdb_lockall().
-2. When the lock callback comes in, check if the child has the
- lock. Initially, this is always false. If so, return 0.
- Otherwise, try to obtain it in non-blocking mode. If that
- fails, return EWOULDBLOCK.
-3. Release locks in the unlock callback as normal.
-4. If tdb_lockall() fails, see if we recorded a lock failure; if
- so, call the child to repeat the operation.
-5. The child records what locks it obtains, and returns that
- information to the parent.
-6. When the child has succeeded, goto 1.
-This is flexible enough to handle any potential locking scenario,
-even when lock requirements change. It can be optimized so that
-the parent does not release locks, just tells the child which
-locks it doesn't need to obtain.
-It also keeps the complexity out of the API, and in ctdbd where
-it is needed.
-2.8.2 Status
-2.9 tdb_chainlock Functions Expose Implementation
-tdb_chainlock locks some number of records, including the record
-indicated by the given key. This gave atomicity guarantees;
-no-one can start a transaction, alter, read or delete that key
-while the lock is held.
-It also makes the same guarantee for any other key in the chain,
-which is an internal implementation detail and potentially a
-cause for deadlock.
-2.9.1 Proposed Solution
-None. It would be nice to have an explicit single entry lock
-which effected no other keys. Unfortunately, this won't work for
-an entry which doesn't exist. Thus while chainlock may be
-implemented more efficiently for the existing case, it will still
-have overlap issues with the non-existing case. So it is best to
-keep the current (lack of) guarantee about which records will be
-effected to avoid constraining our implementation.
-2.10 Signal Handling is Not Race-Free
-The tdb_setalarm_sigptr() call allows the caller's signal handler
-to indicate that the tdb locking code should return with a
-failure, rather than trying again when a signal is received (and
-errno == EAGAIN). This is usually used to implement timeouts.
-Unfortunately, this does not work in the case where the signal is
-received before the tdb code enters the fcntl() call to place the
-lock: the code will sleep within the fcntl() code, unaware that
-the signal wants it to exit. In the case of long timeouts, this
-does not happen in practice.
-2.10.1 Proposed Solution
-The locking hooks proposed in[Proposed-Solution-locking-hook]
-would allow the user to decide on whether to fail the lock
-acquisition on a signal. This allows the caller to choose their
-own compromise: they could narrow the race by checking
-immediately before the fcntl call.[footnote:
-It may be possible to make this race-free in some implementations
-by having the signal handler alter the struct flock to make it
-invalid. This will cause the fcntl() lock call to fail with
-EINVAL if the signal occurs before the kernel is entered,
-otherwise EAGAIN.
-2.10.2 Status
-2.11 The API Uses Gratuitous Typedefs, Capitals
-typedefs are useful for providing source compatibility when types
-can differ across implementations, or arguably in the case of
-function pointer definitions which are hard for humans to parse.
-Otherwise it is simply obfuscation and pollutes the namespace.
-Capitalization is usually reserved for compile-time constants and
- TDB_CONTEXT There is no reason to use this over 'struct
- tdb_context'; the definition isn't visible to the API user
- anyway.
- TDB_DATA There is no reason to use this over struct TDB_DATA;
- the struct needs to be understood by the API user.
- struct TDB_DATA This would normally be called 'struct
- tdb_data'.
- enum TDB_ERROR Similarly, this would normally be enum
- tdb_error.
-2.11.1 Proposed Solution
-None. Introducing lower case variants would please pedants like
-myself, but if it were done the existing ones should be kept.
-There is little point forcing a purely cosmetic change upon tdb
-2.12 <tdb_log_func-Doesnt-Take>tdb_log_func Doesn't Take The
- Private Pointer
-For API compatibility reasons, the logging function needs to call
-tdb_get_logging_private() to retrieve the pointer registered by
-the tdb_open_ex for logging.
-2.12.1 Proposed Solution
-It should simply take an extra argument, since we are prepared to
-break the API/ABI.
-2.12.2 Status
-2.13 Various Callback Functions Are Not Typesafe
-The callback functions in tdb_set_logging_function (after [tdb_log_func-Doesnt-Take]
- is resolved), tdb_parse_record, tdb_traverse, tdb_traverse_read
-and tdb_check all take void * and must internally convert it to
-the argument type they were expecting.
-If this type changes, the compiler will not produce warnings on
-the callers, since it only sees void *.
-2.13.1 Proposed Solution
-With careful use of macros, we can create callback functions
-which give a warning when used on gcc and the types of the
-callback and its private argument differ. Unsupported compilers
-will not give a warning, which is no worse than now. In addition,
-the callbacks become clearer, as they need not use void * for
-their parameter.
-See CCAN's typesafe_cb module at
-2.13.2 Status
-2.14 TDB_CLEAR_IF_FIRST Must Be Specified On All Opens,
- tdb_reopen_all Problematic
-The TDB_CLEAR_IF_FIRST flag to tdb_open indicates that the TDB
-file should be cleared if the caller discovers it is the only
-process with the TDB open. However, if any caller does not
-specify TDB_CLEAR_IF_FIRST it will not be detected, so will have
-the TDB erased underneath them (usually resulting in a crash).
-There is a similar issue on fork(); if the parent exits (or
-otherwise closes the tdb) before the child calls tdb_reopen_all()
-to establish the lock used to indicate the TDB is opened by
-someone, a TDB_CLEAR_IF_FIRST opener at that moment will believe
-it alone has opened the TDB and will erase it.
-2.14.1 Proposed Solution
-Remove TDB_CLEAR_IF_FIRST. Other workarounds are possible, but
-see [TDB_CLEAR_IF_FIRST-Imposes-Performance].
-2.14.2 Status
-Incomplete, TDB_CLEAR_IF_FIRST still defined, but not
-2.15 Extending The Header Is Difficult
-We have reserved (zeroed) words in the TDB header, which can be
-used for future features. If the future features are compulsory,
-the version number must be updated to prevent old code from
-accessing the database. But if the future feature is optional, we
-have no way of telling if older code is accessing the database or
-2.15.1 Proposed Solution
-The header should contain a “format variant” value (64-bit). This
-is divided into two 32-bit parts:
-1. The lower part reflects the format variant understood by code
- accessing the database.
-2. The upper part reflects the format variant you must understand
- to write to the database (otherwise you can only open for
- reading).
-The latter field can only be written at creation time, the former
-should be written under the OPEN_LOCK when opening the database
-for writing, if the variant of the code is lower than the current
-lowest variant.
-This should allow backwards-compatible features to be added, and
-detection if older code (which doesn't understand the feature)
-writes to the database.
-2.15.2 Status
-2.16 Record Headers Are Not Expandible
-If we later want to add (say) checksums on keys and data, it
-would require another format change, which we'd like to avoid.
-2.16.1 Proposed Solution
-We often have extra padding at the tail of a record. If we ensure
-that the first byte (if any) of this padding is zero, we will
-have a way for future changes to detect code which doesn't
-understand a new format: the new code would write (say) a 1 at
-the tail, and thus if there is no tail or the first byte is 0, we
-would know the extension is not present on that record.
-2.16.2 Status
-2.17 TDB Does Not Use Talloc
-Many users of TDB (particularly Samba) use the talloc allocator,
-and thus have to wrap TDB in a talloc context to use it
-2.17.1 Proposed Solution
-The allocation within TDB is not complicated enough to justify
-the use of talloc, and I am reluctant to force another
-(excellent) library on TDB users. Nonetheless a compromise is
-possible. An attribute (see [attributes]) can be added later to
-tdb_open() to provide an alternate allocation mechanism,
-specifically for talloc but usable by any other allocator (which
-would ignore the “context” argument).
-This would form a talloc heirarchy as expected, but the caller
-would still have to attach a destructor to the tdb context
-returned from tdb_open to close it. All TDB_DATA fields would be
-children of the tdb_context, and the caller would still have to
-manage them (using talloc_free() or talloc_steal()).
-2.17.2 Status
-3 Performance And Scalability Issues
- Imposes Performance Penalty
-When TDB_CLEAR_IF_FIRST is specified, a 1-byte read lock is
-placed at offset 4 (aka. the ACTIVE_LOCK). While these locks
-never conflict in normal tdb usage, they do add substantial
-overhead for most fcntl lock implementations when the kernel
-scans to detect if a lock conflict exists. This is often a single
-linked list, making the time to acquire and release a fcntl lock
-O(N) where N is the number of processes with the TDB open, not
-the number actually doing work.
-In a Samba server it is common to have huge numbers of clients
-sitting idle, and thus they have weaned themselves off the
-TDB_CLEAR_IF_FIRST flag.[footnote:
-There is a flag to tdb_reopen_all() which is used for this
-optimization: if the parent process will outlive the child, the
-child does not need the ACTIVE_LOCK. This is a workaround for
-this very performance issue.
-3.1.1 Proposed Solution
-Remove the flag. It was a neat idea, but even trivial servers
-tend to know when they are initializing for the first time and
-can simply unlink the old tdb at that point.
-3.1.2 Status
-Incomplete; TDB_CLEAR_IF_FIRST still defined, but does nothing.
-3.2 TDB Files Have a 4G Limit
-This seems to be becoming an issue (so much for “trivial”!),
-particularly for ldb.
-3.2.1 Proposed Solution
-A new, incompatible TDB format which uses 64 bit offsets
-internally rather than 32 bit as now. For simplicity of endian
-conversion (which TDB does on the fly if required), all values
-will be 64 bit on disk. In practice, some upper bits may be used
-for other purposes, but at least 56 bits will be available for
-file offsets.
-tdb_open() will automatically detect the old version, and even
-create them if TDB_VERSION6 is specified to tdb_open.
-32 bit processes will still be able to access TDBs larger than 4G
-(assuming that their off_t allows them to seek to 64 bits), they
-will gracefully fall back as they fail to mmap. This can happen
-already with large TDBs.
-Old versions of tdb will fail to open the new TDB files (since 28
-August 2009, commit 398d0c29290: prior to that any unrecognized
-file format would be erased and initialized as a fresh tdb!)
-3.2.2 Status
-3.3 TDB Records Have a 4G Limit
-This has not been a reported problem, and the API uses size_t
-which can be 64 bit on 64 bit platforms. However, other limits
-may have made such an issue moot.
-3.3.1 Proposed Solution
-Record sizes will be 64 bit, with an error returned on 32 bit
-platforms which try to access such records (the current
-implementation would return TDB_ERR_OOM in a similar case). It
-seems unlikely that 32 bit keys will be a limitation, so the
-implementation may not support this (see [sub:Records-Incur-A]).
-3.3.2 Status
-3.4 Hash Size Is Determined At TDB Creation Time
-TDB contains a number of hash chains in the header; the number is
-specified at creation time, and defaults to 131. This is such a
-bottleneck on large databases (as each hash chain gets quite
-long), that LDB uses 10,000 for this hash. In general it is
-impossible to know what the 'right' answer is at database
-creation time.
-3.4.1 <sub:Hash-Size-Solution>Proposed Solution
-After comprehensive performance testing on various scalable hash
- and
-This was annoying because I was previously convinced that an
-expanding tree of hashes would be very close to optimal.
-], it became clear that it is hard to beat a straight linear hash
-table which doubles in size when it reaches saturation.
-Unfortunately, altering the hash table introduces serious locking
-complications: the entire hash table needs to be locked to
-enlarge the hash table, and others might be holding locks.
-Particularly insidious are insertions done under tdb_chainlock.
-Thus an expanding layered hash will be used: an array of hash
-groups, with each hash group exploding into pointers to lower
-hash groups once it fills, turning into a hash tree. This has
-implications for locking: we must lock the entire group in case
-we need to expand it, yet we don't know how deep the tree is at
-that point.
-Note that bits from the hash table entries should be stolen to
-hold more hash bits to reduce the penalty of collisions. We can
-use the otherwise-unused lower 3 bits. If we limit the size of
-the database to 64 exabytes, we can use the top 8 bits of the
-hash entry as well. These 11 bits would reduce false positives
-down to 1 in 2000 which is more than we need: we can use one of
-the bits to indicate that the extra hash bits are valid. This
-means we can choose not to re-hash all entries when we expand a
-hash group; simply use the next bits we need and mark them
-3.4.2 Status
-3.5 <TDB-Freelist-Is>TDB Freelist Is Highly Contended
-TDB uses a single linked list for the free list. Allocation
-occurs as follows, using heuristics which have evolved over time:
-1. Get the free list lock for this whole operation.
-2. Multiply length by 1.25, so we always over-allocate by 25%.
-3. Set the slack multiplier to 1.
-4. Examine the current freelist entry: if it is > length but <
- the current best case, remember it as the best case.
-5. Multiply the slack multiplier by 1.05.
-6. If our best fit so far is less than length * slack multiplier,
- return it. The slack will be turned into a new free record if
- it's large enough.
-7. Otherwise, go onto the next freelist entry.
-Deleting a record occurs as follows:
-1. Lock the hash chain for this whole operation.
-2. Walk the chain to find the record, keeping the prev pointer
- offset.
-3. If max_dead is non-zero:
- (a) Walk the hash chain again and count the dead records.
- (b) If it's more than max_dead, bulk free all the dead ones
- (similar to steps 4 and below, but the lock is only obtained
- once).
- (c) Simply mark this record as dead and return.
-4. Get the free list lock for the remainder of this operation.
-5. <right-merging>Examine the following block to see if it is
- free; if so, enlarge the current block and remove that block
- from the free list. This was disabled, as removal from the free
- list was O(entries-in-free-list).
-6. Examine the preceeding block to see if it is free: for this
- reason, each block has a 32-bit tailer which indicates its
- length. If it is free, expand it to cover our new block and
- return.
-7. Otherwise, prepend ourselves to the free list.
-Disabling right-merging (step [right-merging]) causes
-fragmentation; the other heuristics proved insufficient to
-address this, so the final answer to this was that when we expand
-the TDB file inside a transaction commit, we repack the entire
-The single list lock limits our allocation rate; due to the other
-issues this is not currently seen as a bottleneck.
-3.5.1 Proposed Solution
-The first step is to remove all the current heuristics, as they
-obviously interact, then examine them once the lock contention is
-The free list must be split to reduce contention. Assuming
-perfect free merging, we can at most have 1 free list entry for
-each entry. This implies that the number of free lists is related
-to the size of the hash table, but as it is rare to walk a large
-number of free list entries we can use far fewer, say 1/32 of the
-number of hash buckets.
-It seems tempting to try to reuse the hash implementation which
-we use for records here, but we have two ways of searching for
-free entries: for allocation we search by size (and possibly
-zone) which produces too many clashes for our hash table to
-handle well, and for coalescing we search by address. Thus an
-array of doubly-linked free lists seems preferable.
-There are various benefits in using per-size free lists (see [sub:TDB-Becomes-Fragmented]
-) but it's not clear this would reduce contention in the common
-case where all processes are allocating/freeing the same size.
-Thus we almost certainly need to divide in other ways: the most
-obvious is to divide the file into zones, and using a free list
-(or table of free lists) for each. This approximates address
-Unfortunately it is difficult to know what heuristics should be
-used to determine zone sizes, and our transaction code relies on
-being able to create a “recovery area” by simply appending to the
-file (difficult if it would need to create a new zone header).
-Thus we use a linked-list of free tables; currently we only ever
-create one, but if there is more than one we choose one at random
-to use. In future we may use heuristics to add new free tables on
-contention. We only expand the file when all free tables are
-The basic algorithm is as follows. Freeing is simple:
-1. Identify the correct free list.
-2. Lock the corresponding list.
-3. Re-check the list (we didn't have a lock, sizes could have
- changed): relock if necessary.
-4. Place the freed entry in the list.
-Allocation is a little more complicated, as we perform delayed
-coalescing at this point:
-1. Pick a free table; usually the previous one.
-2. Lock the corresponding list.
-3. If the top entry is -large enough, remove it from the list and
- return it.
-4. Otherwise, coalesce entries in the list.If there was no entry
- large enough, unlock the list and try the next largest list
-5. If no list has an entry which meets our needs, try the next
- free table.
-6. If no zone satisfies, expand the file.
-This optimizes rapid insert/delete of free list entries by not
-coalescing them all the time.. First-fit address ordering
-ordering seems to be fairly good for keeping fragmentation low
-(see [sub:TDB-Becomes-Fragmented]). Note that address ordering
-does not need a tailer to coalesce, though if we needed one we
-could have one cheaply: see [sub:Records-Incur-A].
-Each free entry has the free table number in the header: less
-than 255. It also contains a doubly-linked list for easy
-3.6 <sub:TDB-Becomes-Fragmented>TDB Becomes Fragmented
-Much of this is a result of allocation strategy[footnote:
-The Memory Fragmentation Problem: Solved? Johnstone & Wilson 1995
-] and deliberate hobbling of coalescing; internal fragmentation
-(aka overallocation) is deliberately set at 25%, and external
-fragmentation is only cured by the decision to repack the entire
-db when a transaction commit needs to enlarge the file.
-3.6.1 Proposed Solution
-The 25% overhead on allocation works in practice for ldb because
-indexes tend to expand by one record at a time. This internal
-fragmentation can be resolved by having an “expanded” bit in the
-header to note entries that have previously expanded, and
-allocating more space for them.
-There are is a spectrum of possible solutions for external
-fragmentation: one is to use a fragmentation-avoiding allocation
-strategy such as best-fit address-order allocator. The other end
-of the spectrum would be to use a bump allocator (very fast and
-simple) and simply repack the file when we reach the end.
-There are three problems with efficient fragmentation-avoiding
-allocators: they are non-trivial, they tend to use a single free
-list for each size, and there's no evidence that tdb allocation
-patterns will match those recorded for general allocators (though
-it seems likely).
-Thus we don't spend too much effort on external fragmentation; we
-will be no worse than the current code if we need to repack on
-occasion. More effort is spent on reducing freelist contention,
-and reducing overhead.
-3.7 <sub:Records-Incur-A>Records Incur A 28-Byte Overhead
-Each TDB record has a header as follows:
-struct tdb_record {
- tdb_off_t next; /* offset of the next record in the list
- tdb_len_t rec_len; /* total byte length of record */
- tdb_len_t key_len; /* byte length of key */
- tdb_len_t data_len; /* byte length of data */
- uint32_t full_hash; /* the full 32 bit hash of the key */
- uint32_t magic; /* try to catch errors */
- /* the following union is implied:
- union {
- char record[rec_len];
- struct {
- char key[key_len];
- char data[data_len];
- }
- uint32_t totalsize; (tailer)
- }
- */
-Naively, this would double to a 56-byte overhead on a 64 bit
-3.7.1 Proposed Solution
-We can use various techniques to reduce this for an allocated
-1. The 'next' pointer is not required, as we are using a flat
- hash table.
-2. 'rec_len' can instead be expressed as an addition to key_len
- and data_len (it accounts for wasted or overallocated length in
- the record). Since the record length is always a multiple of 8,
- we can conveniently fit it in 32 bits (representing up to 35
- bits).
-3. 'key_len' and 'data_len' can be reduced. I'm unwilling to
- restrict 'data_len' to 32 bits, but instead we can combine the
- two into one 64-bit field and using a 5 bit value which
- indicates at what bit to divide the two. Keys are unlikely to
- scale as fast as data, so I'm assuming a maximum key size of 32
- bits.
-4. 'full_hash' is used to avoid a memcmp on the “miss” case, but
- this is diminishing returns after a handful of bits (at 10
- bits, it reduces 99.9% of false memcmp). As an aside, as the
- lower bits are already incorporated in the hash table
- resolution, the upper bits should be used here. Note that it's
- not clear that these bits will be a win, given the extra bits
- in the hash table itself (see [sub:Hash-Size-Solution]).
-5. 'magic' does not need to be enlarged: it currently reflects
- one of 5 values (used, free, dead, recovery, and
- unused_recovery). It is useful for quick sanity checking
- however, and should not be eliminated.
-6. 'tailer' is only used to coalesce free blocks (so a block to
- the right can find the header to check if this block is free).
- This can be replaced by a single 'free' bit in the header of
- the following block (and the tailer only exists in free
- blocks).[footnote:
-This technique from Thomas Standish. Data Structure Techniques.
-Addison-Wesley, Reading, Massachusetts, 1980.
-] The current proposed coalescing algorithm doesn't need this,
- however.
-This produces a 16 byte used header like this:
-struct tdb_used_record {
- uint32_t used_magic : 16,
- key_data_divide: 5,
- top_hash: 11;
- uint32_t extra_octets;
- uint64_t key_and_data_len;
-And a free record like this:
-struct tdb_free_record {
- uint64_t free_magic: 8,
- prev : 56;
- uint64_t free_table: 8,
- total_length : 56
- uint64_t next;;
-Note that by limiting valid offsets to 56 bits, we can pack
-everything we need into 3 64-byte words, meaning our minimum
-record size is 8 bytes.
-3.7.2 Status
-3.8 Transaction Commit Requires 4 fdatasync
-The current transaction algorithm is:
-1. write_recovery_data();
-2. sync();
-3. write_recovery_header();
-4. sync();
-5. overwrite_with_new_data();
-6. sync();
-7. remove_recovery_header();
-8. sync();
-On current ext3, each sync flushes all data to disk, so the next
-3 syncs are relatively expensive. But this could become a
-performance bottleneck on other filesystems such as ext4.
-3.8.1 Proposed Solution
-Neil Brown points out that this is overzealous, and only one sync
-is needed:
-1. Bundle the recovery data, a transaction counter and a strong
- checksum of the new data.
-2. Strong checksum that whole bundle.
-3. Store the bundle in the database.
-4. Overwrite the oldest of the two recovery pointers in the
- header (identified using the transaction counter) with the
- offset of this bundle.
-5. sync.
-6. Write the new data to the file.
-Checking for recovery means identifying the latest bundle with a
-valid checksum and using the new data checksum to ensure that it
-has been applied. This is more expensive than the current check,
-but need only be done at open. For running databases, a separate
-header field can be used to indicate a transaction in progress;
-we need only check for recovery if this is set.
-3.8.2 Status
-3.9 <sub:TDB-Does-Not>TDB Does Not Have Snapshot Support
-3.9.1 Proposed SolutionNone. At some point you say “use a real
- database” (but see [replay-attribute]).
-But as a thought experiment, if we implemented transactions to
-only overwrite free entries (this is tricky: there must not be a
-header in each entry which indicates whether it is free, but use
-of presence in metadata elsewhere), and a pointer to the hash
-table, we could create an entirely new commit without destroying
-existing data. Then it would be easy to implement snapshots in a
-similar way.
-This would not allow arbitrary changes to the database, such as
-tdb_repack does, and would require more space (since we have to
-preserve the current and future entries at once). If we used hash
-trees rather than one big hash table, we might only have to
-rewrite some sections of the hash, too.
-We could then implement snapshots using a similar method, using
-multiple different hash tables/free tables.
-3.9.2 Status
-3.10 Transactions Cannot Operate in Parallel
-This would be useless for ldb, as it hits the index records with
-just about every update. It would add significant complexity in
-resolving clashes, and cause the all transaction callers to write
-their code to loop in the case where the transactions spuriously
-3.10.1 Proposed Solution
-None (but see [replay-attribute]). We could solve a small part of
-the problem by providing read-only transactions. These would
-allow one write transaction to begin, but it could not commit
-until all r/o transactions are done. This would require a new
-RO_TRANSACTION_LOCK, which would be upgraded on commit.
-3.10.2 Status
-3.11 Default Hash Function Is Suboptimal
-The Knuth-inspired multiplicative hash used by tdb is fairly slow
-(especially if we expand it to 64 bits), and works best when the
-hash bucket size is a prime number (which also means a slow
-modulus). In addition, it is highly predictable which could
-potentially lead to a Denial of Service attack in some TDB uses.
-3.11.1 Proposed Solution
-The Jenkins lookup3 hash[footnote:
-] is a fast and superbly-mixing hash. It's used by the Linux
-kernel and almost everything else. This has the particular
-properties that it takes an initial seed, and produces two 32 bit
-hash numbers, which we can combine into a 64-bit hash.
-The seed should be created at tdb-creation time from some random
-source, and placed in the header. This is far from foolproof, but
-adds a little bit of protection against hash bombing.
-3.11.2 Status
-3.12 <Reliable-Traversal-Adds>Reliable Traversal Adds Complexity
-We lock a record during traversal iteration, and try to grab that
-lock in the delete code. If that grab on delete fails, we simply
-mark it deleted and continue onwards; traversal checks for this
-condition and does the delete when it moves off the record.
-If traversal terminates, the dead record may be left
-3.12.1 Proposed Solution
-Remove reliability guarantees; see [traverse-Proposed-Solution].
-3.12.2 Status
-3.13 Fcntl Locking Adds Overhead
-Placing a fcntl lock means a system call, as does removing one.
-This is actually one reason why transactions can be faster
-(everything is locked once at transaction start). In the
-uncontended case, this overhead can theoretically be eliminated.
-3.13.1 Proposed Solution
-We tried this before with spinlock support, in the early days of
-TDB, and it didn't make much difference except in manufactured
-We could use spinlocks (with futex kernel support under Linux),
-but it means that we lose automatic cleanup when a process dies
-with a lock. There is a method of auto-cleanup under Linux, but
-it's not supported by other operating systems. We could
-reintroduce a clear-if-first-style lock and sweep for dead
-futexes on open, but that wouldn't help the normal case of one
-concurrent opener dying. Increasingly elaborate repair schemes
-could be considered, but they require an ABI change (everyone
-must use them) anyway, so there's no need to do this at the same
-time as everything else.
-3.14 Some Transactions Don't Require Durability
-Volker points out that gencache uses a CLEAR_IF_FIRST tdb for
-normal (fast) usage, and occasionally empties the results into a
-transactional TDB. This kind of usage prioritizes performance
-over durability: as long as we are consistent, data can be lost.
-This would be more neatly implemented inside tdb: a “soft”
-transaction commit (ie. syncless) which meant that data may be
-reverted on a crash.
-3.14.1 Proposed Solution
-Unfortunately any transaction scheme which overwrites old data
-requires a sync before that overwrite to avoid the possibility of
-It seems possible to use a scheme similar to that described in [sub:TDB-Does-Not]
-,where transactions are committed without overwriting existing
-data, and an array of top-level pointers were available in the
-header. If the transaction is “soft” then we would not need a
-sync at all: existing processes would pick up the new hash table
-and free list and work with that.
-At some later point, a sync would allow recovery of the old data
-into the free lists (perhaps when the array of top-level pointers
-filled). On crash, tdb_open() would examine the array of top
-levels, and apply the transactions until it encountered an
-invalid checksum.
-3.15 Tracing Is Fragile, Replay Is External
-The current TDB has compile-time-enabled tracing code, but it
-often breaks as it is not enabled by default. In a similar way,
-the ctdb code has an external wrapper which does replay tracing
-so it can coordinate cluster-wide transactions.
-3.15.1 Proposed Solution<replay-attribute>
-Tridge points out that an attribute can be later added to
-tdb_open (see [attributes]) to provide replay/trace hooks, which
-could become the basis for this and future parallel transactions
-and snapshot support.
-3.15.2 Status
diff --git a/lib/tdb2/free.c b/lib/tdb2/free.c
deleted file mode 100644
index c4015a0f2a..0000000000
--- a/lib/tdb2/free.c
+++ /dev/null
@@ -1,976 +0,0 @@
- /*
- Trivial Database 2: free list/block handling
- Copyright (C) Rusty Russell 2010
- This library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 3 of the License, or (at your option) any later version.
- This library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- Lesser General Public License for more details.
- You should have received a copy of the GNU Lesser General Public
- License along with this library; if not, see <>.
-#include "private.h"
-#include <ccan/likely/likely.h>
-#include <ccan/ilog/ilog.h>
-#include <time.h>
-#include <assert.h>
-#include <limits.h>
-static unsigned fls64(uint64_t val)
- return ilog64(val);
-/* In which bucket would we find a particular record size? (ignoring header) */
-unsigned int size_to_bucket(tdb_len_t data_len)
- unsigned int bucket;
- /* We can't have records smaller than this. */
- assert(data_len >= TDB_MIN_DATA_LEN);
- /* Ignoring the header... */
- if (data_len - TDB_MIN_DATA_LEN <= 64) {
- /* 0 in bucket 0, 8 in bucket 1... 64 in bucket 8. */
- bucket = (data_len - TDB_MIN_DATA_LEN) / 8;
- } else {
- /* After that we go power of 2. */
- bucket = fls64(data_len - TDB_MIN_DATA_LEN) + 2;
- }
- if (unlikely(bucket >= TDB_FREE_BUCKETS))
- bucket = TDB_FREE_BUCKETS - 1;
- return bucket;
-tdb_off_t first_ftable(struct tdb_context *tdb)
- return tdb_read_off(tdb, offsetof(struct tdb_header, free_table));
-tdb_off_t next_ftable(struct tdb_context *tdb, tdb_off_t ftable)
- return tdb_read_off(tdb, ftable + offsetof(struct tdb_freetable,next));
-enum TDB_ERROR tdb_ftable_init(struct tdb_context *tdb)
- /* Use reservoir sampling algorithm to select a free list at random. */
- unsigned int rnd, max = 0, count = 0;
- tdb_off_t off;
- tdb->ftable_off = off = first_ftable(tdb);
- tdb->ftable = 0;
- while (off) {
- if (TDB_OFF_IS_ERR(off)) {
- return TDB_OFF_TO_ERR(off);
- }
- rnd = random();
- if (rnd >= max) {
- tdb->ftable_off = off;
- tdb->ftable = count;
- max = rnd;
- }
- off = next_ftable(tdb, off);
- count++;
- }
- return TDB_SUCCESS;
-/* Offset of a given bucket. */
-tdb_off_t bucket_off(tdb_off_t ftable_off, unsigned bucket)
- return ftable_off + offsetof(struct tdb_freetable, buckets)
- + bucket * sizeof(tdb_off_t);
-/* Returns free_buckets + 1, or list number to search, or -ve error. */
-static tdb_off_t find_free_head(struct tdb_context *tdb,
- tdb_off_t ftable_off,
- tdb_off_t bucket)
- /* Speculatively search for a non-zero bucket. */
- return tdb_find_nonzero_off(tdb, bucket_off(ftable_off, 0),
- bucket, TDB_FREE_BUCKETS);
-static void check_list(struct tdb_context *tdb, tdb_off_t b_off)
- tdb_off_t off, prev = 0, first;
- struct tdb_free_record r;
- first = off = (tdb_read_off(tdb, b_off) & TDB_OFF_MASK);
- while (off != 0) {
- tdb_read_convert(tdb, off, &r, sizeof(r));
- if (frec_magic(&r) != TDB_FREE_MAGIC)
- abort();
- if (prev && frec_prev(&r) != prev)
- abort();
- prev = off;
- off =;
- }
- if (first) {
- tdb_read_convert(tdb, first, &r, sizeof(r));
- if (frec_prev(&r) != prev)
- abort();
- }
-/* Remove from free bucket. */
-static enum TDB_ERROR remove_from_list(struct tdb_context *tdb,
- tdb_off_t b_off, tdb_off_t r_off,
- const struct tdb_free_record *r)
- tdb_off_t off, prev_next, head;
- enum TDB_ERROR ecode;
- /* Is this only element in list? Zero out bucket, and we're done. */
- if (frec_prev(r) == r_off)
- return tdb_write_off(tdb, b_off, 0);
- /* off = &r->prev->next */
- off = frec_prev(r) + offsetof(struct tdb_free_record, next);
- /* Get prev->next */
- prev_next = tdb_read_off(tdb, off);
- if (TDB_OFF_IS_ERR(prev_next))
- return TDB_OFF_TO_ERR(prev_next);
- /* If prev->next == 0, we were head: update bucket to point to next. */
- if (prev_next == 0) {
- /* We must preserve upper bits. */
- head = tdb_read_off(tdb, b_off);
- if (TDB_OFF_IS_ERR(head))
- return TDB_OFF_TO_ERR(head);
- if ((head & TDB_OFF_MASK) != r_off) {
- return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
- "remove_from_list:"
- " %llu head %llu on list %llu",
- (long long)r_off,
- (long long)head,
- (long long)b_off);
- }
- head = ((head & ~TDB_OFF_MASK) | r->next);
- ecode = tdb_write_off(tdb, b_off, head);
- if (ecode != TDB_SUCCESS)
- return ecode;
- } else {
- /* r->prev->next = r->next */
- ecode = tdb_write_off(tdb, off, r->next);
- if (ecode != TDB_SUCCESS)
- return ecode;
- }
- /* If we were the tail, off = &head->prev. */
- if (r->next == 0) {
- head = tdb_read_off(tdb, b_off);
- if (TDB_OFF_IS_ERR(head))
- return TDB_OFF_TO_ERR(head);
- head &= TDB_OFF_MASK;
- off = head + offsetof(struct tdb_free_record, magic_and_prev);
- } else {
- /* off = &r->next->prev */
- off = r->next + offsetof(struct tdb_free_record,
- magic_and_prev);
- }
- /* *off == r */
- if ((tdb_read_off(tdb, off) & TDB_OFF_MASK) != r_off) {
- return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
- "remove_from_list:"
- " %llu bad prev in list %llu",
- (long long)r_off, (long long)b_off);
- }
- /* r->next->prev = r->prev */
- return tdb_write_off(tdb, off, r->magic_and_prev);
-/* Enqueue in this free bucket: sets coalesce if we've added 128
- * entries to it. */
-static enum TDB_ERROR enqueue_in_free(struct tdb_context *tdb,
- tdb_off_t b_off,
- tdb_off_t off,
- tdb_len_t len,
- bool *coalesce)
- struct tdb_free_record new;
- enum TDB_ERROR ecode;
- tdb_off_t prev, head;
- uint64_t magic = (TDB_FREE_MAGIC << (64 - TDB_OFF_UPPER_STEAL));
- head = tdb_read_off(tdb, b_off);
- if (TDB_OFF_IS_ERR(head))
- return TDB_OFF_TO_ERR(head);
- /* We only need to set ftable_and_len; rest is set in enqueue_in_free */
- new.ftable_and_len = ((uint64_t)tdb->ftable
- << (64 - TDB_OFF_UPPER_STEAL))
- | len;
- /* new->next = head. */
- = (head & TDB_OFF_MASK);
- /* First element? Prev points to ourselves. */
- if (! {
- new.magic_and_prev = (magic | off);
- } else {
- /* new->prev = next->prev */
- prev = tdb_read_off(tdb,
- + offsetof(struct tdb_free_record,
- magic_and_prev));
- new.magic_and_prev = prev;
- if (frec_magic(&new) != TDB_FREE_MAGIC) {
- return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
- "enqueue_in_free: %llu bad head"
- " prev %llu",
- (long long),
- (long long)prev);
- }
- /* next->prev = new. */
- ecode = tdb_write_off(tdb,
- + offsetof(struct tdb_free_record,
- magic_and_prev),
- off | magic);
- if (ecode != TDB_SUCCESS) {
- return ecode;
- }
- prev = tdb_read_off(tdb, frec_prev(&new)
- + offsetof(struct tdb_free_record, next));
- if (prev != 0) {
- return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
- "enqueue_in_free:"
- " %llu bad tail next ptr %llu",
- (long long)frec_prev(&new)
- + offsetof(struct tdb_free_record,
- next),
- (long long)prev);
- }
- }
- /* Update enqueue count, but don't set high bit: see TDB_OFF_IS_ERR */
- if (*coalesce)
- head += (1ULL << (64 - TDB_OFF_UPPER_STEAL));
- head &= ~(TDB_OFF_MASK | (1ULL << 63));
- head |= off;
- ecode = tdb_write_off(tdb, b_off, head);
- if (ecode != TDB_SUCCESS) {
- return ecode;
- }
- /* It's time to coalesce if counter wrapped. */
- if (*coalesce)
- *coalesce = ((head & ~TDB_OFF_MASK) == 0);
- return tdb_write_convert(tdb, off, &new, sizeof(new));
-static tdb_off_t ftable_offset(struct tdb_context *tdb, unsigned int ftable)
- tdb_off_t off;
- unsigned int i;
- if (likely(tdb->ftable == ftable))
- return tdb->ftable_off;
- off = first_ftable(tdb);
- for (i = 0; i < ftable; i++) {
- if (TDB_OFF_IS_ERR(off)) {
- break;
- }
- off = next_ftable(tdb, off);
- }
- return off;
-/* Note: we unlock the current bucket if fail (-ve), or coalesce (+ve) and
- * need to blatt the *protect record (which is set to an error). */
-static tdb_len_t coalesce(struct tdb_context *tdb,
- tdb_off_t off, tdb_off_t b_off,
- tdb_len_t data_len,
- tdb_off_t *protect)
- tdb_off_t end;
- struct tdb_free_record rec;
- enum TDB_ERROR ecode;
- tdb->stats.alloc_coalesce_tried++;
- end = off + sizeof(struct tdb_used_record) + data_len;
- while (end < tdb->file->map_size) {
- const struct tdb_free_record *r;
- tdb_off_t nb_off;
- unsigned ftable, bucket;
- r = tdb_access_read(tdb, end, sizeof(*r), true);
- if (TDB_PTR_IS_ERR(r)) {
- ecode = TDB_PTR_ERR(r);
- goto err;
- }
- if (frec_magic(r) != TDB_FREE_MAGIC
- || frec_ftable(r) == TDB_FTABLE_NONE) {
- tdb_access_release(tdb, r);
- break;
- }
- ftable = frec_ftable(r);
- bucket = size_to_bucket(frec_len(r));
- nb_off = ftable_offset(tdb, ftable);
- if (TDB_OFF_IS_ERR(nb_off)) {
- tdb_access_release(tdb, r);
- ecode = TDB_OFF_TO_ERR(nb_off);
- goto err;
- }
- nb_off = bucket_off(nb_off, bucket);
- tdb_access_release(tdb, r);
- /* We may be violating lock order here, so best effort. */
- if (tdb_lock_free_bucket(tdb, nb_off, TDB_LOCK_NOWAIT)
- tdb->stats.alloc_coalesce_lockfail++;
- break;
- }
- /* Now we have lock, re-check. */
- ecode = tdb_read_convert(tdb, end, &rec, sizeof(rec));
- if (ecode != TDB_SUCCESS) {
- tdb_unlock_free_bucket(tdb, nb_off);
- goto err;
- }
- if (unlikely(frec_magic(&rec) != TDB_FREE_MAGIC)) {
- tdb->stats.alloc_coalesce_race++;
- tdb_unlock_free_bucket(tdb, nb_off);
- break;
- }
- if (unlikely(frec_ftable(&rec) != ftable)
- || unlikely(size_to_bucket(frec_len(&rec)) != bucket)) {
- tdb->stats.alloc_coalesce_race++;
- tdb_unlock_free_bucket(tdb, nb_off);
- break;
- }
- /* Did we just mess up a record you were hoping to use? */
- if (end == *protect) {
- tdb->stats.alloc_coalesce_iterate_clash++;
- }
- ecode = remove_from_list(tdb, nb_off, end, &rec);
- check_list(tdb, nb_off);
- if (ecode != TDB_SUCCESS) {
- tdb_unlock_free_bucket(tdb, nb_off);
- goto err;
- }
- end += sizeof(struct tdb_used_record) + frec_len(&rec);
- tdb_unlock_free_bucket(tdb, nb_off);
- tdb->stats.alloc_coalesce_num_merged++;
- }
- /* Didn't find any adjacent free? */
- if (end == off + sizeof(struct tdb_used_record) + data_len)
- return 0;
- /* Before we expand, check this isn't one you wanted protected? */
- if (off == *protect) {
- tdb->stats.alloc_coalesce_iterate_clash++;
- }
- /* OK, expand initial record */
- ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec));
- if (ecode != TDB_SUCCESS) {
- goto err;
- }
- if (frec_len(&rec) != data_len) {
- ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
- "coalesce: expected data len %zu not %zu",
- (size_t)data_len, (size_t)frec_len(&rec));
- goto err;
- }
- ecode = remove_from_list(tdb, b_off, off, &rec);
- check_list(tdb, b_off);
- if (ecode != TDB_SUCCESS) {
- goto err;
- }
- /* Try locking violation first. We don't allow coalesce recursion! */
- ecode = add_free_record(tdb, off, end - off, TDB_LOCK_NOWAIT, false);
- if (ecode != TDB_SUCCESS) {
- /* Need to drop lock. Can't rely on anything stable. */
- tdb->stats.alloc_coalesce_lockfail++;
- /* We have to drop this to avoid deadlocks, so make sure record
- * doesn't get coalesced by someone else! */
- rec.ftable_and_len = (TDB_FTABLE_NONE
- << (64 - TDB_OFF_UPPER_STEAL))
- | (end - off - sizeof(struct tdb_used_record));
- ecode = tdb_write_off(tdb,
- off + offsetof(struct tdb_free_record,
- ftable_and_len),
- rec.ftable_and_len);
- if (ecode != TDB_SUCCESS) {
- goto err;
- }
- tdb_unlock_free_bucket(tdb, b_off);
- ecode = add_free_record(tdb, off, end - off, TDB_LOCK_WAIT,
- false);
- if (ecode != TDB_SUCCESS) {
- return TDB_ERR_TO_OFF(ecode);
- }
- } else if (TDB_OFF_IS_ERR(*protect)) {
- /* For simplicity, we always drop lock if they can't continue */
- tdb_unlock_free_bucket(tdb, b_off);
- }
- tdb->stats.alloc_coalesce_succeeded++;
- /* Return usable length. */
- return end - off - sizeof(struct tdb_used_record);
- /* To unify error paths, we *always* unlock bucket on error. */
- tdb_unlock_free_bucket(tdb, b_off);
- return TDB_ERR_TO_OFF(ecode);
-/* List is locked: we unlock it. */
-static enum TDB_ERROR coalesce_list(struct tdb_context *tdb,
- tdb_off_t ftable_off,
- tdb_off_t b_off,
- unsigned int limit)
- enum TDB_ERROR ecode;
- tdb_off_t off;
- off = tdb_read_off(tdb, b_off);
- if (TDB_OFF_IS_ERR(off)) {
- ecode = TDB_OFF_TO_ERR(off);
- goto unlock_err;
- }
- /* A little bit of paranoia: counter should be 0. */
- off &= TDB_OFF_MASK;
- while (off && limit--) {
- struct tdb_free_record rec;
- tdb_len_t coal;
- tdb_off_t next;
- ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec));
- if (ecode != TDB_SUCCESS)
- goto unlock_err;
- next =;
- coal = coalesce(tdb, off, b_off, frec_len(&rec), &next);
- if (TDB_OFF_IS_ERR(coal)) {
- /* This has already unlocked on error. */
- return TDB_OFF_TO_ERR(coal);
- }
- if (TDB_OFF_IS_ERR(next)) {
- /* Coalescing had to unlock, so stop. */
- return TDB_SUCCESS;
- }
- /* Keep going if we're doing well... */
- limit += size_to_bucket(coal / 16 + TDB_MIN_DATA_LEN);
- off = next;
- }
- /* Now, move those elements to the tail of the list so we get something
- * else next time. */
- if (off) {
- struct tdb_free_record oldhrec, newhrec, oldtrec, newtrec;
- tdb_off_t oldhoff, oldtoff, newtoff;
- /* The record we were up to is the new head. */
- ecode = tdb_read_convert(tdb, off, &newhrec, sizeof(newhrec));
- if (ecode != TDB_SUCCESS)
- goto unlock_err;
- /* Get the new tail. */
- newtoff = frec_prev(&newhrec);
- ecode = tdb_read_convert(tdb, newtoff, &newtrec,
- sizeof(newtrec));
- if (ecode != TDB_SUCCESS)
- goto unlock_err;
- /* Get the old head. */
- oldhoff = tdb_read_off(tdb, b_off);
- if (TDB_OFF_IS_ERR(oldhoff)) {
- ecode = TDB_OFF_TO_ERR(oldhoff);
- goto unlock_err;
- }
- /* This could happen if they all coalesced away. */
- if (oldhoff == off)
- goto out;
- ecode = tdb_read_convert(tdb, oldhoff, &oldhrec,
- sizeof(oldhrec));
- if (ecode != TDB_SUCCESS)
- goto unlock_err;
- /* Get the old tail. */
- oldtoff = frec_prev(&oldhrec);
- ecode = tdb_read_convert(tdb, oldtoff, &oldtrec,
- sizeof(oldtrec));
- if (ecode != TDB_SUCCESS)
- goto unlock_err;
- /* Old tail's next points to old head. */
- = oldhoff;
- /* Old head's prev points to old tail. */
- oldhrec.magic_and_prev
- | oldtoff;
- /* New tail's next is 0. */
- = 0;
- /* Write out the modified versions. */
- ecode = tdb_write_convert(tdb, oldtoff, &oldtrec,
- sizeof(oldtrec));
- if (ecode != TDB_SUCCESS)
- goto unlock_err;
- ecode = tdb_write_convert(tdb, oldhoff, &oldhrec,
- sizeof(oldhrec));
- if (ecode != TDB_SUCCESS)
- goto unlock_err;
- ecode = tdb_write_convert(tdb, newtoff, &newtrec,
- sizeof(newtrec));
- if (ecode != TDB_SUCCESS)
- goto unlock_err;
- /* And finally link in new head. */
- ecode = tdb_write_off(tdb, b_off, off);
- if (ecode != TDB_SUCCESS)
- goto unlock_err;
- }
- tdb_unlock_free_bucket(tdb, b_off);
- return TDB_SUCCESS;
- tdb_unlock_free_bucket(tdb, b_off);
- return ecode;
-/* List must not be locked if coalesce_ok is set. */
-enum TDB_ERROR add_free_record(struct tdb_context *tdb,
- tdb_off_t off, tdb_len_t len_with_header,
- enum tdb_lock_flags waitflag,
- bool coalesce_ok)
- tdb_off_t b_off;
- tdb_len_t len;
- enum TDB_ERROR ecode;
- assert(len_with_header >= sizeof(struct tdb_free_record));
- len = len_with_header - sizeof(struct tdb_used_record);
- b_off = bucket_off(tdb->ftable_off, size_to_bucket(len));
- ecode = tdb_lock_free_bucket(tdb, b_off, waitflag);
- if (ecode != TDB_SUCCESS) {
- return ecode;
- }
- ecode = enqueue_in_free(tdb, b_off, off, len, &coalesce_ok);
- check_list(tdb, b_off);
- /* Coalescing unlocks free list. */
- if (!ecode && coalesce_ok)
- ecode = coalesce_list(tdb, tdb->ftable_off, b_off, 2);
- else
- tdb_unlock_free_bucket(tdb, b_off);
- return ecode;
-static size_t adjust_size(size_t keylen, size_t datalen)
- size_t size = keylen + datalen;
- if (size < TDB_MIN_DATA_LEN)
- size = TDB_MIN_DATA_LEN;
- /* Round to next uint64_t boundary. */
- return (size + (sizeof(uint64_t) - 1ULL)) & ~(sizeof(uint64_t) - 1ULL);
-/* If we have enough left over to be useful, split that off. */
-static size_t record_leftover(size_t keylen, size_t datalen,
- bool want_extra, size_t total_len)
- ssize_t leftover;
- if (want_extra)
- datalen += datalen / 2;
- leftover = total_len - adjust_size(keylen, datalen);
- if (leftover < (ssize_t)sizeof(struct tdb_free_record))
- return 0;
- return leftover;
-/* We need size bytes to put our key and data in. */
-static tdb_off_t lock_and_alloc(struct tdb_context *tdb,
- tdb_off_t ftable_off,
- tdb_off_t bucket,
- size_t keylen, size_t datalen,
- bool want_extra,
- unsigned magic,
- unsigned hashlow)
- tdb_off_t off, b_off,best_off;
- struct tdb_free_record best = { 0 };
- double multiplier;
- size_t size = adjust_size(keylen, datalen);
- enum TDB_ERROR ecode;
- tdb->stats.allocs++;
- b_off = bucket_off(ftable_off, bucket);
- /* FIXME: Try non-blocking wait first, to measure contention. */
- /* Lock this bucket. */
- ecode = tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT);
- if (ecode != TDB_SUCCESS) {
- return TDB_ERR_TO_OFF(ecode);
- }
- best.ftable_and_len = -1ULL;
- best_off = 0;
- /* Get slack if we're after extra. */
- if (want_extra)
- multiplier = 1.5;
- else
- multiplier = 1.0;
- /* Walk the list to see if any are large enough, getting less fussy
- * as we go. */
- off = tdb_read_off(tdb, b_off);
- if (TDB_OFF_IS_ERR(off)) {
- ecode = TDB_OFF_TO_ERR(off);
- goto unlock_err;
- }
- off &= TDB_OFF_MASK;
- while (off) {
- const struct tdb_free_record *r;
- tdb_len_t len;
- tdb_off_t next;
- r = tdb_access_read(tdb, off, sizeof(*r), true);
- if (TDB_PTR_IS_ERR(r)) {
- ecode = TDB_PTR_ERR(r);
- goto unlock_err;
- }
- if (frec_magic(r) != TDB_FREE_MAGIC) {
- ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
- "lock_and_alloc:"
- " %llu non-free 0x%llx",
- (long long)off,
- (long long)r->magic_and_prev);
- tdb_access_release(tdb, r);
- goto unlock_err;
- }
- if (frec_len(r) >= size && frec_len(r) < frec_len(&best)) {
- best_off = off;
- best = *r;
- }
- if (frec_len(&best) <= size * multiplier && best_off) {
- tdb_access_release(tdb, r);
- break;
- }
- multiplier *= 1.01;
- next = r->next;
- len = frec_len(r);
- tdb_access_release(tdb, r);
- off = next;
- }
- /* If we found anything at all, use it. */
- if (best_off) {
- struct tdb_used_record rec;
- size_t leftover;
- /* We're happy with this size: take it. */
- ecode = remove_from_list(tdb, b_off, best_off, &best);
- check_list(tdb, b_off);
- if (ecode != TDB_SUCCESS) {
- goto unlock_err;
- }
- leftover = record_leftover(keylen, datalen, want_extra,
- frec_len(&best));
- assert(keylen + datalen + leftover <= frec_len(&best));
- /* We need to mark non-free before we drop lock, otherwise
- * coalesce() could try to merge it! */
- ecode = set_header(tdb, &rec, magic, keylen, datalen,
- frec_len(&best) - leftover, hashlow);
- if (ecode != TDB_SUCCESS) {
- goto unlock_err;
- }
- ecode = tdb_write_convert(tdb, best_off, &rec, sizeof(rec));
- if (ecode != TDB_SUCCESS) {
- goto unlock_err;
- }
- /* For futureproofing, we put a 0 in any unused space. */
- if (rec_extra_padding(&rec)) {
- ecode = tdb->io->twrite(tdb, best_off + sizeof(rec)
- + keylen + datalen, "", 1);
- if (ecode != TDB_SUCCESS) {
- goto unlock_err;
- }
- }
- /* Bucket of leftover will be <= current bucket, so nested
- * locking is allowed. */
- if (leftover) {
- tdb->stats.alloc_leftover++;
- ecode = add_free_record(tdb,
- best_off + sizeof(rec)
- + frec_len(&best) - leftover,
- leftover, TDB_LOCK_WAIT, false);
- if (ecode != TDB_SUCCESS) {
- best_off = TDB_ERR_TO_OFF(ecode);
- }
- }
- tdb_unlock_free_bucket(tdb, b_off);
- return best_off;
- }
- tdb_unlock_free_bucket(tdb, b_off);
- return 0;
- tdb_unlock_free_bucket(tdb, b_off);
- return TDB_ERR_TO_OFF(ecode);
-/* Get a free block from current free list, or 0 if none, -ve on error. */
-static tdb_off_t get_free(struct tdb_context *tdb,
- size_t keylen, size_t datalen, bool want_extra,
- unsigned magic, unsigned hashlow)
- tdb_off_t off, ftable_off;
- tdb_off_t start_b, b, ftable;
- bool wrapped = false;
- /* If they are growing, add 50% to get to higher bucket. */
- if (want_extra)
- start_b = size_to_bucket(adjust_size(keylen,
- datalen + datalen / 2));
- else
- start_b = size_to_bucket(adjust_size(keylen, datalen));
- ftable_off = tdb->ftable_off;
- ftable = tdb->ftable;
- while (!wrapped || ftable_off != tdb->ftable_off) {
- /* Start at exact size bucket, and search up... */
- for (b = find_free_head(tdb, ftable_off, start_b);
- b = find_free_head(tdb, ftable_off, b + 1)) {
- /* Try getting one from list. */
- off = lock_and_alloc(tdb, ftable_off,
- b, keylen, datalen, want_extra,
- magic, hashlow);
- if (TDB_OFF_IS_ERR(off))
- return off;
- if (off != 0) {
- if (b == start_b)
- tdb->stats.alloc_bucket_exact++;
- if (b == TDB_FREE_BUCKETS - 1)
- tdb->stats.alloc_bucket_max++;
- /* Worked? Stay using this list. */
- tdb->ftable_off = ftable_off;
- tdb->ftable = ftable;
- return off;
- }
- /* Didn't work. Try next bucket. */
- }
- if (TDB_OFF_IS_ERR(b)) {
- return b;
- }
- /* Hmm, try next table. */
- ftable_off = next_ftable(tdb, ftable_off);
- if (TDB_OFF_IS_ERR(ftable_off)) {
- return ftable_off;
- }
- ftable++;
- if (ftable_off == 0) {
- wrapped = true;
- ftable_off = first_ftable(tdb);
- if (TDB_OFF_IS_ERR(ftable_off)) {
- return ftable_off;
- }
- ftable = 0;
- }
- }
- return 0;
-enum TDB_ERROR set_header(struct tdb_context *tdb,
- struct tdb_used_record *rec,
- unsigned magic, uint64_t keylen, uint64_t datalen,
- uint64_t actuallen, unsigned hashlow)
- uint64_t keybits = (fls64(keylen) + 1) / 2;
- /* Use bottom bits of hash, so it's independent of hash table size. */
- rec->magic_and_meta = (hashlow & ((1 << 11)-1))
- | ((actuallen - (keylen + datalen)) << 11)
- | (keybits << 43)
- | ((uint64_t)magic << 48);
- rec->key_and_data_len = (keylen | (datalen << (keybits*2)));
- /* Encoding can fail on big values. */
- if (rec_key_length(rec) != keylen
- || rec_data_length(rec) != datalen
- || rec_extra_padding(rec) != actuallen - (keylen + datalen)) {
- return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
- "Could not encode k=%llu,d=%llu,a=%llu",
- (long long)keylen, (long long)datalen,
- (long long)actuallen);
- }
- return TDB_SUCCESS;
-/* You need 'size', this tells you how much you should expand by. */
-tdb_off_t tdb_expand_adjust(tdb_off_t map_size, tdb_off_t size)
- tdb_off_t new_size, top_size;
- /* limit size in order to avoid using up huge amounts of memory for
- * in memory tdbs if an oddball huge record creeps in */
- if (size > 100 * 1024) {
- top_size = map_size + size * 2;
- } else {
- top_size = map_size + size * 100;
- }
- /* always make room for at least top_size more records, and at
- least 25% more space. if the DB is smaller than 100MiB,
- otherwise grow it by 10% only. */
- if (map_size > 100 * 1024 * 1024) {
- new_size = map_size * 1.10;
- } else {
- new_size = map_size * 1.25;
- }
- /* Round the database up to a multiple of the page size */
- if (new_size < top_size)
- new_size = top_size;
- return new_size - map_size;
-/* Expand the database. */
-static enum TDB_ERROR tdb_expand(struct tdb_context *tdb, tdb_len_t size)
- uint64_t old_size;
- tdb_len_t wanted;
- enum TDB_ERROR ecode;
- /* Need to hold a hash lock to expand DB: transactions rely on it. */
- if (!(tdb->flags & TDB_NOLOCK)
- && !tdb->file->allrecord_lock.count && !tdb_has_hash_locks(tdb)) {
- return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
- "tdb_expand: must hold lock during expand");
- }
- /* Only one person can expand file at a time. */
- ecode = tdb_lock_expand(tdb, F_WRLCK);
- if (ecode != TDB_SUCCESS) {
- return ecode;
- }
- /* Someone else may have expanded the file, so retry. */
- old_size = tdb->file->map_size;
- tdb->io->oob(tdb, tdb->file->map_size, 1, true);
- if (tdb->file->map_size != old_size) {
- tdb_unlock_expand(tdb, F_WRLCK);
- return TDB_SUCCESS;
- }
- /* Overallocate. */
- wanted = tdb_expand_adjust(old_size, size);
- /* We need room for the record header too. */
- wanted = adjust_size(0, sizeof(struct tdb_used_record) + wanted);
- ecode = tdb->io->expand_file(tdb, wanted);
- if (ecode != TDB_SUCCESS) {
- tdb_unlock_expand(tdb, F_WRLCK);
- return ecode;
- }
- /* We need to drop this lock before adding free record. */
- tdb_unlock_expand(tdb, F_WRLCK);
- tdb->stats.expands++;
- return add_free_record(tdb, old_size, wanted, TDB_LOCK_WAIT, true);
-/* This won't fail: it will expand the database if it has to. */
-tdb_off_t alloc(struct tdb_context *tdb, size_t keylen, size_t datalen,
- uint64_t hash, unsigned magic, bool growing)
- tdb_off_t off;
- /* We can't hold pointers during this: we could unmap! */
- assert(!tdb->direct_access);
- for (;;) {
- enum TDB_ERROR ecode;
- off = get_free(tdb, keylen, datalen, growing, magic, hash);
- if (likely(off != 0))
- break;
- ecode = tdb_expand(tdb, adjust_size(keylen, datalen));
- if (ecode != TDB_SUCCESS) {
- return TDB_ERR_TO_OFF(ecode);
- }
- }
- return off;
diff --git a/lib/tdb2/hash.c b/lib/tdb2/hash.c
deleted file mode 100644
index 067884a74e..0000000000
--- a/lib/tdb2/hash.c
+++ /dev/null
@@ -1,894 +0,0 @@
- /*
- Trivial Database 2: hash handling
- Copyright (C) Rusty Russell 2010
- This library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 3 of the License, or (at your option) any later version.
- This library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- Lesser General Public License for more details.
- You should have received a copy of the GNU Lesser General Public
- License along with this library; if not, see <>.
-#include "private.h"
-#include <ccan/hash/hash.h>
-#include <assert.h>
-/* Default hash function. */
-uint64_t tdb_jenkins_hash(const void *key, size_t length, uint64_t seed,
- void *unused)
- uint64_t ret;
- /* hash64_stable assumes lower bits are more important; they are a
- * slightly better hash. We use the upper bits first, so swap them. */
- ret = hash64_stable((const unsigned char *)key, length, seed);
- return (ret >> 32) | (ret << 32);
-uint64_t tdb_hash(struct tdb_context *tdb, const void *ptr, size_t len)
- return tdb->hash_fn(ptr, len, tdb->hash_seed, tdb->hash_data);
-uint64_t hash_record(struct tdb_context *tdb, tdb_off_t off)
- const struct tdb_used_record *r;
- const void *key;
- uint64_t klen, hash;
- r = tdb_access_read(tdb, off, sizeof(*r), true);
- if (TDB_PTR_IS_ERR(r)) {
- /* FIXME */
- return 0;
- }
- klen = rec_key_length(r);
- tdb_access_release(tdb, r);
- key = tdb_access_read(tdb, off + sizeof(*r), klen, false);
- if (TDB_PTR_IS_ERR(key)) {
- return 0;
- }
- hash = tdb_hash(tdb, key, klen);
- tdb_access_release(tdb, key);
- return hash;
-/* Get bits from a value. */
-static uint32_t bits_from(uint64_t val, unsigned start, unsigned num)
- assert(num <= 32);
- return (val >> start) & ((1U << num) - 1);
-/* We take bits from the top: that way we can lock whole sections of the hash
- * by using lock ranges. */
-static uint32_t use_bits(struct hash_info *h, unsigned num)
- h->hash_used += num;
- return bits_from(h->h, 64 - h->hash_used, num);
-static tdb_bool_err key_matches(struct tdb_context *tdb,
- const struct tdb_used_record *rec,
- tdb_off_t off,
- const struct tdb_data *key)
- tdb_bool_err ret = false;
- const char *rkey;
- if (rec_key_length(rec) != key->dsize) {
- tdb->stats.compare_wrong_keylen++;
- return ret;
- }
- rkey = tdb_access_read(tdb, off + sizeof(*rec), key->dsize, false);
- if (TDB_PTR_IS_ERR(rkey)) {
- return (tdb_bool_err)TDB_PTR_ERR(rkey);
- }
- if (memcmp(rkey, key->dptr, key->dsize) == 0)
- ret = true;
- else
- tdb->stats.compare_wrong_keycmp++;
- tdb_access_release(tdb, rkey);
- return ret;
-/* Does entry match? */
-static tdb_bool_err match(struct tdb_context *tdb,
- struct hash_info *h,
- const struct tdb_data *key,
- tdb_off_t val,
- struct tdb_used_record *rec)
- tdb_off_t off;
- enum TDB_ERROR ecode;
- tdb->stats.compares++;
- /* Desired bucket must match. */
- if (h->home_bucket != (val & TDB_OFF_HASH_GROUP_MASK)) {
- tdb->stats.compare_wrong_bucket++;
- return false;
- }
- /* Top bits of offset == next bits of hash. */
- != bits_from(h->h, 64 - h->hash_used - TDB_OFF_UPPER_STEAL_EXTRA,
- tdb->stats.compare_wrong_offsetbits++;
- return false;
- }
- off = val & TDB_OFF_MASK;
- ecode = tdb_read_convert(tdb, off, rec, sizeof(*rec));
- if (ecode != TDB_SUCCESS) {
- return (tdb_bool_err)ecode;
- }
- if ((h->h & ((1 << 11)-1)) != rec_hash(rec)) {
- tdb->stats.compare_wrong_rechash++;
- return false;
- }
- return key_matches(tdb, rec, off, key);
-static tdb_off_t hbucket_off(tdb_off_t group_start, unsigned bucket)
- return group_start
- + (bucket % (1 << TDB_HASH_GROUP_BITS)) * sizeof(tdb_off_t);
-bool is_subhash(tdb_off_t val)
- return (val >> TDB_OFF_UPPER_STEAL_SUBHASH_BIT) & 1;
-/* FIXME: Guess the depth, don't over-lock! */
-static tdb_off_t hlock_range(tdb_off_t group, tdb_off_t *size)
- return group << (64 - (TDB_TOPLEVEL_HASH_BITS - TDB_HASH_GROUP_BITS));
-static tdb_off_t COLD find_in_chain(struct tdb_context *tdb,
- struct tdb_data key,
- tdb_off_t chain,
- struct hash_info *h,
- struct tdb_used_record *rec,
- struct traverse_info *tinfo)
- tdb_off_t off, next;
- enum TDB_ERROR ecode;
- /* In case nothing is free, we set these to zero. */
- h->home_bucket = h->found_bucket = 0;
- for (off = chain; off; off = next) {
- unsigned int i;
- h->group_start = off;
- ecode = tdb_read_convert(tdb, off, h->group, sizeof(h->group));
- if (ecode != TDB_SUCCESS) {
- return TDB_ERR_TO_OFF(ecode);
- }
- for (i = 0; i < (1 << TDB_HASH_GROUP_BITS); i++) {
- tdb_off_t recoff;
- if (!h->group[i]) {
- /* Remember this empty bucket. */
- h->home_bucket = h->found_bucket = i;
- continue;
- }
- /* We can insert extra bits via add_to_hash
- * empty bucket logic. */
- recoff = h->group[i] & TDB_OFF_MASK;
- ecode = tdb_read_convert(tdb, recoff, rec,
- sizeof(*rec));
- if (ecode != TDB_SUCCESS) {
- return TDB_ERR_TO_OFF(ecode);
- }
- ecode = TDB_OFF_TO_ERR(key_matches(tdb, rec, recoff,
- &key));
- if (ecode < 0) {
- return TDB_ERR_TO_OFF(ecode);
- }
- if (ecode == (enum TDB_ERROR)1) {
- h->home_bucket = h->found_bucket = i;
- if (tinfo) {
- tinfo->levels[tinfo->num_levels]
- .hashtable = off;
- tinfo->levels[tinfo->num_levels]
- .total_buckets
- tinfo->levels[tinfo->num_levels].entry
- = i;
- tinfo->num_levels++;
- }
- return recoff;
- }
- }
- next = tdb_read_off(tdb, off
- + offsetof(struct tdb_chain, next));
- if (TDB_OFF_IS_ERR(next)) {
- return next;
- }
- if (next)
- next += sizeof(struct tdb_used_record);
- }
- return 0;
-/* This is the core routine which searches the hashtable for an entry.
- * On error, no locks are held and -ve is returned.
- * Otherwise, hinfo is filled in (and the optional tinfo).
- * If not found, the return value is 0.
- * If found, the return value is the offset, and *rec is the record. */
-tdb_off_t find_and_lock(struct tdb_context *tdb,
- struct tdb_data key,
- int ltype,
- struct hash_info *h,
- struct tdb_used_record *rec,
- struct traverse_info *tinfo)
- uint32_t i, group;
- tdb_off_t hashtable;
- enum TDB_ERROR ecode;
- h->h = tdb_hash(tdb, key.dptr, key.dsize);
- h->hash_used = 0;
- h->home_bucket = use_bits(h, TDB_HASH_GROUP_BITS);
- h->hlock_start = hlock_range(group, &h->hlock_range);
- ecode = tdb_lock_hashes(tdb, h->hlock_start, h->hlock_range, ltype,
- if (ecode != TDB_SUCCESS) {
- return TDB_ERR_TO_OFF(ecode);
- }
- hashtable = offsetof(struct tdb_header, hashtable);
- if (tinfo) {
- tinfo->toplevel_group = group;
- tinfo->num_levels = 1;
- tinfo->levels[0].entry = 0;
- tinfo->levels[0].hashtable = hashtable
- + (group << TDB_HASH_GROUP_BITS) * sizeof(tdb_off_t);
- tinfo->levels[0].total_buckets = 1 << TDB_HASH_GROUP_BITS;
- }
- while (h->hash_used <= 64) {
- /* Read in the hash group. */
- h->group_start = hashtable
- + group * (sizeof(tdb_off_t) << TDB_HASH_GROUP_BITS);
- ecode = tdb_read_convert(tdb, h->group_start, &h->group,
- sizeof(h->group));
- if (ecode != TDB_SUCCESS) {
- goto fail;
- }
- /* Pointer to another hash table? Go down... */
- if (is_subhash(h->group[h->home_bucket])) {
- hashtable = (h->group[h->home_bucket] & TDB_OFF_MASK)
- + sizeof(struct tdb_used_record);
- if (tinfo) {
- /* When we come back, use *next* bucket */
- tinfo->levels[tinfo->num_levels-1].entry
- += h->home_bucket + 1;
- }
- group = use_bits(h, TDB_SUBLEVEL_HASH_BITS
- h->home_bucket = use_bits(h, TDB_HASH_GROUP_BITS);
- if (tinfo) {
- tinfo->levels[tinfo->num_levels].hashtable
- = hashtable;
- tinfo->levels[tinfo->num_levels].total_buckets
- tinfo->levels[tinfo->num_levels].entry
- = group << TDB_HASH_GROUP_BITS;
- tinfo->num_levels++;
- }
- continue;
- }
- /* It's in this group: search (until 0 or all searched) */
- for (i = 0, h->found_bucket = h->home_bucket;
- i < (1 << TDB_HASH_GROUP_BITS);
- i++, h->found_bucket = ((h->found_bucket+1)
- % (1 << TDB_HASH_GROUP_BITS))) {
- tdb_bool_err berr;
- if (is_subhash(h->group[h->found_bucket]))
- continue;
- if (!h->group[h->found_bucket])
- break;
- berr = match(tdb, h, &key, h->group[h->found_bucket],
- rec);
- if (berr < 0) {
- ecode = TDB_OFF_TO_ERR(berr);
- goto fail;
- }
- if (berr) {
- if (tinfo) {
- tinfo->levels[tinfo->num_levels-1].entry
- += h->found_bucket;
- }
- return h->group[h->found_bucket] & TDB_OFF_MASK;
- }
- }
- /* Didn't find it: h indicates where it would go. */
- return 0;
- }
- return find_in_chain(tdb, key, hashtable, h, rec, tinfo);
- tdb_unlock_hashes(tdb, h->hlock_start, h->hlock_range, ltype);
- return TDB_ERR_TO_OFF(ecode);
-/* I wrote a simple test, expanding a hash to 2GB, for the following
- * cases:
- * 1) Expanding all the buckets at once,
- * 2) Expanding the bucket we wanted to place the new entry into.
- * 3) Expanding the most-populated bucket,
- *
- * I measured the worst/average/best density during this process.
- * 1) 3%/16%/30%
- * 2) 4%/20%/38%
- * 3) 6%/22%/41%
- *
- * So we figure out the busiest bucket for the moment.
- */
-static unsigned fullest_bucket(struct tdb_context *tdb,
- const tdb_off_t *group,
- unsigned new_bucket)
- unsigned counts[1 << TDB_HASH_GROUP_BITS] = { 0 };
- unsigned int i, best_bucket;
- /* Count the new entry. */
- counts[new_bucket]++;
- best_bucket = new_bucket;
- for (i = 0; i < (1 << TDB_HASH_GROUP_BITS); i++) {
- unsigned this_bucket;
- if (is_subhash(group[i]))
- continue;
- this_bucket = group[i] & TDB_OFF_HASH_GROUP_MASK;
- if (++counts[this_bucket] > counts[best_bucket])
- best_bucket = this_bucket;
- }
- return best_bucket;
-static bool put_into_group(tdb_off_t *group,
- unsigned bucket, tdb_off_t encoded)
- unsigned int i;
- for (i = 0; i < (1 << TDB_HASH_GROUP_BITS); i++) {
- unsigned b = (bucket + i) % (1 << TDB_HASH_GROUP_BITS);
- if (group[b] == 0) {
- group[b] = encoded;
- return true;
- }
- }
- return false;
-static void force_into_group(tdb_off_t *group,
- unsigned bucket, tdb_off_t encoded)
- if (!put_into_group(group, bucket, encoded))
- abort();
-static tdb_off_t encode_offset(tdb_off_t new_off, struct hash_info *h)
- return h->home_bucket
- | new_off
- | ((uint64_t)bits_from(h->h,
- 64 - h->hash_used - TDB_OFF_UPPER_STEAL_EXTRA,
-/* Simply overwrite the hash entry we found before. */
-enum TDB_ERROR replace_in_hash(struct tdb_context *tdb,
- struct hash_info *h,
- tdb_off_t new_off)
- return tdb_write_off(tdb, hbucket_off(h->group_start, h->found_bucket),
- encode_offset(new_off, h));
-/* We slot in anywhere that's empty in the chain. */
-static enum TDB_ERROR COLD add_to_chain(struct tdb_context *tdb,
- tdb_off_t subhash,
- tdb_off_t new_off)
- tdb_off_t entry;
- enum TDB_ERROR ecode;
- entry = tdb_find_zero_off(tdb, subhash, 1<<TDB_HASH_GROUP_BITS);
- if (TDB_OFF_IS_ERR(entry)) {
- return TDB_OFF_TO_ERR(entry);
- }
- if (entry == 1 << TDB_HASH_GROUP_BITS) {
- tdb_off_t next;
- next = tdb_read_off(tdb, subhash
- + offsetof(struct tdb_chain, next));
- if (TDB_OFF_IS_ERR(next)) {
- return TDB_OFF_TO_ERR(next);
- }
- if (!next) {
- next = alloc(tdb, 0, sizeof(struct tdb_chain), 0,
- TDB_CHAIN_MAGIC, false);
- if (TDB_OFF_IS_ERR(next))
- return TDB_OFF_TO_ERR(next);
- ecode = zero_out(tdb,
- next+sizeof(struct tdb_used_record),
- sizeof(struct tdb_chain));
- if (ecode != TDB_SUCCESS) {
- return ecode;
- }
- ecode = tdb_write_off(tdb, subhash
- + offsetof(struct tdb_chain,
- next),
- next);
- if (ecode != TDB_SUCCESS) {
- return ecode;
- }
- }
- return add_to_chain(tdb, next, new_off);
- }
- return tdb_write_off(tdb, subhash + entry * sizeof(tdb_off_t),
- new_off);
-/* Add into a newly created subhash. */
-static enum TDB_ERROR add_to_subhash(struct tdb_context *tdb, tdb_off_t subhash,
- unsigned hash_used, tdb_off_t val)
- tdb_off_t off = (val & TDB_OFF_MASK), *group;
- struct hash_info h;
- unsigned int gnum;
- h.hash_used = hash_used;
- if (hash_used + TDB_SUBLEVEL_HASH_BITS > 64)
- return add_to_chain(tdb, subhash, off);
- h.h = hash_record(tdb, off);
- h.group_start = subhash
- + gnum * (sizeof(tdb_off_t) << TDB_HASH_GROUP_BITS);
- h.home_bucket = use_bits(&h, TDB_HASH_GROUP_BITS);
- group = tdb_access_write(tdb, h.group_start,
- sizeof(*group) << TDB_HASH_GROUP_BITS, true);
- if (TDB_PTR_IS_ERR(group)) {
- return TDB_PTR_ERR(group);
- }
- force_into_group(group, h.home_bucket, encode_offset(off, &h));
- return tdb_access_commit(tdb, group);
-static enum TDB_ERROR expand_group(struct tdb_context *tdb, struct hash_info *h)
- unsigned bucket, num_vals, i, magic;
- size_t subsize;
- tdb_off_t subhash;
- tdb_off_t vals[1 << TDB_HASH_GROUP_BITS];
- enum TDB_ERROR ecode;
- /* Attach new empty subhash under fullest bucket. */
- bucket = fullest_bucket(tdb, h->group, h->home_bucket);
- if (h->hash_used == 64) {
- tdb->stats.alloc_chain++;
- subsize = sizeof(struct tdb_chain);
- magic = TDB_CHAIN_MAGIC;
- } else {
- tdb->stats.alloc_subhash++;
- subsize = (sizeof(tdb_off_t) << TDB_SUBLEVEL_HASH_BITS);
- }
- subhash = alloc(tdb, 0, subsize, 0, magic, false);
- if (TDB_OFF_IS_ERR(subhash)) {
- return TDB_OFF_TO_ERR(subhash);
- }
- ecode = zero_out(tdb, subhash + sizeof(struct tdb_used_record),
- subsize);
- if (ecode != TDB_SUCCESS) {
- return ecode;
- }
- /* Remove any which are destined for bucket or are in wrong place. */
- num_vals = 0;
- for (i = 0; i < (1 << TDB_HASH_GROUP_BITS); i++) {
- unsigned home_bucket = h->group[i] & TDB_OFF_HASH_GROUP_MASK;
- if (!h->group[i] || is_subhash(h->group[i]))
- continue;
- if (home_bucket == bucket || home_bucket != i) {
- vals[num_vals++] = h->group[i];
- h->group[i] = 0;
- }
- }
- /* FIXME: This assert is valid, but we do this during unit test :( */
- /* assert(num_vals); */
- /* Overwrite expanded bucket with subhash pointer. */
- h->group[bucket] = subhash | (1ULL << TDB_OFF_UPPER_STEAL_SUBHASH_BIT);
- /* Point to actual contents of record. */
- subhash += sizeof(struct tdb_used_record);
- /* Put values back. */
- for (i = 0; i < num_vals; i++) {
- unsigned this_bucket = vals[i] & TDB_OFF_HASH_GROUP_MASK;
- if (this_bucket == bucket) {
- ecode = add_to_subhash(tdb, subhash, h->hash_used,
- vals[i]);
- if (ecode != TDB_SUCCESS)
- return ecode;
- } else {
- /* There should be room to put this back. */
- force_into_group(h->group, this_bucket, vals[i]);
- }
- }
- return TDB_SUCCESS;
-enum TDB_ERROR delete_from_hash(struct tdb_context *tdb, struct hash_info *h)
- unsigned int i, num_movers = 0;
- tdb_off_t movers[1 << TDB_HASH_GROUP_BITS];
- h->group[h->found_bucket] = 0;
- for (i = 1; i < (1 << TDB_HASH_GROUP_BITS); i++) {
- unsigned this_bucket;
- this_bucket = (h->found_bucket+i) % (1 << TDB_HASH_GROUP_BITS);
- /* Empty bucket? We're done. */
- if (!h->group[this_bucket])
- break;
- /* Ignore subhashes. */
- if (is_subhash(h->group[this_bucket]))
- continue;
- /* If this one is not happy where it is, we'll move it. */
- if ((h->group[this_bucket] & TDB_OFF_HASH_GROUP_MASK)
- != this_bucket) {
- movers[num_movers++] = h->group[this_bucket];
- h->group[this_bucket] = 0;
- }
- }
- /* Put back the ones we erased. */
- for (i = 0; i < num_movers; i++) {
- force_into_group(h->group, movers[i] & TDB_OFF_HASH_GROUP_MASK,
- movers[i]);
- }
- /* Now we write back the hash group */
- return tdb_write_convert(tdb, h->group_start,
- h->group, sizeof(h->group));
-enum TDB_ERROR add_to_hash(struct tdb_context *tdb, struct hash_info *h,
- tdb_off_t new_off)
- enum TDB_ERROR ecode;
- /* We hit an empty bucket during search? That's where it goes. */
- if (!h->group[h->found_bucket]) {
- h->group[h->found_bucket] = encode_offset(new_off, h);
- /* Write back the modified group. */
- return tdb_write_convert(tdb, h->group_start,
- h->group, sizeof(h->group));
- }
- if (h->hash_used > 64)
- return add_to_chain(tdb, h->group_start, new_off);
- /* We're full. Expand. */
- ecode = expand_group(tdb, h);
- if (ecode != TDB_SUCCESS) {
- return ecode;
- }
- if (is_subhash(h->group[h->home_bucket])) {
- /* We were expanded! */
- tdb_off_t hashtable;
- unsigned int gnum;
- /* Write back the modified group. */
- ecode = tdb_write_convert(tdb, h->group_start, h->group,
- sizeof(h->group));
- if (ecode != TDB_SUCCESS) {
- return ecode;
- }
- /* Move hashinfo down a level. */
- hashtable = (h->group[h->home_bucket] & TDB_OFF_MASK)
- + sizeof(struct tdb_used_record);
- h->home_bucket = use_bits(h, TDB_HASH_GROUP_BITS);
- h->group_start = hashtable
- + gnum * (sizeof(tdb_off_t) << TDB_HASH_GROUP_BITS);
- ecode = tdb_read_convert(tdb, h->group_start, &h->group,
- sizeof(h->group));
- if (ecode != TDB_SUCCESS) {
- return ecode;
- }
- }
- /* Expanding the group must have made room if it didn't choose this
- * bucket. */
- if (put_into_group(h->group, h->home_bucket, encode_offset(new_off,h))){
- return tdb_write_convert(tdb, h->group_start,
- h->group, sizeof(h->group));
- }
- /* This can happen if all hashes in group (and us) dropped into same
- * group in subhash. */
- return add_to_hash(tdb, h, new_off);
-/* Traverse support: returns offset of record, or 0 or -ve error. */
-static tdb_off_t iterate_hash(struct tdb_context *tdb,
- struct traverse_info *tinfo)
- tdb_off_t off, val, i;
- struct traverse_level *tlevel;
- tlevel = &tinfo->levels[tinfo->num_levels-1];
- for (i = tdb_find_nonzero_off(tdb, tlevel->hashtable,
- tlevel->entry, tlevel->total_buckets);
- i != tlevel->total_buckets;
- i = tdb_find_nonzero_off(tdb, tlevel->hashtable,
- i+1, tlevel->total_buckets)) {
- if (TDB_OFF_IS_ERR(i)) {
- return i;
- }
- val = tdb_read_off(tdb, tlevel->hashtable+sizeof(tdb_off_t)*i);
- if (TDB_OFF_IS_ERR(val)) {
- return val;
- }
- off = val & TDB_OFF_MASK;
- /* This makes the delete-all-in-traverse case work
- * (and simplifies our logic a little). */
- if (off == tinfo->prev)
- continue;
- tlevel->entry = i;
- if (!is_subhash(val)) {
- /* Found one. */
- tinfo->prev = off;
- return off;
- }
- /* When we come back, we want the next one */
- tlevel->entry++;
- tinfo->num_levels++;
- tlevel++;
- tlevel->hashtable = off + sizeof(struct tdb_used_record);
- tlevel->entry = 0;
- /* Next level is a chain? */
- if (unlikely(tinfo->num_levels == TDB_MAX_LEVELS + 1))
- tlevel->total_buckets = (1 << TDB_HASH_GROUP_BITS);
- else
- tlevel->total_buckets = (1 << TDB_SUBLEVEL_HASH_BITS);
- goto again;
- }
- /* Nothing there? */
- if (tinfo->num_levels == 1)
- return 0;
- /* Handle chained entries. */
- if (unlikely(tinfo->num_levels == TDB_MAX_LEVELS + 1)) {
- tlevel->hashtable = tdb_read_off(tdb, tlevel->hashtable
- + offsetof(struct tdb_chain,
- next));
- if (TDB_OFF_IS_ERR(tlevel->hashtable)) {
- return tlevel->hashtable;
- }
- if (tlevel->hashtable) {
- tlevel->hashtable += sizeof(struct tdb_used_record);
- tlevel->entry = 0;
- goto again;
- }
- }
- /* Go back up and keep searching. */
- tinfo->num_levels--;
- tlevel--;
- goto again;
-/* Return success if we find something, TDB_ERR_NOEXIST if none. */
-enum TDB_ERROR next_in_hash(struct tdb_context *tdb,
- struct traverse_info *tinfo,
- TDB_DATA *kbuf, size_t *dlen)
- const unsigned group_bits = TDB_TOPLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS;
- tdb_off_t hl_start, hl_range, off;
- enum TDB_ERROR ecode;
- while (tinfo->toplevel_group < (1 << group_bits)) {
- hl_start = (tdb_off_t)tinfo->toplevel_group
- << (64 - group_bits);
- hl_range = 1ULL << group_bits;
- ecode = tdb_lock_hashes(tdb, hl_start, hl_range, F_RDLCK,
- if (ecode != TDB_SUCCESS) {
- return ecode;
- }
- off = iterate_hash(tdb, tinfo);
- if (off) {
- struct tdb_used_record rec;
- if (TDB_OFF_IS_ERR(off)) {
- ecode = TDB_OFF_TO_ERR(off);
- goto fail;
- }
- ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec));
- if (ecode != TDB_SUCCESS) {
- goto fail;
- }
- if (rec_magic(&rec) != TDB_USED_MAGIC) {
- ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT,
- "next_in_hash:"
- " corrupt record at %llu",
- (long long)off);
- goto fail;
- }
- kbuf->dsize = rec_key_length(&rec);
- /* They want data as well? */
- if (dlen) {
- *dlen = rec_data_length(&rec);
- kbuf->dptr = tdb_alloc_read(tdb,
- off + sizeof(rec),
- kbuf->dsize
- + *dlen);
- } else {
- kbuf->dptr = tdb_alloc_read(tdb,
- off + sizeof(rec),
- kbuf->dsize);
- }
- tdb_unlock_hashes(tdb, hl_start, hl_range, F_RDLCK);
- if (TDB_PTR_IS_ERR(kbuf->dptr)) {
- return TDB_PTR_ERR(kbuf->dptr);
- }
- return TDB_SUCCESS;
- }
- tdb_unlock_hashes(tdb, hl_start, hl_range, F_RDLCK);
- tinfo->toplevel_group++;
- tinfo->levels[0].hashtable
- += (sizeof(tdb_off_t) << TDB_HASH_GROUP_BITS);
- tinfo->levels[0].entry = 0;
- }
- tdb_unlock_hashes(tdb, hl_start, hl_range, F_RDLCK);
- return ecode;
-enum TDB_ERROR first_in_hash(struct tdb_context *tdb,
- struct traverse_info *tinfo,
- TDB_DATA *kbuf, size_t *dlen)
- tinfo->prev = 0;
- tinfo->toplevel_group = 0;
- tinfo->num_levels = 1;
- tinfo->levels[0].hashtable = offsetof(struct tdb_header, hashtable);
- tinfo->levels[0].entry = 0;
- tinfo->levels[0].total_buckets = (1 << TDB_HASH_GROUP_BITS);
- return next_in_hash(tdb, tinfo, kbuf, dlen);
-/* Even if the entry isn't in this hash bucket, you'd have to lock this
- * bucket to find it. */
-static enum TDB_ERROR chainlock(struct tdb_context *tdb, const TDB_DATA *key,
- int ltype, enum tdb_lock_flags waitflag,
- const char *func)
- enum TDB_ERROR ecode;
- uint64_t h = tdb_hash(tdb, key->dptr, key->dsize);
- tdb_off_t lockstart, locksize;
- unsigned int group, gbits;
- group = bits_from(h, 64 - gbits, gbits);
- lockstart = hlock_range(group, &locksize);
- ecode = tdb_lock_hashes(tdb, lockstart, locksize, ltype, waitflag);
- tdb_trace_1rec(tdb, func, *key);
- return ecode;
-/* lock/unlock one hash chain. This is meant to be used to reduce
- contention - it cannot guarantee how many records will be locked */
-_PUBLIC_ enum TDB_ERROR tdb_chainlock(struct tdb_context *tdb, TDB_DATA key)
- return tdb->last_error = chainlock(tdb, &key, F_WRLCK, TDB_LOCK_WAIT,
- "tdb_chainlock");
-_PUBLIC_ void tdb_chainunlock(struct tdb_context *tdb, TDB_DATA key)
- uint64_t h = tdb_hash(tdb, key.dptr, key.dsize);
- tdb_off_t lockstart, locksize;
- unsigned int group, gbits;
- group = bits_from(h, 64 - gbits, gbits);
- lockstart = hlock_range(group, &locksize);
- tdb_trace_1rec(tdb, "tdb_chainunlock", key);
- tdb_unlock_hashes(tdb, lockstart, locksize, F_WRLCK);
-_PUBLIC_ enum TDB_ERROR tdb_chainlock_read(struct tdb_context *tdb, TDB_DATA key)
- return tdb->last_error = chainlock(tdb, &key, F_RDLCK, TDB_LOCK_WAIT,
- "tdb_chainlock_read");
-_PUBLIC_ void tdb_chainunlock_read(struct tdb_context *tdb, TDB_DATA key)
- uint64_t h = tdb_hash(tdb, key.dptr, key.dsize);
- tdb_off_t lockstart, locksize;
- unsigned int group, gbits;
- group = bits_from(h, 64 - gbits, gbits);
- lockstart = hlock_range(group, &locksize);
- tdb_trace_1rec(tdb, "tdb_chainunlock_read", key);
- tdb_unlock_hashes(tdb, lockstart, locksize, F_RDLCK);
diff --git a/lib/tdb2/io.c b/lib/tdb2/io.c
deleted file mode 100644
index ca044ae361..0000000000
--- a/lib/tdb2/io.c
+++ /dev/null
@@ -1,650 +0,0 @@
- /*
- Unix SMB/CIFS implementation.
- trivial database library
- Copyright (C) Andrew Tridgell 1999-2005
- Copyright (C) Paul `Rusty' Russell 2000
- Copyright (C) Jeremy Allison 2000-2003
- Copyright (C) Rusty Russell 2010
- ** NOTE! The following LGPL license applies to the tdb
- ** library. This does NOT imply that all of Samba is released
- ** under the LGPL
- This library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 3 of the License, or (at your option) any later version.
- This library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- Lesser General Public License for more details.
- You should have received a copy of the GNU Lesser General Public
- License along with this library; if not, see <>.
-#include "private.h"
-#include <assert.h>
-#include <ccan/likely/likely.h>
-void tdb_munmap(struct tdb_file *file)
- if (file->fd == -1)
- return;
- if (file->map_ptr) {
- munmap(file->map_ptr, file->map_size);
- file->map_ptr = NULL;
- }
-enum TDB_ERROR tdb_mmap(struct tdb_context *tdb)
- int mmap_flags;
- if (tdb->flags & TDB_INTERNAL)
- return TDB_SUCCESS;
- if (tdb->flags & TDB_NOMMAP)
- return TDB_SUCCESS;
- if ((tdb->open_flags & O_ACCMODE) == O_RDONLY)
- mmap_flags = PROT_READ;
- else
- mmap_flags = PROT_READ | PROT_WRITE;
- /* size_t can be smaller than off_t. */
- if ((size_t)tdb->file->map_size == tdb->file->map_size) {
- tdb->file->map_ptr = mmap(NULL, tdb->file->map_size,
- mmap_flags,
- MAP_SHARED, tdb->file->fd, 0);
- } else
- tdb->file->map_ptr = MAP_FAILED;
- /*
- * NB. When mmap fails it returns MAP_FAILED *NOT* NULL !!!!
- */
- if (tdb->file->map_ptr == MAP_FAILED) {
- tdb->file->map_ptr = NULL;
- /* Incoherent mmap means everyone must mmap! */
- return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
- "tdb_mmap failed for size %lld (%s)",
- (long long)tdb->file->map_size,
- strerror(errno));
- tdb_logerr(tdb, TDB_SUCCESS, TDB_LOG_WARNING,
- "tdb_mmap failed for size %lld (%s)",
- (long long)tdb->file->map_size, strerror(errno));
- }
- return TDB_SUCCESS;
-/* check for an out of bounds access - if it is out of bounds then
- see if the database has been expanded by someone else and expand
- if necessary
- note that "len" is the minimum length needed for the db.
- If probe is true, len being too large isn't a failure.
-static enum TDB_ERROR tdb_oob(struct tdb_context *tdb,
- tdb_off_t off, tdb_len_t len, bool probe)
- struct stat st;
- enum TDB_ERROR ecode;
- /* We can't hold pointers during this: we could unmap! */
- assert(!tdb->direct_access
- || (tdb->flags & TDB_NOLOCK)
- || tdb_has_expansion_lock(tdb));
- if (len + off < len) {
- if (probe)
- return TDB_SUCCESS;
- return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
- "tdb_oob off %llu len %llu wrap\n",
- (long long)off, (long long)len);
- }
- if (len + off <= tdb->file->map_size)
- return TDB_SUCCESS;
- if (tdb->flags & TDB_INTERNAL) {
- if (probe)
- return TDB_SUCCESS;
- tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
- "tdb_oob len %lld beyond internal"
- " malloc size %lld",
- (long long)(off + len),
- (long long)tdb->file->map_size);
- return TDB_ERR_IO;
- }
- ecode = tdb_lock_expand(tdb, F_RDLCK);
- if (ecode != TDB_SUCCESS) {
- return ecode;
- }
- if (fstat(tdb->file->fd, &st) != 0) {
- tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
- "Failed to fstat file: %s", strerror(errno));
- tdb_unlock_expand(tdb, F_RDLCK);
- return TDB_ERR_IO;
- }
- tdb_unlock_expand(tdb, F_RDLCK);
- if (st.st_size < off + len) {
- if (probe)
- return TDB_SUCCESS;
- tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
- "tdb_oob len %llu beyond eof at %llu",
- (long long)(off + len), (long long)st.st_size);
- return TDB_ERR_IO;
- }
- /* Unmap, update size, remap */
- tdb_munmap(tdb->file);
- tdb->file->map_size = st.st_size;
- return tdb_mmap(tdb);
-/* Endian conversion: we only ever deal with 8 byte quantities */
-void *tdb_convert(const struct tdb_context *tdb, void *buf, tdb_len_t size)
- assert(size % 8 == 0);
- if (unlikely((tdb->flags & TDB_CONVERT)) && buf) {
- uint64_t i, *p = (uint64_t *)buf;
- for (i = 0; i < size / 8; i++)
- p[i] = bswap_64(p[i]);
- }
- return buf;
-/* Return first non-zero offset in offset array, or end, or -ve error. */
-/* FIXME: Return the off? */
-uint64_t tdb_find_nonzero_off(struct tdb_context *tdb,
- tdb_off_t base, uint64_t start, uint64_t end)
- uint64_t i;
- const uint64_t *val;
- /* Zero vs non-zero is the same unconverted: minor optimization. */
- val = tdb_access_read(tdb, base + start * sizeof(tdb_off_t),
- (end - start) * sizeof(tdb_off_t), false);
- if (TDB_PTR_IS_ERR(val)) {
- return TDB_ERR_TO_OFF(TDB_PTR_ERR(val));
- }
- for (i = 0; i < (end - start); i++) {
- if (val[i])
- break;
- }
- tdb_access_release(tdb, val);
- return start + i;
-/* Return first zero offset in num offset array, or num, or -ve error. */
-uint64_t tdb_find_zero_off(struct tdb_context *tdb, tdb_off_t off,
- uint64_t num)
- uint64_t i;
- const uint64_t *val;
- /* Zero vs non-zero is the same unconverted: minor optimization. */
- val = tdb_access_read(tdb, off, num * sizeof(tdb_off_t), false);
- if (TDB_PTR_IS_ERR(val)) {
- return TDB_ERR_TO_OFF(TDB_PTR_ERR(val));
- }
- for (i = 0; i < num; i++) {
- if (!val[i])
- break;
- }
- tdb_access_release(tdb, val);
- return i;
-enum TDB_ERROR zero_out(struct tdb_context *tdb, tdb_off_t off, tdb_len_t len)
- char buf[8192] = { 0 };
- void *p = tdb->io->direct(tdb, off, len, true);
- enum TDB_ERROR ecode = TDB_SUCCESS;
- assert(!(tdb->flags & TDB_RDONLY));
- if (TDB_PTR_IS_ERR(p)) {
- return TDB_PTR_ERR(p);
- }
- if (p) {
- memset(p, 0, len);
- return ecode;
- }
- while (len) {
- unsigned todo = len < sizeof(buf) ? len : sizeof(buf);
- ecode = tdb->io->twrite(tdb, off, buf, todo);
- if (ecode != TDB_SUCCESS) {
- break;
- }
- len -= todo;
- off += todo;
- }
- return ecode;
-tdb_off_t tdb_read_off(struct tdb_context *tdb, tdb_off_t off)
- tdb_off_t ret;
- enum TDB_ERROR ecode;
- if (likely(!(tdb->flags & TDB_CONVERT))) {
- tdb_off_t *p = tdb->io->direct(tdb, off, sizeof(*p), false);
- if (TDB_PTR_IS_ERR(p)) {
- return TDB_ERR_TO_OFF(TDB_PTR_ERR(p));
- }
- if (p)
- return *p;
- }
- ecode = tdb_read_convert(tdb, off, &ret, sizeof(ret));
- if (ecode != TDB_SUCCESS) {
- return TDB_ERR_TO_OFF(ecode);
- }
- return ret;
-/* write a lump of data at a specified offset */
-static enum TDB_ERROR tdb_write(struct tdb_context *tdb, tdb_off_t off,
- const void *buf, tdb_len_t len)
- enum TDB_ERROR ecode;
- if (tdb->flags & TDB_RDONLY) {
- return tdb_logerr(tdb, TDB_ERR_RDONLY, TDB_LOG_USE_ERROR,
- "Write to read-only database");
- }
- ecode = tdb->io->oob(tdb, off, len, false);
- if (ecode != TDB_SUCCESS) {
- return ecode;
- }
- if (tdb->file->map_ptr) {
- memcpy(off + (char *)tdb->file->map_ptr, buf, len);
- } else {
- return TDB_ERR_IO;
- ssize_t ret;
- ret = pwrite(tdb->file->fd, buf, len, off);
- if (ret != len) {
- /* This shouldn't happen: we avoid sparse files. */
- if (ret >= 0)
- errno = ENOSPC;
- return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
- "tdb_write: %zi at %zu len=%zu (%s)",
- ret, (size_t)off, (size_t)len,
- strerror(errno));
- }
- }
- return TDB_SUCCESS;
-/* read a lump of data at a specified offset */
-static enum TDB_ERROR tdb_read(struct tdb_context *tdb, tdb_off_t off,
- void *buf, tdb_len_t len)
- enum TDB_ERROR ecode;
- ecode = tdb->io->oob(tdb, off, len, false);
- if (ecode != TDB_SUCCESS) {
- return ecode;
- }
- if (tdb->file->map_ptr) {
- memcpy(buf, off + (char *)tdb->file->map_ptr, len);
- } else {
- return TDB_ERR_IO;
- ssize_t r = pread(tdb->file->fd, buf, len, off);
- if (r != len) {
- return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
- "tdb_read failed with %zi at %zu "
- "len=%zu (%s) map_size=%zu",
- r, (size_t)off, (size_t)len,
- strerror(errno),
- (size_t)tdb->file->map_size);
- }
- }
- return TDB_SUCCESS;
-enum TDB_ERROR tdb_write_convert(struct tdb_context *tdb, tdb_off_t off,
- const void *rec, size_t len)
- enum TDB_ERROR ecode;
- if (unlikely((tdb->flags & TDB_CONVERT))) {
- void *conv = malloc(len);
- if (!conv) {
- return tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR,
- "tdb_write: no memory converting"
- " %zu bytes", len);
- }
- memcpy(conv, rec, len);
- ecode = tdb->io->twrite(tdb, off,
- tdb_convert(tdb, conv, len), len);
- free(conv);
- } else {
- ecode = tdb->io->twrite(tdb, off, rec, len);
- }
- return ecode;
-enum TDB_ERROR tdb_read_convert(struct tdb_context *tdb, tdb_off_t off,
- void *rec, size_t len)
- enum TDB_ERROR ecode = tdb->io->tread(tdb, off, rec, len);
- tdb_convert(tdb, rec, len);
- return ecode;
-enum TDB_ERROR tdb_write_off(struct tdb_context *tdb,
- tdb_off_t off, tdb_off_t val)
- if (tdb->flags & TDB_RDONLY) {
- return tdb_logerr(tdb, TDB_ERR_RDONLY, TDB_LOG_USE_ERROR,
- "Write to read-only database");
- }
- if (likely(!(tdb->flags & TDB_CONVERT))) {
- tdb_off_t *p = tdb->io->direct(tdb, off, sizeof(*p), true);
- if (TDB_PTR_IS_ERR(p)) {
- return TDB_PTR_ERR(p);
- }
- if (p) {
- *p = val;
- return TDB_SUCCESS;
- }
- }
- return tdb_write_convert(tdb, off, &val, sizeof(val));
-static void *_tdb_alloc_read(struct tdb_context *tdb, tdb_off_t offset,
- tdb_len_t len, unsigned int prefix)
- unsigned char *buf;
- enum TDB_ERROR ecode;
- /* some systems don't like zero length malloc */
- buf = malloc(prefix + len ? prefix + len : 1);
- if (!buf) {
- tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_USE_ERROR,
- "tdb_alloc_read malloc failed len=%zu",
- (size_t)(prefix + len));
- } else {
- ecode = tdb->io->tread(tdb, offset, buf+prefix, len);
- if (unlikely(ecode != TDB_SUCCESS)) {
- free(buf);
- return TDB_ERR_PTR(ecode);
- }
- }
- return buf;
-/* read a lump of data, allocating the space for it */
-void *tdb_alloc_read(struct tdb_context *tdb, tdb_off_t offset, tdb_len_t len)
- return _tdb_alloc_read(tdb, offset, len, 0);
-static enum TDB_ERROR fill(struct tdb_context *tdb,
- const void *buf, size_t size,
- tdb_off_t off, tdb_len_t len)
- while (len) {
- size_t n = len > size ? size : len;
- ssize_t ret = pwrite(tdb->file->fd, buf, n, off);
- if (ret != n) {
- if (ret >= 0)
- errno = ENOSPC;
- return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
- "fill failed:"
- " %zi at %zu len=%zu (%s)",
- ret, (size_t)off, (size_t)len,
- strerror(errno));
- }
- len -= n;
- off += n;
- }
- return TDB_SUCCESS;
-/* expand a file. we prefer to use ftruncate, as that is what posix
- says to use for mmap expansion */
-static enum TDB_ERROR tdb_expand_file(struct tdb_context *tdb,
- tdb_len_t addition)
- char buf[8192];
- enum TDB_ERROR ecode;
- if (tdb->flags & TDB_RDONLY) {
- return tdb_logerr(tdb, TDB_ERR_RDONLY, TDB_LOG_USE_ERROR,
- "Expand on read-only database");
- }
- if (tdb->flags & TDB_INTERNAL) {
- char *new = realloc(tdb->file->map_ptr,
- tdb->file->map_size + addition);
- if (!new) {
- return tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR,
- "No memory to expand database");
- }
- tdb->file->map_ptr = new;
- tdb->file->map_size += addition;
- return TDB_SUCCESS;
- } else {
- /* Unmap before trying to write; old TDB claimed OpenBSD had
- * problem with this otherwise. */
- tdb_munmap(tdb->file);
- /* If this fails, we try to fill anyway. */
- if (ftruncate(tdb->file->fd, tdb->file->map_size + addition))
- ;
- /* now fill the file with something. This ensures that the
- file isn't sparse, which would be very bad if we ran out of
- disk. This must be done with write, not via mmap */
- memset(buf, 0x43, sizeof(buf));
- ecode = fill(tdb, buf, sizeof(buf), tdb->file->map_size,
- addition);
- if (ecode != TDB_SUCCESS)
- return ecode;
- tdb->file->map_size += addition;
- return tdb_mmap(tdb);
- }
-const void *tdb_access_read(struct tdb_context *tdb,
- tdb_off_t off, tdb_len_t len, bool convert)
- void *ret = NULL;
- if (likely(!(tdb->flags & TDB_CONVERT))) {
- ret = tdb->io->direct(tdb, off, len, false);
- if (TDB_PTR_IS_ERR(ret)) {
- return ret;
- }
- }
- if (!ret) {
- struct tdb_access_hdr *hdr;
- hdr = _tdb_alloc_read(tdb, off, len, sizeof(*hdr));
- if (TDB_PTR_IS_ERR(hdr)) {
- return hdr;
- }
- hdr->next = tdb->access;
- tdb->access = hdr;
- ret = hdr + 1;
- if (convert) {
- tdb_convert(tdb, (void *)ret, len);
- }
- } else
- tdb->direct_access++;
- return ret;
-void *tdb_access_write(struct tdb_context *tdb,
- tdb_off_t off, tdb_len_t len, bool convert)
- void *ret = NULL;
- if (tdb->flags & TDB_RDONLY) {
- tdb_logerr(tdb, TDB_ERR_RDONLY, TDB_LOG_USE_ERROR,
- "Write to read-only database");
- }
- if (likely(!(tdb->flags & TDB_CONVERT))) {
- ret = tdb->io->direct(tdb, off, len, true);
- if (TDB_PTR_IS_ERR(ret)) {
- return ret;
- }
- }
- if (!ret) {
- struct tdb_access_hdr *hdr;
- hdr = _tdb_alloc_read(tdb, off, len, sizeof(*hdr));
- if (TDB_PTR_IS_ERR(hdr)) {
- return hdr;
- }
- hdr->next = tdb->access;
- tdb->access = hdr;
- hdr->off = off;
- hdr->len = len;
- hdr->convert = convert;
- ret = hdr + 1;
- if (convert)
- tdb_convert(tdb, (void *)ret, len);
- } else
- tdb->direct_access++;
- return ret;
-static struct tdb_access_hdr **find_hdr(struct tdb_context *tdb, const void *p)
- struct tdb_access_hdr **hp;
- for (hp = &tdb->access; *hp; hp = &(*hp)->next) {
- if (*hp + 1 == p)
- return hp;
- }
- return NULL;
-void tdb_access_release(struct tdb_context *tdb, const void *p)
- struct tdb_access_hdr *hdr, **hp = find_hdr(tdb, p);
- if (hp) {
- hdr = *hp;
- *hp = hdr->next;
- free(hdr);
- } else
- tdb->direct_access--;
-enum TDB_ERROR tdb_access_commit(struct tdb_context *tdb, void *p)
- struct tdb_access_hdr *hdr, **hp = find_hdr(tdb, p);
- enum TDB_ERROR ecode;
- if (hp) {
- hdr = *hp;
- if (hdr->convert)
- ecode = tdb_write_convert(tdb, hdr->off, p, hdr->len);
- else
- ecode = tdb_write(tdb, hdr->off, p, hdr->len);
- *hp = hdr->next;
- free(hdr);
- } else {
- tdb->direct_access--;
- ecode = TDB_SUCCESS;
- }
- return ecode;
-static void *tdb_direct(struct tdb_context *tdb, tdb_off_t off, size_t len,
- bool write_mode)
- enum TDB_ERROR ecode;
- if (unlikely(!tdb->file->map_ptr))
- return NULL;
- ecode = tdb_oob(tdb, off, len, false);
- if (unlikely(ecode != TDB_SUCCESS))
- return TDB_ERR_PTR(ecode);
- return (char *)tdb->file->map_ptr + off;
-void tdb_inc_seqnum(struct tdb_context *tdb)
- tdb_off_t seq;
- if (likely(!(tdb->flags & TDB_CONVERT))) {
- int64_t *direct;
- direct = tdb->io->direct(tdb,
- offsetof(struct tdb_header, seqnum),
- sizeof(*direct), true);
- if (likely(direct)) {
- /* Don't let it go negative, even briefly */
- if (unlikely((*direct) + 1) < 0)
- *direct = 0;
- (*direct)++;
- return;
- }
- }
- seq = tdb_read_off(tdb, offsetof(struct tdb_header, seqnum));
- if (!TDB_OFF_IS_ERR(seq)) {
- seq++;
- if (unlikely((int64_t)seq < 0))
- seq = 0;
- tdb_write_off(tdb, offsetof(struct tdb_header, seqnum), seq);
- }
-static const struct tdb_methods io_methods = {
- tdb_read,
- tdb_write,
- tdb_oob,
- tdb_expand_file,
- tdb_direct,
- initialise the default methods table
-void tdb_io_init(struct tdb_context *tdb)
- tdb->io = &io_methods;
diff --git a/lib/tdb2/lock.c b/lib/tdb2/lock.c
deleted file mode 100644
index b0583546fb..0000000000
--- a/lib/tdb2/lock.c
+++ /dev/null
@@ -1,883 +0,0 @@
- /*
- Unix SMB/CIFS implementation.
- trivial database library
- Copyright (C) Andrew Tridgell 1999-2005
- Copyright (C) Paul `Rusty' Russell 2000
- Copyright (C) Jeremy Allison 2000-2003
- ** NOTE! The following LGPL license applies to the tdb
- ** library. This does NOT imply that all of Samba is released
- ** under the LGPL
- This library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 3 of the License, or (at your option) any later version.
- This library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- Lesser General Public License for more details.
- You should have received a copy of the GNU Lesser General Public
- License along with this library; if not, see <>.
-#include "private.h"
-#include <assert.h>
-#include <ccan/build_assert/build_assert.h>
-/* If we were threaded, we could wait for unlock, but we're not, so fail. */
-enum TDB_ERROR owner_conflict(struct tdb_context *tdb, const char *call)
- return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_USE_ERROR,
- "%s: lock owned by another tdb in this process.",
- call);
-/* If we fork, we no longer really own locks. */
-bool check_lock_pid(struct tdb_context *tdb, const char *call, bool log)
- /* No locks? No problem! */
- if (tdb->file->allrecord_lock.count == 0
- && tdb->file->num_lockrecs == 0) {
- return true;
- }
- /* No fork? No problem! */
- if (tdb->file->locker == getpid()) {
- return true;
- }
- if (log) {
- tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_USE_ERROR,
- "%s: fork() detected after lock acquisition!"
- " (%u vs %u)", call, tdb->file->locker, getpid());
- }
- return false;
-int tdb_fcntl_lock(int fd, int rw, off_t off, off_t len, bool waitflag,
- void *unused)
- struct flock fl;
- int ret;
- do {
- fl.l_type = rw;
- fl.l_whence = SEEK_SET;
- fl.l_start = off;
- fl.l_len = len;
- if (waitflag)
- ret = fcntl(fd, F_SETLKW, &fl);
- else
- ret = fcntl(fd, F_SETLK, &fl);
- } while (ret != 0 && errno == EINTR);
- return ret;
-int tdb_fcntl_unlock(int fd, int rw, off_t off, off_t len, void *unused)
- struct flock fl;
- int ret;
- do {
- fl.l_type = F_UNLCK;
- fl.l_whence = SEEK_SET;
- fl.l_start = off;
- fl.l_len = len;
- ret = fcntl(fd, F_SETLKW, &fl);
- } while (ret != 0 && errno == EINTR);
- return ret;
-static int lock(struct tdb_context *tdb,
- int rw, off_t off, off_t len, bool waitflag)
- int ret;
- if (tdb->file->allrecord_lock.count == 0
- && tdb->file->num_lockrecs == 0) {
- tdb->file->locker = getpid();
- }
- tdb->stats.lock_lowlevel++;
- ret = tdb->lock_fn(tdb->file->fd, rw, off, len, waitflag,
- tdb->lock_data);
- if (!waitflag) {
- tdb->stats.lock_nonblock++;
- if (ret != 0)
- tdb->stats.lock_nonblock_fail++;
- }
- return ret;
-static int unlock(struct tdb_context *tdb, int rw, off_t off, off_t len)
-#if 0 /* Check they matched up locks and unlocks correctly. */
- char line[80];
- FILE *locks;
- bool found = false;
- locks = fopen("/proc/locks", "r");
- while (fgets(line, 80, locks)) {
- char *p;
- int type, start, l;
- /* eg. 1: FLOCK ADVISORY WRITE 2440 08:01:2180826 0 EOF */
- p = strchr(line, ':') + 1;
- if (strncmp(p, " POSIX ADVISORY ", strlen(" POSIX ADVISORY ")))
- continue;
- p += strlen(" FLOCK ADVISORY ");
- if (strncmp(p, "READ ", strlen("READ ")) == 0)
- type = F_RDLCK;
- else if (strncmp(p, "WRITE ", strlen("WRITE ")) == 0)
- type = F_WRLCK;
- else
- abort();
- p += 6;
- if (atoi(p) != getpid())
- continue;
- p = strchr(strchr(p, ' ') + 1, ' ') + 1;
- start = atoi(p);
- p = strchr(p, ' ') + 1;
- if (strncmp(p, "EOF", 3) == 0)
- l = 0;
- else
- l = atoi(p) - start + 1;
- if (off == start) {
- if (len != l) {
- fprintf(stderr, "Len %u should be %u: %s",
- (int)len, l, line);
- abort();
- }
- if (type != rw) {
- fprintf(stderr, "Type %s wrong: %s",
- rw == F_RDLCK ? "READ" : "WRITE", line);
- abort();
- }
- found = true;
- break;
- }
- }
- if (!found) {
- fprintf(stderr, "Unlock on %u@%u not found!",
- (int)off, (int)len);
- abort();
- }
- fclose(locks);
- return tdb->unlock_fn(tdb->file->fd, rw, off, len, tdb->lock_data);
-/* a byte range locking function - return 0 on success
- this functions locks len bytes at the specified offset.
- note that a len of zero means lock to end of file
-static enum TDB_ERROR tdb_brlock(struct tdb_context *tdb,
- int rw_type, tdb_off_t offset, tdb_off_t len,
- enum tdb_lock_flags flags)
- int ret;
- if (tdb->flags & TDB_NOLOCK) {
- return TDB_SUCCESS;
- }
- if (rw_type == F_WRLCK && (tdb->flags & TDB_RDONLY)) {
- return tdb_logerr(tdb, TDB_ERR_RDONLY, TDB_LOG_USE_ERROR,
- "Write lock attempted on read-only database");
- }
- /* A 32 bit system cannot open a 64-bit file, but it could have
- * expanded since then: check here. */
- if ((size_t)(offset + len) != offset + len) {
- return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
- "tdb_brlock: lock on giant offset %llu",
- (long long)(offset + len));
- }
- ret = lock(tdb, rw_type, offset, len, flags & TDB_LOCK_WAIT);
- if (ret != 0) {
- /* Generic lock error. errno set by fcntl.
- * EAGAIN is an expected return from non-blocking
- * locks. */
- if (!(flags & TDB_LOCK_PROBE)
- && (errno != EAGAIN && errno != EINTR)) {
- tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
- "tdb_brlock failed (fd=%d) at"
- " offset %zu rw_type=%d flags=%d len=%zu:"
- " %s",
- tdb->file->fd, (size_t)offset, rw_type,
- flags, (size_t)len, strerror(errno));
- }
- return TDB_ERR_LOCK;
- }
- return TDB_SUCCESS;
-static enum TDB_ERROR tdb_brunlock(struct tdb_context *tdb,
- int rw_type, tdb_off_t offset, size_t len)
- if (tdb->flags & TDB_NOLOCK) {
- return TDB_SUCCESS;
- }
- if (!check_lock_pid(tdb, "tdb_brunlock", true))
- return TDB_ERR_LOCK;
- if (unlock(tdb, rw_type, offset, len) == -1) {
- return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
- "tdb_brunlock failed (fd=%d) at offset %zu"
- " rw_type=%d len=%zu: %s",
- tdb->file->fd, (size_t)offset, rw_type,
- (size_t)len, strerror(errno));
- }
- return TDB_SUCCESS;
- upgrade a read lock to a write lock. This needs to be handled in a
- special way as some OSes (such as solaris) have too conservative
- deadlock detection and claim a deadlock when progress can be
- made. For those OSes we may loop for a while.
-enum TDB_ERROR tdb_allrecord_upgrade(struct tdb_context *tdb, off_t start)
- int count = 1000;
- if (!check_lock_pid(tdb, "tdb_transaction_prepare_commit", true))
- return TDB_ERR_LOCK;
- if (tdb->file->allrecord_lock.count != 1) {
- return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
- "tdb_allrecord_upgrade failed:"
- " count %u too high",
- tdb->file->allrecord_lock.count);
- }
- if (tdb->file-> != 1) {
- return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
- "tdb_allrecord_upgrade failed:"
- " already upgraded?");
- }
- if (tdb->file->allrecord_lock.owner != tdb) {
- return owner_conflict(tdb, "tdb_allrecord_upgrade");
- }
- while (count--) {
- struct timeval tv;
- if (tdb_brlock(tdb, F_WRLCK, start, 0,
- tdb->file->allrecord_lock.ltype = F_WRLCK;
- tdb->file-> = 0;
- return TDB_SUCCESS;
- }
- if (errno != EDEADLK) {
- break;
- }
- /* sleep for as short a time as we can - more portable than usleep() */
- tv.tv_sec = 0;
- tv.tv_usec = 1;
- select(0, NULL, NULL, NULL, &tv);
- }
- if (errno != EAGAIN && errno != EINTR)
- tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
- "tdb_allrecord_upgrade failed");
- return TDB_ERR_LOCK;
-static struct tdb_lock *find_nestlock(struct tdb_context *tdb, tdb_off_t offset,
- const struct tdb_context *owner)
- unsigned int i;
- for (i=0; i<tdb->file->num_lockrecs; i++) {
- if (tdb->file->lockrecs[i].off == offset) {
- if (owner && tdb->file->lockrecs[i].owner != owner)
- return NULL;
- return &tdb->file->lockrecs[i];
- }
- }
- return NULL;
-enum TDB_ERROR tdb_lock_and_recover(struct tdb_context *tdb)
- enum TDB_ERROR ecode;
- if (!check_lock_pid(tdb, "tdb_transaction_prepare_commit", true))
- return TDB_ERR_LOCK;
- ecode = tdb_allrecord_lock(tdb, F_WRLCK, TDB_LOCK_WAIT|TDB_LOCK_NOCHECK,
- false);
- if (ecode != TDB_SUCCESS) {
- return ecode;
- }
- ecode = tdb_lock_open(tdb, F_WRLCK, TDB_LOCK_WAIT|TDB_LOCK_NOCHECK);
- if (ecode != TDB_SUCCESS) {
- tdb_allrecord_unlock(tdb, F_WRLCK);
- return ecode;
- }
- ecode = tdb_transaction_recover(tdb);
- tdb_unlock_open(tdb, F_WRLCK);
- tdb_allrecord_unlock(tdb, F_WRLCK);
- return ecode;
-/* lock an offset in the database. */
-static enum TDB_ERROR tdb_nest_lock(struct tdb_context *tdb,
- tdb_off_t offset, int ltype,
- enum tdb_lock_flags flags)
- struct tdb_lock *new_lck;
- enum TDB_ERROR ecode;
- + tdb->file->map_size / 8)) {
- return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
- "tdb_nest_lock: invalid offset %zu ltype=%d",
- (size_t)offset, ltype);
- }
- if (tdb->flags & TDB_NOLOCK)
- return TDB_SUCCESS;
- if (!check_lock_pid(tdb, "tdb_nest_lock", true)) {
- return TDB_ERR_LOCK;
- }
- tdb->stats.locks++;
- new_lck = find_nestlock(tdb, offset, NULL);
- if (new_lck) {
- if (new_lck->owner != tdb) {
- return owner_conflict(tdb, "tdb_nest_lock");
- }
- if (new_lck->ltype == F_RDLCK && ltype == F_WRLCK) {
- return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
- "tdb_nest_lock:"
- " offset %zu has read lock",
- (size_t)offset);
- }
- /* Just increment the struct, posix locks don't stack. */
- new_lck->count++;
- return TDB_SUCCESS;
- }
-#if 0
- if (tdb->file->num_lockrecs
- && offset >= TDB_HASH_LOCK_START
- return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
- "tdb_nest_lock: already have a hash lock?");
- }
- new_lck = (struct tdb_lock *)realloc(
- tdb->file->lockrecs,
- sizeof(*tdb->file->lockrecs) * (tdb->file->num_lockrecs+1));
- if (new_lck == NULL) {
- return tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR,
- "tdb_nest_lock:"
- " unable to allocate %zu lock struct",
- tdb->file->num_lockrecs + 1);
- }
- tdb->file->lockrecs = new_lck;
- /* Since fcntl locks don't nest, we do a lock for the first one,
- and simply bump the count for future ones */
- ecode = tdb_brlock(tdb, ltype, offset, 1, flags);
- if (ecode != TDB_SUCCESS) {
- return ecode;
- }
- /* First time we grab a lock, perhaps someone died in commit? */
- if (!(flags & TDB_LOCK_NOCHECK)
- && tdb->file->num_lockrecs == 0) {
- tdb_bool_err berr = tdb_needs_recovery(tdb);
- if (berr != false) {
- tdb_brunlock(tdb, ltype, offset, 1);
- if (berr < 0)
- return TDB_OFF_TO_ERR(berr);
- ecode = tdb_lock_and_recover(tdb);
- if (ecode == TDB_SUCCESS) {
- ecode = tdb_brlock(tdb, ltype, offset, 1,
- flags);
- }
- if (ecode != TDB_SUCCESS) {
- return ecode;
- }
- }
- }
- tdb->file->lockrecs[tdb->file->num_lockrecs].owner = tdb;
- tdb->file->lockrecs[tdb->file->num_lockrecs].off = offset;
- tdb->file->lockrecs[tdb->file->num_lockrecs].count = 1;
- tdb->file->lockrecs[tdb->file->num_lockrecs].ltype = ltype;
- tdb->file->num_lockrecs++;
- return TDB_SUCCESS;
-static enum TDB_ERROR tdb_nest_unlock(struct tdb_context *tdb,
- tdb_off_t off, int ltype)
- struct tdb_lock *lck;
- enum TDB_ERROR ecode;
- if (tdb->flags & TDB_NOLOCK)
- return TDB_SUCCESS;
- lck = find_nestlock(tdb, off, tdb);
- if ((lck == NULL) || (lck->count == 0)) {
- return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
- "tdb_nest_unlock: no lock for %zu",
- (size_t)off);
- }
- if (lck->count > 1) {
- lck->count--;
- return TDB_SUCCESS;
- }
- /*
- * This lock has count==1 left, so we need to unlock it in the
- * kernel. We don't bother with decrementing the in-memory array
- * element, we're about to overwrite it with the last array element
- * anyway.
- */
- ecode = tdb_brunlock(tdb, ltype, off, 1);
- /*
- * Shrink the array by overwriting the element just unlocked with the
- * last array element.
- */
- *lck = tdb->file->lockrecs[--tdb->file->num_lockrecs];
- return ecode;
- get the transaction lock
- */
-enum TDB_ERROR tdb_transaction_lock(struct tdb_context *tdb, int ltype)
- return tdb_nest_lock(tdb, TDB_TRANSACTION_LOCK, ltype, TDB_LOCK_WAIT);
- release the transaction lock
- */
-void tdb_transaction_unlock(struct tdb_context *tdb, int ltype)
- tdb_nest_unlock(tdb, TDB_TRANSACTION_LOCK, ltype);
-/* We only need to lock individual bytes, but Linux merges consecutive locks
- * so we lock in contiguous ranges. */
-static enum TDB_ERROR tdb_lock_gradual(struct tdb_context *tdb,
- int ltype, enum tdb_lock_flags flags,
- tdb_off_t off, tdb_off_t len)
- enum TDB_ERROR ecode;
- enum tdb_lock_flags nb_flags = (flags & ~TDB_LOCK_WAIT);
- if (len <= 1) {
- /* 0 would mean to end-of-file... */
- assert(len != 0);
- /* Single hash. Just do blocking lock. */
- return tdb_brlock(tdb, ltype, off, len, flags);
- }
- /* First we try non-blocking. */
- ecode = tdb_brlock(tdb, ltype, off, len, nb_flags);
- if (ecode != TDB_ERR_LOCK) {
- return ecode;
- }
- /* Try locking first half, then second. */
- ecode = tdb_lock_gradual(tdb, ltype, flags, off, len / 2);
- if (ecode != TDB_SUCCESS)
- return ecode;
- ecode = tdb_lock_gradual(tdb, ltype, flags,
- off + len / 2, len - len / 2);
- if (ecode != TDB_SUCCESS) {
- tdb_brunlock(tdb, ltype, off, len / 2);
- }
- return ecode;
-/* lock/unlock entire database. It can only be upgradable if you have some
- * other way of guaranteeing exclusivity (ie. transaction write lock). */
-enum TDB_ERROR tdb_allrecord_lock(struct tdb_context *tdb, int ltype,
- enum tdb_lock_flags flags, bool upgradable)
- enum TDB_ERROR ecode;
- tdb_bool_err berr;
- if (tdb->flags & TDB_NOLOCK)
- return TDB_SUCCESS;
- if (!check_lock_pid(tdb, "tdb_allrecord_lock", true)) {
- return TDB_ERR_LOCK;
- }
- if (tdb->file->allrecord_lock.count) {
- if (tdb->file->allrecord_lock.owner != tdb) {
- return owner_conflict(tdb, "tdb_allrecord_lock");
- }
- if (ltype == F_RDLCK
- || tdb->file->allrecord_lock.ltype == F_WRLCK) {
- tdb->file->allrecord_lock.count++;
- return TDB_SUCCESS;
- }
- /* a global lock of a different type exists */
- return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_USE_ERROR,
- "tdb_allrecord_lock: already have %s lock",
- tdb->file->allrecord_lock.ltype == F_RDLCK
- ? "read" : "write");
- }
- if (tdb_has_hash_locks(tdb)) {
- /* can't combine global and chain locks */
- return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_USE_ERROR,
- "tdb_allrecord_lock:"
- " already have chain lock");
- }
- if (upgradable && ltype != F_RDLCK) {
- /* tdb error: you can't upgrade a write lock! */
- return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
- "tdb_allrecord_lock:"
- " can't upgrade a write lock");
- }
- tdb->stats.locks++;
- /* Lock hashes, gradually. */
- ecode = tdb_lock_gradual(tdb, ltype, flags, TDB_HASH_LOCK_START,
- if (ecode != TDB_SUCCESS)
- return ecode;
- /* Lock free tables: there to end of file. */
- ecode = tdb_brlock(tdb, ltype,
- 0, flags);
- if (ecode != TDB_SUCCESS) {
- tdb_brunlock(tdb, ltype, TDB_HASH_LOCK_START,
- return ecode;
- }
- tdb->file->allrecord_lock.owner = tdb;
- tdb->file->allrecord_lock.count = 1;
- /* If it's upgradable, it's actually exclusive so we can treat
- * it as a write lock. */
- tdb->file->allrecord_lock.ltype = upgradable ? F_WRLCK : ltype;
- tdb->file-> = upgradable;
- /* Now check for needing recovery. */
- if (flags & TDB_LOCK_NOCHECK)
- return TDB_SUCCESS;
- berr = tdb_needs_recovery(tdb);
- if (likely(berr == false))
- return TDB_SUCCESS;
- tdb_allrecord_unlock(tdb, ltype);
- if (berr < 0)
- return TDB_OFF_TO_ERR(berr);
- ecode = tdb_lock_and_recover(tdb);
- if (ecode != TDB_SUCCESS) {
- return ecode;
- }
- goto again;
-enum TDB_ERROR tdb_lock_open(struct tdb_context *tdb,
- int ltype, enum tdb_lock_flags flags)
- return tdb_nest_lock(tdb, TDB_OPEN_LOCK, ltype, flags);
-void tdb_unlock_open(struct tdb_context *tdb, int ltype)
- tdb_nest_unlock(tdb, TDB_OPEN_LOCK, ltype);
-bool tdb_has_open_lock(struct tdb_context *tdb)
- return !(tdb->flags & TDB_NOLOCK)
- && find_nestlock(tdb, TDB_OPEN_LOCK, tdb) != NULL;
-enum TDB_ERROR tdb_lock_expand(struct tdb_context *tdb, int ltype)
- /* Lock doesn't protect data, so don't check (we recurse if we do!) */
- return tdb_nest_lock(tdb, TDB_EXPANSION_LOCK, ltype,
-void tdb_unlock_expand(struct tdb_context *tdb, int ltype)
- tdb_nest_unlock(tdb, TDB_EXPANSION_LOCK, ltype);
-/* unlock entire db */
-void tdb_allrecord_unlock(struct tdb_context *tdb, int ltype)
- if (tdb->flags & TDB_NOLOCK)
- return;
- if (tdb->file->allrecord_lock.count == 0) {
- tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_USE_ERROR,
- "tdb_allrecord_unlock: not locked!");
- return;
- }
- if (tdb->file->allrecord_lock.owner != tdb) {
- tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_USE_ERROR,
- "tdb_allrecord_unlock: not locked by us!");
- return;
- }
- /* Upgradable locks are marked as write locks. */
- if (tdb->file->allrecord_lock.ltype != ltype
- && (!tdb->file-> || ltype != F_RDLCK)) {
- tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
- "tdb_allrecord_unlock: have %s lock",
- tdb->file->allrecord_lock.ltype == F_RDLCK
- ? "read" : "write");
- return;
- }
- if (tdb->file->allrecord_lock.count > 1) {
- tdb->file->allrecord_lock.count--;
- return;
- }
- tdb->file->allrecord_lock.count = 0;
- tdb->file->allrecord_lock.ltype = 0;
- tdb_brunlock(tdb, ltype, TDB_HASH_LOCK_START, 0);
-bool tdb_has_expansion_lock(struct tdb_context *tdb)
- return find_nestlock(tdb, TDB_EXPANSION_LOCK, tdb) != NULL;
-bool tdb_has_hash_locks(struct tdb_context *tdb)
- unsigned int i;
- for (i=0; i<tdb->file->num_lockrecs; i++) {
- if (tdb->file->lockrecs[i].off >= TDB_HASH_LOCK_START
- && tdb->file->lockrecs[i].off < (TDB_HASH_LOCK_START
- return true;
- }
- return false;
-static bool tdb_has_free_lock(struct tdb_context *tdb)
- unsigned int i;
- if (tdb->flags & TDB_NOLOCK)
- return false;
- for (i=0; i<tdb->file->num_lockrecs; i++) {
- if (tdb->file->lockrecs[i].off
- return true;
- }
- return false;
-enum TDB_ERROR tdb_lock_hashes(struct tdb_context *tdb,
- tdb_off_t hash_lock,
- tdb_len_t hash_range,
- int ltype, enum tdb_lock_flags waitflag)
- /* FIXME: Do this properly, using hlock_range */
- unsigned l = TDB_HASH_LOCK_START
- + (hash_lock >> (64 - TDB_HASH_LOCK_RANGE_BITS));
- /* a allrecord lock allows us to avoid per chain locks */
- if (tdb->file->allrecord_lock.count) {
- if (!check_lock_pid(tdb, "tdb_lock_hashes", true))
- return TDB_ERR_LOCK;
- if (tdb->file->allrecord_lock.owner != tdb)
- return owner_conflict(tdb, "tdb_lock_hashes");
- if (ltype == tdb->file->allrecord_lock.ltype
- || ltype == F_RDLCK) {
- return TDB_SUCCESS;
- }
- return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_USE_ERROR,
- "tdb_lock_hashes:"
- " already have %s allrecordlock",
- tdb->file->allrecord_lock.ltype == F_RDLCK
- ? "read" : "write");
- }
- if (tdb_has_free_lock(tdb)) {
- return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
- "tdb_lock_hashes: already have free lock");
- }
- if (tdb_has_expansion_lock(tdb)) {
- return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
- "tdb_lock_hashes:"
- " already have expansion lock");
- }
- return tdb_nest_lock(tdb, l, ltype, waitflag);
-enum TDB_ERROR tdb_unlock_hashes(struct tdb_context *tdb,
- tdb_off_t hash_lock,
- tdb_len_t hash_range, int ltype)
- unsigned l = TDB_HASH_LOCK_START
- + (hash_lock >> (64 - TDB_HASH_LOCK_RANGE_BITS));
- if (tdb->flags & TDB_NOLOCK)
- return 0;
- /* a allrecord lock allows us to avoid per chain locks */
- if (tdb->file->allrecord_lock.count) {
- if (tdb->file->allrecord_lock.ltype == F_RDLCK
- && ltype == F_WRLCK) {
- return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
- "tdb_unlock_hashes RO allrecord!");
- }
- if (tdb->file->allrecord_lock.owner != tdb) {
- return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_USE_ERROR,
- "tdb_unlock_hashes:"
- " not locked by us!");
- }
- return TDB_SUCCESS;
- }
- return tdb_nest_unlock(tdb, l, ltype);
-/* Hash locks use TDB_HASH_LOCK_START + the next 30 bits.
- * Then we begin; bucket offsets are sizeof(tdb_len_t) apart, so we divide.
- * The result is that on 32 bit systems we don't use lock values > 2^31 on
- * files that are less than 4GB.
- */
-static tdb_off_t free_lock_off(tdb_off_t b_off)
- + b_off / sizeof(tdb_off_t);
-enum TDB_ERROR tdb_lock_free_bucket(struct tdb_context *tdb, tdb_off_t b_off,
- enum tdb_lock_flags waitflag)
- assert(b_off >= sizeof(struct tdb_header));
- if (tdb->flags & TDB_NOLOCK)
- return 0;
- /* a allrecord lock allows us to avoid per chain locks */
- if (tdb->file->allrecord_lock.count) {
- if (!check_lock_pid(tdb, "tdb_lock_free_bucket", true))
- return TDB_ERR_LOCK;
- if (tdb->file->allrecord_lock.owner != tdb) {
- return owner_conflict(tdb, "tdb_lock_free_bucket");
- }
- if (tdb->file->allrecord_lock.ltype == F_WRLCK)
- return 0;
- return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
- "tdb_lock_free_bucket with"
- " read-only allrecordlock!");
- }
-#if 0 /* FIXME */
- if (tdb_has_expansion_lock(tdb)) {
- return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
- "tdb_lock_free_bucket:"
- " already have expansion lock");
- }
- return tdb_nest_lock(tdb, free_lock_off(b_off), F_WRLCK, waitflag);
-void tdb_unlock_free_bucket(struct tdb_context *tdb, tdb_off_t b_off)
- if (tdb->file->allrecord_lock.count)
- return;
- tdb_nest_unlock(tdb, free_lock_off(b_off), F_WRLCK);
-_PUBLIC_ enum TDB_ERROR tdb_lockall(struct tdb_context *tdb)
- return tdb_allrecord_lock(tdb, F_WRLCK, TDB_LOCK_WAIT, false);
-_PUBLIC_ void tdb_unlockall(struct tdb_context *tdb)
- tdb_allrecord_unlock(tdb, F_WRLCK);
-_PUBLIC_ enum TDB_ERROR tdb_lockall_read(struct tdb_context *tdb)
- return tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_WAIT, false);
-_PUBLIC_ void tdb_unlockall_read(struct tdb_context *tdb)
- tdb_allrecord_unlock(tdb, F_RDLCK);
-void tdb_lock_cleanup(struct tdb_context *tdb)
- unsigned int i;
- /* We don't want to warn: they're allowed to close tdb after fork. */
- if (!check_lock_pid(tdb, "tdb_close", false))
- return;
- while (tdb->file->allrecord_lock.count
- && tdb->file->allrecord_lock.owner == tdb) {
- tdb_allrecord_unlock(tdb, tdb->file->allrecord_lock.ltype);
- }
- for (i=0; i<tdb->file->num_lockrecs; i++) {
- if (tdb->file->lockrecs[i].owner == tdb) {
- tdb_nest_unlock(tdb,
- tdb->file->lockrecs[i].off,
- tdb->file->lockrecs[i].ltype);
- i--;
- }
- }
diff --git a/lib/tdb2/open.c b/lib/tdb2/open.c
deleted file mode 100644
index fab855b6b8..0000000000
--- a/lib/tdb2/open.c
+++ /dev/null
@@ -1,768 +0,0 @@
- /*
- Trivial Database 2: opening and closing TDBs
- Copyright (C) Rusty Russell 2010
- This library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 3 of the License, or (at your option) any later version.
- This library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- Lesser General Public License for more details.
- You should have received a copy of the GNU Lesser General Public
- License along with this library; if not, see <>.
-#include "private.h"
-#include <ccan/build_assert/build_assert.h>
-#include <assert.h>
-/* all tdbs, to detect double-opens (fcntl file don't nest!) */
-static struct tdb_context *tdbs = NULL;
-static struct tdb_file *find_file(dev_t device, ino_t ino)
- struct tdb_context *i;
- for (i = tdbs; i; i = i->next) {
- if (i->file->device == device && i->file->inode == ino) {
- i->file->refcnt++;
- return i->file;
- }
- }
- return NULL;
-static bool read_all(int fd, void *buf, size_t len)
- while (len) {
- ssize_t ret;
- ret = read(fd, buf, len);
- if (ret < 0)
- return false;
- if (ret == 0) {
- /* ETOOSHORT? */
- errno = EWOULDBLOCK;
- return false;
- }
- buf = (char *)buf + ret;
- len -= ret;
- }
- return true;
-static uint64_t random_number(struct tdb_context *tdb)
- int fd;
- uint64_t ret = 0;
- struct timeval now;
- fd = open("/dev/urandom", O_RDONLY);
- if (fd >= 0) {
- if (read_all(fd, &ret, sizeof(ret))) {
- close(fd);
- return ret;
- }
- close(fd);
- }
- /* FIXME: Untested! Based on Wikipedia protocol description! */
- fd = open("/dev/egd-pool", O_RDWR);
- if (fd >= 0) {
- /* Command is 1, next byte is size we want to read. */
- char cmd[2] = { 1, sizeof(uint64_t) };
- if (write(fd, cmd, sizeof(cmd)) == sizeof(cmd)) {
- char reply[1 + sizeof(uint64_t)];
- int r = read(fd, reply, sizeof(reply));
- if (r > 1) {
- /* Copy at least some bytes. */
- memcpy(&ret, reply+1, r - 1);
- if (reply[0] == sizeof(uint64_t)
- && r == sizeof(reply)) {
- close(fd);
- return ret;
- }
- }
- }
- close(fd);
- }
- /* Fallback: pid and time. */
- gettimeofday(&now, NULL);
- ret = getpid() * 100132289ULL + now.tv_sec * 1000000ULL + now.tv_usec;
- tdb_logerr(tdb, TDB_SUCCESS, TDB_LOG_WARNING,
- "tdb_open: random from getpid and time");
- return ret;
-static void tdb2_context_init(struct tdb_context *tdb)
- /* Initialize the TDB2 fields here */
- tdb_io_init(tdb);
- tdb->direct_access = 0;
- tdb->transaction = NULL;
- tdb->access = NULL;
-struct new_database {
- struct tdb_header hdr;
- struct tdb_freetable ftable;
-/* initialise a new database */
-static enum TDB_ERROR tdb_new_database(struct tdb_context *tdb,
- struct tdb_attribute_seed *seed,
- struct tdb_header *hdr)
- /* We make it up in memory, then write it out if not internal */
- struct new_database newdb;
- unsigned int magic_len;
- ssize_t rlen;
- enum TDB_ERROR ecode;
- /* Fill in the header */
- newdb.hdr.version = TDB_VERSION;
- if (seed)
- newdb.hdr.hash_seed = seed->seed;
- else
- newdb.hdr.hash_seed = random_number(tdb);
- newdb.hdr.hash_test = TDB_HASH_MAGIC;
- newdb.hdr.hash_test = tdb->hash_fn(&newdb.hdr.hash_test,
- sizeof(newdb.hdr.hash_test),
- newdb.hdr.hash_seed,
- tdb->hash_data);
- newdb.hdr.recovery = 0;
- newdb.hdr.features_used = newdb.hdr.features_offered = TDB_FEATURE_MASK;
- newdb.hdr.seqnum = 0;
- newdb.hdr.capabilities = 0;
- memset(newdb.hdr.reserved, 0, sizeof(newdb.hdr.reserved));
- /* Initial hashes are empty. */
- memset(newdb.hdr.hashtable, 0, sizeof(newdb.hdr.hashtable));
- /* Free is empty. */
- newdb.hdr.free_table = offsetof(struct new_database, ftable);
- memset(&newdb.ftable, 0, sizeof(newdb.ftable));
- ecode = set_header(NULL, &newdb.ftable.hdr, TDB_FTABLE_MAGIC, 0,
- sizeof(newdb.ftable) - sizeof(newdb.ftable.hdr),
- sizeof(newdb.ftable) - sizeof(newdb.ftable.hdr),
- 0);
- if (ecode != TDB_SUCCESS) {
- return ecode;
- }
- /* Magic food */
- memset(newdb.hdr.magic_food, 0, sizeof(newdb.hdr.magic_food));
- strcpy(newdb.hdr.magic_food, TDB_MAGIC_FOOD);
- /* This creates an endian-converted database, as if read from disk */
- magic_len = sizeof(newdb.hdr.magic_food);
- tdb_convert(tdb,
- (char *)&newdb.hdr + magic_len, sizeof(newdb) - magic_len);
- *hdr = newdb.hdr;
- if (tdb->flags & TDB_INTERNAL) {
- tdb->file->map_size = sizeof(newdb);
- tdb->file->map_ptr = malloc(tdb->file->map_size);
- if (!tdb->file->map_ptr) {
- return tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR,
- "tdb_new_database:"
- " failed to allocate");
- }
- memcpy(tdb->file->map_ptr, &newdb, tdb->file->map_size);
- return TDB_SUCCESS;
- }
- if (lseek(tdb->file->fd, 0, SEEK_SET) == -1) {
- return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
- "tdb_new_database:"
- " failed to seek: %s", strerror(errno));
- }
- if (ftruncate(tdb->file->fd, 0) == -1) {
- return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
- "tdb_new_database:"
- " failed to truncate: %s", strerror(errno));
- }
- rlen = write(tdb->file->fd, &newdb, sizeof(newdb));
- if (rlen != sizeof(newdb)) {
- if (rlen >= 0)
- errno = ENOSPC;
- return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
- "tdb_new_database: %zi writing header: %s",
- rlen, strerror(errno));
- }
- return TDB_SUCCESS;
-static enum TDB_ERROR tdb_new_file(struct tdb_context *tdb)
- tdb->file = malloc(sizeof(*tdb->file));
- if (!tdb->file)
- return tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR,
- "tdb_open: cannot alloc tdb_file structure");
- tdb->file->num_lockrecs = 0;
- tdb->file->lockrecs = NULL;
- tdb->file->allrecord_lock.count = 0;
- tdb->file->refcnt = 1;
- tdb->file->map_ptr = NULL;
- return TDB_SUCCESS;
-_PUBLIC_ enum TDB_ERROR tdb_set_attribute(struct tdb_context *tdb,
- const union tdb_attribute *attr)
- switch (attr->base.attr) {
- tdb->log_fn = attr->log.fn;
- tdb->log_data = attr->;
- break;
- return tdb->last_error
- = tdb_logerr(tdb, TDB_ERR_EINVAL,
- "tdb_set_attribute:"
- " cannot set %s after opening",
- attr->base.attr == TDB_ATTRIBUTE_HASH
- : attr->base.attr == TDB_ATTRIBUTE_SEED
- return tdb->last_error
- = tdb_logerr(tdb, TDB_ERR_EINVAL,
- "tdb_set_attribute:"
- " cannot set TDB_ATTRIBUTE_STATS");
- tdb->lock_fn = attr->flock.lock;
- tdb->unlock_fn = attr->flock.unlock;
- tdb->lock_data = attr->;
- break;
- default:
- return tdb->last_error
- = tdb_logerr(tdb, TDB_ERR_EINVAL,
- "tdb_set_attribute:"
- " unknown attribute type %u",
- attr->base.attr);
- }
- return TDB_SUCCESS;
-_PUBLIC_ enum TDB_ERROR tdb_get_attribute(struct tdb_context *tdb,
- union tdb_attribute *attr)
- switch (attr->base.attr) {
- if (!tdb->log_fn)
- return tdb->last_error = TDB_ERR_NOEXIST;
- attr->log.fn = tdb->log_fn;
- attr-> = tdb->log_data;
- break;
- attr->hash.fn = tdb->hash_fn;
- attr-> = tdb->hash_data;
- break;
- attr->seed.seed = tdb->hash_seed;
- break;
- if (!tdb->openhook)
- return tdb->last_error = TDB_ERR_NOEXIST;
- attr->openhook.fn = tdb->openhook;
- attr-> = tdb->openhook_data;
- break;
- size_t size = attr->stats.size;
- if (size > tdb->stats.size)
- size = tdb->stats.size;
- memcpy(&attr->stats, &tdb->stats, size);
- break;
- }
- attr->flock.lock = tdb->lock_fn;
- attr->flock.unlock = tdb->unlock_fn;
- attr-> = tdb->lock_data;
- break;
- default:
- return tdb->last_error
- = tdb_logerr(tdb, TDB_ERR_EINVAL,
- "tdb_get_attribute:"
- " unknown attribute type %u",
- attr->base.attr);
- }
- attr-> = NULL;
- return TDB_SUCCESS;
-_PUBLIC_ void tdb_unset_attribute(struct tdb_context *tdb,
- enum tdb_attribute_type type)
- switch (type) {
- tdb->log_fn = NULL;
- break;
- tdb->openhook = NULL;
- break;
- tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_LOG_USE_ERROR,
- "tdb_unset_attribute: cannot unset %s after opening",
- break;
- tdb_logerr(tdb, TDB_ERR_EINVAL,
- "tdb_unset_attribute:"
- "cannot unset TDB_ATTRIBUTE_STATS");
- break;
- tdb->lock_fn = tdb_fcntl_lock;
- tdb->unlock_fn = tdb_fcntl_unlock;
- break;
- default:
- tdb_logerr(tdb, TDB_ERR_EINVAL,
- "tdb_unset_attribute: unknown attribute type %u",
- type);
- }
-/* The top three bits of the capability tell us whether it matters. */
-enum TDB_ERROR unknown_capability(struct tdb_context *tdb, const char *caller,
- tdb_off_t type)
- if (type & TDB_CAP_NOOPEN) {
- return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
- "%s: file has unknown capability %llu",
- caller, type & TDB_CAP_NOOPEN);
- }
- if ((type & TDB_CAP_NOWRITE) && !(tdb->flags & TDB_RDONLY)) {
- return tdb_logerr(tdb, TDB_ERR_RDONLY, TDB_LOG_ERROR,
- "%s: file has unknown capability %llu"
- " (cannot write to it)",
- caller, type & TDB_CAP_NOOPEN);
- }
- if (type & TDB_CAP_NOCHECK) {
- tdb->flags |= TDB_CANT_CHECK;
- }
- return TDB_SUCCESS;
-static enum TDB_ERROR capabilities_ok(struct tdb_context *tdb,
- tdb_off_t capabilities)
- tdb_off_t off, next;
- enum TDB_ERROR ecode = TDB_SUCCESS;
- const struct tdb_capability *cap;
- /* Check capability list. */
- for (off = capabilities; off && ecode == TDB_SUCCESS; off = next) {
- cap = tdb_access_read(tdb, off, sizeof(*cap), true);
- if (TDB_PTR_IS_ERR(cap)) {
- return TDB_PTR_ERR(cap);
- }
- switch (cap->type & TDB_CAP_TYPE_MASK) {
- /* We don't understand any capabilities (yet). */
- default:
- ecode = unknown_capability(tdb, "tdb_open", cap->type);
- }
- next = cap->next;
- tdb_access_release(tdb, cap);
- }
- return ecode;
-_PUBLIC_ struct tdb_context *tdb_open(const char *name, int tdb_flags,
- int open_flags, mode_t mode,
- union tdb_attribute *attr)
- struct tdb_context *tdb;
- struct stat st;
- int saved_errno = 0;
- uint64_t hash_test;
- unsigned v;
- ssize_t rlen;
- struct tdb_header hdr;
- struct tdb_attribute_seed *seed = NULL;
- tdb_bool_err berr;
- enum TDB_ERROR ecode;
- int openlock;
- tdb = malloc(sizeof(*tdb) + (name ? strlen(name) + 1 : 0));
- if (!tdb) {
- /* Can't log this */
- errno = ENOMEM;
- return NULL;
- }
- /* Set name immediately for logging functions. */
- if (name) {
- tdb->name = strcpy((char *)(tdb + 1), name);
- } else {
- tdb->name = NULL;
- }
- tdb->flags = tdb_flags;
- tdb->log_fn = NULL;
- tdb->open_flags = open_flags;
- tdb->last_error = TDB_SUCCESS;
- tdb->file = NULL;
- tdb->openhook = NULL;
- tdb->lock_fn = tdb_fcntl_lock;
- tdb->unlock_fn = tdb_fcntl_unlock;
- tdb->hash_fn = tdb_jenkins_hash;
- memset(&tdb->stats, 0, sizeof(tdb->stats));
- tdb->stats.base.attr = TDB_ATTRIBUTE_STATS;
- tdb->stats.size = sizeof(tdb->stats);
- while (attr) {
- switch (attr->base.attr) {
- tdb->hash_fn = attr->hash.fn;
- tdb->hash_data = attr->;
- break;
- seed = &attr->seed;
- break;
- tdb->openhook = attr->openhook.fn;
- tdb->openhook_data = attr->;
- break;
- default:
- /* These are set as normal. */
- ecode = tdb_set_attribute(tdb, attr);
- if (ecode != TDB_SUCCESS)
- goto fail;
- }
- attr = attr->;
- }
- | TDB_RDONLY)) {
- ecode = tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_LOG_USE_ERROR,
- "tdb_open: unknown flags %u", tdb_flags);
- goto fail;
- }
- if (seed) {
- if (!(tdb_flags & TDB_INTERNAL) && !(open_flags & O_CREAT)) {
- ecode = tdb_logerr(tdb, TDB_ERR_EINVAL,
- "tdb_open:"
- " cannot set TDB_ATTRIBUTE_SEED"
- " without O_CREAT.");
- goto fail;
- }
- }
- if ((open_flags & O_ACCMODE) == O_WRONLY) {
- ecode = tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_LOG_USE_ERROR,
- "tdb_open: can't open tdb %s write-only",
- name);
- goto fail;
- }
- if ((open_flags & O_ACCMODE) == O_RDONLY) {
- openlock = F_RDLCK;
- tdb->flags |= TDB_RDONLY;
- } else {
- if (tdb_flags & TDB_RDONLY) {
- ecode = tdb_logerr(tdb, TDB_ERR_EINVAL,
- "tdb_open: can't use TDB_RDONLY"
- " without O_RDONLY");
- goto fail;
- }
- openlock = F_WRLCK;
- }
- /* internal databases don't need any of the rest. */
- if (tdb->flags & TDB_INTERNAL) {
- tdb->flags |= (TDB_NOLOCK | TDB_NOMMAP);
- ecode = tdb_new_file(tdb);
- if (ecode != TDB_SUCCESS) {
- goto fail;
- }
- tdb->file->fd = -1;
- ecode = tdb_new_database(tdb, seed, &hdr);
- if (ecode == TDB_SUCCESS) {
- tdb_convert(tdb, &hdr.hash_seed,
- sizeof(hdr.hash_seed));
- tdb->hash_seed = hdr.hash_seed;
- tdb2_context_init(tdb);
- tdb_ftable_init(tdb);
- }
- if (ecode != TDB_SUCCESS) {
- goto fail;
- }
- return tdb;
- }
- if (stat(name, &st) != -1)
- tdb->file = find_file(st.st_dev, st.st_ino);
- if (!tdb->file) {
- int fd;
- if ((fd = open(name, open_flags, mode)) == -1) {
- /* errno set by open(2) */
- saved_errno = errno;
- tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
- "tdb_open: could not open file %s: %s",
- name, strerror(errno));
- goto fail_errno;
- }
- /* on exec, don't inherit the fd */
- v = fcntl(fd, F_GETFD, 0);
- fcntl(fd, F_SETFD, v | FD_CLOEXEC);
- if (fstat(fd, &st) == -1) {
- saved_errno = errno;
- tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
- "tdb_open: could not stat open %s: %s",
- name, strerror(errno));
- close(fd);
- goto fail_errno;
- }
- ecode = tdb_new_file(tdb);
- if (ecode != TDB_SUCCESS) {
- close(fd);
- goto fail;
- }
- tdb->file->fd = fd;
- tdb->file->device = st.st_dev;
- tdb->file->inode = st.st_ino;
- tdb->file->map_ptr = NULL;
- tdb->file->map_size = 0;
- }
- /* ensure there is only one process initialising at once */
- ecode = tdb_lock_open(tdb, openlock, TDB_LOCK_WAIT|TDB_LOCK_NOCHECK);
- if (ecode != TDB_SUCCESS) {
- saved_errno = errno;
- goto fail_errno;
- }
- /* call their open hook if they gave us one. */
- if (tdb->openhook) {
- ecode = tdb->openhook(tdb->file->fd, tdb->openhook_data);
- if (ecode != TDB_SUCCESS) {
- tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
- "tdb_open: open hook failed");
- goto fail;
- }
- open_flags |= O_CREAT;
- }
- /* If they used O_TRUNC, read will return 0. */
- rlen = pread(tdb->file->fd, &hdr, sizeof(hdr), 0);
- if (rlen == 0 && (open_flags & O_CREAT)) {
- ecode = tdb_new_database(tdb, seed, &hdr);
- if (ecode != TDB_SUCCESS) {
- goto fail;
- }
- } else if (rlen < 0) {
- ecode = tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
- "tdb_open: error %s reading %s",
- strerror(errno), name);
- goto fail;
- } else if (rlen < sizeof(hdr)
- || strcmp(hdr.magic_food, TDB_MAGIC_FOOD) != 0) {
- ecode = tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
- "tdb_open: %s is not a tdb2 file", name);
- goto fail;
- }
- if (hdr.version != TDB_VERSION) {
- if (hdr.version == bswap_64(TDB_VERSION))
- tdb->flags |= TDB_CONVERT;
- else {
- /* wrong version */
- ecode = tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
- "tdb_open:"
- " %s is unknown version 0x%llx",
- name, (long long)hdr.version);
- goto fail;
- }
- } else if (tdb->flags & TDB_CONVERT) {
- ecode = tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
- "tdb_open:"
- " %s does not need TDB_CONVERT",
- name);
- goto fail;
- }
- tdb2_context_init(tdb);
- tdb_convert(tdb, &hdr, sizeof(hdr));
- tdb->hash_seed = hdr.hash_seed;
- hash_test = TDB_HASH_MAGIC;
- hash_test = tdb_hash(tdb, &hash_test, sizeof(hash_test));
- if (hdr.hash_test != hash_test) {
- /* wrong hash variant */
- ecode = tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
- "tdb_open:"
- " %s uses a different hash function",
- name);
- goto fail;
- }
- ecode = capabilities_ok(tdb, hdr.capabilities);
- if (ecode != TDB_SUCCESS) {
- goto fail;
- }
- /* Clear any features we don't understand. */
- if ((open_flags & O_ACCMODE) != O_RDONLY) {
- hdr.features_used &= TDB_FEATURE_MASK;
- ecode = tdb_write_convert(tdb, offsetof(struct tdb_header,
- features_used),
- &hdr.features_used,
- sizeof(hdr.features_used));
- if (ecode != TDB_SUCCESS)
- goto fail;
- }
- tdb_unlock_open(tdb, openlock);
- /* This makes sure we have current map_size and mmap. */
- ecode = tdb->io->oob(tdb, tdb->file->map_size, 1, true);
- if (unlikely(ecode != TDB_SUCCESS))
- goto fail;
- /* Now it's fully formed, recover if necessary. */
- berr = tdb_needs_recovery(tdb);
- if (unlikely(berr != false)) {
- if (berr < 0) {
- ecode = TDB_OFF_TO_ERR(berr);
- goto fail;
- }
- ecode = tdb_lock_and_recover(tdb);
- if (ecode != TDB_SUCCESS) {
- goto fail;
- }
- }
- ecode = tdb_ftable_init(tdb);
- if (ecode != TDB_SUCCESS) {
- goto fail;
- }
- tdb->next = tdbs;
- tdbs = tdb;
- return tdb;
- fail:
- /* Map ecode to some logical errno. */
- switch (TDB_ERR_TO_OFF(ecode)) {
- saved_errno = EIO;
- break;
- saved_errno = EWOULDBLOCK;
- break;
- saved_errno = ENOMEM;
- break;
- saved_errno = EINVAL;
- break;
- default:
- saved_errno = EINVAL;
- break;
- }
-#ifdef TDB_TRACE
- close(tdb->tracefd);
- if (tdb->file) {
- tdb_lock_cleanup(tdb);
- if (--tdb->file->refcnt == 0) {
- assert(tdb->file->num_lockrecs == 0);
- if (tdb->file->map_ptr) {
- if (tdb->flags & TDB_INTERNAL) {
- free(tdb->file->map_ptr);
- } else
- tdb_munmap(tdb->file);
- }
- if (close(tdb->file->fd) != 0)
- tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
- "tdb_open: failed to close tdb fd"
- " on error: %s", strerror(errno));
- free(tdb->file->lockrecs);
- free(tdb->file);
- }
- }
- free(tdb);
- errno = saved_errno;
- return NULL;
-_PUBLIC_ int tdb_close(struct tdb_context *tdb)
- int ret = 0;
- struct tdb_context **i;
- tdb_trace(tdb, "tdb_close");
- if (tdb->transaction) {
- tdb_transaction_cancel(tdb);
- }
- if (tdb->file->map_ptr) {
- if (tdb->flags & TDB_INTERNAL)
- free(tdb->file->map_ptr);
- else
- tdb_munmap(tdb->file);
- }
- if (tdb->file) {
- tdb_lock_cleanup(tdb);
- if (--tdb->file->refcnt == 0) {
- ret = close(tdb->file->fd);
- free(tdb->file->lockrecs);
- free(tdb->file);
- }
- }
- /* Remove from tdbs list */
- for (i = &tdbs; *i; i = &(*i)->next) {
- if (*i == tdb) {
- *i = tdb->next;
- break;
- }
- }
-#ifdef TDB_TRACE
- close(tdb->tracefd);
- free(tdb);
- return ret;
-_PUBLIC_ void tdb_foreach_(int (*fn)(struct tdb_context *, void *), void *p)
- struct tdb_context *i;
- for (i = tdbs; i; i = i->next) {
- if (fn(i, p) != 0)
- break;
- }
diff --git a/lib/tdb2/private.h b/lib/tdb2/private.h
deleted file mode 100644
index 8c917a70b2..0000000000
--- a/lib/tdb2/private.h
+++ /dev/null
@@ -1,657 +0,0 @@
-#ifndef TDB_PRIVATE_H
-#define TDB_PRIVATE_H
- /*
- Trivial Database 2: private types and prototypes
- Copyright (C) Rusty Russell 2010
- This library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 3 of the License, or (at your option) any later version.
- This library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- Lesser General Public License for more details.
- You should have received a copy of the GNU Lesser General Public
- License along with this library; if not, see <>.
-#include "config.h"
-#ifndef HAVE_CCAN
-#error You need ccan to build tdb2!
-#include "tdb2.h"
-#include <ccan/compiler/compiler.h>
-#include <ccan/likely/likely.h>
-#include <ccan/endian/endian.h>
-#include "replace.h"
-#include "system/filesys.h"
-#include "system/time.h"
-#include "system/shmem.h"
-#include "system/select.h"
-#include "system/wait.h"
-#include <stdint.h>
-#include <stdbool.h>
-#include <stdlib.h>
-#include <stddef.h>
-#include <sys/time.h>
-#include <sys/mman.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <errno.h>
-#include <stdio.h>
-#include <utime.h>
-#include <unistd.h>
-#ifndef TEST_IT
-#define TEST_IT(cond)
-/* #define TDB_TRACE 1 */
-#ifndef __STRING
-#define __STRING(x) #x
-#define __STRINGSTRING(x) __STRING(x)
-#ifndef __location__
-#define __location__ __FILE__ ":" __STRINGSTRING(__LINE__)
-typedef uint64_t tdb_len_t;
-typedef uint64_t tdb_off_t;
-#define TDB_MAGIC_FOOD "TDB file\n"
-#define TDB_VERSION ((uint64_t)(0x26011967 + 7))
-#define TDB_USED_MAGIC ((uint64_t)0x1999)
-#define TDB_HTABLE_MAGIC ((uint64_t)0x1888)
-#define TDB_CHAIN_MAGIC ((uint64_t)0x1777)
-#define TDB_FTABLE_MAGIC ((uint64_t)0x1666)
-#define TDB_CAP_MAGIC ((uint64_t)0x1555)
-#define TDB_FREE_MAGIC ((uint64_t)0xFE)
-#define TDB_HASH_MAGIC (0xA1ABE11A01092008ULL)
-#define TDB_RECOVERY_MAGIC (0xf53bc0e7ad124589ULL)
-/* Capability bits. */
-#define TDB_CAP_NOCHECK 0x8000000000000000ULL
-#define TDB_CAP_NOWRITE 0x4000000000000000ULL
-#define TDB_CAP_NOOPEN 0x2000000000000000ULL
-#define TDB_OFF_IS_ERR(off) unlikely(off >= (tdb_off_t)(long)TDB_ERR_LAST)
-#define TDB_OFF_TO_ERR(off) ((enum TDB_ERROR)(long)(off))
-#define TDB_ERR_TO_OFF(ecode) ((tdb_off_t)(long)(ecode))
-/* Packing errors into pointers and v.v. */
-#define TDB_PTR_IS_ERR(ptr) \
- unlikely((unsigned long)(ptr) >= (unsigned long)TDB_ERR_LAST)
-#define TDB_PTR_ERR(p) ((enum TDB_ERROR)(long)(p))
-#define TDB_ERR_PTR(err) ((void *)(long)(err))
-/* Common case of returning true, false or -ve error. */
-typedef int tdb_bool_err;
-/* Prevent others from opening the file. */
-#define TDB_OPEN_LOCK 0
-/* Expanding file. */
-/* Doing a transaction. */
-/* Hash chain locks. */
-/* Range for hash locks. */
-/* We have 1024 entries in the top level. */
-/* And 64 entries in each sub-level: thus 64 bits exactly after 9 levels. */
-/* And 8 entries in each group, ie 8 groups per sublevel. */
-/* This is currently 10: beyond this we chain. */
-/* Extend file by least 100 times larger than needed. */
-/* We steal bits from the offsets to store hash info. */
-/* We steal this many upper bits, giving a maximum offset of 64 exabytes. */
-/* The bit number where we store extra hash bits. */
-/* Additional features we understand. Currently: none. */
-#define TDB_FEATURE_MASK ((uint64_t)0)
-/* The bit number where we store the extra hash bits. */
-/* Convenience mask to get actual offset. */
-#define TDB_OFF_MASK \
-/* How many buckets in a free list: see size_to_bucket(). */
-/* We have to be able to fit a free record here. */
-#define TDB_MIN_DATA_LEN \
- (sizeof(struct tdb_free_record) - sizeof(struct tdb_used_record))
-/* Indicates this entry is not on an flist (can happen during coalescing) */
-struct tdb_used_record {
- /* For on-disk compatibility, we avoid bitfields:
- magic: 16, (highest)
- key_len_bits: 5,
- extra_padding: 32
- hash_bits: 11
- */
- uint64_t magic_and_meta;
- /* The bottom key_len_bits*2 are key length, rest is data length. */
- uint64_t key_and_data_len;
-static inline unsigned rec_key_bits(const struct tdb_used_record *r)
- return ((r->magic_and_meta >> 43) & ((1 << 5)-1)) * 2;
-static inline uint64_t rec_key_length(const struct tdb_used_record *r)
- return r->key_and_data_len & ((1ULL << rec_key_bits(r)) - 1);
-static inline uint64_t rec_data_length(const struct tdb_used_record *r)
- return r->key_and_data_len >> rec_key_bits(r);
-static inline uint64_t rec_extra_padding(const struct tdb_used_record *r)
- return (r->magic_and_meta >> 11) & 0xFFFFFFFF;
-static inline uint32_t rec_hash(const struct tdb_used_record *r)
- return r->magic_and_meta & ((1 << 11) - 1);
-static inline uint16_t rec_magic(const struct tdb_used_record *r)
- return (r->magic_and_meta >> 48);
-struct tdb_free_record {
- uint64_t magic_and_prev; /* TDB_OFF_UPPER_STEAL bits magic, then prev */
- uint64_t ftable_and_len; /* Len not counting these two fields. */
- /* This is why the minimum record size is 8 bytes. */
- uint64_t next;
-static inline uint64_t frec_prev(const struct tdb_free_record *f)
- return f->magic_and_prev & ((1ULL << (64 - TDB_OFF_UPPER_STEAL)) - 1);
-static inline uint64_t frec_magic(const struct tdb_free_record *f)
- return f->magic_and_prev >> (64 - TDB_OFF_UPPER_STEAL);
-static inline uint64_t frec_len(const struct tdb_free_record *f)
- return f->ftable_and_len & ((1ULL << (64 - TDB_OFF_UPPER_STEAL))-1);
-static inline unsigned frec_ftable(const struct tdb_free_record *f)
- return f->ftable_and_len >> (64 - TDB_OFF_UPPER_STEAL);
-struct tdb_recovery_record {
- uint64_t magic;
- /* Length of record (add this header to get total length). */
- uint64_t max_len;
- /* Length used. */
- uint64_t len;
- /* Old length of file before transaction. */
- uint64_t eof;
-/* If we bottom out of the subhashes, we chain. */
-struct tdb_chain {
- tdb_off_t rec[1 << TDB_HASH_GROUP_BITS];
- tdb_off_t next;
-/* this is stored at the front of every database */
-struct tdb_header {
- char magic_food[64]; /* for /etc/magic */
- /* FIXME: Make me 32 bit? */
- uint64_t version; /* version of the code */
- uint64_t hash_test; /* result of hashing HASH_MAGIC. */
- uint64_t hash_seed; /* "random" seed written at creation time. */
- tdb_off_t free_table; /* (First) free table. */
- tdb_off_t recovery; /* Transaction recovery area. */
- uint64_t features_used; /* Features all writers understand */
- uint64_t features_offered; /* Features offered */
- uint64_t seqnum; /* Sequence number for TDB_SEQNUM */
- tdb_off_t capabilities; /* Optional linked list of capabilities. */
- tdb_off_t reserved[22];
- /* Top level hash table. */
- tdb_off_t hashtable[1ULL << TDB_TOPLEVEL_HASH_BITS];
-struct tdb_freetable {
- struct tdb_used_record hdr;
- tdb_off_t next;
- tdb_off_t buckets[TDB_FREE_BUCKETS];
-struct tdb_capability {
- struct tdb_used_record hdr;
- tdb_off_t type;
- tdb_off_t next;
- /* ... */
-/* Information about a particular (locked) hash entry. */
-struct hash_info {
- /* Full hash value of entry. */
- uint64_t h;
- /* Start and length of lock acquired. */
- tdb_off_t hlock_start;
- tdb_len_t hlock_range;
- /* Start of hash group. */
- tdb_off_t group_start;
- /* Bucket we belong in. */
- unsigned int home_bucket;
- /* Bucket we (or an empty space) were found in. */
- unsigned int found_bucket;
- /* How many bits of the hash are already used. */
- unsigned int hash_used;
- /* Current working group. */
- tdb_off_t group[1 << TDB_HASH_GROUP_BITS];
-struct traverse_info {
- struct traverse_level {
- tdb_off_t hashtable;
- /* We ignore groups here, and treat it as a big array. */
- unsigned entry;
- unsigned int total_buckets;
- } levels[TDB_MAX_LEVELS + 1];
- unsigned int num_levels;
- unsigned int toplevel_group;
- /* This makes delete-everything-inside-traverse work as expected. */
- tdb_off_t prev;
-enum tdb_lock_flags {
- /* If set, don't log an error on failure. */
- /* If set, don't check for recovery (used by recovery code). */
-struct tdb_lock {
- struct tdb_context *owner;
- off_t off;
- uint32_t count;
- uint32_t ltype;
-/* This is only needed for tdb_access_commit, but used everywhere to
- * simplify. */
-struct tdb_access_hdr {
- struct tdb_access_hdr *next;
- tdb_off_t off;
- tdb_len_t len;
- bool convert;
-struct tdb_file {
- /* How many are sharing us? */
- unsigned int refcnt;
- /* Mmap (if any), or malloc (for TDB_INTERNAL). */
- void *map_ptr;
- /* How much space has been mapped (<= current file size) */
- tdb_len_t map_size;
- /* The file descriptor (-1 for TDB_INTERNAL). */
- int fd;
- /* Lock information */
- pid_t locker;
- struct tdb_lock allrecord_lock;
- size_t num_lockrecs;
- struct tdb_lock *lockrecs;
- /* Identity of this file. */
- dev_t device;
- ino_t inode;
-struct tdb_methods {
- enum TDB_ERROR (*tread)(struct tdb_context *, tdb_off_t, void *,
- tdb_len_t);
- enum TDB_ERROR (*twrite)(struct tdb_context *, tdb_off_t, const void *,
- tdb_len_t);
- enum TDB_ERROR (*oob)(struct tdb_context *, tdb_off_t, tdb_len_t, bool);
- enum TDB_ERROR (*expand_file)(struct tdb_context *, tdb_len_t);
- void *(*direct)(struct tdb_context *, tdb_off_t, size_t, bool);
- internal prototypes
-/* hash.c: */
-uint64_t tdb_jenkins_hash(const void *key, size_t length, uint64_t seed,
- void *unused);
-enum TDB_ERROR first_in_hash(struct tdb_context *tdb,
- struct traverse_info *tinfo,
- TDB_DATA *kbuf, size_t *dlen);
-enum TDB_ERROR next_in_hash(struct tdb_context *tdb,
- struct traverse_info *tinfo,
- TDB_DATA *kbuf, size_t *dlen);
-/* Hash random memory. */
-uint64_t tdb_hash(struct tdb_context *tdb, const void *ptr, size_t len);
-/* Hash on disk. */
-uint64_t hash_record(struct tdb_context *tdb, tdb_off_t off);
-/* Find and lock a hash entry (or where it would be). */
-tdb_off_t find_and_lock(struct tdb_context *tdb,
- struct tdb_data key,
- int ltype,
- struct hash_info *h,
- struct tdb_used_record *rec,
- struct traverse_info *tinfo);
-enum TDB_ERROR replace_in_hash(struct tdb_context *tdb,
- struct hash_info *h,
- tdb_off_t new_off);
-enum TDB_ERROR add_to_hash(struct tdb_context *tdb, struct hash_info *h,
- tdb_off_t new_off);
-enum TDB_ERROR delete_from_hash(struct tdb_context *tdb, struct hash_info *h);
-/* For tdb_check */
-bool is_subhash(tdb_off_t val);
-enum TDB_ERROR unknown_capability(struct tdb_context *tdb, const char *caller,
- tdb_off_t type);
-/* free.c: */
-enum TDB_ERROR tdb_ftable_init(struct tdb_context *tdb);
-/* check.c needs these to iterate through free lists. */
-tdb_off_t first_ftable(struct tdb_context *tdb);
-tdb_off_t next_ftable(struct tdb_context *tdb, tdb_off_t ftable);
-/* This returns space or -ve error number. */
-tdb_off_t alloc(struct tdb_context *tdb, size_t keylen, size_t datalen,
- uint64_t hash, unsigned magic, bool growing);
-/* Put this record in a free list. */
-enum TDB_ERROR add_free_record(struct tdb_context *tdb,
- tdb_off_t off, tdb_len_t len_with_header,
- enum tdb_lock_flags waitflag,
- bool coalesce_ok);
-/* Set up header for a used/ftable/htable/chain/capability record. */
-enum TDB_ERROR set_header(struct tdb_context *tdb,
- struct tdb_used_record *rec,
- unsigned magic, uint64_t keylen, uint64_t datalen,
- uint64_t actuallen, unsigned hashlow);
-/* Used by tdb_check to verify. */
-unsigned int size_to_bucket(tdb_len_t data_len);
-tdb_off_t bucket_off(tdb_off_t ftable_off, unsigned bucket);
-/* Used by tdb_summary */
-tdb_off_t dead_space(struct tdb_context *tdb, tdb_off_t off);
-/* Adjust expansion, used by create_recovery_area */
-tdb_off_t tdb_expand_adjust(tdb_off_t map_size, tdb_off_t size);
-/* io.c: */
-/* Initialize tdb->methods. */
-void tdb_io_init(struct tdb_context *tdb);
-/* Convert endian of the buffer if required. */
-void *tdb_convert(const struct tdb_context *tdb, void *buf, tdb_len_t size);
-/* Unmap and try to map the tdb. */
-void tdb_munmap(struct tdb_file *file);
-enum TDB_ERROR tdb_mmap(struct tdb_context *tdb);
-/* Either alloc a copy, or give direct access. Release frees or noop. */
-const void *tdb_access_read(struct tdb_context *tdb,
- tdb_off_t off, tdb_len_t len, bool convert);
-void *tdb_access_write(struct tdb_context *tdb,
- tdb_off_t off, tdb_len_t len, bool convert);
-/* Release result of tdb_access_read/write. */
-void tdb_access_release(struct tdb_context *tdb, const void *p);
-/* Commit result of tdb_acces_write. */
-enum TDB_ERROR tdb_access_commit(struct tdb_context *tdb, void *p);
-/* Convenience routine to get an offset. */
-tdb_off_t tdb_read_off(struct tdb_context *tdb, tdb_off_t off);
-/* Write an offset at an offset. */
-enum TDB_ERROR tdb_write_off(struct tdb_context *tdb, tdb_off_t off,
- tdb_off_t val);
-/* Clear an ondisk area. */
-enum TDB_ERROR zero_out(struct tdb_context *tdb, tdb_off_t off, tdb_len_t len);
-/* Return a non-zero offset between >= start < end in this array (or end). */
-tdb_off_t tdb_find_nonzero_off(struct tdb_context *tdb,
- tdb_off_t base,
- uint64_t start,
- uint64_t end);
-/* Return a zero offset in this array, or num. */
-tdb_off_t tdb_find_zero_off(struct tdb_context *tdb, tdb_off_t off,
- uint64_t num);
-/* Allocate and make a copy of some offset. */
-void *tdb_alloc_read(struct tdb_context *tdb, tdb_off_t offset, tdb_len_t len);
-/* Writes a converted copy of a record. */
-enum TDB_ERROR tdb_write_convert(struct tdb_context *tdb, tdb_off_t off,
- const void *rec, size_t len);
-/* Reads record and converts it */
-enum TDB_ERROR tdb_read_convert(struct tdb_context *tdb, tdb_off_t off,
- void *rec, size_t len);
-/* Bump the seqnum (caller checks for tdb->flags & TDB_SEQNUM) */
-void tdb_inc_seqnum(struct tdb_context *tdb);
-/* lock.c: */
-/* Print message because another tdb owns a lock we want. */
-enum TDB_ERROR owner_conflict(struct tdb_context *tdb, const char *call);
-/* If we fork, we no longer really own locks. */
-bool check_lock_pid(struct tdb_context *tdb, const char *call, bool log);
-/* Lock/unlock a range of hashes. */
-enum TDB_ERROR tdb_lock_hashes(struct tdb_context *tdb,
- tdb_off_t hash_lock, tdb_len_t hash_range,
- int ltype, enum tdb_lock_flags waitflag);
-enum TDB_ERROR tdb_unlock_hashes(struct tdb_context *tdb,
- tdb_off_t hash_lock,
- tdb_len_t hash_range, int ltype);
-/* For closing the file. */
-void tdb_lock_cleanup(struct tdb_context *tdb);
-/* Lock/unlock a particular free bucket. */
-enum TDB_ERROR tdb_lock_free_bucket(struct tdb_context *tdb, tdb_off_t b_off,
- enum tdb_lock_flags waitflag);
-void tdb_unlock_free_bucket(struct tdb_context *tdb, tdb_off_t b_off);
-/* Serialize transaction start. */
-enum TDB_ERROR tdb_transaction_lock(struct tdb_context *tdb, int ltype);
-void tdb_transaction_unlock(struct tdb_context *tdb, int ltype);
-/* Do we have any hash locks (ie. via tdb_chainlock) ? */
-bool tdb_has_hash_locks(struct tdb_context *tdb);
-/* Lock entire database. */
-enum TDB_ERROR tdb_allrecord_lock(struct tdb_context *tdb, int ltype,
- enum tdb_lock_flags flags, bool upgradable);
-void tdb_allrecord_unlock(struct tdb_context *tdb, int ltype);
-enum TDB_ERROR tdb_allrecord_upgrade(struct tdb_context *tdb, off_t start);
-/* Serialize db open. */
-enum TDB_ERROR tdb_lock_open(struct tdb_context *tdb,
- int ltype, enum tdb_lock_flags flags);
-void tdb_unlock_open(struct tdb_context *tdb, int ltype);
-bool tdb_has_open_lock(struct tdb_context *tdb);
-/* Serialize db expand. */
-enum TDB_ERROR tdb_lock_expand(struct tdb_context *tdb, int ltype);
-void tdb_unlock_expand(struct tdb_context *tdb, int ltype);
-bool tdb_has_expansion_lock(struct tdb_context *tdb);
-/* If it needs recovery, grab all the locks and do it. */
-enum TDB_ERROR tdb_lock_and_recover(struct tdb_context *tdb);
-/* Default lock and unlock functions. */
-int tdb_fcntl_lock(int fd, int rw, off_t off, off_t len, bool waitflag, void *);
-int tdb_fcntl_unlock(int fd, int rw, off_t off, off_t len, void *);
-/* transaction.c: */
-enum TDB_ERROR tdb_transaction_recover(struct tdb_context *tdb);
-tdb_bool_err tdb_needs_recovery(struct tdb_context *tdb);
-struct tdb_context {
- /* Single list of all TDBs, to detect multiple opens. */
- struct tdb_context *next;
- /* Filename of the database. */
- const char *name;
- /* Logging function */
- void (*log_fn)(struct tdb_context *tdb,
- enum tdb_log_level level,
- enum TDB_ERROR ecode,
- const char *message,
- void *data);
- void *log_data;
- /* Open flags passed to tdb_open. */
- int open_flags;
- /* low level (fnctl) lock functions. */
- int (*lock_fn)(int fd, int rw, off_t off, off_t len, bool w, void *);
- int (*unlock_fn)(int fd, int rw, off_t off, off_t len, void *);
- void *lock_data;
- /* the tdb flags passed to tdb_open. */
- uint32_t flags;
- /* Our statistics. */
- struct tdb_attribute_stats stats;
- /* The actual file information */
- struct tdb_file *file;
- /* Hash function. */
- uint64_t (*hash_fn)(const void *key, size_t len, uint64_t seed, void *);
- void *hash_data;
- uint64_t hash_seed;
- /* Our open hook, if any. */
- enum TDB_ERROR (*openhook)(int fd, void *data);
- void *openhook_data;
- /* Last error we returned. */
- enum TDB_ERROR last_error;
- /* Are we accessing directly? (debugging check). */
- int direct_access;
- /* Set if we are in a transaction. */
- struct tdb_transaction *transaction;
- /* What free table are we using? */
- tdb_off_t ftable_off;
- unsigned int ftable;
- /* IO methods: changes for transactions. */
- const struct tdb_methods *io;
- /* Direct access information */
- struct tdb_access_hdr *access;
-/* tdb.c: */
- tdb_logerr(struct tdb_context *tdb,
- enum TDB_ERROR ecode,
- enum tdb_log_level level,
- const char *fmt, ...);
-#ifdef TDB_TRACE
-void tdb_trace(struct tdb_context *tdb, const char *op);
-void tdb_trace_seqnum(struct tdb_context *tdb, uint32_t seqnum, const char *op);
-void tdb_trace_open(struct tdb_context *tdb, const char *op,
- unsigned hash_size, unsigned tdb_flags, unsigned open_flags);
-void tdb_trace_ret(struct tdb_context *tdb, const char *op, int ret);
-void tdb_trace_retrec(struct tdb_context *tdb, const char *op, TDB_DATA ret);
-void tdb_trace_1rec(struct tdb_context *tdb, const char *op,
- TDB_DATA rec);
-void tdb_trace_1rec_ret(struct tdb_context *tdb, const char *op,
- TDB_DATA rec, int ret);
-void tdb_trace_1rec_retrec(struct tdb_context *tdb, const char *op,
- TDB_DATA rec, TDB_DATA ret);
-void tdb_trace_2rec_flag_ret(struct tdb_context *tdb, const char *op,
- TDB_DATA rec1, TDB_DATA rec2, unsigned flag,
- int ret);
-void tdb_trace_2rec_retrec(struct tdb_context *tdb, const char *op,
- TDB_DATA rec1, TDB_DATA rec2, TDB_DATA ret);
-#define tdb_trace(tdb, op)
-#define tdb_trace_seqnum(tdb, seqnum, op)
-#define tdb_trace_open(tdb, op, hash_size, tdb_flags, open_flags)
-#define tdb_trace_ret(tdb, op, ret)
-#define tdb_trace_retrec(tdb, op, ret)
-#define tdb_trace_1rec(tdb, op, rec)
-#define tdb_trace_1rec_ret(tdb, op, rec, ret)
-#define tdb_trace_1rec_retrec(tdb, op, rec, ret)
-#define tdb_trace_2rec_flag_ret(tdb, op, rec1, rec2, flag, ret)
-#define tdb_trace_2rec_retrec(tdb, op, rec1, rec2, ret)
-#endif /* !TDB_TRACE */
diff --git a/lib/tdb2/pytdb.c b/lib/tdb2/pytdb.c
deleted file mode 100644
index 1fa4e5828b..0000000000
--- a/lib/tdb2/pytdb.c
+++ /dev/null
@@ -1,591 +0,0 @@
- Unix SMB/CIFS implementation.
- Python interface to tdb2. Simply modified from tdb1 version.
- Copyright (C) 2004-2006 Tim Potter <>
- Copyright (C) 2007-2008 Jelmer Vernooij <>
- Copyright (C) 2011 Rusty Russell <>
- ** NOTE! The following LGPL license applies to the tdb
- ** library. This does NOT imply that all of Samba is released
- ** under the LGPL
- This library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 3 of the License, or (at your option) any later version.
- This library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- Lesser General Public License for more details.
- You should have received a copy of the GNU Lesser General Public
- License along with this library; if not, see <>.
-#include <Python.h>
-#include "replace.h"
-#include "system/filesys.h"
-#ifndef Py_RETURN_NONE
-#define Py_RETURN_NONE return Py_INCREF(Py_None), Py_None
-/* Include tdb headers */
-#include <tdb2.h>
-typedef struct {
- PyObject_HEAD
- struct tdb_context *ctx;
- bool closed;
-} PyTdbObject;
-staticforward PyTypeObject PyTdb;
-static void PyErr_SetTDBError(enum TDB_ERROR e)
- PyErr_SetObject(PyExc_RuntimeError,
- Py_BuildValue("(i,s)", e, tdb_errorstr(e)));
-static TDB_DATA PyString_AsTDB_DATA(PyObject *data)
- TDB_DATA ret;
- ret.dptr = (unsigned char *)PyString_AsString(data);
- ret.dsize = PyString_Size(data);
- return ret;
-static PyObject *PyString_FromTDB_DATA(TDB_DATA data)
- PyObject *ret = PyString_FromStringAndSize((const char *)data.dptr,
- data.dsize);
- free(data.dptr);
- return ret;
-#define PyErr_TDB_ERROR_IS_ERR_RAISE(ret) \
- if (ret != TDB_SUCCESS) { \
- PyErr_SetTDBError(ret); \
- return NULL; \
- }
-static void stderr_log(struct tdb_context *tdb,
- enum tdb_log_level level,
- enum TDB_ERROR ecode,
- const char *message,
- void *data)
- fprintf(stderr, "%s:%s:%s\n",
- tdb_name(tdb), tdb_errorstr(ecode), message);
-static PyObject *py_tdb_open(PyTypeObject *type, PyObject *args, PyObject *kwargs)
- char *name = NULL;
- int tdb_flags = TDB_DEFAULT, flags = O_RDWR, mode = 0600;
- struct tdb_context *ctx;
- PyTdbObject *ret;
- union tdb_attribute logattr;
- const char *kwnames[] = { "name", "tdb_flags", "flags", "mode", NULL };
- if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|siii", cast_const2(char **, kwnames), &name, &tdb_flags, &flags, &mode))
- return NULL;
- if (name == NULL) {
- tdb_flags |= TDB_INTERNAL;
- }
- logattr.log.base.attr = TDB_ATTRIBUTE_LOG;
- = NULL;
- logattr.log.fn = stderr_log;
- ctx = tdb_open(name, tdb_flags, flags, mode, &logattr);
- if (ctx == NULL) {
- PyErr_SetFromErrno(PyExc_IOError);
- return NULL;
- }
- ret = PyObject_New(PyTdbObject, &PyTdb);
- if (!ret) {
- tdb_close(ctx);
- return NULL;
- }
- ret->ctx = ctx;
- ret->closed = false;
- return (PyObject *)ret;
-static PyObject *obj_transaction_cancel(PyTdbObject *self)
- tdb_transaction_cancel(self->ctx);
-static PyObject *obj_transaction_commit(PyTdbObject *self)
- enum TDB_ERROR ret = tdb_transaction_commit(self->ctx);
-static PyObject *obj_transaction_prepare_commit(PyTdbObject *self)
- enum TDB_ERROR ret = tdb_transaction_prepare_commit(self->ctx);
-static PyObject *obj_transaction_start(PyTdbObject *self)
- enum TDB_ERROR ret = tdb_transaction_start(self->ctx);
-static PyObject *obj_lockall(PyTdbObject *self)
- enum TDB_ERROR ret = tdb_lockall(self->ctx);
-static PyObject *obj_unlockall(PyTdbObject *self)
- tdb_unlockall(self->ctx);
-static PyObject *obj_lockall_read(PyTdbObject *self)
- enum TDB_ERROR ret = tdb_lockall_read(self->ctx);
-static PyObject *obj_unlockall_read(PyTdbObject *self)
- tdb_unlockall_read(self->ctx);
-static PyObject *obj_close(PyTdbObject *self)
- int ret;
- if (self->closed)
- ret = tdb_close(self->ctx);
- self->closed = true;
- if (ret != 0) {
- PyErr_SetTDBError(TDB_ERR_IO);
- return NULL;
- }
-static PyObject *obj_get(PyTdbObject *self, PyObject *args)
- TDB_DATA key, data;
- PyObject *py_key;
- enum TDB_ERROR ret;
- if (!PyArg_ParseTuple(args, "O", &py_key))
- return NULL;
- key = PyString_AsTDB_DATA(py_key);
- ret = tdb_fetch(self->ctx, key, &data);
- if (ret == TDB_ERR_NOEXIST)
- return PyString_FromTDB_DATA(data);
-static PyObject *obj_append(PyTdbObject *self, PyObject *args)
- TDB_DATA key, data;
- PyObject *py_key, *py_data;
- enum TDB_ERROR ret;
- if (!PyArg_ParseTuple(args, "OO", &py_key, &py_data))
- return NULL;
- key = PyString_AsTDB_DATA(py_key);
- data = PyString_AsTDB_DATA(py_data);
- ret = tdb_append(self->ctx, key, data);
-static PyObject *obj_firstkey(PyTdbObject *self)
- enum TDB_ERROR ret;
- TDB_DATA key;
- ret = tdb_firstkey(self->ctx, &key);
- if (ret == TDB_ERR_NOEXIST)
- return PyString_FromTDB_DATA(key);
-static PyObject *obj_nextkey(PyTdbObject *self, PyObject *args)
- TDB_DATA key;
- PyObject *py_key;
- enum TDB_ERROR ret;
- if (!PyArg_ParseTuple(args, "O", &py_key))
- return NULL;
- /* Malloc here, since tdb_nextkey frees. */
- key.dsize = PyString_Size(py_key);
- key.dptr = malloc(key.dsize);
- memcpy(key.dptr, PyString_AsString(py_key), key.dsize);
- ret = tdb_nextkey(self->ctx, &key);
- if (ret == TDB_ERR_NOEXIST)
- return PyString_FromTDB_DATA(key);
-static PyObject *obj_delete(PyTdbObject *self, PyObject *args)
- TDB_DATA key;
- PyObject *py_key;
- enum TDB_ERROR ret;
- if (!PyArg_ParseTuple(args, "O", &py_key))
- return NULL;
- key = PyString_AsTDB_DATA(py_key);
- ret = tdb_delete(self->ctx, key);
-static PyObject *obj_has_key(PyTdbObject *self, PyObject *args)
- TDB_DATA key;
- PyObject *py_key;
- if (!PyArg_ParseTuple(args, "O", &py_key))
- return NULL;
- key = PyString_AsTDB_DATA(py_key);
- if (tdb_exists(self->ctx, key))
- return Py_True;
- if (tdb_error(self->ctx) != TDB_ERR_NOEXIST)
- PyErr_TDB_ERROR_IS_ERR_RAISE(tdb_error(self->ctx));
- return Py_False;
-static PyObject *obj_store(PyTdbObject *self, PyObject *args)
- TDB_DATA key, value;
- enum TDB_ERROR ret;
- int flag = TDB_REPLACE;
- PyObject *py_key, *py_value;
- if (!PyArg_ParseTuple(args, "OO|i", &py_key, &py_value, &flag))
- return NULL;
- key = PyString_AsTDB_DATA(py_key);
- value = PyString_AsTDB_DATA(py_value);
- ret = tdb_store(self->ctx, key, value, flag);
-static PyObject *obj_add_flag(PyTdbObject *self, PyObject *args)
- unsigned flag;
- if (!PyArg_ParseTuple(args, "I", &flag))
- return NULL;
- tdb_add_flag(self->ctx, flag);
-static PyObject *obj_remove_flag(PyTdbObject *self, PyObject *args)
- unsigned flag;
- if (!PyArg_ParseTuple(args, "I", &flag))
- return NULL;
- tdb_remove_flag(self->ctx, flag);
-typedef struct {
- PyObject_HEAD
- TDB_DATA current;
- bool end;
- PyTdbObject *iteratee;
-} PyTdbIteratorObject;
-static PyObject *tdb_iter_next(PyTdbIteratorObject *self)
- enum TDB_ERROR e;
- PyObject *ret;
- if (self->end)
- return NULL;
- ret = PyString_FromStringAndSize((const char *)self->current.dptr,
- self->current.dsize);
- e = tdb_nextkey(self->iteratee->ctx, &self->current);
- if (e == TDB_ERR_NOEXIST)
- self->end = true;
- else
- return ret;
-static void tdb_iter_dealloc(PyTdbIteratorObject *self)
- Py_DECREF(self->iteratee);
- PyObject_Del(self);
-PyTypeObject PyTdbIterator = {
- .tp_name = "Iterator",
- .tp_basicsize = sizeof(PyTdbIteratorObject),
- .tp_iternext = (iternextfunc)tdb_iter_next,
- .tp_dealloc = (destructor)tdb_iter_dealloc,
- .tp_flags = Py_TPFLAGS_DEFAULT,
- .tp_iter = PyObject_SelfIter,
-static PyObject *tdb_object_iter(PyTdbObject *self)
- PyTdbIteratorObject *ret;
- enum TDB_ERROR e;
- ret = PyObject_New(PyTdbIteratorObject, &PyTdbIterator);
- if (!ret)
- return NULL;
- e = tdb_firstkey(self->ctx, &ret->current);
- if (e == TDB_ERR_NOEXIST) {
- ret->end = true;
- } else {
- ret->end = false;
- }
- ret->iteratee = self;
- Py_INCREF(self);
- return (PyObject *)ret;
-static PyObject *obj_clear(PyTdbObject *self)
- enum TDB_ERROR ret = tdb_wipe_all(self->ctx);
-static PyObject *obj_enable_seqnum(PyTdbObject *self)
- tdb_add_flag(self->ctx, TDB_SEQNUM);
-static PyMethodDef tdb_object_methods[] = {
- { "transaction_cancel", (PyCFunction)obj_transaction_cancel, METH_NOARGS,
- "S.transaction_cancel() -> None\n"
- "Cancel the currently active transaction." },
- { "transaction_commit", (PyCFunction)obj_transaction_commit, METH_NOARGS,
- "S.transaction_commit() -> None\n"
- "Commit the currently active transaction." },
- { "transaction_prepare_commit", (PyCFunction)obj_transaction_prepare_commit, METH_NOARGS,
- "S.transaction_prepare_commit() -> None\n"
- "Prepare to commit the currently active transaction" },
- { "transaction_start", (PyCFunction)obj_transaction_start, METH_NOARGS,
- "S.transaction_start() -> None\n"
- "Start a new transaction." },
- { "lock_all", (PyCFunction)obj_lockall, METH_NOARGS, NULL },
- { "unlock_all", (PyCFunction)obj_unlockall, METH_NOARGS, NULL },
- { "read_lock_all", (PyCFunction)obj_lockall_read, METH_NOARGS, NULL },
- { "read_unlock_all", (PyCFunction)obj_unlockall_read, METH_NOARGS, NULL },
- { "close", (PyCFunction)obj_close, METH_NOARGS, NULL },
- { "get", (PyCFunction)obj_get, METH_VARARGS, "S.get(key) -> value\n"
- "Fetch a value." },
- { "append", (PyCFunction)obj_append, METH_VARARGS, "S.append(key, value) -> None\n"
- "Append data to an existing key." },
- { "firstkey", (PyCFunction)obj_firstkey, METH_NOARGS, "S.firstkey() -> data\n"
- "Return the first key in this database." },
- { "nextkey", (PyCFunction)obj_nextkey, METH_NOARGS, "S.nextkey(key) -> data\n"
- "Return the next key in this database." },
- { "delete", (PyCFunction)obj_delete, METH_VARARGS, "S.delete(key) -> None\n"
- "Delete an entry." },
- { "has_key", (PyCFunction)obj_has_key, METH_VARARGS, "S.has_key(key) -> None\n"
- "Check whether key exists in this database." },
- { "store", (PyCFunction)obj_store, METH_VARARGS, ", data, flag=REPLACE) -> None"
- "Store data." },
- { "add_flag", (PyCFunction)obj_add_flag, METH_VARARGS, "S.add_flag(flag) -> None" },
- { "remove_flag", (PyCFunction)obj_remove_flag, METH_VARARGS, "S.remove_flag(flag) -> None" },
- { "iterkeys", (PyCFunction)tdb_object_iter, METH_NOARGS, "S.iterkeys() -> iterator" },
- { "clear", (PyCFunction)obj_clear, METH_NOARGS, "S.clear() -> None\n"
- "Wipe the entire database." },
- { "enable_seqnum", (PyCFunction)obj_enable_seqnum, METH_NOARGS,
- "S.enable_seqnum() -> None" },
- { NULL }
-static PyObject *obj_get_flags(PyTdbObject *self, void *closure)
- return PyInt_FromLong(tdb_get_flags(self->ctx));
-static PyObject *obj_get_filename(PyTdbObject *self, void *closure)
- return PyString_FromString(tdb_name(self->ctx));
-static PyObject *obj_get_seqnum(PyTdbObject *self, void *closure)
- return PyInt_FromLong(tdb_get_seqnum(self->ctx));
-static PyGetSetDef tdb_object_getsetters[] = {
- { cast_const(char *, "flags"), (getter)obj_get_flags, NULL, NULL },
- { cast_const(char *, "filename"), (getter)obj_get_filename, NULL,
- cast_const(char *, "The filename of this TDB file.")},
- { cast_const(char *, "seqnum"), (getter)obj_get_seqnum, NULL, NULL },
- { NULL }
-static PyObject *tdb_object_repr(PyTdbObject *self)
- if (tdb_get_flags(self->ctx) & TDB_INTERNAL) {
- return PyString_FromString("Tdb(<internal>)");
- } else {
- return PyString_FromFormat("Tdb('%s')", tdb_name(self->ctx));
- }
-static void tdb_object_dealloc(PyTdbObject *self)
- if (!self->closed)
- tdb_close(self->ctx);
- self->ob_type->tp_free(self);
-static PyObject *obj_getitem(PyTdbObject *self, PyObject *key)
- TDB_DATA tkey, val;
- enum TDB_ERROR ret;
- if (!PyString_Check(key)) {
- PyErr_SetString(PyExc_TypeError, "Expected string as key");
- return NULL;
- }
- tkey.dptr = (unsigned char *)PyString_AsString(key);
- tkey.dsize = PyString_Size(key);
- ret = tdb_fetch(self->ctx, tkey, &val);
- if (ret == TDB_ERR_NOEXIST) {
- PyErr_SetString(PyExc_KeyError, "No such TDB entry");
- return NULL;
- } else {
- return PyString_FromTDB_DATA(val);
- }
-static int obj_setitem(PyTdbObject *self, PyObject *key, PyObject *value)
- TDB_DATA tkey, tval;
- enum TDB_ERROR ret;
- if (!PyString_Check(key)) {
- PyErr_SetString(PyExc_TypeError, "Expected string as key");
- return -1;
- }
- tkey = PyString_AsTDB_DATA(key);
- if (value == NULL) {
- ret = tdb_delete(self->ctx, tkey);
- } else {
- if (!PyString_Check(value)) {
- PyErr_SetString(PyExc_TypeError, "Expected string as value");
- return -1;
- }
- tval = PyString_AsTDB_DATA(value);
- ret = tdb_store(self->ctx, tkey, tval, TDB_REPLACE);
- }
- if (ret != TDB_SUCCESS) {
- PyErr_SetTDBError(ret);
- return -1;
- }
- return ret;
-static PyMappingMethods tdb_object_mapping = {
- .mp_subscript = (binaryfunc)obj_getitem,
- .mp_ass_subscript = (objobjargproc)obj_setitem,
-static PyTypeObject PyTdb = {
- .tp_name = "tdb.Tdb",
- .tp_basicsize = sizeof(PyTdbObject),
- .tp_methods = tdb_object_methods,
- .tp_getset = tdb_object_getsetters,
- .tp_new = py_tdb_open,
- .tp_doc = "A TDB file",
- .tp_repr = (reprfunc)tdb_object_repr,
- .tp_dealloc = (destructor)tdb_object_dealloc,
- .tp_as_mapping = &tdb_object_mapping,
- .tp_iter = (getiterfunc)tdb_object_iter,
-static PyMethodDef tdb_methods[] = {
- { "open", (PyCFunction)py_tdb_open, METH_VARARGS|METH_KEYWORDS, "open(name, hash_size=0, tdb_flags=TDB_DEFAULT, flags=O_RDWR, mode=0600)\n"
- "Open a TDB file." },
- { NULL }
-void inittdb(void);
-void inittdb(void)
- PyObject *m;
- if (PyType_Ready(&PyTdb) < 0)
- return;
- if (PyType_Ready(&PyTdbIterator) < 0)
- return;
- m = Py_InitModule3("tdb", tdb_methods, "TDB is a simple key-value database similar to GDBM that supports multiple writers.");
- if (m == NULL)
- return;
- PyModule_AddObject(m, "REPLACE", PyInt_FromLong(TDB_REPLACE));
- PyModule_AddObject(m, "INSERT", PyInt_FromLong(TDB_INSERT));
- PyModule_AddObject(m, "MODIFY", PyInt_FromLong(TDB_MODIFY));
- PyModule_AddObject(m, "DEFAULT", PyInt_FromLong(TDB_DEFAULT));
- PyModule_AddObject(m, "INTERNAL", PyInt_FromLong(TDB_INTERNAL));
- PyModule_AddObject(m, "NOLOCK", PyInt_FromLong(TDB_NOLOCK));
- PyModule_AddObject(m, "NOMMAP", PyInt_FromLong(TDB_NOMMAP));
- PyModule_AddObject(m, "CONVERT", PyInt_FromLong(TDB_CONVERT));
- PyModule_AddObject(m, "NOSYNC", PyInt_FromLong(TDB_NOSYNC));
- PyModule_AddObject(m, "SEQNUM", PyInt_FromLong(TDB_SEQNUM));
- PyModule_AddObject(m, "ALLOW_NESTING", PyInt_FromLong(TDB_ALLOW_NESTING));
- PyModule_AddObject(m, "__docformat__", PyString_FromString("restructuredText"));
- PyModule_AddObject(m, "__version__", PyString_FromString(PACKAGE_VERSION));
- Py_INCREF(&PyTdb);
- PyModule_AddObject(m, "Tdb", (PyObject *)&PyTdb);
- Py_INCREF(&PyTdbIterator);
diff --git a/lib/tdb2/summary.c b/lib/tdb2/summary.c
deleted file mode 100644
index c7e93284e0..0000000000
--- a/lib/tdb2/summary.c
+++ /dev/null
@@ -1,330 +0,0 @@
- /*
- Trivial Database 2: human-readable summary code
- Copyright (C) Rusty Russell 2010
- This library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 3 of the License, or (at your option) any later version.
- This library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- Lesser General Public License for more details.
- You should have received a copy of the GNU Lesser General Public
- License along with this library; if not, see <>.
-#include "private.h"
-#include <assert.h>
-#include <ccan/tally/tally.h>
- "Size of file/data: %zu/%zu\n" \
- "Number of records: %zu\n" \
- "Smallest/average/largest keys: %zu/%zu/%zu\n%s" \
- "Smallest/average/largest data: %zu/%zu/%zu\n%s" \
- "Smallest/average/largest padding: %zu/%zu/%zu\n%s" \
- "Number of free records: %zu\n" \
- "Smallest/average/largest free records: %zu/%zu/%zu\n%s" \
- "Number of uncoalesced records: %zu\n" \
- "Smallest/average/largest uncoalesced runs: %zu/%zu/%zu\n%s" \
- "Toplevel hash used: %u of %u\n" \
- "Number of chains: %zu\n" \
- "Number of subhashes: %zu\n" \
- "Smallest/average/largest subhash entries: %zu/%zu/%zu\n%s" \
- "Percentage keys/data/padding/free/rechdrs/freehdrs/hashes: %.0f/%.0f/%.0f/%.0f/%.0f/%.0f/%.0f\n"
- "Free bucket %zu: total entries %zu.\n" \
- "Smallest/average/largest length: %zu/%zu/%zu\n%s"
- "Free bucket %zu-%zu: total entries %zu.\n" \
- "Smallest/average/largest length: %zu/%zu/%zu\n%s"
- "Capability %llu%s\n"
-#define HISTO_WIDTH 70
-#define HISTO_HEIGHT 20
-static tdb_off_t count_hash(struct tdb_context *tdb,
- tdb_off_t hash_off, unsigned bits)
- const tdb_off_t *h;
- tdb_off_t count = 0;
- unsigned int i;
- h = tdb_access_read(tdb, hash_off, sizeof(*h) << bits, true);
- if (TDB_PTR_IS_ERR(h)) {
- return TDB_ERR_TO_OFF(TDB_PTR_ERR(h));
- }
- for (i = 0; i < (1 << bits); i++)
- count += (h[i] != 0);
- tdb_access_release(tdb, h);
- return count;
-static enum TDB_ERROR summarize(struct tdb_context *tdb,
- struct tally *hashes,
- struct tally *ftables,
- struct tally *fr,
- struct tally *keys,
- struct tally *data,
- struct tally *extra,
- struct tally *uncoal,
- struct tally *chains,
- size_t *num_caps)
- tdb_off_t off;
- tdb_len_t len;
- tdb_len_t unc = 0;
- for (off = sizeof(struct tdb_header);
- off < tdb->file->map_size;
- off += len) {
- const union {
- struct tdb_used_record u;
- struct tdb_free_record f;
- struct tdb_recovery_record r;
- } *p;
- /* We might not be able to get the whole thing. */
- p = tdb_access_read(tdb, off, sizeof(p->f), true);
- if (TDB_PTR_IS_ERR(p)) {
- return TDB_PTR_ERR(p);
- }
- if (frec_magic(&p->f) != TDB_FREE_MAGIC) {
- if (unc > 1) {
- tally_add(uncoal, unc);
- unc = 0;
- }
- }
- if (p->r.magic == TDB_RECOVERY_INVALID_MAGIC
- || p->r.magic == TDB_RECOVERY_MAGIC) {
- len = sizeof(p->r) + p->r.max_len;
- } else if (frec_magic(&p->f) == TDB_FREE_MAGIC) {
- len = frec_len(&p->f);
- tally_add(fr, len);
- len += sizeof(p->u);
- unc++;
- } else if (rec_magic(&p->u) == TDB_USED_MAGIC) {
- len = sizeof(p->u)
- + rec_key_length(&p->u)
- + rec_data_length(&p->u)
- + rec_extra_padding(&p->u);
- tally_add(keys, rec_key_length(&p->u));
- tally_add(data, rec_data_length(&p->u));
- tally_add(extra, rec_extra_padding(&p->u));
- } else if (rec_magic(&p->u) == TDB_HTABLE_MAGIC) {
- tdb_off_t count = count_hash(tdb,
- off + sizeof(p->u),
- if (TDB_OFF_IS_ERR(count)) {
- return TDB_OFF_TO_ERR(count);
- }
- tally_add(hashes, count);
- tally_add(extra, rec_extra_padding(&p->u));
- len = sizeof(p->u)
- + rec_data_length(&p->u)
- + rec_extra_padding(&p->u);
- } else if (rec_magic(&p->u) == TDB_FTABLE_MAGIC) {
- len = sizeof(p->u)
- + rec_data_length(&p->u)
- + rec_extra_padding(&p->u);
- tally_add(ftables, rec_data_length(&p->u));
- tally_add(extra, rec_extra_padding(&p->u));
- } else if (rec_magic(&p->u) == TDB_CHAIN_MAGIC) {
- len = sizeof(p->u)
- + rec_data_length(&p->u)
- + rec_extra_padding(&p->u);
- tally_add(chains, 1);
- tally_add(extra, rec_extra_padding(&p->u));
- } else if (rec_magic(&p->u) == TDB_CAP_MAGIC) {
- len = sizeof(p->u)
- + rec_data_length(&p->u)
- + rec_extra_padding(&p->u);
- (*num_caps)++;
- } else {
- len = dead_space(tdb, off);
- if (TDB_OFF_IS_ERR(len)) {
- return TDB_OFF_TO_ERR(len);
- }
- }
- tdb_access_release(tdb, p);
- }
- if (unc)
- tally_add(uncoal, unc);
- return TDB_SUCCESS;
-static void add_capabilities(struct tdb_context *tdb, char *summary)
- tdb_off_t off, next;
- const struct tdb_capability *cap;
- size_t count = 0;
- /* Append to summary. */
- summary += strlen(summary);
- off = tdb_read_off(tdb, offsetof(struct tdb_header, capabilities));
- if (TDB_OFF_IS_ERR(off))
- return;
- /* Walk capability list. */
- for (; off; off = next) {
- cap = tdb_access_read(tdb, off, sizeof(*cap), true);
- if (TDB_PTR_IS_ERR(cap)) {
- break;
- }
- count++;
- sprintf(summary, CAPABILITY_FORMAT,
- cap->type & TDB_CAP_TYPE_MASK,
- /* Noopen? How did we get here? */
- (cap->type & TDB_CAP_NOOPEN) ? " (unopenable)"
- : ((cap->type & TDB_CAP_NOWRITE)
- && (cap->type & TDB_CAP_NOCHECK)) ? " (uncheckable,read-only)"
- : (cap->type & TDB_CAP_NOWRITE) ? " (read-only)"
- : (cap->type & TDB_CAP_NOCHECK) ? " (uncheckable)"
- : "");
- summary += strlen(summary);
- next = cap->next;
- tdb_access_release(tdb, cap);
- }
-_PUBLIC_ enum TDB_ERROR tdb_summary(struct tdb_context *tdb,
- enum tdb_summary_flags flags,
- char **summary)
- tdb_len_t len;
- size_t num_caps = 0;
- struct tally *ftables, *hashes, *freet, *keys, *data, *extra, *uncoal,
- *chains;
- char *hashesg, *freeg, *keysg, *datag, *extrag, *uncoalg;
- enum TDB_ERROR ecode;
- hashesg = freeg = keysg = datag = extrag = uncoalg = NULL;
- ecode = tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_WAIT, false);
- if (ecode != TDB_SUCCESS) {
- return tdb->last_error = ecode;
- }
- ecode = tdb_lock_expand(tdb, F_RDLCK);
- if (ecode != TDB_SUCCESS) {
- tdb_allrecord_unlock(tdb, F_RDLCK);
- return tdb->last_error = ecode;
- }
- /* Start stats off empty. */
- ftables = tally_new(HISTO_HEIGHT);
- hashes = tally_new(HISTO_HEIGHT);
- freet = tally_new(HISTO_HEIGHT);
- keys = tally_new(HISTO_HEIGHT);
- data = tally_new(HISTO_HEIGHT);
- extra = tally_new(HISTO_HEIGHT);
- uncoal = tally_new(HISTO_HEIGHT);
- chains = tally_new(HISTO_HEIGHT);
- if (!ftables || !hashes || !freet || !keys || !data || !extra
- || !uncoal || !chains) {
- ecode = tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR,
- "tdb_summary: failed to allocate"
- " tally structures");
- goto unlock;
- }
- ecode = summarize(tdb, hashes, ftables, freet, keys, data, extra,
- uncoal, chains, &num_caps);
- if (ecode != TDB_SUCCESS) {
- goto unlock;
- }
- hashesg = tally_histogram(hashes, HISTO_WIDTH, HISTO_HEIGHT);
- freeg = tally_histogram(freet, HISTO_WIDTH, HISTO_HEIGHT);
- keysg = tally_histogram(keys, HISTO_WIDTH, HISTO_HEIGHT);
- datag = tally_histogram(data, HISTO_WIDTH, HISTO_HEIGHT);
- extrag = tally_histogram(extra, HISTO_WIDTH, HISTO_HEIGHT);
- uncoalg = tally_histogram(uncoal, HISTO_WIDTH, HISTO_HEIGHT);
- }
- /* 20 is max length of a %llu. */
- len = strlen(SUMMARY_FORMAT) + 33*20 + 1
- + (hashesg ? strlen(hashesg) : 0)
- + (freeg ? strlen(freeg) : 0)
- + (keysg ? strlen(keysg) : 0)
- + (datag ? strlen(datag) : 0)
- + (extrag ? strlen(extrag) : 0)
- + (uncoalg ? strlen(uncoalg) : 0)
- + num_caps * (strlen(CAPABILITY_FORMAT) + 20
- + strlen(" (uncheckable,read-only)"));
- *summary = malloc(len);
- if (!*summary) {
- ecode = tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR,
- "tdb_summary: failed to allocate string");
- goto unlock;
- }
- sprintf(*summary, SUMMARY_FORMAT,
- (size_t)tdb->file->map_size,
- tally_total(keys, NULL) + tally_total(data, NULL),
- tally_num(keys),
- tally_min(keys), tally_mean(keys), tally_max(keys),
- keysg ? keysg : "",
- tally_min(data), tally_mean(data), tally_max(data),
- datag ? datag : "",
- tally_min(extra), tally_mean(extra), tally_max(extra),
- extrag ? extrag : "",
- tally_num(freet),
- tally_min(freet), tally_mean(freet), tally_max(freet),
- freeg ? freeg : "",
- tally_total(uncoal, NULL),
- tally_min(uncoal), tally_mean(uncoal), tally_max(uncoal),
- uncoalg ? uncoalg : "",
- (unsigned)count_hash(tdb, offsetof(struct tdb_header,
- hashtable),
- tally_num(chains),
- tally_num(hashes),
- tally_min(hashes), tally_mean(hashes), tally_max(hashes),
- hashesg ? hashesg : "",
- tally_total(keys, NULL) * 100.0 / tdb->file->map_size,
- tally_total(data, NULL) * 100.0 / tdb->file->map_size,
- tally_total(extra, NULL) * 100.0 / tdb->file->map_size,
- tally_total(freet, NULL) * 100.0 / tdb->file->map_size,
- (tally_num(keys) + tally_num(freet) + tally_num(hashes))
- * sizeof(struct tdb_used_record) * 100.0 / tdb->file->map_size,
- tally_num(ftables) * sizeof(struct tdb_freetable)
- * 100.0 / tdb->file->map_size,
- (tally_num(hashes)
- * (sizeof(tdb_off_t) << TDB_SUBLEVEL_HASH_BITS)
- + (sizeof(tdb_off_t) << TDB_TOPLEVEL_HASH_BITS)
- + sizeof(struct tdb_chain) * tally_num(chains))
- * 100.0 / tdb->file->map_size);
- add_capabilities(tdb, *summary);
- free(hashesg);
- free(freeg);
- free(keysg);
- free(datag);
- free(extrag);
- free(uncoalg);
- free(hashes);
- free(freet);
- free(keys);
- free(data);
- free(extra);
- free(uncoal);
- free(ftables);
- free(chains);
- tdb_allrecord_unlock(tdb, F_RDLCK);
- tdb_unlock_expand(tdb, F_RDLCK);
- return tdb->last_error = ecode;
diff --git a/lib/tdb2/tdb.c b/lib/tdb2/tdb.c
deleted file mode 100644
index 5257aa17e3..0000000000
--- a/lib/tdb2/tdb.c
+++ /dev/null
@@ -1,605 +0,0 @@
- /*
- Trivial Database 2: fetch, store and misc routines.
- Copyright (C) Rusty Russell 2010
- This library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 3 of the License, or (at your option) any later version.
- This library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- Lesser General Public License for more details.
- You should have received a copy of the GNU Lesser General Public
- License along with this library; if not, see <>.
-#include "private.h"
-#include <ccan/asprintf/asprintf.h>
-#include <stdarg.h>
-static enum TDB_ERROR update_rec_hdr(struct tdb_context *tdb,
- tdb_off_t off,
- tdb_len_t keylen,
- tdb_len_t datalen,
- struct tdb_used_record *rec,
- uint64_t h)
- uint64_t dataroom = rec_data_length(rec) + rec_extra_padding(rec);
- enum TDB_ERROR ecode;
- ecode = set_header(tdb, rec, TDB_USED_MAGIC, keylen, datalen,
- keylen + dataroom, h);
- if (ecode == TDB_SUCCESS) {
- ecode = tdb_write_convert(tdb, off, rec, sizeof(*rec));
- }
- return ecode;
-static enum TDB_ERROR replace_data(struct tdb_context *tdb,
- struct hash_info *h,
- struct tdb_data key, struct tdb_data dbuf,
- tdb_off_t old_off, tdb_len_t old_room,
- bool growing)
- tdb_off_t new_off;
- enum TDB_ERROR ecode;
- /* Allocate a new record. */
- new_off = alloc(tdb, key.dsize, dbuf.dsize, h->h, TDB_USED_MAGIC,
- growing);
- if (TDB_OFF_IS_ERR(new_off)) {
- return TDB_OFF_TO_ERR(new_off);
- }
- /* We didn't like the existing one: remove it. */
- if (old_off) {
- tdb->stats.frees++;
- ecode = add_free_record(tdb, old_off,
- sizeof(struct tdb_used_record)
- + key.dsize + old_room,
- TDB_LOCK_WAIT, true);
- if (ecode == TDB_SUCCESS)
- ecode = replace_in_hash(tdb, h, new_off);
- } else {
- ecode = add_to_hash(tdb, h, new_off);
- }
- if (ecode != TDB_SUCCESS) {
- return ecode;
- }
- new_off += sizeof(struct tdb_used_record);
- ecode = tdb->io->twrite(tdb, new_off, key.dptr, key.dsize);
- if (ecode != TDB_SUCCESS) {
- return ecode;
- }
- new_off += key.dsize;
- ecode = tdb->io->twrite(tdb, new_off, dbuf.dptr, dbuf.dsize);
- if (ecode != TDB_SUCCESS) {
- return ecode;
- }
- if (tdb->flags & TDB_SEQNUM)
- tdb_inc_seqnum(tdb);
- return TDB_SUCCESS;
-static enum TDB_ERROR update_data(struct tdb_context *tdb,
- tdb_off_t off,
- struct tdb_data dbuf,
- tdb_len_t extra)
- enum TDB_ERROR ecode;
- ecode = tdb->io->twrite(tdb, off, dbuf.dptr, dbuf.dsize);
- if (ecode == TDB_SUCCESS && extra) {
- /* Put a zero in; future versions may append other data. */
- ecode = tdb->io->twrite(tdb, off + dbuf.dsize, "", 1);
- }
- if (tdb->flags & TDB_SEQNUM)
- tdb_inc_seqnum(tdb);
- return ecode;
-_PUBLIC_ enum TDB_ERROR tdb_store(struct tdb_context *tdb,
- struct tdb_data key, struct tdb_data dbuf, int flag)
- struct hash_info h;
- tdb_off_t off;
- tdb_len_t old_room = 0;
- struct tdb_used_record rec;
- enum TDB_ERROR ecode;
- off = find_and_lock(tdb, key, F_WRLCK, &h, &rec, NULL);
- if (TDB_OFF_IS_ERR(off)) {
- return tdb->last_error = TDB_OFF_TO_ERR(off);
- }
- /* Now we have lock on this hash bucket. */
- if (flag == TDB_INSERT) {
- if (off) {
- ecode = TDB_ERR_EXISTS;
- goto out;
- }
- } else {
- if (off) {
- old_room = rec_data_length(&rec)
- + rec_extra_padding(&rec);
- if (old_room >= dbuf.dsize) {
- /* Can modify in-place. Easy! */
- ecode = update_rec_hdr(tdb, off,
- key.dsize, dbuf.dsize,
- &rec, h.h);
- if (ecode != TDB_SUCCESS) {
- goto out;
- }
- ecode = update_data(tdb,
- off + sizeof(rec)
- + key.dsize, dbuf,
- old_room - dbuf.dsize);
- if (ecode != TDB_SUCCESS) {
- goto out;
- }
- tdb_unlock_hashes(tdb, h.hlock_start,
- h.hlock_range, F_WRLCK);
- return tdb->last_error = TDB_SUCCESS;
- }
- } else {
- if (flag == TDB_MODIFY) {
- /* if the record doesn't exist and we
- are in TDB_MODIFY mode then we should fail
- the store */
- ecode = TDB_ERR_NOEXIST;
- goto out;
- }
- }
- }
- /* If we didn't use the old record, this implies we're growing. */
- ecode = replace_data(tdb, &h, key, dbuf, off, old_room, off);
- tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range, F_WRLCK);
- return tdb->last_error = ecode;
-_PUBLIC_ enum TDB_ERROR tdb_append(struct tdb_context *tdb,
- struct tdb_data key, struct tdb_data dbuf)
- struct hash_info h;
- tdb_off_t off;
- struct tdb_used_record rec;
- tdb_len_t old_room = 0, old_dlen;
- unsigned char *newdata;
- struct tdb_data new_dbuf;
- enum TDB_ERROR ecode;
- off = find_and_lock(tdb, key, F_WRLCK, &h, &rec, NULL);
- if (TDB_OFF_IS_ERR(off)) {
- return tdb->last_error = TDB_OFF_TO_ERR(off);
- }
- if (off) {
- old_dlen = rec_data_length(&rec);
- old_room = old_dlen + rec_extra_padding(&rec);
- /* Fast path: can append in place. */
- if (rec_extra_padding(&rec) >= dbuf.dsize) {
- ecode = update_rec_hdr(tdb, off, key.dsize,
- old_dlen + dbuf.dsize, &rec,
- h.h);
- if (ecode != TDB_SUCCESS) {
- goto out;
- }
- off += sizeof(rec) + key.dsize + old_dlen;
- ecode = update_data(tdb, off, dbuf,
- rec_extra_padding(&rec));
- goto out;
- }
- /* Slow path. */
- newdata = malloc(key.dsize + old_dlen + dbuf.dsize);
- if (!newdata) {
- ecode = tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR,
- "tdb_append:"
- " failed to allocate %zu bytes",
- (size_t)(key.dsize + old_dlen
- + dbuf.dsize));
- goto out;
- }
- ecode = tdb->io->tread(tdb, off + sizeof(rec) + key.dsize,
- newdata, old_dlen);
- if (ecode != TDB_SUCCESS) {
- goto out_free_newdata;
- }
- memcpy(newdata + old_dlen, dbuf.dptr, dbuf.dsize);
- new_dbuf.dptr = newdata;
- new_dbuf.dsize = old_dlen + dbuf.dsize;
- } else {
- newdata = NULL;
- new_dbuf = dbuf;
- }
- /* If they're using tdb_append(), it implies they're growing record. */
- ecode = replace_data(tdb, &h, key, new_dbuf, off, old_room, true);
- free(newdata);
- tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range, F_WRLCK);
- return tdb->last_error = ecode;
-_PUBLIC_ enum TDB_ERROR tdb_fetch(struct tdb_context *tdb, struct tdb_data key,
- struct tdb_data *data)
- tdb_off_t off;
- struct tdb_used_record rec;
- struct hash_info h;
- enum TDB_ERROR ecode;
- off = find_and_lock(tdb, key, F_RDLCK, &h, &rec, NULL);
- if (TDB_OFF_IS_ERR(off)) {
- return tdb->last_error = TDB_OFF_TO_ERR(off);
- }
- if (!off) {
- ecode = TDB_ERR_NOEXIST;
- } else {
- data->dsize = rec_data_length(&rec);
- data->dptr = tdb_alloc_read(tdb, off + sizeof(rec) + key.dsize,
- data->dsize);
- if (TDB_PTR_IS_ERR(data->dptr)) {
- ecode = TDB_PTR_ERR(data->dptr);
- } else
- ecode = TDB_SUCCESS;
- }
- tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range, F_RDLCK);
- return tdb->last_error = ecode;
-_PUBLIC_ bool tdb_exists(struct tdb_context *tdb, TDB_DATA key)
- tdb_off_t off;
- struct tdb_used_record rec;
- struct hash_info h;
- off = find_and_lock(tdb, key, F_RDLCK, &h, &rec, NULL);
- if (TDB_OFF_IS_ERR(off)) {
- tdb->last_error = TDB_OFF_TO_ERR(off);
- return false;
- }
- tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range, F_RDLCK);
- tdb->last_error = TDB_SUCCESS;
- return off ? true : false;
-_PUBLIC_ enum TDB_ERROR tdb_delete(struct tdb_context *tdb, struct tdb_data key)
- tdb_off_t off;
- struct tdb_used_record rec;
- struct hash_info h;
- enum TDB_ERROR ecode;
- off = find_and_lock(tdb, key, F_WRLCK, &h, &rec, NULL);
- if (TDB_OFF_IS_ERR(off)) {
- return tdb->last_error = TDB_OFF_TO_ERR(off);
- }
- if (!off) {
- ecode = TDB_ERR_NOEXIST;
- goto unlock;
- }
- ecode = delete_from_hash(tdb, &h);
- if (ecode != TDB_SUCCESS) {
- goto unlock;
- }
- /* Free the deleted entry. */
- tdb->stats.frees++;
- ecode = add_free_record(tdb, off,
- sizeof(struct tdb_used_record)
- + rec_key_length(&rec)
- + rec_data_length(&rec)
- + rec_extra_padding(&rec),
- TDB_LOCK_WAIT, true);
- if (tdb->flags & TDB_SEQNUM)
- tdb_inc_seqnum(tdb);
- tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range, F_WRLCK);
- return tdb->last_error = ecode;
-_PUBLIC_ unsigned int tdb_get_flags(struct tdb_context *tdb)
- return tdb->flags;
-static bool inside_transaction(const struct tdb_context *tdb)
- return tdb->transaction != NULL;
-static bool readonly_changable(struct tdb_context *tdb, const char *caller)
- if (inside_transaction(tdb)) {
- tdb->last_error = tdb_logerr(tdb, TDB_ERR_EINVAL,
- "%s: can't change"
- " TDB_RDONLY inside transaction",
- caller);
- return false;
- }
- return true;
-_PUBLIC_ void tdb_add_flag(struct tdb_context *tdb, unsigned flag)
- if (tdb->flags & TDB_INTERNAL) {
- tdb->last_error = tdb_logerr(tdb, TDB_ERR_EINVAL,
- "tdb_add_flag: internal db");
- return;
- }
- switch (flag) {
- case TDB_NOLOCK:
- tdb->flags |= TDB_NOLOCK;
- break;
- case TDB_NOMMAP:
- tdb->flags |= TDB_NOMMAP;
- tdb_munmap(tdb->file);
- break;
- case TDB_NOSYNC:
- tdb->flags |= TDB_NOSYNC;
- break;
- case TDB_SEQNUM:
- tdb->flags |= TDB_SEQNUM;
- break;
- tdb->flags |= TDB_ALLOW_NESTING;
- break;
- case TDB_RDONLY:
- if (readonly_changable(tdb, "tdb_add_flag"))
- tdb->flags |= TDB_RDONLY;
- break;
- default:
- tdb->last_error = tdb_logerr(tdb, TDB_ERR_EINVAL,
- "tdb_add_flag: Unknown flag %u",
- flag);
- }
-_PUBLIC_ void tdb_remove_flag(struct tdb_context *tdb, unsigned flag)
- if (tdb->flags & TDB_INTERNAL) {
- tdb->last_error = tdb_logerr(tdb, TDB_ERR_EINVAL,
- "tdb_remove_flag: internal db");
- return;
- }
- switch (flag) {
- case TDB_NOLOCK:
- tdb->flags &= ~TDB_NOLOCK;
- break;
- case TDB_NOMMAP:
- tdb->flags &= ~TDB_NOMMAP;
- /* If mmap incoherent, we were mmaping anyway. */
- tdb_mmap(tdb);
- break;
- case TDB_NOSYNC:
- tdb->flags &= ~TDB_NOSYNC;
- break;
- case TDB_SEQNUM:
- tdb->flags &= ~TDB_SEQNUM;
- break;
- tdb->flags &= ~TDB_ALLOW_NESTING;
- break;
- case TDB_RDONLY:
- if ((tdb->open_flags & O_ACCMODE) == O_RDONLY) {
- tdb->last_error = tdb_logerr(tdb, TDB_ERR_EINVAL,
- "tdb_remove_flag: can't"
- " remove TDB_RDONLY on tdb"
- " opened with O_RDONLY");
- break;
- }
- if (readonly_changable(tdb, "tdb_remove_flag"))
- tdb->flags &= ~TDB_RDONLY;
- break;
- default:
- tdb->last_error = tdb_logerr(tdb, TDB_ERR_EINVAL,
- "tdb_remove_flag: Unknown flag %u",
- flag);
- }
-_PUBLIC_ const char *tdb_errorstr(enum TDB_ERROR ecode)
- /* Gcc warns if you miss a case in the switch, so use that. */
- switch (TDB_ERR_TO_OFF(ecode)) {
- case TDB_ERR_TO_OFF(TDB_SUCCESS): return "Success";
- case TDB_ERR_TO_OFF(TDB_ERR_CORRUPT): return "Corrupt database";
- case TDB_ERR_TO_OFF(TDB_ERR_IO): return "IO Error";
- case TDB_ERR_TO_OFF(TDB_ERR_LOCK): return "Locking error";
- case TDB_ERR_TO_OFF(TDB_ERR_OOM): return "Out of memory";
- case TDB_ERR_TO_OFF(TDB_ERR_EXISTS): return "Record exists";
- case TDB_ERR_TO_OFF(TDB_ERR_EINVAL): return "Invalid parameter";
- case TDB_ERR_TO_OFF(TDB_ERR_NOEXIST): return "Record does not exist";
- case TDB_ERR_TO_OFF(TDB_ERR_RDONLY): return "write not permitted";
- }
- return "Invalid error code";
-_PUBLIC_ enum TDB_ERROR tdb_error(struct tdb_context *tdb)
- return tdb->last_error;
-enum TDB_ERROR COLD tdb_logerr(struct tdb_context *tdb,
- enum TDB_ERROR ecode,
- enum tdb_log_level level,
- const char *fmt, ...)
- char *message;
- va_list ap;
- size_t len;
- /* tdb_open paths care about errno, so save it. */
- int saved_errno = errno;
- if (!tdb->log_fn)
- return ecode;
- va_start(ap, fmt);
- len = vasprintf(&message, fmt, ap);
- va_end(ap);
- if (len < 0) {
- tdb->log_fn(tdb, TDB_LOG_ERROR, TDB_ERR_OOM,
- "out of memory formatting message:", tdb->log_data);
- tdb->log_fn(tdb, level, ecode, fmt, tdb->log_data);
- } else {
- tdb->log_fn(tdb, level, ecode, message, tdb->log_data);
- free(message);
- }
- errno = saved_errno;
- return ecode;
-_PUBLIC_ enum TDB_ERROR tdb_parse_record_(struct tdb_context *tdb,
- TDB_DATA key,
- enum TDB_ERROR (*parse)(TDB_DATA k,
- void *data),
- void *data)
- tdb_off_t off;
- struct tdb_used_record rec;
- struct hash_info h;
- enum TDB_ERROR ecode;
- off = find_and_lock(tdb, key, F_RDLCK, &h, &rec, NULL);
- if (TDB_OFF_IS_ERR(off)) {
- return tdb->last_error = TDB_OFF_TO_ERR(off);
- }
- if (!off) {
- ecode = TDB_ERR_NOEXIST;
- } else {
- const void *dptr;
- dptr = tdb_access_read(tdb, off + sizeof(rec) + key.dsize,
- rec_data_length(&rec), false);
- if (TDB_PTR_IS_ERR(dptr)) {
- ecode = TDB_PTR_ERR(dptr);
- } else {
- TDB_DATA d = tdb_mkdata(dptr, rec_data_length(&rec));
- ecode = parse(key, d, data);
- tdb_access_release(tdb, dptr);
- }
- }
- tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range, F_RDLCK);
- return tdb->last_error = ecode;
-_PUBLIC_ const char *tdb_name(const struct tdb_context *tdb)
- return tdb->name;
-_PUBLIC_ int64_t tdb_get_seqnum(struct tdb_context *tdb)
- tdb_off_t off;
- off = tdb_read_off(tdb, offsetof(struct tdb_header, seqnum));
- if (TDB_OFF_IS_ERR(off))
- tdb->last_error = TDB_OFF_TO_ERR(off);
- else
- tdb->last_error = TDB_SUCCESS;
- return off;
-_PUBLIC_ int tdb_fd(const struct tdb_context *tdb)
- return tdb->file->fd;
-struct traverse_state {
- enum TDB_ERROR error;
- struct tdb_context *dest_db;
- traverse function for repacking
- */
-static int repack_traverse(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data,
- struct traverse_state *state)
- state->error = tdb_store(state->dest_db, key, data, TDB_INSERT);
- if (state->error != TDB_SUCCESS) {
- return -1;
- }
- return 0;
-_PUBLIC_ enum TDB_ERROR tdb_repack(struct tdb_context *tdb)
- struct tdb_context *tmp_db;
- struct traverse_state state;
- state.error = tdb_transaction_start(tdb);
- if (state.error != TDB_SUCCESS) {
- return state.error;
- }
- tmp_db = tdb_open("tmpdb", TDB_INTERNAL, O_RDWR|O_CREAT, 0, NULL);
- if (tmp_db == NULL) {
- state.error = tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR,
- __location__
- " Failed to create tmp_db");
- tdb_transaction_cancel(tdb);
- return tdb->last_error = state.error;
- }
- state.dest_db = tmp_db;
- if (tdb_traverse(tdb, repack_traverse, &state) < 0) {
- goto fail;
- }
- state.error = tdb_wipe_all(tdb);
- if (state.error != TDB_SUCCESS) {
- goto fail;
- }
- state.dest_db = tdb;
- if (tdb_traverse(tmp_db, repack_traverse, &state) < 0) {
- goto fail;
- }
- tdb_close(tmp_db);
- return tdb_transaction_commit(tdb);
- tdb_transaction_cancel(tdb);
- tdb_close(tmp_db);
- return state.error;
diff --git a/lib/tdb2/ b/lib/tdb2/
deleted file mode 100644
index 75e69d7363..0000000000
--- a/lib/tdb2/
+++ /dev/null
@@ -1,11 +0,0 @@
-Name: tdb
-Description: A trivial database
-Libs: @LIB_RPATH@ -L${libdir} -ltdb
-Cflags: -I${includedir}
diff --git a/lib/tdb2/tdb2.h b/lib/tdb2/tdb2.h
deleted file mode 100644
index f7aa0cc310..0000000000
--- a/lib/tdb2/tdb2.h
+++ /dev/null
@@ -1,897 +0,0 @@
-#ifndef CCAN_TDB2_H
-#define CCAN_TDB2_H
- TDB version 2: trivial database library
- Copyright (C) Andrew Tridgell 1999-2004
- Copyright (C) Rusty Russell 2010-2011
- ** NOTE! The following LGPL license applies to the tdb
- ** library. This does NOT imply that all of Samba is released
- ** under the LGPL
- This library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 3 of the License, or (at your option) any later version.
- This library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- Lesser General Public License for more details.
- You should have received a copy of the GNU Lesser General Public
- License along with this library; if not, see <>.
-#ifdef __cplusplus
-extern "C" {
-#include <replace.h>
-#define _FILE_OFFSET_BITS 64
-/* For mode_t */
-#include <sys/types.h>
-/* For O_* flags. */
-#include <sys/stat.h>
-/* For sig_atomic_t. */
-#include <signal.h>
-/* For uint64_t */
-#include <stdint.h>
-/* For bool */
-#include <stdbool.h>
-/* For memcmp */
-#include <string.h>
-#include <ccan/compiler/compiler.h>
-#include <ccan/typesafe_cb/typesafe_cb.h>
-#include <ccan/cast/cast.h>
-#ifndef typesafe_cb_preargs
-/* Failing to have CCAN just mean less typesafe protection, etc. */
-#define typesafe_cb_preargs(rtype, atype, fn, arg, ...) \
- ((rtype (*)(__VA_ARGS__, atype))(fn))
-#ifndef cast_const
-#if defined(__intptr_t_defined) || defined(HAVE_INTPTR_T)
-#define cast_const(type, expr) ((type)((intptr_t)(expr)))
-#define cast_const(type, expr) ((type *)(expr))
-#endif /* !HAVE_CCAN */
-union tdb_attribute;
-struct tdb_context;
- * tdb_open - open a database file
- * @name: the file name (can be NULL if flags contains TDB_INTERNAL)
- * @tdb_flags: options for this database
- * @open_flags: flags argument for tdb's open() call.
- * @mode: mode argument for tdb's open() call.
- * @attributes: linked list of extra attributes for this tdb.
- *
- * This call opens (and potentially creates) a database file.
- * Multiple processes can have the TDB file open at once.
- *
- * On failure it will return NULL, and set errno: it may also call
- * any log attribute found in @attributes.
- *
- * See also:
- * union tdb_attribute
- */
-struct tdb_context *tdb_open(const char *name, int tdb_flags,
- int open_flags, mode_t mode,
- union tdb_attribute *attributes);
-/* flags for tdb_open() */
-#define TDB_DEFAULT 0 /* just a readability place holder */
-#define TDB_INTERNAL 2 /* don't store on disk */
-#define TDB_NOLOCK 4 /* don't do any locking */
-#define TDB_NOMMAP 8 /* don't use mmap */
-#define TDB_CONVERT 16 /* convert endian */
-#define TDB_NOSYNC 64 /* don't use synchronous transactions */
-#define TDB_SEQNUM 128 /* maintain a sequence number */
-#define TDB_ALLOW_NESTING 256 /* fake nested transactions */
-#define TDB_RDONLY 512 /* implied by O_RDONLY */
-#define TDB_CANT_CHECK 2048 /* has a feature which we don't understand */
- * tdb_close - close and free a tdb.
- * @tdb: the tdb context returned from tdb_open()
- *
- * This always succeeds, in that @tdb is unusable after this call. But if
- * some unexpected error occurred while closing, it will return non-zero
- * (the only clue as to cause will be via the log attribute).
- */
-int tdb_close(struct tdb_context *tdb);
- * struct tdb_data - representation of keys or values.
- * @dptr: the data pointer
- * @dsize: the size of the data pointed to by dptr.
- *
- * This is the "blob" representation of keys and data used by TDB.
- */
-typedef struct tdb_data {
- unsigned char *dptr;
- size_t dsize;
- * enum TDB_ERROR - error returns for TDB
- *
- * See Also:
- * tdb_errorstr()
- */
-enum TDB_ERROR {
- TDB_SUCCESS = 0, /* No error. */
- TDB_ERR_CORRUPT = -1, /* We read the db, and it was bogus. */
- TDB_ERR_IO = -2, /* We couldn't read/write the db. */
- TDB_ERR_LOCK = -3, /* Locking failed. */
- TDB_ERR_OOM = -4, /* Out of Memory. */
- TDB_ERR_EXISTS = -5, /* The key already exists. */
- TDB_ERR_NOEXIST = -6, /* The key does not exist. */
- TDB_ERR_EINVAL = -7, /* You're using it wrong. */
- TDB_ERR_RDONLY = -8, /* The database is read-only. */
- * tdb_store - store a key/value pair in a tdb.
- * @tdb: the tdb context returned from tdb_open()
- * @key: the key
- * @dbuf: the data to associate with the key.
- *
- * This inserts (or overwrites) a key/value pair in the TDB. If flag
- * is TDB_REPLACE, it doesn't matter whether the key exists or not;
- * TDB_INSERT means it must not exist (returns TDB_ERR_EXISTS otherwise),
- * and TDB_MODIFY means it must exist (returns TDB_ERR_NOEXIST otherwise).
- *
- * On success, this returns TDB_SUCCESS.
- *
- * See also:
- * tdb_fetch, tdb_transaction_start, tdb_append, tdb_delete.
- */
-enum TDB_ERROR tdb_store(struct tdb_context *tdb,
- struct tdb_data key,
- struct tdb_data dbuf,
- int flag);
-/* flags to tdb_store() */
-#define TDB_REPLACE 1 /* A readability place holder */
-#define TDB_INSERT 2 /* Don't overwrite an existing entry */
-#define TDB_MODIFY 3 /* Don't create an existing entry */
- * tdb_fetch - fetch a value from a tdb.
- * @tdb: the tdb context returned from tdb_open()
- * @key: the key
- * @data: pointer to data.
- *
- * This looks up a key in the database and sets it in @data.
- *
- * If it returns TDB_SUCCESS, the key was found: it is your
- * responsibility to call free() on @data->dptr.
- *
- * Otherwise, it returns an error (usually, TDB_ERR_NOEXIST) and @data is
- * undefined.
- */
-enum TDB_ERROR tdb_fetch(struct tdb_context *tdb, struct tdb_data key,
- struct tdb_data *data);
- * tdb_errorstr - map the tdb error onto a constant readable string
- * @ecode: the enum TDB_ERROR to map.
- *
- * This is useful for displaying errors to users.
- */
-const char *tdb_errorstr(enum TDB_ERROR ecode);
- * tdb_append - append a value to a key/value pair in a tdb.
- * @tdb: the tdb context returned from tdb_open()
- * @key: the key
- * @dbuf: the data to append.
- *
- * This is equivalent to fetching a record, reallocating .dptr to add the
- * data, and writing it back, only it's much more efficient. If the key
- * doesn't exist, it's equivalent to tdb_store (with an additional hint that
- * you expect to expand the record in future).
- *
- * See Also:
- * tdb_fetch(), tdb_store()
- */
-enum TDB_ERROR tdb_append(struct tdb_context *tdb,
- struct tdb_data key, struct tdb_data dbuf);
- * tdb_delete - delete a key from a tdb.
- * @tdb: the tdb context returned from tdb_open()
- * @key: the key to delete.
- *
- * Returns TDB_SUCCESS on success, or an error (usually TDB_ERR_NOEXIST).
- *
- * See Also:
- * tdb_fetch(), tdb_store()
- */
-enum TDB_ERROR tdb_delete(struct tdb_context *tdb, struct tdb_data key);
- * tdb_exists - does a key exist in the database?
- * @tdb: the tdb context returned from tdb_open()
- * @key: the key to search for.
- *
- * Returns true if it exists, or false if it doesn't or any other error.
- */
-bool tdb_exists(struct tdb_context *tdb, TDB_DATA key);
- * tdb_deq - are struct tdb_data equal?
- * @a: one struct tdb_data
- * @b: another struct tdb_data
- */
-static inline bool tdb_deq(struct tdb_data a, struct tdb_data b)
- return a.dsize == b.dsize && memcmp(a.dptr, b.dptr, a.dsize) == 0;
- * tdb_mkdata - make a struct tdb_data from const data
- * @p: the constant pointer
- * @len: the length
- *
- * As the dptr member of struct tdb_data is not constant, you need to
- * cast it. This function keeps thost casts in one place, as well as
- * suppressing the warning some compilers give when casting away a
- * qualifier (eg. gcc with -Wcast-qual)
- */
-static inline struct tdb_data tdb_mkdata(const void *p, size_t len)
- struct tdb_data d;
- d.dptr = cast_const(void *, p);
- d.dsize = len;
- return d;
- * tdb_transaction_start - start a transaction
- * @tdb: the tdb context returned from tdb_open()
- *
- * This begins a series of atomic operations. Other processes will be able
- * to read the tdb, but not alter it (they will block), nor will they see
- * any changes until tdb_transaction_commit() is called.
- *
- * Note that if the TDB_ALLOW_NESTING flag is set, a tdb_transaction_start()
- * within a transaction will succeed, but it's not a real transaction:
- * (1) An inner transaction which is committed is not actually committed until
- * the outer transaction is; if the outer transaction is cancelled, the
- * inner ones are discarded.
- * (2) tdb_transaction_cancel() marks the outer transaction as having an error,
- * so the final tdb_transaction_commit() will fail.
- * (3) the outer transaction will see the results of the inner transaction.
- *
- * See Also:
- * tdb_transaction_cancel, tdb_transaction_commit.
- */
-enum TDB_ERROR tdb_transaction_start(struct tdb_context *tdb);
- * tdb_transaction_cancel - abandon a transaction
- * @tdb: the tdb context returned from tdb_open()
- *
- * This aborts a transaction, discarding any changes which were made.
- * tdb_close() does this implicitly.
- */
-void tdb_transaction_cancel(struct tdb_context *tdb);
- * tdb_transaction_commit - commit a transaction
- * @tdb: the tdb context returned from tdb_open()
- *
- * This completes a transaction, writing any changes which were made.
- *
- * fsync() is used to commit the transaction (unless TDB_NOSYNC is set),
- * making it robust against machine crashes, but very slow compared to
- * other TDB operations.
- *
- * A failure can only be caused by unexpected errors (eg. I/O or
- * memory); this is no point looping on transaction failure.
- *
- * See Also:
- * tdb_transaction_prepare_commit()
- */
-enum TDB_ERROR tdb_transaction_commit(struct tdb_context *tdb);
- * tdb_transaction_prepare_commit - prepare to commit a transaction
- * @tdb: the tdb context returned from tdb_open()
- *
- * This ensures we have the resources to commit a transaction (using
- * tdb_transaction_commit): if this succeeds then a transaction will only
- * fail if the write() or fsync() calls fail.
- *
- * If this fails you must still call tdb_transaction_cancel() to cancel
- * the transaction.
- *
- * See Also:
- * tdb_transaction_commit()
- */
-enum TDB_ERROR tdb_transaction_prepare_commit(struct tdb_context *tdb);
- * tdb_traverse - traverse a TDB
- * @tdb: the tdb context returned from tdb_open()
- * @fn: the function to call for every key/value pair (or NULL)
- * @p: the pointer to hand to @f
- *
- * This walks the TDB until all they keys have been traversed, or @fn
- * returns non-zero. If the traverse function or other processes are
- * changing data or adding or deleting keys, the traverse may be
- * unreliable: keys may be skipped or (rarely) visited twice.
- *
- * There is one specific exception: the special case of deleting the
- * current key does not undermine the reliability of the traversal.
- *
- * On success, returns the number of keys iterated. On error returns
- * a negative enum TDB_ERROR value.
- */
-#define tdb_traverse(tdb, fn, p) \
- tdb_traverse_(tdb, typesafe_cb_preargs(int, void *, (fn), (p), \
- struct tdb_context *, \
-int64_t tdb_traverse_(struct tdb_context *tdb,
- int (*fn)(struct tdb_context *,
- TDB_DATA, TDB_DATA, void *), void *p);
- * tdb_parse_record - operate directly on data in the database.
- * @tdb: the tdb context returned from tdb_open()
- * @key: the key whose record we should hand to @parse
- * @parse: the function to call for the data
- * @data: the private pointer to hand to @parse (types must match).
- *
- * This avoids a copy for many cases, by handing you a pointer into
- * the memory-mapped database. It also locks the record to prevent
- * other accesses at the same time.
- *
- * Do not alter the data handed to parse()!
- */
-#define tdb_parse_record(tdb, key, parse, data) \
- tdb_parse_record_((tdb), (key), \
- typesafe_cb_preargs(enum TDB_ERROR, void *, \
- (parse), (data), \
- TDB_DATA, TDB_DATA), (data))
-enum TDB_ERROR tdb_parse_record_(struct tdb_context *tdb,
- TDB_DATA key,
- enum TDB_ERROR (*parse)(TDB_DATA k,
- void *data),
- void *data);
- * tdb_get_seqnum - get a database sequence number
- * @tdb: the tdb context returned from tdb_open()
- *
- * This returns a sequence number: any change to the database from a
- * tdb context opened with the TDB_SEQNUM flag will cause that number
- * to increment. Note that the incrementing is unreliable (it is done
- * without locking), so this is only useful as an optimization.
- *
- * For example, you may have a regular database backup routine which
- * does not operate if the sequence number is unchanged. In the
- * unlikely event of a failed increment, it will be backed up next
- * time any way.
- *
- * Returns an enum TDB_ERROR (ie. negative) on error.
- */
-int64_t tdb_get_seqnum(struct tdb_context *tdb);
- * tdb_firstkey - get the "first" key in a TDB
- * @tdb: the tdb context returned from tdb_open()
- * @key: pointer to key.
- *
- * This returns an arbitrary key in the database; with tdb_nextkey() it allows
- * open-coded traversal of the database, though it is slightly less efficient
- * than tdb_traverse.
- *
- * It is your responsibility to free @key->dptr on success.
- *
- * Returns TDB_ERR_NOEXIST if the database is empty.
- */
-enum TDB_ERROR tdb_firstkey(struct tdb_context *tdb, struct tdb_data *key);
- * tdb_nextkey - get the "next" key in a TDB
- * @tdb: the tdb context returned from tdb_open()
- * @key: a key returned by tdb_firstkey() or tdb_nextkey().
- *
- * This returns another key in the database; it will free @key.dptr for
- * your convenience.
- *
- * Returns TDB_ERR_NOEXIST if there are no more keys.
- */
-enum TDB_ERROR tdb_nextkey(struct tdb_context *tdb, struct tdb_data *key);
- * tdb_chainlock - lock a record in the TDB
- * @tdb: the tdb context returned from tdb_open()
- * @key: the key to lock.
- *
- * This prevents any access occurring to a group of keys including @key,
- * even if @key does not exist. This allows primitive atomic updates of
- * records without using transactions.
- *
- * You cannot begin a transaction while holding a tdb_chainlock(), nor can
- * you do any operations on any other keys in the database. This also means
- * that you cannot hold more than one tdb_chainlock() at a time.
- *
- * See Also:
- * tdb_chainunlock()
- */
-enum TDB_ERROR tdb_chainlock(struct tdb_context *tdb, TDB_DATA key);
- * tdb_chainunlock - unlock a record in the TDB
- * @tdb: the tdb context returned from tdb_open()
- * @key: the key to unlock.
- *
- * The key must have previously been locked by tdb_chainlock().
- */
-void tdb_chainunlock(struct tdb_context *tdb, TDB_DATA key);
- * tdb_chainlock_read - lock a record in the TDB, for reading
- * @tdb: the tdb context returned from tdb_open()
- * @key: the key to lock.
- *
- * This prevents any changes from occurring to a group of keys including @key,
- * even if @key does not exist. This allows primitive atomic updates of
- * records without using transactions.
- *
- * You cannot begin a transaction while holding a tdb_chainlock_read(), nor can
- * you do any operations on any other keys in the database. This also means
- * that you cannot hold more than one tdb_chainlock()/read() at a time.
- *
- * See Also:
- * tdb_chainlock()
- */
-enum TDB_ERROR tdb_chainlock_read(struct tdb_context *tdb, TDB_DATA key);
- * tdb_chainunlock_read - unlock a record in the TDB for reading
- * @tdb: the tdb context returned from tdb_open()
- * @key: the key to unlock.
- *
- * The key must have previously been locked by tdb_chainlock_read().
- */
-void tdb_chainunlock_read(struct tdb_context *tdb, TDB_DATA key);
- * tdb_lockall - lock the entire TDB
- * @tdb: the tdb context returned from tdb_open()
- *
- * You cannot hold a tdb_chainlock while calling this. It nests, so you
- * must call tdb_unlockall as many times as you call tdb_lockall.
- */
-enum TDB_ERROR tdb_lockall(struct tdb_context *tdb);
- * tdb_unlockall - unlock the entire TDB
- * @tdb: the tdb context returned from tdb_open()
- */
-void tdb_unlockall(struct tdb_context *tdb);
- * tdb_lockall_read - lock the entire TDB for reading
- * @tdb: the tdb context returned from tdb_open()
- *
- * This prevents others writing to the database, eg. tdb_delete, tdb_store,
- * tdb_append, but not tdb_fetch.
- *
- * You cannot hold a tdb_chainlock while calling this. It nests, so you
- * must call tdb_unlockall_read as many times as you call tdb_lockall_read.
- */
-enum TDB_ERROR tdb_lockall_read(struct tdb_context *tdb);
- * tdb_unlockall_read - unlock the entire TDB for reading
- * @tdb: the tdb context returned from tdb_open()
- */
-void tdb_unlockall_read(struct tdb_context *tdb);
- * tdb_wipe_all - wipe the database clean
- * @tdb: the tdb context returned from tdb_open()
- *
- * Completely erase the database. This is faster than iterating through
- * each key and doing tdb_delete.
- */
-enum TDB_ERROR tdb_wipe_all(struct tdb_context *tdb);
- * tdb_repack - repack the database
- * @tdb: the tdb context returned from tdb_open()
- *
- * This repacks the database; if it is suffering from a great deal of
- * fragmentation this might help. However, it can take twice the
- * memory of the existing TDB.
- */
-enum TDB_ERROR tdb_repack(struct tdb_context *tdb);
- * tdb_check - check a TDB for consistency
- * @tdb: the tdb context returned from tdb_open()
- * @check: function to check each key/data pair (or NULL)
- * @data: argument for @check, must match type.
- *
- * This performs a consistency check of the open database, optionally calling
- * a check() function on each record so you can do your own data consistency
- * checks as well. If check() returns an error, that is returned from
- * tdb_check().
- *
- * Note that the TDB uses a feature which we don't understand which
- * indicates we can't run tdb_check(), this will log a warning to that
- * effect and return TDB_SUCCESS. You can detect this condition by
- * looking for TDB_CANT_CHECK in tdb_get_flags().
- *
- * Returns TDB_SUCCESS or an error.
- */
-#define tdb_check(tdb, check, data) \
- tdb_check_((tdb), typesafe_cb_preargs(enum TDB_ERROR, void *, \
- (check), (data), \
- struct tdb_data, \
- struct tdb_data), \
- (data))
-enum TDB_ERROR tdb_check_(struct tdb_context *tdb,
- enum TDB_ERROR (*check)(struct tdb_data k,
- struct tdb_data d,
- void *data),
- void *data);
- * tdb_error - get the last error (not threadsafe)
- * @tdb: the tdb context returned from tdb_open()
- *
- * Returns the last error returned by a TDB function.
- *
- * This makes porting from TDB1 easier, but note that the last error is not
- * reliable in threaded programs.
- */
-enum TDB_ERROR tdb_error(struct tdb_context *tdb);
- * enum tdb_summary_flags - flags for tdb_summary.
- */
-enum tdb_summary_flags {
- TDB_SUMMARY_HISTOGRAMS = 1 /* Draw graphs in the summary. */
- * tdb_summary - return a string describing the TDB state
- * @tdb: the tdb context returned from tdb_open()
- * @flags: flags to control the summary output.
- * @summary: pointer to string to allocate.
- *
- * This returns a developer-readable string describing the overall
- * state of the tdb, such as the percentage used and sizes of records.
- * It is designed to provide information about the tdb at a glance
- * without displaying any keys or data in the database.
- *
- * On success, sets @summary to point to a malloc()'ed nul-terminated
- * multi-line string. It is your responsibility to free() it.
- */
-enum TDB_ERROR tdb_summary(struct tdb_context *tdb,
- enum tdb_summary_flags flags,
- char **summary);
- * tdb_get_flags - return the flags for a tdb
- * @tdb: the tdb context returned from tdb_open()
- *
- * This returns the flags on the current tdb. Some of these are caused by
- * the flags argument to tdb_open(), others (such as TDB_CONVERT) are
- * intuited.
- */
-unsigned int tdb_get_flags(struct tdb_context *tdb);
- * tdb_add_flag - set a flag for a tdb
- * @tdb: the tdb context returned from tdb_open()
- *
- * You can use this to set a flag on the TDB. You cannot set these flags
- * on a TDB_INTERNAL tdb.
- */
-void tdb_add_flag(struct tdb_context *tdb, unsigned flag);
- * tdb_remove_flag - unset a flag for a tdb
- * @tdb: the tdb context returned from tdb_open()
- *
- * You can use this to clear a flag on the TDB. You cannot clear flags
- * on a TDB_INTERNAL tdb.
- */
-void tdb_remove_flag(struct tdb_context *tdb, unsigned flag);
- * enum tdb_attribute_type - descriminator for union tdb_attribute.
- */
-enum tdb_attribute_type {
- * tdb_get_attribute - get an attribute for an existing tdb
- * @tdb: the tdb context returned from tdb_open()
- * @attr: the union tdb_attribute to set.
- *
- * This gets an attribute from a TDB which has previously been set (or
- * may return the default values). Set @attr.base.attr to the
- * attribute type you want get.
- */
-enum TDB_ERROR tdb_get_attribute(struct tdb_context *tdb,
- union tdb_attribute *attr);
- * tdb_set_attribute - set an attribute for an existing tdb
- * @tdb: the tdb context returned from tdb_open()
- * @attr: the union tdb_attribute to set.
- *
- * This sets an attribute on a TDB, overriding any previous attribute
- * of the same type. It returns TDB_ERR_EINVAL if the attribute is
- * unknown or invalid.
- *
- * TDB_ATTRIBUTE_OPENHOOK cannot currently be set after tdb_open.
- */
-enum TDB_ERROR tdb_set_attribute(struct tdb_context *tdb,
- const union tdb_attribute *attr);
- * tdb_unset_attribute - reset an attribute for an existing tdb
- * @tdb: the tdb context returned from tdb_open()
- * @type: the attribute type to unset.
- *
- * This unsets an attribute on a TDB, returning it to the defaults
- * (where applicable).
- *
- * Note that it only makes sense for TDB_ATTRIBUTE_LOG and TDB_ATTRIBUTE_FLOCK
- * to be unset.
- */
-void tdb_unset_attribute(struct tdb_context *tdb,
- enum tdb_attribute_type type);
- * tdb_name - get the name of a tdb
- * @tdb: the tdb context returned from tdb_open()
- *
- * This returns a copy of the name string, made at tdb_open() time. If that
- * argument was NULL (possible for a TDB_INTERNAL db) this will return NULL.
- *
- * This is mostly useful for logging.
- */
-const char *tdb_name(const struct tdb_context *tdb);
- * tdb_fd - get the file descriptor of a tdb
- * @tdb: the tdb context returned from tdb_open()
- *
- * This returns the file descriptor for the underlying database file, or -1
- */
-int tdb_fd(const struct tdb_context *tdb);
- * tdb_foreach - iterate through every open TDB.
- * @fn: the function to call for every TDB
- * @p: the pointer to hand to @fn
- *
- * TDB internally keeps track of all open TDBs; this function allows you to
- * iterate through them. If @fn returns non-zero, traversal stops.
- */
-#define tdb_foreach(fn, p) \
- tdb_foreach_(typesafe_cb_preargs(int, void *, (fn), (p), \
- struct tdb_context *), (p))
-void tdb_foreach_(int (*fn)(struct tdb_context *, void *), void *p);
- * struct tdb_attribute_base - common fields for all tdb attributes.
- */
-struct tdb_attribute_base {
- enum tdb_attribute_type attr;
- union tdb_attribute *next;
- * enum tdb_log_level - log levels for tdb_attribute_log
- * @TDB_LOG_ERROR: used to log unrecoverable errors such as I/O errors
- * or internal consistency failures.
- * @TDB_LOG_USE_ERROR: used to log usage errors such as invalid parameters
- * or writing to a read-only database.
- * @TDB_LOG_WARNING: used for informational messages on issues which
- * are unusual but handled by TDB internally, such
- * as a failure to mmap or failure to open /dev/urandom.
- */
-enum tdb_log_level {
- * struct tdb_attribute_log - log function attribute
- *
- * This attribute provides a hook for you to log errors.
- */
-struct tdb_attribute_log {
- struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_LOG */
- void (*fn)(struct tdb_context *tdb,
- enum tdb_log_level level,
- enum TDB_ERROR ecode,
- const char *message,
- void *data);
- void *data;
- * struct tdb_attribute_hash - hash function attribute
- *
- * This attribute allows you to provide an alternative hash function.
- * This hash function will be handed keys from the database; it will also
- * be handed the 8-byte TDB_HASH_MAGIC value for checking the header (the
- * tdb_open() will fail if the hash value doesn't match the header).
- *
- * Note that if your hash function gives different results on
- * different machine endians, your tdb will no longer work across
- * different architectures!
- */
-struct tdb_attribute_hash {
- struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_HASH */
- uint64_t (*fn)(const void *key, size_t len, uint64_t seed,
- void *data);
- void *data;
- * struct tdb_attribute_seed - hash function seed attribute
- *
- * The hash function seed is normally taken from /dev/urandom (or equivalent)
- * but can be set manually here. This is mainly for testing purposes.
- */
-struct tdb_attribute_seed {
- struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_SEED */
- uint64_t seed;
- * struct tdb_attribute_stats - tdb operational statistics
- *
- * This attribute records statistics of various low-level TDB operations.
- * This can be used to assist performance evaluation. This is only
- * useful for tdb_get_attribute().
- *
- * New fields will be added at the end, hence the "size" argument which
- * indicates how large your structure is: it must be filled in before
- * calling tdb_get_attribute(), which will overwrite it with the size
- * tdb knows about.
- */
-struct tdb_attribute_stats {
- struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_STATS */
- size_t size; /* = sizeof(struct tdb_attribute_stats) */
- uint64_t allocs;
- uint64_t alloc_subhash;
- uint64_t alloc_chain;
- uint64_t alloc_bucket_exact;
- uint64_t alloc_bucket_max;
- uint64_t alloc_leftover;
- uint64_t alloc_coalesce_tried;
- uint64_t alloc_coalesce_iterate_clash;
- uint64_t alloc_coalesce_lockfail;
- uint64_t alloc_coalesce_race;
- uint64_t alloc_coalesce_succeeded;
- uint64_t alloc_coalesce_num_merged;
- uint64_t compares;
- uint64_t compare_wrong_bucket;
- uint64_t compare_wrong_offsetbits;
- uint64_t compare_wrong_keylen;
- uint64_t compare_wrong_rechash;
- uint64_t compare_wrong_keycmp;
- uint64_t transactions;
- uint64_t transaction_cancel;
- uint64_t transaction_nest;
- uint64_t transaction_expand_file;
- uint64_t transaction_read_direct;
- uint64_t transaction_read_direct_fail;
- uint64_t transaction_write_direct;
- uint64_t transaction_write_direct_fail;
- uint64_t expands;
- uint64_t frees;
- uint64_t locks;
- uint64_t lock_lowlevel;
- uint64_t lock_nonblock;
- uint64_t lock_nonblock_fail;
- * struct tdb_attribute_openhook - tdb special effects hook for open
- *
- * This attribute contains a function to call once we have the OPEN_LOCK
- * for the tdb, but before we've examined its contents. If this succeeds,
- * the tdb will be populated if it's then zero-length.
- *
- * This is a hack to allow support for TDB1-style TDB_CLEAR_IF_FIRST
- * behaviour.
- */
-struct tdb_attribute_openhook {
- struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_OPENHOOK */
- enum TDB_ERROR (*fn)(int fd, void *data);
- void *data;
- * struct tdb_attribute_flock - tdb special effects hook for file locking
- *
- * This attribute contains function to call to place locks on a file; it can
- * be used to support non-blocking operations or lock proxying.
- *
- * They should return 0 on success, -1 on failure and set errno.
- *
- * An error will be logged on error if errno is neither EAGAIN nor EINTR
- * (normally it would only return EAGAIN if waitflag is false, and
- * loop internally on EINTR).
- */
-struct tdb_attribute_flock {
- struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_FLOCK */
- int (*lock)(int fd,int rw, off_t off, off_t len, bool waitflag, void *);
- int (*unlock)(int fd, int rw, off_t off, off_t len, void *);
- void *data;
- * union tdb_attribute - tdb attributes.
- *
- * This represents all the known attributes.
- *
- * See also:
- * struct tdb_attribute_log, struct tdb_attribute_hash,
- * struct tdb_attribute_seed, struct tdb_attribute_stats,
- * struct tdb_attribute_openhook, struct tdb_attribute_flock.
- */
-union tdb_attribute {
- struct tdb_attribute_base base;
- struct tdb_attribute_log log;
- struct tdb_attribute_hash hash;
- struct tdb_attribute_seed seed;
- struct tdb_attribute_stats stats;
- struct tdb_attribute_openhook openhook;
- struct tdb_attribute_flock flock;
-#ifdef __cplusplus
-#endif /* tdb2.h */
diff --git a/lib/tdb2/test/api-12-store.c b/lib/tdb2/test/api-12-store.c
deleted file mode 100644
index 6a9dd95f5f..0000000000
--- a/lib/tdb2/test/api-12-store.c
+++ /dev/null
@@ -1,57 +0,0 @@
-#include "config.h"
-#include "tdb2.h"
-#include "tap-interface.h"
-#include <ccan/hash/hash.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include "logging.h"
-/* We use the same seed which we saw a failure on. */
-static uint64_t fixedhash(const void *key, size_t len, uint64_t seed, void *p)
- return hash64_stable((const unsigned char *)key, len,
- *(uint64_t *)p);
-int main(int argc, char *argv[])
- unsigned int i, j;
- struct tdb_context *tdb;
- uint64_t seed = 16014841315512641303ULL;
- union tdb_attribute fixed_hattr
- = { .hash = { .base = { TDB_ATTRIBUTE_HASH },
- .fn = fixedhash,
- .data = &seed } };
- struct tdb_data key = { (unsigned char *)&j, sizeof(j) };
- struct tdb_data data = { (unsigned char *)&j, sizeof(j) };
- = &tap_log_attr;
- plan_tests(sizeof(flags) / sizeof(flags[0]) * (1 + 500 * 3) + 1);
- for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
- tdb = tdb_open("run-12-store.tdb", flags[i],
- O_RDWR|O_CREAT|O_TRUNC, 0600, &fixed_hattr);
- ok1(tdb);
- if (!tdb)
- continue;
- /* We seemed to lose some keys.
- * Insert and check they're in there! */
- for (j = 0; j < 500; j++) {
- struct tdb_data d = { NULL, 0 }; /* Bogus GCC warning */
- ok1(tdb_store(tdb, key, data, TDB_REPLACE) == 0);
- ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS);
- ok1(tdb_deq(d, data));
- free(d.dptr);
- }
- tdb_close(tdb);
- }
- ok1(tap_log_messages == 0);
- return exit_status();
diff --git a/lib/tdb2/test/api-13-delete.c b/lib/tdb2/test/api-13-delete.c
deleted file mode 100644
index 279b38645b..0000000000
--- a/lib/tdb2/test/api-13-delete.c
+++ /dev/null
@@ -1,205 +0,0 @@
-#include "private.h" // For TDB_TOPLEVEL_HASH_BITS
-#include <ccan/hash/hash.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include "tdb2.h"
-#include "tap-interface.h"
-#include "logging.h"
-/* We rig the hash so adjacent-numbered records always clash. */
-static uint64_t clash(const void *key, size_t len, uint64_t seed, void *priv)
- return ((uint64_t)*(const unsigned int *)key)
- << (64 - TDB_TOPLEVEL_HASH_BITS - 1);
-/* We use the same seed which we saw a failure on. */
-static uint64_t fixedhash(const void *key, size_t len, uint64_t seed, void *p)
- return hash64_stable((const unsigned char *)key, len,
- *(uint64_t *)p);
-static bool store_records(struct tdb_context *tdb)
- int i;
- struct tdb_data key = { (unsigned char *)&i, sizeof(i) };
- struct tdb_data d, data = { (unsigned char *)&i, sizeof(i) };
- for (i = 0; i < 1000; i++) {
- if (tdb_store(tdb, key, data, TDB_REPLACE) != 0)
- return false;
- tdb_fetch(tdb, key, &d);
- if (!tdb_deq(d, data))
- return false;
- free(d.dptr);
- }
- return true;
-static void test_val(struct tdb_context *tdb, uint64_t val)
- uint64_t v;
- struct tdb_data key = { (unsigned char *)&v, sizeof(v) };
- struct tdb_data d, data = { (unsigned char *)&v, sizeof(v) };
- /* Insert an entry, then delete it. */
- v = val;
- /* Delete should fail. */
- ok1(tdb_delete(tdb, key) == TDB_ERR_NOEXIST);
- ok1(tdb_check(tdb, NULL, NULL) == 0);
- /* Insert should succeed. */
- ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0);
- ok1(tdb_check(tdb, NULL, NULL) == 0);
- /* Delete should succeed. */
- ok1(tdb_delete(tdb, key) == 0);
- ok1(tdb_check(tdb, NULL, NULL) == 0);
- /* Re-add it, then add collision. */
- ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0);
- v = val + 1;
- ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0);
- ok1(tdb_check(tdb, NULL, NULL) == 0);
- /* Can find both? */
- ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS);
- ok1(d.dsize == data.dsize);
- free(d.dptr);
- v = val;
- ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS);
- ok1(d.dsize == data.dsize);
- free(d.dptr);
- /* Delete second one. */
- v = val + 1;
- ok1(tdb_delete(tdb, key) == 0);
- ok1(tdb_check(tdb, NULL, NULL) == 0);
- /* Re-add */
- ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0);
- ok1(tdb_check(tdb, NULL, NULL) == 0);
- /* Now, try deleting first one. */
- v = val;
- ok1(tdb_delete(tdb, key) == 0);
- ok1(tdb_check(tdb, NULL, NULL) == 0);
- /* Can still find second? */
- v = val + 1;
- ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS);
- ok1(d.dsize == data.dsize);
- free(d.dptr);
- /* Now, this will be ideally placed. */
- v = val + 2;
- ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0);
- ok1(tdb_check(tdb, NULL, NULL) == 0);
- /* This will collide with both. */
- v = val;
- ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0);
- /* We can still find them all, right? */
- ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS);
- ok1(d.dsize == data.dsize);
- free(d.dptr);
- v = val + 1;
- ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS);
- ok1(d.dsize == data.dsize);
- free(d.dptr);
- v = val + 2;
- ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS);
- ok1(d.dsize == data.dsize);
- free(d.dptr);
- /* And if we delete val + 1, that val + 2 should not move! */
- v = val + 1;
- ok1(tdb_delete(tdb, key) == 0);
- ok1(tdb_check(tdb, NULL, NULL) == 0);
- v = val;
- ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS);
- ok1(d.dsize == data.dsize);
- free(d.dptr);
- v = val + 2;
- ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS);
- ok1(d.dsize == data.dsize);
- free(d.dptr);
- /* Delete those two, so we are empty. */
- ok1(tdb_delete(tdb, key) == 0);
- v = val;
- ok1(tdb_delete(tdb, key) == 0);
- ok1(tdb_check(tdb, NULL, NULL) == 0);
-int main(int argc, char *argv[])
- unsigned int i, j;
- struct tdb_context *tdb;
- uint64_t seed = 16014841315512641303ULL;
- union tdb_attribute clash_hattr
- = { .hash = { .base = { TDB_ATTRIBUTE_HASH },
- .fn = clash } };
- union tdb_attribute fixed_hattr
- = { .hash = { .base = { TDB_ATTRIBUTE_HASH },
- .fn = fixedhash,
- .data = &seed } };
- /* These two values gave trouble before. */
- int vals[] = { 755, 837 };
- = &tap_log_attr;
- = &tap_log_attr;
- plan_tests(sizeof(flags) / sizeof(flags[0])
- * (39 * 3 + 5 + sizeof(vals)/sizeof(vals[0])*2) + 1);
- for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
- tdb = tdb_open("run-13-delete.tdb", flags[i],
- O_RDWR|O_CREAT|O_TRUNC, 0600, &clash_hattr);
- ok1(tdb);
- if (!tdb)
- continue;
- /* Check start of hash table. */
- test_val(tdb, 0);
- /* Check end of hash table. */
- test_val(tdb, -1ULL);
- /* Check mixed bitpattern. */
- test_val(tdb, 0x123456789ABCDEF0ULL);
- ok1(!tdb->file || (tdb->file->allrecord_lock.count == 0
- && tdb->file->num_lockrecs == 0));
- tdb_close(tdb);
- /* Deleting these entries in the db gave problems. */
- tdb = tdb_open("run-13-delete.tdb", flags[i],
- O_RDWR|O_CREAT|O_TRUNC, 0600, &fixed_hattr);
- ok1(tdb);
- if (!tdb)
- continue;
- ok1(store_records(tdb));
- ok1(tdb_check(tdb, NULL, NULL) == 0);
- for (j = 0; j < sizeof(vals)/sizeof(vals[0]); j++) {
- struct tdb_data key;
- key.dptr = (unsigned char *)&vals[j];
- key.dsize = sizeof(vals[j]);
- ok1(tdb_delete(tdb, key) == 0);
- ok1(tdb_check(tdb, NULL, NULL) == 0);
- }
- tdb_close(tdb);
- }
- ok1(tap_log_messages == 0);
- return exit_status();
diff --git a/lib/tdb2/test/api-14-exists.c b/lib/tdb2/test/api-14-exists.c
deleted file mode 100644
index 801c295893..0000000000
--- a/lib/tdb2/test/api-14-exists.c
+++ /dev/null
@@ -1,54 +0,0 @@
-#include "config.h"
-#include "tdb2.h"
-#include "tap-interface.h"
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include "logging.h"
-static bool test_records(struct tdb_context *tdb)
- int i;
- struct tdb_data key = { (unsigned char *)&i, sizeof(i) };
- struct tdb_data data = { (unsigned char *)&i, sizeof(i) };
- for (i = 0; i < 1000; i++) {
- if (tdb_exists(tdb, key))
- return false;
- if (tdb_store(tdb, key, data, TDB_REPLACE) != 0)
- return false;
- if (!tdb_exists(tdb, key))
- return false;
- }
- for (i = 0; i < 1000; i++) {
- if (!tdb_exists(tdb, key))
- return false;
- if (tdb_delete(tdb, key) != 0)
- return false;
- if (tdb_exists(tdb, key))
- return false;
- }
- return true;
-int main(int argc, char *argv[])
- unsigned int i;
- struct tdb_context *tdb;
- plan_tests(sizeof(flags) / sizeof(flags[0]) * 2 + 1);
- for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
- tdb = tdb_open("run-14-exists.tdb", flags[i],
- O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
- if (ok1(tdb))
- ok1(test_records(tdb));
- tdb_close(tdb);
- }
- ok1(tap_log_messages == 0);
- return exit_status();
diff --git a/lib/tdb2/test/api-16-wipe_all.c b/lib/tdb2/test/api-16-wipe_all.c
deleted file mode 100644
index 3dfcc7a419..0000000000
--- a/lib/tdb2/test/api-16-wipe_all.c
+++ /dev/null
@@ -1,46 +0,0 @@
-#include "config.h"
-#include "tdb2.h"
-#include "tap-interface.h"
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include "logging.h"
-static bool add_records(struct tdb_context *tdb)
- int i;
- struct tdb_data key = { (unsigned char *)&i, sizeof(i) };
- struct tdb_data data = { (unsigned char *)&i, sizeof(i) };
- for (i = 0; i < 1000; i++) {
- if (tdb_store(tdb, key, data, TDB_REPLACE) != 0)
- return false;
- }
- return true;
-int main(int argc, char *argv[])
- unsigned int i;
- struct tdb_context *tdb;
- plan_tests(sizeof(flags) / sizeof(flags[0]) * 4 + 1);
- for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
- tdb = tdb_open("run-16-wipe_all.tdb", flags[i],
- O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
- if (ok1(tdb)) {
- struct tdb_data key;
- ok1(add_records(tdb));
- ok1(tdb_wipe_all(tdb) == TDB_SUCCESS);
- ok1(tdb_firstkey(tdb, &key) == TDB_ERR_NOEXIST);
- tdb_close(tdb);
- }
- }
- ok1(tap_log_messages == 0);
- return exit_status();
diff --git a/lib/tdb2/test/api-21-parse_record.c b/lib/tdb2/test/api-21-parse_record.c
deleted file mode 100644
index 150e1c9dd0..0000000000
--- a/lib/tdb2/test/api-21-parse_record.c
+++ /dev/null
@@ -1,67 +0,0 @@
-#include "config.h"
-#include "tdb2.h"
-#include "tap-interface.h"
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include "logging.h"
-static enum TDB_ERROR parse(TDB_DATA key, TDB_DATA data, TDB_DATA *expected)
- if (!tdb_deq(data, *expected))
- return TDB_ERR_EINVAL;
- return TDB_SUCCESS;
-static enum TDB_ERROR parse_err(TDB_DATA key, TDB_DATA data, void *unused)
- return 100;
-static bool test_records(struct tdb_context *tdb)
- int i;
- struct tdb_data key = { (unsigned char *)&i, sizeof(i) };
- struct tdb_data data = { (unsigned char *)&i, sizeof(i) };
- for (i = 0; i < 1000; i++) {
- if (tdb_store(tdb, key, data, TDB_REPLACE) != 0)
- return false;
- }
- for (i = 0; i < 1000; i++) {
- if (tdb_parse_record(tdb, key, parse, &data) != TDB_SUCCESS)
- return false;
- }
- if (tdb_parse_record(tdb, key, parse, &data) != TDB_ERR_NOEXIST)
- return false;
- /* Test error return from parse function. */
- i = 0;
- if (tdb_parse_record(tdb, key, parse_err, NULL) != 100)
- return false;
- return true;
-int main(int argc, char *argv[])
- unsigned int i;
- struct tdb_context *tdb;
- plan_tests(sizeof(flags) / sizeof(flags[0]) * 2 + 1);
- for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
- tdb = tdb_open("api-21-parse_record.tdb", flags[i],
- O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
- if (ok1(tdb))
- ok1(test_records(tdb));
- tdb_close(tdb);
- }
- ok1(tap_log_messages == 0);
- return exit_status();
diff --git a/lib/tdb2/test/api-55-transaction.c b/lib/tdb2/test/api-55-transaction.c
deleted file mode 100644
index c474c6abc3..0000000000
--- a/lib/tdb2/test/api-55-transaction.c
+++ /dev/null
@@ -1,73 +0,0 @@
-#include "private.h" // struct tdb_context
-#include "tdb2.h"
-#include "tap-interface.h"
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <stdlib.h>
-#include "logging.h"
-int main(int argc, char *argv[])
- unsigned int i;
- struct tdb_context *tdb;
- unsigned char *buffer;
- int flags[] = { TDB_DEFAULT, TDB_NOMMAP,
- struct tdb_data key = tdb_mkdata("key", 3);
- struct tdb_data data;
- buffer = malloc(1000);
- for (i = 0; i < 1000; i++)
- buffer[i] = i;
- plan_tests(sizeof(flags) / sizeof(flags[0]) * 20 + 1);
- for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
- tdb = tdb_open("run-55-transaction.tdb", flags[i],
- O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
- ok1(tdb);
- if (!tdb)
- continue;
- ok1(tdb_transaction_start(tdb) == 0);
- data.dptr = buffer;
- data.dsize = 1000;
- ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0);
- ok1(tdb_fetch(tdb, key, &data) == TDB_SUCCESS);
- ok1(data.dsize == 1000);
- ok1(memcmp(data.dptr, buffer, data.dsize) == 0);
- free(data.dptr);
- /* Cancelling a transaction means no store */
- tdb_transaction_cancel(tdb);
- ok1(tdb->file->allrecord_lock.count == 0
- && tdb->file->num_lockrecs == 0);
- ok1(tdb_check(tdb, NULL, NULL) == 0);
- ok1(tdb_fetch(tdb, key, &data) == TDB_ERR_NOEXIST);
- /* Commit the transaction. */
- ok1(tdb_transaction_start(tdb) == 0);
- data.dptr = buffer;
- data.dsize = 1000;
- ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0);
- ok1(tdb_fetch(tdb, key, &data) == TDB_SUCCESS);
- ok1(data.dsize == 1000);
- ok1(memcmp(data.dptr, buffer, data.dsize) == 0);
- free(data.dptr);
- ok1(tdb_transaction_commit(tdb) == 0);
- ok1(tdb->file->allrecord_lock.count == 0
- && tdb->file->num_lockrecs == 0);
- ok1(tdb_check(tdb, NULL, NULL) == 0);
- ok1(tdb_fetch(tdb, key, &data) == TDB_SUCCESS);
- ok1(data.dsize == 1000);
- ok1(memcmp(data.dptr, buffer, data.dsize) == 0);
- free(data.dptr);
- tdb_close(tdb);
- }
- ok1(tap_log_messages == 0);
- free(buffer);
- return exit_status();
diff --git a/lib/tdb2/test/api-80-tdb_fd.c b/lib/tdb2/test/api-80-tdb_fd.c
deleted file mode 100644
index 63967b8aa6..0000000000
--- a/lib/tdb2/test/api-80-tdb_fd.c
+++ /dev/null
@@ -1,32 +0,0 @@
-#include "config.h"
-#include "tdb2.h"
-#include "tap-interface.h"
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include "logging.h"
-int main(int argc, char *argv[])
- unsigned int i;
- struct tdb_context *tdb;
- plan_tests(sizeof(flags) / sizeof(flags[0]) * 3);
- for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
- tdb = tdb_open("api-80-tdb_fd.tdb", flags[i],
- O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
- if (!ok1(tdb))
- continue;
- if (flags[i] & TDB_INTERNAL)
- ok1(tdb_fd(tdb) == -1);
- else
- ok1(tdb_fd(tdb) > 2);
- tdb_close(tdb);
- ok1(tap_log_messages == 0);
- }
- return exit_status();
diff --git a/lib/tdb2/test/api-81-seqnum.c b/lib/tdb2/test/api-81-seqnum.c
deleted file mode 100644
index 8bf261d635..0000000000
--- a/lib/tdb2/test/api-81-seqnum.c
+++ /dev/null
@@ -1,69 +0,0 @@
-#include "config.h"
-#include "tdb2.h"
-#include "tap-interface.h"
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <stdlib.h>
-#include "logging.h"
-int main(int argc, char *argv[])
- unsigned int i, seq;
- struct tdb_context *tdb;
- struct tdb_data d = { NULL, 0 }; /* Bogus GCC warning */
- struct tdb_data key = tdb_mkdata("key", 3);
- struct tdb_data data = tdb_mkdata("data", 4);
- plan_tests(sizeof(flags) / sizeof(flags[0]) * 15 + 4 * 13);
- for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
- tdb = tdb_open("api-81-seqnum.tdb", flags[i]|TDB_SEQNUM,
- O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
- if (!ok1(tdb))
- continue;
- seq = 0;
- ok1(tdb_get_seqnum(tdb) == seq);
- ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0);
- ok1(tdb_get_seqnum(tdb) == ++seq);
- /* Fetch doesn't change seqnum */
- if (ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS))
- free(d.dptr);
- ok1(tdb_get_seqnum(tdb) == seq);
- ok1(tdb_append(tdb, key, data) == TDB_SUCCESS);
- ok1(tdb_get_seqnum(tdb) == ++seq);
- ok1(tdb_delete(tdb, key) == TDB_SUCCESS);
- ok1(tdb_get_seqnum(tdb) == ++seq);
- /* Empty append works */
- ok1(tdb_append(tdb, key, data) == TDB_SUCCESS);
- ok1(tdb_get_seqnum(tdb) == ++seq);
- ok1(tdb_wipe_all(tdb) == TDB_SUCCESS);
- ok1(tdb_get_seqnum(tdb) == ++seq);
- if (!(flags[i] & TDB_INTERNAL)) {
- ok1(tdb_transaction_start(tdb) == TDB_SUCCESS);
- ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0);
- ok1(tdb_get_seqnum(tdb) == ++seq);
- ok1(tdb_append(tdb, key, data) == TDB_SUCCESS);
- ok1(tdb_get_seqnum(tdb) == ++seq);
- ok1(tdb_delete(tdb, key) == TDB_SUCCESS);
- ok1(tdb_get_seqnum(tdb) == ++seq);
- ok1(tdb_transaction_commit(tdb) == TDB_SUCCESS);
- ok1(tdb_get_seqnum(tdb) == seq);
- ok1(tdb_transaction_start(tdb) == TDB_SUCCESS);
- ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0);
- ok1(tdb_get_seqnum(tdb) == seq + 1);
- tdb_transaction_cancel(tdb);
- ok1(tdb_get_seqnum(tdb) == seq);
- }
- tdb_close(tdb);
- ok1(tap_log_messages == 0);
- }
- return exit_status();
diff --git a/lib/tdb2/test/api-82-lockattr.c b/lib/tdb2/test/api-82-lockattr.c
deleted file mode 100644
index b229eab83c..0000000000
--- a/lib/tdb2/test/api-82-lockattr.c
+++ /dev/null
@@ -1,237 +0,0 @@
-#include "private.h" // for tdb_fcntl_unlock
-#include "tdb2.h"
-#include "tap-interface.h"
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <errno.h>
-#include "logging.h"
-static int mylock(int fd, int rw, off_t off, off_t len, bool waitflag,
- void *_err)
- int *lock_err = _err;
- struct flock fl;
- int ret;
- if (*lock_err) {
- errno = *lock_err;
- return -1;
- }
- do {
- fl.l_type = rw;
- fl.l_whence = SEEK_SET;
- fl.l_start = off;
- fl.l_len = len;
- if (waitflag)
- ret = fcntl(fd, F_SETLKW, &fl);
- else
- ret = fcntl(fd, F_SETLK, &fl);
- } while (ret != 0 && errno == EINTR);
- return ret;
-static int trav_err;
-static int trav(struct tdb_context *tdb, TDB_DATA k, TDB_DATA d, int *terr)
- *terr = trav_err;
- return 0;
-int main(int argc, char *argv[])
- unsigned int i;
- struct tdb_context *tdb;
- int flags[] = { TDB_DEFAULT, TDB_NOMMAP,
- union tdb_attribute lock_attr;
- struct tdb_data key = tdb_mkdata("key", 3);
- struct tdb_data data = tdb_mkdata("data", 4);
- int lock_err;
- lock_attr.base.attr = TDB_ATTRIBUTE_FLOCK;
- = &tap_log_attr;
- lock_attr.flock.lock = mylock;
- lock_attr.flock.unlock = tdb_fcntl_unlock;
- = &lock_err;
- plan_tests(sizeof(flags) / sizeof(flags[0]) * 80);
- for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
- struct tdb_data d;
- /* Nonblocking open; expect no error message. */
- lock_err = EAGAIN;
- tdb = tdb_open("run-82-lockattr.tdb", flags[i],
- O_RDWR|O_CREAT|O_TRUNC, 0600, &lock_attr);
- ok(errno == lock_err, "Errno is %u", errno);
- ok1(!tdb);
- ok1(tap_log_messages == 0);
- lock_err = EINTR;
- tdb = tdb_open("run-82-lockattr.tdb", flags[i],
- O_RDWR|O_CREAT|O_TRUNC, 0600, &lock_attr);
- ok(errno == lock_err, "Errno is %u", errno);
- ok1(!tdb);
- ok1(tap_log_messages == 0);
- /* Forced fail open. */
- lock_err = ENOMEM;
- tdb = tdb_open("run-82-lockattr.tdb", flags[i],
- O_RDWR|O_CREAT|O_TRUNC, 0600, &lock_attr);
- ok1(errno == lock_err);
- ok1(!tdb);
- ok1(tap_log_messages == 1);
- tap_log_messages = 0;
- lock_err = 0;
- tdb = tdb_open("run-82-lockattr.tdb", flags[i],
- O_RDWR|O_CREAT|O_TRUNC, 0600, &lock_attr);
- if (!ok1(tdb))
- continue;
- ok1(tap_log_messages == 0);
- /* Nonblocking store. */
- lock_err = EAGAIN;
- ok1(tdb_store(tdb, key, data, TDB_REPLACE) == TDB_ERR_LOCK);
- ok1(tap_log_messages == 0);
- lock_err = EINTR;
- ok1(tdb_store(tdb, key, data, TDB_REPLACE) == TDB_ERR_LOCK);
- ok1(tap_log_messages == 0);
- lock_err = ENOMEM;
- ok1(tdb_store(tdb, key, data, TDB_REPLACE) == TDB_ERR_LOCK);
- ok1(tap_log_messages == 1);
- tap_log_messages = 0;
- /* Nonblocking fetch. */
- lock_err = EAGAIN;
- ok1(!tdb_exists(tdb, key));
- ok1(tap_log_messages == 0);
- lock_err = EINTR;
- ok1(!tdb_exists(tdb, key));
- ok1(tap_log_messages == 0);
- lock_err = ENOMEM;
- ok1(!tdb_exists(tdb, key));
- ok1(tap_log_messages == 1);
- tap_log_messages = 0;
- lock_err = EAGAIN;
- ok1(tdb_fetch(tdb, key, &d) == TDB_ERR_LOCK);
- ok1(tap_log_messages == 0);
- lock_err = EINTR;
- ok1(tdb_fetch(tdb, key, &d) == TDB_ERR_LOCK);
- ok1(tap_log_messages == 0);
- lock_err = ENOMEM;
- ok1(tdb_fetch(tdb, key, &d) == TDB_ERR_LOCK);
- ok1(tap_log_messages == 1);
- tap_log_messages = 0;
- /* Nonblocking delete. */
- lock_err = EAGAIN;
- ok1(tdb_delete(tdb, key) == TDB_ERR_LOCK);
- ok1(tap_log_messages == 0);
- lock_err = EINTR;
- ok1(tdb_delete(tdb, key) == TDB_ERR_LOCK);
- ok1(tap_log_messages == 0);
- lock_err = ENOMEM;
- ok1(tdb_delete(tdb, key) == TDB_ERR_LOCK);
- ok1(tap_log_messages == 1);
- tap_log_messages = 0;
- /* Nonblocking locks. */
- lock_err = EAGAIN;
- ok1(tdb_chainlock(tdb, key) == TDB_ERR_LOCK);
- ok1(tap_log_messages == 0);
- lock_err = EINTR;
- ok1(tdb_chainlock(tdb, key) == TDB_ERR_LOCK);
- ok1(tap_log_messages == 0);
- lock_err = ENOMEM;
- ok1(tdb_chainlock(tdb, key) == TDB_ERR_LOCK);
- ok1(tap_log_messages == 1);
- tap_log_messages = 0;
- lock_err = EAGAIN;
- ok1(tdb_chainlock_read(tdb, key) == TDB_ERR_LOCK);
- ok1(tap_log_messages == 0);
- lock_err = EINTR;
- ok1(tdb_chainlock_read(tdb, key) == TDB_ERR_LOCK);
- ok1(tap_log_messages == 0);
- lock_err = ENOMEM;
- ok1(tdb_chainlock_read(tdb, key) == TDB_ERR_LOCK);
- ok1(tap_log_messages == 1);
- tap_log_messages = 0;
- lock_err = EAGAIN;
- ok1(tdb_lockall(tdb) == TDB_ERR_LOCK);
- ok1(tap_log_messages == 0);
- lock_err = EINTR;
- ok1(tdb_lockall(tdb) == TDB_ERR_LOCK);
- ok1(tap_log_messages == 0);
- lock_err = ENOMEM;
- ok1(tdb_lockall(tdb) == TDB_ERR_LOCK);
- /* This actually does divide and conquer. */
- ok1(tap_log_messages > 0);
- tap_log_messages = 0;
- lock_err = EAGAIN;
- ok1(tdb_lockall_read(tdb) == TDB_ERR_LOCK);
- ok1(tap_log_messages == 0);
- lock_err = EINTR;
- ok1(tdb_lockall_read(tdb) == TDB_ERR_LOCK);
- ok1(tap_log_messages == 0);
- lock_err = ENOMEM;
- ok1(tdb_lockall_read(tdb) == TDB_ERR_LOCK);
- ok1(tap_log_messages > 0);
- tap_log_messages = 0;
- /* Nonblocking traverse; go nonblock partway through. */
- lock_err = 0;
- ok1(tdb_store(tdb, key, data, TDB_REPLACE) == 0);
- trav_err = EAGAIN;
- ok1(tdb_traverse(tdb, trav, &lock_err) == TDB_ERR_LOCK);
- ok1(tap_log_messages == 0);
- trav_err = EINTR;
- lock_err = 0;
- ok1(tdb_traverse(tdb, trav, &lock_err) == TDB_ERR_LOCK);
- ok1(tap_log_messages == 0);
- trav_err = ENOMEM;
- lock_err = 0;
- ok1(tdb_traverse(tdb, trav, &lock_err) == TDB_ERR_LOCK);
- ok1(tap_log_messages == 1);
- tap_log_messages = 0;
- /* Nonblocking transactions. */
- lock_err = EAGAIN;
- ok1(tdb_transaction_start(tdb) == TDB_ERR_LOCK);
- ok1(tap_log_messages == 0);
- lock_err = EINTR;
- ok1(tdb_transaction_start(tdb) == TDB_ERR_LOCK);
- ok1(tap_log_messages == 0);
- lock_err = ENOMEM;
- ok1(tdb_transaction_start(tdb) == TDB_ERR_LOCK);
- ok1(tap_log_messages == 1);
- tap_log_messages = 0;
- /* Nonblocking transaction prepare. */
- lock_err = 0;
- ok1(tdb_transaction_start(tdb) == 0);
- ok1(tdb_delete(tdb, key) == 0);
- lock_err = EAGAIN;
- ok1(tdb_transaction_prepare_commit(tdb) == TDB_ERR_LOCK);
- ok1(tap_log_messages == 0);
- lock_err = 0;
- ok1(tdb_transaction_prepare_commit(tdb) == 0);
- ok1(tdb_transaction_commit(tdb) == 0);
- /* And the transaction was committed, right? */
- ok1(!tdb_exists(tdb, key));
- tdb_close(tdb);
- ok1(tap_log_messages == 0);
- }
- return exit_status();
diff --git a/lib/tdb2/test/api-83-openhook.c b/lib/tdb2/test/api-83-openhook.c
deleted file mode 100644
index 191cf068c1..0000000000
--- a/lib/tdb2/test/api-83-openhook.c
+++ /dev/null
@@ -1,96 +0,0 @@
-#include "config.h"
-#include "tdb2.h"
-#include "tap-interface.h"
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <stdlib.h>
-#include <stdbool.h>
-#include <stdarg.h>
-#include <unistd.h>
-#include "external-agent.h"
-#include "logging.h"
-static enum TDB_ERROR clear_if_first(int fd, void *arg)
-/* We hold a lock offset 4 always, so we can tell if anyone is holding it.
- * (This is compatible with tdb1's TDB_CLEAR_IF_FIRST flag). */
- struct flock fl;
- if (arg != clear_if_first)
- fl.l_type = F_WRLCK;
- fl.l_whence = SEEK_SET;
- fl.l_start = 4;
- fl.l_len = 1;
- if (fcntl(fd, F_SETLK, &fl) == 0) {
- /* We must be first ones to open it! */
- diag("truncating file!");
- if (ftruncate(fd, 0) != 0) {
- return TDB_ERR_IO;
- }
- }
- fl.l_type = F_RDLCK;
- if (fcntl(fd, F_SETLKW, &fl) != 0) {
- return TDB_ERR_IO;
- }
- return TDB_SUCCESS;
-int main(int argc, char *argv[])
- unsigned int i;
- struct tdb_context *tdb;
- struct agent *agent;
- union tdb_attribute cif;
- struct tdb_data key = tdb_mkdata("key", 3);
- int flags[] = { TDB_DEFAULT, TDB_NOMMAP,
- cif.openhook.base.attr = TDB_ATTRIBUTE_OPENHOOK;
- = &tap_log_attr;
- cif.openhook.fn = clear_if_first;
- = clear_if_first;
- agent = prepare_external_agent();
- plan_tests(sizeof(flags) / sizeof(flags[0]) * 13);
- for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
- /* Create it */
- tdb = tdb_open("run-83-openhook.tdb", flags[i],
- ok1(tdb);
- ok1(tdb_store(tdb, key, key, TDB_REPLACE) == 0);
- tdb_close(tdb);
- /* Now, open with CIF, should clear it. */
- tdb = tdb_open("run-83-openhook.tdb", flags[i],
- O_RDWR, 0, &cif);
- ok1(tdb);
- ok1(!tdb_exists(tdb, key));
- ok1(tdb_store(tdb, key, key, TDB_REPLACE) == 0);
- /* Agent should not clear it, since it's still open. */
- ok1(external_agent_operation(agent, OPEN_WITH_HOOK,
- "run-83-openhook.tdb") == SUCCESS);
- ok1(external_agent_operation(agent, FETCH, "key") == SUCCESS);
- ok1(external_agent_operation(agent, CLOSE, "") == SUCCESS);
- /* Still exists for us too. */
- ok1(tdb_exists(tdb, key));
- /* Close it, now agent should clear it. */
- tdb_close(tdb);
- ok1(external_agent_operation(agent, OPEN_WITH_HOOK,
- "run-83-openhook.tdb") == SUCCESS);
- ok1(external_agent_operation(agent, FETCH, "key") == FAILED);
- ok1(external_agent_operation(agent, CLOSE, "") == SUCCESS);
- ok1(tap_log_messages == 0);
- }
- free_external_agent(agent);
- return exit_status();
diff --git a/lib/tdb2/test/api-91-get-stats.c b/lib/tdb2/test/api-91-get-stats.c
deleted file mode 100644
index 395db3fb18..0000000000
--- a/lib/tdb2/test/api-91-get-stats.c
+++ /dev/null
@@ -1,57 +0,0 @@
-#include "config.h"
-#include "tdb2.h"
-#include "tap-interface.h"
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <stdlib.h>
-#include <stddef.h>
-#include "logging.h"
-int main(int argc, char *argv[])
- unsigned int i;
- struct tdb_context *tdb;
- int flags[] = { TDB_DEFAULT, TDB_NOMMAP,
- plan_tests(sizeof(flags) / sizeof(flags[0]) * 11);
- for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
- union tdb_attribute *attr;
- struct tdb_data key = tdb_mkdata("key", 3);
- tdb = tdb_open("run-91-get-stats.tdb", flags[i],
- O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
- ok1(tdb);
- ok1(tdb_store(tdb, key, key, TDB_REPLACE) == 0);
- /* Use malloc so valgrind will catch overruns. */
- attr = malloc(sizeof *attr);
- attr->stats.base.attr = TDB_ATTRIBUTE_STATS;
- attr->stats.size = sizeof(*attr);
- ok1(tdb_get_attribute(tdb, attr) == 0);
- ok1(attr->stats.size == sizeof(*attr));
- ok1(attr->stats.allocs > 0);
- ok1(attr->stats.expands > 0);
- ok1(attr->stats.locks > 0);
- free(attr);
- /* Try short one. */
- attr = malloc(offsetof(struct tdb_attribute_stats, allocs)
- + sizeof(attr->stats.allocs));
- attr->stats.base.attr = TDB_ATTRIBUTE_STATS;
- attr->stats.size = offsetof(struct tdb_attribute_stats, allocs)
- + sizeof(attr->stats.allocs);
- ok1(tdb_get_attribute(tdb, attr) == 0);
- ok1(attr->stats.size == sizeof(*attr));
- ok1(attr->stats.allocs > 0);
- free(attr);
- ok1(tap_log_messages == 0);
- tdb_close(tdb);
- }
- return exit_status();
diff --git a/lib/tdb2/test/api-92-get-set-readonly.c b/lib/tdb2/test/api-92-get-set-readonly.c
deleted file mode 100644
index 46aea7ae0d..0000000000
--- a/lib/tdb2/test/api-92-get-set-readonly.c
+++ /dev/null
@@ -1,105 +0,0 @@
-#include "config.h"
-#include "tdb2.h"
-#include "tap-interface.h"
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include "logging.h"
-int main(int argc, char *argv[])
- unsigned int i;
- struct tdb_context *tdb;
- struct tdb_data key = tdb_mkdata("key", 3);
- struct tdb_data data = tdb_mkdata("data", 4);
- int flags[] = { TDB_DEFAULT, TDB_NOMMAP,
- plan_tests(sizeof(flags) / sizeof(flags[0]) * 48);
- for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
- /* RW -> R0 */
- tdb = tdb_open("run-92-get-set-readonly.tdb", flags[i],
- O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
- ok1(tdb);
- ok1(!(tdb_get_flags(tdb) & TDB_RDONLY));
- ok1(tdb_store(tdb, key, data, TDB_INSERT) == TDB_SUCCESS);
- tdb_add_flag(tdb, TDB_RDONLY);
- ok1(tdb_get_flags(tdb) & TDB_RDONLY);
- /* Can't store, append, delete. */
- ok1(tdb_store(tdb, key, data, TDB_MODIFY) == TDB_ERR_RDONLY);
- ok1(tap_log_messages == 1);
- ok1(tdb_append(tdb, key, data) == TDB_ERR_RDONLY);
- ok1(tap_log_messages == 2);
- ok1(tdb_delete(tdb, key) == TDB_ERR_RDONLY);
- ok1(tap_log_messages == 3);
- /* Can't start a transaction, or any write lock. */
- ok1(tdb_transaction_start(tdb) == TDB_ERR_RDONLY);
- ok1(tap_log_messages == 4);
- ok1(tdb_chainlock(tdb, key) == TDB_ERR_RDONLY);
- ok1(tap_log_messages == 5);
- ok1(tdb_lockall(tdb) == TDB_ERR_RDONLY);
- ok1(tap_log_messages == 6);
- ok1(tdb_wipe_all(tdb) == TDB_ERR_RDONLY);
- ok1(tap_log_messages == 7);
- /* Back to RW. */
- tdb_remove_flag(tdb, TDB_RDONLY);
- ok1(!(tdb_get_flags(tdb) & TDB_RDONLY));
- ok1(tdb_store(tdb, key, data, TDB_MODIFY) == TDB_SUCCESS);
- ok1(tdb_append(tdb, key, data) == TDB_SUCCESS);
- ok1(tdb_delete(tdb, key) == TDB_SUCCESS);
- ok1(tdb_transaction_start(tdb) == TDB_SUCCESS);
- ok1(tdb_store(tdb, key, data, TDB_INSERT) == TDB_SUCCESS);
- ok1(tdb_transaction_commit(tdb) == TDB_SUCCESS);
- ok1(tdb_chainlock(tdb, key) == TDB_SUCCESS);
- tdb_chainunlock(tdb, key);
- ok1(tdb_lockall(tdb) == TDB_SUCCESS);
- tdb_unlockall(tdb);
- ok1(tdb_wipe_all(tdb) == TDB_SUCCESS);
- ok1(tap_log_messages == 7);
- tdb_close(tdb);
- /* R0 -> RW */
- tdb = tdb_open("run-92-get-set-readonly.tdb", flags[i],
- O_RDONLY, 0600, &tap_log_attr);
- ok1(tdb);
- ok1(tdb_get_flags(tdb) & TDB_RDONLY);
- /* Can't store, append, delete. */
- ok1(tdb_store(tdb, key, data, TDB_INSERT) == TDB_ERR_RDONLY);
- ok1(tap_log_messages == 8);
- ok1(tdb_append(tdb, key, data) == TDB_ERR_RDONLY);
- ok1(tap_log_messages == 9);
- ok1(tdb_delete(tdb, key) == TDB_ERR_RDONLY);
- ok1(tap_log_messages == 10);
- /* Can't start a transaction, or any write lock. */
- ok1(tdb_transaction_start(tdb) == TDB_ERR_RDONLY);
- ok1(tap_log_messages == 11);
- ok1(tdb_chainlock(tdb, key) == TDB_ERR_RDONLY);
- ok1(tap_log_messages == 12);
- ok1(tdb_lockall(tdb) == TDB_ERR_RDONLY);
- ok1(tap_log_messages == 13);
- ok1(tdb_wipe_all(tdb) == TDB_ERR_RDONLY);
- ok1(tap_log_messages == 14);
- /* Can't remove TDB_RDONLY since we opened with O_RDONLY */
- tdb_remove_flag(tdb, TDB_RDONLY);
- ok1(tap_log_messages == 15);
- ok1(tdb_get_flags(tdb) & TDB_RDONLY);
- tdb_close(tdb);
- ok1(tap_log_messages == 15);
- tap_log_messages = 0;
- }
- return exit_status();
diff --git a/lib/tdb2/test/api-93-repack.c b/lib/tdb2/test/api-93-repack.c
deleted file mode 100644
index 910eb9b301..0000000000
--- a/lib/tdb2/test/api-93-repack.c
+++ /dev/null
@@ -1,80 +0,0 @@
-#include "config.h"
-#include "tdb2.h"
-#include "tap-interface.h"
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include "logging.h"
-#define NUM_TESTS 1000
-static bool store_all(struct tdb_context *tdb)
- unsigned int i;
- struct tdb_data key = { (unsigned char *)&i, sizeof(i) };
- struct tdb_data dbuf = { (unsigned char *)&i, sizeof(i) };
- for (i = 0; i < NUM_TESTS; i++) {
- if (tdb_store(tdb, key, dbuf, TDB_INSERT) != TDB_SUCCESS)
- return false;
- }
- return true;
-static int mark_entry(struct tdb_context *tdb,
- TDB_DATA key, TDB_DATA data, bool found[])
- unsigned int num;
- if (key.dsize != sizeof(num))
- return -1;
- memcpy(&num, key.dptr, key.dsize);
- if (num >= NUM_TESTS)
- return -1;
- if (found[num])
- return -1;
- found[num] = true;
- return 0;
-static bool is_all_set(bool found[], unsigned int num)
- unsigned int i;
- for (i = 0; i < num; i++)
- if (!found[i])
- return false;
- return true;
-int main(int argc, char *argv[])
- unsigned int i;
- bool found[NUM_TESTS];
- struct tdb_context *tdb;
- int flags[] = { TDB_DEFAULT, TDB_NOMMAP,
- };
- plan_tests(sizeof(flags) / sizeof(flags[0]) * 6 + 1);
- for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
- tdb = tdb_open("run-93-repack.tdb", flags[i],
- O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
- ok1(tdb);
- if (!tdb)
- break;
- ok1(store_all(tdb));
- ok1(tdb_repack(tdb) == TDB_SUCCESS);
- memset(found, 0, sizeof(found));
- ok1(tdb_check(tdb, NULL, NULL) == TDB_SUCCESS);
- ok1(tdb_traverse(tdb, mark_entry, found) == NUM_TESTS);
- ok1(is_all_set(found, NUM_TESTS));
- tdb_close(tdb);
- }
- ok1(tap_log_messages == 0);
- return exit_status();
diff --git a/lib/tdb2/test/api-add-remove-flags.c b/lib/tdb2/test/api-add-remove-flags.c
deleted file mode 100644
index a72b609fcb..0000000000
--- a/lib/tdb2/test/api-add-remove-flags.c
+++ /dev/null
@@ -1,89 +0,0 @@
-#include "private.h" // for tdb_context
-#include "tdb2.h"
-#include "tap-interface.h"
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include "logging.h"
-int main(int argc, char *argv[])
- unsigned int i;
- struct tdb_context *tdb;
- plan_tests(87);
- for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
- tdb = tdb_open("run-add-remove-flags.tdb", flags[i],
- O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
- ok1(tdb);
- if (!tdb)
- continue;
- ok1(tdb_get_flags(tdb) == tdb->flags);
- tap_log_messages = 0;
- tdb_add_flag(tdb, TDB_NOLOCK);
- if (flags[i] & TDB_INTERNAL)
- ok1(tap_log_messages == 1);
- else {
- ok1(tap_log_messages == 0);
- ok1(tdb_get_flags(tdb) & TDB_NOLOCK);
- }
- tap_log_messages = 0;
- tdb_add_flag(tdb, TDB_NOMMAP);
- if (flags[i] & TDB_INTERNAL)
- ok1(tap_log_messages == 1);
- else {
- ok1(tap_log_messages == 0);
- ok1(tdb_get_flags(tdb) & TDB_NOMMAP);
- ok1(tdb->file->map_ptr == NULL);
- }
- tap_log_messages = 0;
- tdb_add_flag(tdb, TDB_NOSYNC);
- if (flags[i] & TDB_INTERNAL)
- ok1(tap_log_messages == 1);
- else {
- ok1(tap_log_messages == 0);
- ok1(tdb_get_flags(tdb) & TDB_NOSYNC);
- }
- ok1(tdb_get_flags(tdb) == tdb->flags);
- tap_log_messages = 0;
- tdb_remove_flag(tdb, TDB_NOLOCK);
- if (flags[i] & TDB_INTERNAL)
- ok1(tap_log_messages == 1);
- else {
- ok1(tap_log_messages == 0);
- ok1(!(tdb_get_flags(tdb) & TDB_NOLOCK));
- }
- tap_log_messages = 0;
- tdb_remove_flag(tdb, TDB_NOMMAP);
- if (flags[i] & TDB_INTERNAL)
- ok1(tap_log_messages == 1);
- else {
- ok1(tap_log_messages == 0);
- ok1(!(tdb_get_flags(tdb) & TDB_NOMMAP));
- ok1(tdb->file->map_ptr != NULL);
- }
- tap_log_messages = 0;
- tdb_remove_flag(tdb, TDB_NOSYNC);
- if (flags[i] & TDB_INTERNAL)
- ok1(tap_log_messages == 1);
- else {
- ok1(tap_log_messages == 0);
- ok1(!(tdb_get_flags(tdb) & TDB_NOSYNC));
- }
- tdb_close(tdb);
- }
- ok1(tap_log_messages == 0);
- return exit_status();
diff --git a/lib/tdb2/test/api-check-callback.c b/lib/tdb2/test/api-check-callback.c
deleted file mode 100644
index 96ef09f3bd..0000000000
--- a/lib/tdb2/test/api-check-callback.c
+++ /dev/null
@@ -1,86 +0,0 @@
-#include "config.h"
-#include "tdb2.h"
-#include "tap-interface.h"
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include "logging.h"
-#define NUM_RECORDS 1000
-static bool store_records(struct tdb_context *tdb)
- int i;
- struct tdb_data key = { (unsigned char *)&i, sizeof(i) };
- struct tdb_data data = { (unsigned char *)&i, sizeof(i) };
- for (i = 0; i < NUM_RECORDS; i++)
- if (tdb_store(tdb, key, data, TDB_REPLACE) != 0)
- return false;
- return true;
-static enum TDB_ERROR check(struct tdb_data key,
- struct tdb_data data,
- bool *array)
- int val;
- if (key.dsize != sizeof(val)) {
- diag("Wrong key size: %u\n", key.dsize);
- }
- if (key.dsize != data.dsize
- || memcmp(key.dptr, data.dptr, sizeof(val)) != 0) {
- diag("Key and data differ\n");
- }
- memcpy(&val, key.dptr, sizeof(val));
- if (val >= NUM_RECORDS || val < 0) {
- diag("check value %i\n", val);
- }
- if (array[val]) {
- diag("Value %i already seen\n", val);
- }
- array[val] = true;
- return TDB_SUCCESS;
-int main(int argc, char *argv[])
- unsigned int i, j;
- struct tdb_context *tdb;
- plan_tests(sizeof(flags) / sizeof(flags[0]) * 4 + 1);
- for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
- bool array[NUM_RECORDS];
- tdb = tdb_open("run-check-callback.tdb", flags[i],
- O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
- ok1(tdb);
- if (!tdb)
- continue;
- ok1(store_records(tdb));
- for (j = 0; j < NUM_RECORDS; j++)
- array[j] = false;
- ok1(tdb_check(tdb, check, array) == TDB_SUCCESS);
- for (j = 0; j < NUM_RECORDS; j++)
- if (!array[j])
- break;
- ok1(j == NUM_RECORDS);
- tdb_close(tdb);
- }
- ok1(tap_log_messages == 0);
- return exit_status();
diff --git a/lib/tdb2/test/api-firstkey-nextkey.c b/lib/tdb2/test/api-firstkey-nextkey.c
deleted file mode 100644
index e5a7c5f8b5..0000000000
--- a/lib/tdb2/test/api-firstkey-nextkey.c
+++ /dev/null
@@ -1,159 +0,0 @@
-#include "config.h"
-#include "tdb2.h"
-#include "tap-interface.h"
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <stdlib.h>
-#include "logging.h"
-#define NUM_RECORDS 1000
-static bool store_records(struct tdb_context *tdb)
- int i;
- struct tdb_data key = { (unsigned char *)&i, sizeof(i) };
- struct tdb_data data = { (unsigned char *)&i, sizeof(i) };
- for (i = 0; i < NUM_RECORDS; i++)
- if (tdb_store(tdb, key, data, TDB_REPLACE) != 0)
- return false;
- return true;
-struct trav_data {
- unsigned int records[NUM_RECORDS];
- unsigned int calls;
-static int trav(struct tdb_context *tdb, TDB_DATA key, TDB_DATA dbuf, void *p)
- struct trav_data *td = p;
- int val;
- memcpy(&val, dbuf.dptr, dbuf.dsize);
- td->records[td->calls++] = val;
- return 0;
-/* Since tdb_nextkey frees dptr, we need to clone it. */
-static TDB_DATA dup_key(TDB_DATA key)
- void *p = malloc(key.dsize);
- memcpy(p, key.dptr, key.dsize);
- key.dptr = p;
- return key;
-int main(int argc, char *argv[])
- unsigned int i, j;
- int num;
- struct trav_data td;
- struct tdb_context *tdb;
- union tdb_attribute seed_attr;
- enum TDB_ERROR ecode;
- seed_attr.base.attr = TDB_ATTRIBUTE_SEED;
- = &tap_log_attr;
- seed_attr.seed.seed = 6334326220117065685ULL;
- plan_tests(sizeof(flags) / sizeof(flags[0])
- * (NUM_RECORDS*6 + (NUM_RECORDS-1)*3 + 22) + 1);
- for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
- tdb = tdb_open("api-firstkey-nextkey.tdb", flags[i],
- &seed_attr);
- ok1(tdb);
- if (!tdb)
- continue;
- ok1(tdb_firstkey(tdb, &k) == TDB_ERR_NOEXIST);
- /* One entry... */
- k.dptr = (unsigned char *)&num;
- k.dsize = sizeof(num);
- num = 0;
- ok1(tdb_store(tdb, k, k, TDB_INSERT) == 0);
- ok1(tdb_firstkey(tdb, &k) == TDB_SUCCESS);
- ok1(k.dsize == sizeof(num));
- ok1(memcmp(k.dptr, &num, sizeof(num)) == 0);
- ok1(tdb_nextkey(tdb, &k) == TDB_ERR_NOEXIST);
- /* Two entries. */
- k.dptr = (unsigned char *)&num;
- k.dsize = sizeof(num);
- num = 1;
- ok1(tdb_store(tdb, k, k, TDB_INSERT) == 0);
- ok1(tdb_firstkey(tdb, &k) == TDB_SUCCESS);
- ok1(k.dsize == sizeof(num));
- memcpy(&num, k.dptr, sizeof(num));
- ok1(num == 0 || num == 1);
- ok1(tdb_nextkey(tdb, &k) == TDB_SUCCESS);
- ok1(k.dsize == sizeof(j));
- memcpy(&j, k.dptr, sizeof(j));
- ok1(j == 0 || j == 1);
- ok1(j != num);
- ok1(tdb_nextkey(tdb, &k) == TDB_ERR_NOEXIST);
- /* Clean up. */
- k.dptr = (unsigned char *)&num;
- k.dsize = sizeof(num);
- num = 0;
- ok1(tdb_delete(tdb, k) == 0);
- num = 1;
- ok1(tdb_delete(tdb, k) == 0);
- /* Now lots of records. */
- ok1(store_records(tdb));
- td.calls = 0;
- num = tdb_traverse(tdb, trav, &td);
- ok1(num == NUM_RECORDS);
- ok1(td.calls == NUM_RECORDS);
- /* Simple loop should match tdb_traverse */
- for (j = 0, ecode = tdb_firstkey(tdb, &k); j < td.calls; j++) {
- int val;
- ok1(ecode == TDB_SUCCESS);
- ok1(k.dsize == sizeof(val));
- memcpy(&val, k.dptr, k.dsize);
- ok1(td.records[j] == val);
- ecode = tdb_nextkey(tdb, &k);
- }
- /* But arbitrary orderings should work too. */
- for (j = td.calls-1; j > 0; j--) {
- k.dptr = (unsigned char *)&td.records[j-1];
- k.dsize = sizeof(td.records[j-1]);
- k = dup_key(k);
- ok1(tdb_nextkey(tdb, &k) == TDB_SUCCESS);
- ok1(k.dsize == sizeof(td.records[j]));
- ok1(memcmp(k.dptr, &td.records[j], k.dsize) == 0);
- free(k.dptr);
- }
- /* Even delete should work. */
- for (j = 0, ecode = tdb_firstkey(tdb, &k);
- ecode != TDB_ERR_NOEXIST;
- j++) {
- ok1(ecode == TDB_SUCCESS);
- ok1(k.dsize == 4);
- ok1(tdb_delete(tdb, k) == 0);
- ecode = tdb_nextkey(tdb, &k);
- }
- diag("delete using first/nextkey gave %u of %u records",
- ok1(j == NUM_RECORDS);
- tdb_close(tdb);
- }
- ok1(tap_log_messages == 0);
- return exit_status();
diff --git a/lib/tdb2/test/api-fork-test.c b/lib/tdb2/test/api-fork-test.c
deleted file mode 100644
index 934c71cbe8..0000000000
--- a/lib/tdb2/test/api-fork-test.c
+++ /dev/null
@@ -1,179 +0,0 @@
-/* Test forking while holding lock.
- *
- * There are only five ways to do this currently:
- * (1) grab a tdb_chainlock, then fork.
- * (2) grab a tdb_lockall, then fork.
- * (3) grab a tdb_lockall_read, then fork.
- * (4) start a transaction, then fork.
- * (5) fork from inside a tdb_parse() callback.
- *
- * Note that we don't hold a lock across tdb_traverse callbacks, so
- * that doesn't matter.
- */
-#include "config.h"
-#include "tdb2.h"
-#include "tap-interface.h"
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <sys/types.h>
-#include <sys/wait.h>
-#include <unistd.h>
-#include <stdlib.h>
-#include "logging.h"
-static enum TDB_ERROR fork_in_parse(TDB_DATA key, TDB_DATA data,
- struct tdb_context *tdb)
- int status;
- if (fork() == 0) {
- /* We expect this to fail. */
- if (tdb_store(tdb, key, data, TDB_REPLACE) != TDB_ERR_LOCK)
- exit(1);
- if (tdb_fetch(tdb, key, &data) != TDB_ERR_LOCK)
- exit(1);
- if (tap_log_messages != 2)
- exit(2);
- tdb_close(tdb);
- if (tap_log_messages != 2)
- exit(3);
- exit(0);
- }
- wait(&status);
- ok1(WIFEXITED(status) && WEXITSTATUS(status) == 0);
- return TDB_SUCCESS;
-int main(int argc, char *argv[])
- unsigned int i;
- struct tdb_context *tdb;
- int flags[] = { TDB_DEFAULT, TDB_NOMMAP,
- struct tdb_data key = tdb_mkdata("key", 3);
- struct tdb_data data = tdb_mkdata("data", 4);
- plan_tests(sizeof(flags) / sizeof(flags[0]) * 14);
- for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
- int status;
- tap_log_messages = 0;
- tdb = tdb_open("run-fork-test.tdb", flags[i],
- O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
- if (!ok1(tdb))
- continue;
- /* Put a record in here. */
- ok1(tdb_store(tdb, key, data, TDB_REPLACE) == TDB_SUCCESS);
- ok1(tdb_chainlock(tdb, key) == TDB_SUCCESS);
- if (fork() == 0) {
- /* We expect this to fail. */
- if (tdb_store(tdb, key, data, TDB_REPLACE) != TDB_ERR_LOCK)
- return 1;
- if (tdb_fetch(tdb, key, &data) != TDB_ERR_LOCK)
- return 1;
- if (tap_log_messages != 2)
- return 2;
- tdb_chainunlock(tdb, key);
- if (tap_log_messages != 3)
- return 3;
- tdb_close(tdb);
- if (tap_log_messages != 3)
- return 4;
- return 0;
- }
- wait(&status);
- ok1(WIFEXITED(status) && WEXITSTATUS(status) == 0);
- tdb_chainunlock(tdb, key);
- ok1(tdb_lockall(tdb) == TDB_SUCCESS);
- if (fork() == 0) {
- /* We expect this to fail. */
- if (tdb_store(tdb, key, data, TDB_REPLACE) != TDB_ERR_LOCK)
- return 1;
- if (tdb_fetch(tdb, key, &data) != TDB_ERR_LOCK)
- return 1;
- if (tap_log_messages != 2)
- return 2;
- tdb_unlockall(tdb);
- if (tap_log_messages != 2)
- return 3;
- tdb_close(tdb);
- if (tap_log_messages != 2)
- return 4;
- return 0;
- }
- wait(&status);
- ok1(WIFEXITED(status) && WEXITSTATUS(status) == 0);
- tdb_unlockall(tdb);
- ok1(tdb_lockall_read(tdb) == TDB_SUCCESS);
- if (fork() == 0) {
- /* We expect this to fail. */
- /* This would always fail anyway... */
- if (tdb_store(tdb, key, data, TDB_REPLACE) != TDB_ERR_LOCK)
- return 1;
- if (tdb_fetch(tdb, key, &data) != TDB_ERR_LOCK)
- return 1;
- if (tap_log_messages != 2)
- return 2;
- tdb_unlockall_read(tdb);
- if (tap_log_messages != 2)
- return 3;
- tdb_close(tdb);
- if (tap_log_messages != 2)
- return 4;
- return 0;
- }
- wait(&status);
- ok1(WIFEXITED(status) && WEXITSTATUS(status) == 0);
- tdb_unlockall_read(tdb);
- ok1(tdb_transaction_start(tdb) == TDB_SUCCESS);
- /* If transactions is empty, noop "commit" succeeds. */
- ok1(tdb_delete(tdb, key) == TDB_SUCCESS);
- if (fork() == 0) {
- /* We expect this to fail. */
- if (tdb_store(tdb, key, data, TDB_REPLACE) != TDB_ERR_LOCK)
- return 1;
- if (tdb_fetch(tdb, key, &data) != TDB_ERR_LOCK)
- return 1;
- if (tap_log_messages != 2)
- return 2;
- if (tdb_transaction_commit(tdb) != TDB_ERR_LOCK)
- return 3;
- tdb_close(tdb);
- if (tap_log_messages < 3)
- return 4;
- return 0;
- }
- wait(&status);
- ok1(WIFEXITED(status) && WEXITSTATUS(status) == 0);
- tdb_transaction_cancel(tdb);
- ok1(tdb_parse_record(tdb, key, fork_in_parse, tdb)
- tdb_close(tdb);
- ok1(tap_log_messages == 0);
- }
- return exit_status();
diff --git a/lib/tdb2/test/api-locktimeout.c b/lib/tdb2/test/api-locktimeout.c
deleted file mode 100644
index dabe262f25..0000000000
--- a/lib/tdb2/test/api-locktimeout.c
+++ /dev/null
@@ -1,193 +0,0 @@
-#include "config.h"
-#include "tdb2.h"
-#include "tap-interface.h"
-#include "system/wait.h"
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/time.h>
-#include <fcntl.h>
-#include <limits.h>
-#include <errno.h>
-#include "logging.h"
-#include "external-agent.h"
-#undef alarm
-#define alarm fast_alarm
-/* Speed things up by doing things in milliseconds. */
-static unsigned int fast_alarm(unsigned int milli_seconds)
- struct itimerval it;
- it.it_interval.tv_sec = it.it_interval.tv_usec = 0;
- it.it_value.tv_sec = milli_seconds / 1000;
- it.it_value.tv_usec = milli_seconds * 1000;
- setitimer(ITIMER_REAL, &it, NULL);
- return 0;
-#define CatchSignal(sig, handler) signal((sig), (handler))
-static void do_nothing(int signum)
-/* This example code is taken from SAMBA, so try not to change it. */
-static struct flock flock_struct;
-/* Return a value which is none of v1, v2 or v3. */
-static inline short int invalid_value(short int v1, short int v2, short int v3)
- short int try = (v1+v2+v3)^((v1+v2+v3) << 16);
- while (try == v1 || try == v2 || try == v3)
- try++;
- return try;
-/* We invalidate in as many ways as we can, so the OS rejects it */
-static void invalidate_flock_struct(int signum)
- flock_struct.l_type = invalid_value(F_RDLCK, F_WRLCK, F_UNLCK);
- flock_struct.l_whence = invalid_value(SEEK_SET, SEEK_CUR, SEEK_END);
- flock_struct.l_start = -1;
- /* A large negative. */
- flock_struct.l_len = (((off_t)1 << (sizeof(off_t)*CHAR_BIT - 1)) + 1);
-static int timeout_lock(int fd, int rw, off_t off, off_t len, bool waitflag,
- void *_timeout)
- int ret, saved_errno = errno;
- unsigned int timeout = *(unsigned int *)_timeout;
- flock_struct.l_type = rw;
- flock_struct.l_whence = SEEK_SET;
- flock_struct.l_start = off;
- flock_struct.l_len = len;
- CatchSignal(SIGALRM, invalidate_flock_struct);
- alarm(timeout);
- for (;;) {
- if (waitflag)
- ret = fcntl(fd, F_SETLKW, &flock_struct);
- else
- ret = fcntl(fd, F_SETLK, &flock_struct);
- if (ret == 0)
- break;
- /* Not signalled? Something else went wrong. */
- if (flock_struct.l_len == len) {
- if (errno == EAGAIN || errno == EINTR)
- continue;
- saved_errno = errno;
- break;
- } else {
- saved_errno = EINTR;
- break;
- }
- }
- alarm(0);
- errno = saved_errno;
- return ret;
-static int tdb_chainlock_with_timeout_internal(struct tdb_context *tdb,
- TDB_DATA key,
- unsigned int timeout,
- int rw_type)
- union tdb_attribute locking;
- enum TDB_ERROR ecode;
- if (timeout) {
- locking.base.attr = TDB_ATTRIBUTE_FLOCK;
- ecode = tdb_get_attribute(tdb, &locking);
- if (ecode != TDB_SUCCESS)
- return ecode;
- /* Replace locking function with our own. */
- = &timeout;
- locking.flock.lock = timeout_lock;
- ecode = tdb_set_attribute(tdb, &locking);
- if (ecode != TDB_SUCCESS)
- return ecode;
- }
- if (rw_type == F_RDLCK)
- ecode = tdb_chainlock_read(tdb, key);
- else
- ecode = tdb_chainlock(tdb, key);
- if (timeout) {
- tdb_unset_attribute(tdb, TDB_ATTRIBUTE_FLOCK);
- }
- return ecode;
-int main(int argc, char *argv[])
- unsigned int i;
- struct tdb_context *tdb;
- TDB_DATA key = tdb_mkdata("hello", 5);
- int flags[] = { TDB_DEFAULT, TDB_NOMMAP,
- struct agent *agent;
- plan_tests(sizeof(flags) / sizeof(flags[0]) * 15);
- agent = prepare_external_agent();
- for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
- enum TDB_ERROR ecode;
- tdb = tdb_open("run-locktimeout.tdb", flags[i],
- O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
- if (!ok1(tdb))
- break;
- /* Simple cases: should succeed. */
- ecode = tdb_chainlock_with_timeout_internal(tdb, key, 20,
- ok1(ecode == TDB_SUCCESS);
- ok1(tap_log_messages == 0);
- tdb_chainunlock_read(tdb, key);
- ok1(tap_log_messages == 0);
- ecode = tdb_chainlock_with_timeout_internal(tdb, key, 20,
- ok1(ecode == TDB_SUCCESS);
- ok1(tap_log_messages == 0);
- tdb_chainunlock(tdb, key);
- ok1(tap_log_messages == 0);
- /* OK, get agent to start transaction, then we should time out. */
- ok1(external_agent_operation(agent, OPEN, "run-locktimeout.tdb")
- == SUCCESS);
- ok1(external_agent_operation(agent, TRANSACTION_START, "")
- == SUCCESS);
- ecode = tdb_chainlock_with_timeout_internal(tdb, key, 20,
- ok1(ecode == TDB_ERR_LOCK);
- ok1(tap_log_messages == 0);
- /* Even if we get a different signal, should be fine. */
- CatchSignal(SIGUSR1, do_nothing);
- external_agent_operation(agent, SEND_SIGNAL, "");
- ecode = tdb_chainlock_with_timeout_internal(tdb, key, 20,
- ok1(ecode == TDB_ERR_LOCK);
- ok1(tap_log_messages == 0);
- ok1(external_agent_operation(agent, TRANSACTION_COMMIT, "")
- == SUCCESS);
- ok1(external_agent_operation(agent, CLOSE, "")
- == SUCCESS);
- tdb_close(tdb);
- }
- free_external_agent(agent);
- return exit_status();
diff --git a/lib/tdb2/test/api-missing-entries.c b/lib/tdb2/test/api-missing-entries.c
deleted file mode 100644
index c81839bc05..0000000000
--- a/lib/tdb2/test/api-missing-entries.c
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Another test revealed that we lost an entry. This reproduces it. */
-#include "config.h"
-#include "tdb2.h"
-#include <ccan/hash/hash.h>
-#include "tap-interface.h"
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include "logging.h"
-#define NUM_RECORDS 1189
-/* We use the same seed which we saw this failure on. */
-static uint64_t failhash(const void *key, size_t len, uint64_t seed, void *p)
- seed = 699537674708983027ULL;
- return hash64_stable((const unsigned char *)key, len, seed);
-int main(int argc, char *argv[])
- int i;
- struct tdb_context *tdb;
- struct tdb_data key = { (unsigned char *)&i, sizeof(i) };
- struct tdb_data data = { (unsigned char *)&i, sizeof(i) };
- union tdb_attribute hattr = { .hash = { .base = { TDB_ATTRIBUTE_HASH },
- .fn = failhash } };
- = &tap_log_attr;
- plan_tests(1 + NUM_RECORDS + 2);
- tdb = tdb_open("run-missing-entries.tdb", TDB_INTERNAL,
- O_RDWR|O_CREAT|O_TRUNC, 0600, &hattr);
- if (ok1(tdb)) {
- for (i = 0; i < NUM_RECORDS; i++) {
- ok1(tdb_store(tdb, key, data, TDB_REPLACE) == 0);
- }
- ok1(tdb_check(tdb, NULL, NULL) == 0);
- tdb_close(tdb);
- }
- ok1(tap_log_messages == 0);
- return exit_status();
diff --git a/lib/tdb2/test/api-open-multiple-times.c b/lib/tdb2/test/api-open-multiple-times.c
deleted file mode 100644
index 38aea135ac..0000000000
--- a/lib/tdb2/test/api-open-multiple-times.c
+++ /dev/null
@@ -1,83 +0,0 @@
-#include "config.h"
-#include "tdb2.h"
-#include "tap-interface.h"
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <stdlib.h>
-#include "logging.h"
-int main(int argc, char *argv[])
- unsigned int i;
- struct tdb_context *tdb, *tdb2;
- struct tdb_data key = { (unsigned char *)&i, sizeof(i) };
- struct tdb_data data = { (unsigned char *)&i, sizeof(i) };
- struct tdb_data d = { NULL, 0 }; /* Bogus GCC warning */
- int flags[] = { TDB_DEFAULT, TDB_NOMMAP,
- plan_tests(sizeof(flags) / sizeof(flags[0]) * 28);
- for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
- tdb = tdb_open("run-open-multiple-times.tdb", flags[i],
- O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
- ok1(tdb);
- if (!tdb)
- continue;
- tdb2 = tdb_open("run-open-multiple-times.tdb", flags[i],
- O_RDWR|O_CREAT, 0600, &tap_log_attr);
- ok1(tdb_check(tdb, NULL, NULL) == 0);
- ok1(tdb_check(tdb2, NULL, NULL) == 0);
- /* Store in one, fetch in the other. */
- ok1(tdb_store(tdb, key, data, TDB_REPLACE) == 0);
- ok1(tdb_fetch(tdb2, key, &d) == TDB_SUCCESS);
- ok1(tdb_deq(d, data));
- free(d.dptr);
- /* Vice versa, with delete. */
- ok1(tdb_delete(tdb2, key) == 0);
- ok1(tdb_fetch(tdb, key, &d) == TDB_ERR_NOEXIST);
- /* OK, now close first one, check second still good. */
- ok1(tdb_close(tdb) == 0);
- ok1(tdb_store(tdb2, key, data, TDB_REPLACE) == 0);
- ok1(tdb_fetch(tdb2, key, &d) == TDB_SUCCESS);
- ok1(tdb_deq(d, data));
- free(d.dptr);
- /* Reopen */
- tdb = tdb_open("run-open-multiple-times.tdb", flags[i],
- O_RDWR|O_CREAT, 0600, &tap_log_attr);
- ok1(tdb);
- ok1(tdb_transaction_start(tdb2) == 0);
- /* Anything in the other one should fail. */
- ok1(tdb_fetch(tdb, key, &d) == TDB_ERR_LOCK);
- ok1(tap_log_messages == 1);
- ok1(tdb_store(tdb, key, data, TDB_REPLACE) == TDB_ERR_LOCK);
- ok1(tap_log_messages == 2);
- ok1(tdb_transaction_start(tdb) == TDB_ERR_LOCK);
- ok1(tap_log_messages == 3);
- ok1(tdb_chainlock(tdb, key) == TDB_ERR_LOCK);
- ok1(tap_log_messages == 4);
- /* Transaciton should work as normal. */
- ok1(tdb_store(tdb2, key, data, TDB_REPLACE) == TDB_SUCCESS);
- /* Now... try closing with locks held. */
- ok1(tdb_close(tdb2) == 0);
- ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS);
- ok1(tdb_deq(d, data));
- free(d.dptr);
- ok1(tdb_close(tdb) == 0);
- ok1(tap_log_messages == 4);
- tap_log_messages = 0;
- }
- return exit_status();
diff --git a/lib/tdb2/test/api-record-expand.c b/lib/tdb2/test/api-record-expand.c
deleted file mode 100644
index 34799ebe5e..0000000000
--- a/lib/tdb2/test/api-record-expand.c
+++ /dev/null
@@ -1,51 +0,0 @@
-#include "config.h"
-#include "tdb2.h"
-#include "tap-interface.h"
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <stdlib.h>
-#include "logging.h"
-#define MAX_SIZE 10000
-#define SIZE_STEP 131
-int main(int argc, char *argv[])
- unsigned int i;
- struct tdb_context *tdb;
- struct tdb_data key = tdb_mkdata("key", 3);
- struct tdb_data data;
- data.dptr = malloc(MAX_SIZE);
- memset(data.dptr, 0x24, MAX_SIZE);
- plan_tests(sizeof(flags) / sizeof(flags[0])
- * (3 + (1 + (MAX_SIZE/SIZE_STEP)) * 2) + 1);
- for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
- tdb = tdb_open("run-record-expand.tdb", flags[i],
- O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
- ok1(tdb);
- if (!tdb)
- continue;
- data.dsize = 0;
- ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0);
- ok1(tdb_check(tdb, NULL, NULL) == 0);
- for (data.dsize = 0;
- data.dsize < MAX_SIZE;
- data.dsize += SIZE_STEP) {
- memset(data.dptr, data.dsize, data.dsize);
- ok1(tdb_store(tdb, key, data, TDB_MODIFY) == 0);
- ok1(tdb_check(tdb, NULL, NULL) == 0);
- }
- tdb_close(tdb);
- }
- ok1(tap_log_messages == 0);
- free(data.dptr);
- return exit_status();
diff --git a/lib/tdb2/test/api-simple-delete.c b/lib/tdb2/test/api-simple-delete.c
deleted file mode 100644
index 48b077a6db..0000000000
--- a/lib/tdb2/test/api-simple-delete.c
+++ /dev/null
@@ -1,39 +0,0 @@
-#include "config.h"
-#include "tdb2.h"
-#include "tap-interface.h"
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include "logging.h"
-int main(int argc, char *argv[])
- unsigned int i;
- struct tdb_context *tdb;
- struct tdb_data key = tdb_mkdata("key", 3);
- struct tdb_data data = tdb_mkdata("data", 4);
- plan_tests(sizeof(flags) / sizeof(flags[0]) * 7 + 1);
- for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
- tdb = tdb_open("run-simple-delete.tdb", flags[i],
- O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
- ok1(tdb);
- if (tdb) {
- /* Delete should fail. */
- ok1(tdb_delete(tdb, key) == TDB_ERR_NOEXIST);
- ok1(tdb_check(tdb, NULL, NULL) == 0);
- /* Insert should succeed. */
- ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0);
- ok1(tdb_check(tdb, NULL, NULL) == 0);
- /* Delete should now work. */
- ok1(tdb_delete(tdb, key) == 0);
- ok1(tdb_check(tdb, NULL, NULL) == 0);
- tdb_close(tdb);
- }
- }
- ok1(tap_log_messages == 0);
- return exit_status();
diff --git a/lib/tdb2/test/api-summary.c b/lib/tdb2/test/api-summary.c
deleted file mode 100644
index e9dfd270e9..0000000000
--- a/lib/tdb2/test/api-summary.c
+++ /dev/null
@@ -1,58 +0,0 @@
-#include "config.h"
-#include "tdb2.h"
-#include "tap-interface.h"
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <stdlib.h>
-#include "logging.h"
-int main(int argc, char *argv[])
- unsigned int i, j;
- struct tdb_context *tdb;
- struct tdb_data key = { (unsigned char *)&j, sizeof(j) };
- struct tdb_data data = { (unsigned char *)&j, sizeof(j) };
- char *summary;
- plan_tests(sizeof(flags) / sizeof(flags[0]) * (1 + 2 * 5) + 1);
- for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
- tdb = tdb_open("run-summary.tdb", flags[i],
- O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
- ok1(tdb);
- if (!tdb)
- continue;
- /* Put some stuff in there. */
- for (j = 0; j < 500; j++) {
- /* Make sure padding varies to we get some graphs! */
- data.dsize = j % (sizeof(j) + 1);
- if (tdb_store(tdb, key, data, TDB_REPLACE) != 0)
- fail("Storing in tdb");
- }
- for (j = 0;
- ok1(tdb_summary(tdb, j, &summary) == TDB_SUCCESS);
- ok1(strstr(summary, "Number of records: 500\n"));
- ok1(strstr(summary, "Smallest/average/largest keys: 4/4/4\n"));
- ok1(strstr(summary, "Smallest/average/largest data: 0/2/4\n"));
- ok1(strstr(summary, "|")
- && strstr(summary, "*"));
- } else {
- ok1(!strstr(summary, "|")
- && !strstr(summary, "*"));
- }
- free(summary);
- }
- tdb_close(tdb);
- }
- ok1(tap_log_messages == 0);
- return exit_status();
diff --git a/lib/tdb2/test/external-agent.c b/lib/tdb2/test/external-agent.c
deleted file mode 100644
index e8cff95728..0000000000
--- a/lib/tdb2/test/external-agent.c
+++ /dev/null
@@ -1,252 +0,0 @@
-#include "external-agent.h"
-#include "logging.h"
-#include "lock-tracking.h"
-#include <sys/types.h>
-#include <sys/wait.h>
-#include <unistd.h>
-#include <ccan/err/err.h>
-#include <fcntl.h>
-#include <stdlib.h>
-#include <limits.h>
-#include <string.h>
-#include <errno.h>
-#include "tap-interface.h"
-#include <stdio.h>
-#include <stdarg.h>
-static struct tdb_context *tdb;
-void (*external_agent_free)(void *) = free;
-static enum TDB_ERROR clear_if_first(int fd, void *arg)
-/* We hold a lock offset 4 always, so we can tell if anyone is holding it.
- * (This is compatible with tdb1's TDB_CLEAR_IF_FIRST flag). */
- struct flock fl;
- fl.l_type = F_WRLCK;
- fl.l_whence = SEEK_SET;
- fl.l_start = 4;
- fl.l_len = 1;
- if (fcntl(fd, F_SETLK, &fl) == 0) {
- /* We must be first ones to open it! */
- diag("agent truncating file!");
- if (ftruncate(fd, 0) != 0) {
- return TDB_ERR_IO;
- }
- }
- fl.l_type = F_RDLCK;
- if (fcntl(fd, F_SETLKW, &fl) != 0) {
- return TDB_ERR_IO;
- }
- return TDB_SUCCESS;
-static enum agent_return do_operation(enum operation op, const char *name)
- enum agent_return ret;
- TDB_DATA data;
- enum TDB_ERROR ecode;
- union tdb_attribute cif;
- if (op != OPEN && op != OPEN_WITH_HOOK && !tdb) {
- diag("external: No tdb open!");
- }
- diag("external: %s", operation_name(op));
- k = tdb_mkdata(name, strlen(name));
- locking_would_block = 0;
- switch (op) {
- case OPEN:
- if (tdb) {
- diag("Already have tdb %s open", tdb_name(tdb));
- }
- tdb = tdb_open(name, TDB_DEFAULT, O_RDWR, 0, &tap_log_attr);
- if (!tdb) {
- if (!locking_would_block)
- diag("Opening tdb gave %s", strerror(errno));
- forget_locking();
- } else
- ret = SUCCESS;
- break;
- if (tdb) {
- diag("Already have tdb %s open", tdb_name(tdb));
- }
- cif.openhook.base.attr = TDB_ATTRIBUTE_OPENHOOK;
- = &tap_log_attr;
- cif.openhook.fn = clear_if_first;
- tdb = tdb_open(name, TDB_DEFAULT, O_RDWR, 0, &cif);
- if (!tdb) {
- if (!locking_would_block)
- diag("Opening tdb gave %s", strerror(errno));
- forget_locking();
- } else
- ret = SUCCESS;
- break;
- case FETCH:
- ecode = tdb_fetch(tdb, k, &data);
- if (ecode == TDB_ERR_NOEXIST) {
- ret = FAILED;
- } else if (ecode < 0) {
- } else if (!tdb_deq(data, k)) {
- external_agent_free(data.dptr);
- } else {
- ret = SUCCESS;
- external_agent_free(data.dptr);
- }
- break;
- case STORE:
- ret = tdb_store(tdb, k, k, 0) == 0 ? SUCCESS : OTHER_FAILURE;
- break;
- ret = tdb_transaction_start(tdb) == 0 ? SUCCESS : OTHER_FAILURE;
- break;
- ret = tdb_transaction_commit(tdb)==0 ? SUCCESS : OTHER_FAILURE;
- break;
- ret = external_agent_needs_rec(tdb);
- break;
- case CHECK:
- ret = tdb_check(tdb, NULL, NULL) == 0 ? SUCCESS : OTHER_FAILURE;
- break;
- case CLOSE:
- ret = tdb_close(tdb) == 0 ? SUCCESS : OTHER_FAILURE;
- tdb = NULL;
- break;
- /* We do this async */
- ret = SUCCESS;
- break;
- default:
- }
- if (locking_would_block)
- return ret;
-struct agent {
- int cmdfd, responsefd;
-/* Do this before doing any tdb stuff. Return handle, or NULL. */
-struct agent *prepare_external_agent(void)
- int pid, ret;
- int command[2], response[2];
- char name[1+PATH_MAX];
- if (pipe(command) != 0 || pipe(response) != 0)
- return NULL;
- pid = fork();
- if (pid < 0)
- return NULL;
- if (pid != 0) {
- struct agent *agent = malloc(sizeof(*agent));
- close(command[0]);
- close(response[1]);
- agent->cmdfd = command[1];
- agent->responsefd = response[0];
- return agent;
- }
- close(command[1]);
- close(response[0]);
- /* We want to fail, not block. */
- nonblocking_locks = true;
- log_prefix = "external: ";
- while ((ret = read(command[0], name, sizeof(name))) > 0) {
- enum agent_return result;
- result = do_operation(name[0], name+1);
- if (write(response[1], &result, sizeof(result))
- != sizeof(result))
- err(1, "Writing response");
- if (name[0] == SEND_SIGNAL) {
- struct timeval ten_ms;
- ten_ms.tv_sec = 0;
- ten_ms.tv_usec = 10000;
- select(0, NULL, NULL, NULL, &ten_ms);
- kill(getppid(), SIGUSR1);
- }
- }
- exit(0);
-/* Ask the external agent to try to do an operation. */
-enum agent_return external_agent_operation(struct agent *agent,
- enum operation op,
- const char *name)
- enum agent_return res;
- unsigned int len;
- char *string;
- if (!name)
- name = "";
- len = 1 + strlen(name) + 1;
- string = malloc(len);
- string[0] = op;
- strcpy(string+1, name);
- if (write(agent->cmdfd, string, len) != len
- || read(agent->responsefd, &res, sizeof(res)) != sizeof(res))
- res = AGENT_DIED;
- free(string);
- return res;
-const char *agent_return_name(enum agent_return ret)
- return ret == SUCCESS ? "SUCCESS"
- : ret == AGENT_DIED ? "AGENT_DIED"
- : ret == FAILED ? "FAILED"
- : "**INVALID**";
-const char *operation_name(enum operation op)
- switch (op) {
- case OPEN: return "OPEN";
- case FETCH: return "FETCH";
- case STORE: return "STORE";
- case CHECK: return "CHECK";
- case SEND_SIGNAL: return "SEND_SIGNAL";
- case CLOSE: return "CLOSE";
- }
- return "**INVALID**";
-void free_external_agent(struct agent *agent)
- close(agent->cmdfd);
- close(agent->responsefd);
- free(agent);
diff --git a/lib/tdb2/test/external-agent.h b/lib/tdb2/test/external-agent.h
deleted file mode 100644
index c4cd2b148d..0000000000
--- a/lib/tdb2/test/external-agent.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/* For locking tests, we need a different process to try things at
- * various times. */
-enum operation {
-/* Do this before doing any tdb stuff. Return handle, or -1. */
-struct agent *prepare_external_agent(void);
-enum agent_return {
- FAILED, /* For fetch, or NEEDS_RECOVERY */
-/* Ask the external agent to try to do an operation.
- * name == tdb name for OPEN/OPEN_WITH_CLEAR_IF_FIRST,
- * record name for FETCH/STORE (store stores name as data too)
- */
-enum agent_return external_agent_operation(struct agent *handle,
- enum operation op,
- const char *name);
-/* Hook into free() on tdb_data in external agent. */
-extern void (*external_agent_free)(void *);
-/* Mapping enum -> string. */
-const char *agent_return_name(enum agent_return ret);
-const char *operation_name(enum operation op);
-void free_external_agent(struct agent *agent);
-/* Internal use: */
-struct tdb_context;
-enum agent_return external_agent_needs_rec(struct tdb_context *tdb);
diff --git a/lib/tdb2/test/failtest_helper.c b/lib/tdb2/test/failtest_helper.c
deleted file mode 100644
index 386f1c2379..0000000000
--- a/lib/tdb2/test/failtest_helper.c
+++ /dev/null
@@ -1,96 +0,0 @@
-#include "failtest_helper.h"
-#include "logging.h"
-#include <string.h>
-#include "tap-interface.h"
-bool failtest_suppress = false;
-/* FIXME: From ccan/str */
-static inline bool strends(const char *str, const char *postfix)
- if (strlen(str) < strlen(postfix))
- return false;
- return !strcmp(str + strlen(str) - strlen(postfix), postfix);
-bool failmatch(const struct failtest_call *call,
- const char *file, int line, enum failtest_call_type type)
- return call->type == type
- && call->line == line
- && ((strcmp(call->file, file) == 0)
- || (strends(call->file, file)
- && (call->file[strlen(call->file) - strlen(file) - 1]
- == '/')));
-static bool is_nonblocking_lock(const struct failtest_call *call)
- return call->type == FAILTEST_FCNTL && call->u.fcntl.cmd == F_SETLK;
-static bool is_unlock(const struct failtest_call *call)
- return call->type == FAILTEST_FCNTL
- && call->u.fcntl.arg.fl.l_type == F_UNLCK;
-bool exit_check_log(struct tlist_calls *history)
- const struct failtest_call *i;
- tlist_for_each(history, i, list) {
- if (!i->fail)
- continue;
- /* Failing the /dev/urandom open doesn't count: we fall back. */
- if (failmatch(i, URANDOM_OPEN))
- continue;
- /* Similarly with read fail. */
- if (failmatch(i, URANDOM_READ))
- continue;
- /* Initial allocation of tdb doesn't log. */
- if (failmatch(i, INITIAL_TDB_MALLOC))
- continue;
- /* We don't block "failures" on non-blocking locks. */
- if (is_nonblocking_lock(i))
- continue;
- if (!tap_log_messages)
- diag("We didn't log for %s:%u", i->file, i->line);
- return tap_log_messages != 0;
- }
- return true;
-/* Some places we soldier on despite errors: only fail them once. */
-enum failtest_result
-block_repeat_failures(struct tlist_calls *history)
- const struct failtest_call *last;
- last = tlist_tail(history, list);
- if (failtest_suppress)
- return FAIL_DONT_FAIL;
- if (failmatch(last, INITIAL_TDB_MALLOC)
- || failmatch(last, URANDOM_OPEN)
- || failmatch(last, URANDOM_READ)) {
- return FAIL_PROBE;
- }
- /* We handle mmap failing, by falling back to read/write, so
- * don't try all possible paths. */
- if (last->type == FAILTEST_MMAP)
- return FAIL_PROBE;
- /* Unlock or non-blocking lock is fail-once. */
- if (is_unlock(last) || is_nonblocking_lock(last))
- return FAIL_PROBE;
- return FAIL_OK;
diff --git a/lib/tdb2/test/failtest_helper.h b/lib/tdb2/test/failtest_helper.h
deleted file mode 100644
index 3c509e7c38..0000000000
--- a/lib/tdb2/test/failtest_helper.h
+++ /dev/null
@@ -1,19 +0,0 @@
-#include <ccan/failtest/failtest.h>
-#include <stdbool.h>
-/* FIXME: Check these! */
-#define URANDOM_OPEN "open.c", 62, FAILTEST_OPEN
-#define URANDOM_READ "open.c", 42, FAILTEST_READ
-bool exit_check_log(struct tlist_calls *history);
-bool failmatch(const struct failtest_call *call,
- const char *file, int line, enum failtest_call_type type);
-enum failtest_result block_repeat_failures(struct tlist_calls *history);
-/* Set this to suppress failure. */
-extern bool failtest_suppress;
-#endif /* TDB2_TEST_LOGGING_H */
diff --git a/lib/tdb2/test/helpapi-external-agent.c b/lib/tdb2/test/helpapi-external-agent.c
deleted file mode 100644
index 59e1c6cbee..0000000000
--- a/lib/tdb2/test/helpapi-external-agent.c
+++ /dev/null
@@ -1,7 +0,0 @@
-#include "external-agent.h"
-/* This isn't possible with via the tdb2 API, but this makes it link. */
-enum agent_return external_agent_needs_rec(struct tdb_context *tdb)
- return FAILED;
diff --git a/lib/tdb2/test/helprun-external-agent.c b/lib/tdb2/test/helprun-external-agent.c
deleted file mode 100644
index 9f243824fd..0000000000
--- a/lib/tdb2/test/helprun-external-agent.c
+++ /dev/null
@@ -1,7 +0,0 @@
-#include "external-agent.h"
-#include "private.h"
-enum agent_return external_agent_needs_rec(struct tdb_context *tdb)
- return tdb_needs_recovery(tdb) ? SUCCESS : FAILED;
diff --git a/lib/tdb2/test/helprun-layout.c b/lib/tdb2/test/helprun-layout.c
deleted file mode 100644
index b9cd4a6432..0000000000
--- a/lib/tdb2/test/helprun-layout.c
+++ /dev/null
@@ -1,402 +0,0 @@
-/* TDB tools to create various canned database layouts. */
-#include "layout.h"
-#include <stdlib.h>
-#include <string.h>
-#include <assert.h>
-#include <ccan/err/err.h>
-#include "logging.h"
-struct tdb_layout *new_tdb_layout(void)
- struct tdb_layout *layout = malloc(sizeof(*layout));
- layout->num_elems = 0;
- layout->elem = NULL;
- return layout;
-static void add(struct tdb_layout *layout, union tdb_layout_elem elem)
- layout->elem = realloc(layout->elem,
- sizeof(layout->elem[0])
- * (layout->num_elems+1));
- layout->elem[layout->num_elems++] = elem;
-void tdb_layout_add_freetable(struct tdb_layout *layout)
- union tdb_layout_elem elem;
- elem.base.type = FREETABLE;
- add(layout, elem);
-void tdb_layout_add_free(struct tdb_layout *layout, tdb_len_t len,
- unsigned ftable)
- union tdb_layout_elem elem;
- elem.base.type = FREE;
- = len;
- = ftable;
- add(layout, elem);
-void tdb_layout_add_capability(struct tdb_layout *layout,
- uint64_t type,
- bool write_breaks,
- bool check_breaks,
- bool open_breaks,
- tdb_len_t extra)
- union tdb_layout_elem elem;
- elem.base.type = CAPABILITY;
- elem.capability.type = type;
- if (write_breaks)
- elem.capability.type |= TDB_CAP_NOWRITE;
- if (open_breaks)
- elem.capability.type |= TDB_CAP_NOOPEN;
- if (check_breaks)
- elem.capability.type |= TDB_CAP_NOCHECK;
- elem.capability.extra = extra;
- add(layout, elem);
-static struct tdb_data dup_key(struct tdb_data key)
- struct tdb_data ret;
- ret.dsize = key.dsize;
- ret.dptr = malloc(ret.dsize);
- memcpy(ret.dptr, key.dptr, ret.dsize);
- return ret;
-void tdb_layout_add_used(struct tdb_layout *layout,
- TDB_DATA key, TDB_DATA data,
- tdb_len_t extra)
- union tdb_layout_elem elem;
- elem.base.type = DATA;
- elem.used.key = dup_key(key);
- = dup_key(data);
- elem.used.extra = extra;
- add(layout, elem);
-static tdb_len_t free_record_len(tdb_len_t len)
- return sizeof(struct tdb_used_record) + len;
-static tdb_len_t data_record_len(struct tle_used *used)
- tdb_len_t len;
- len = sizeof(struct tdb_used_record)
- + used->key.dsize + used->data.dsize + used->extra;
- assert(len >= sizeof(struct tdb_free_record));
- return len;
-static tdb_len_t hashtable_len(struct tle_hashtable *htable)
- return sizeof(struct tdb_used_record)
- + (sizeof(tdb_off_t) << TDB_SUBLEVEL_HASH_BITS)
- + htable->extra;
-static tdb_len_t capability_len(struct tle_capability *cap)
- return sizeof(struct tdb_capability) + cap->extra;
-static tdb_len_t freetable_len(struct tle_freetable *ftable)
- return sizeof(struct tdb_freetable);
-static void set_free_record(void *mem, tdb_len_t len)
- /* We do all the work in add_to_freetable */
-static void add_zero_pad(struct tdb_used_record *u, size_t len, size_t extra)
- if (extra)
- ((char *)(u + 1))[len] = '\0';
-static void set_data_record(void *mem, struct tdb_context *tdb,
- struct tle_used *used)
- struct tdb_used_record *u = mem;
- set_header(tdb, u, TDB_USED_MAGIC, used->key.dsize, used->data.dsize,
- used->key.dsize + used->data.dsize + used->extra,
- tdb_hash(tdb, used->key.dptr, used->key.dsize));
- memcpy(u + 1, used->key.dptr, used->key.dsize);
- memcpy((char *)(u + 1) + used->key.dsize,
- used->data.dptr, used->data.dsize);
- add_zero_pad(u, used->key.dsize + used->data.dsize, used->extra);
-static void set_hashtable(void *mem, struct tdb_context *tdb,
- struct tle_hashtable *htable)
- struct tdb_used_record *u = mem;
- tdb_len_t len = sizeof(tdb_off_t) << TDB_SUBLEVEL_HASH_BITS;
- set_header(tdb, u, TDB_HTABLE_MAGIC, 0, len, len + htable->extra, 0);
- memset(u + 1, 0, len);
- add_zero_pad(u, len, htable->extra);
-static void set_capability(void *mem, struct tdb_context *tdb,
- struct tle_capability *cap, struct tdb_header *hdr,
- tdb_off_t last_cap)
- struct tdb_capability *c = mem;
- tdb_len_t len = sizeof(*c) - sizeof(struct tdb_used_record) + cap->extra;
- c->type = cap->type;
- c->next = 0;
- set_header(tdb, &c->hdr, TDB_CAP_MAGIC, 0, len, len, 0);
- /* Append to capability list. */
- if (!last_cap) {
- hdr->capabilities = cap->;
- } else {
- c = (struct tdb_capability *)((char *)hdr + last_cap);
- c->next = cap->;
- }
-static void set_freetable(void *mem, struct tdb_context *tdb,
- struct tle_freetable *freetable, struct tdb_header *hdr,
- tdb_off_t last_ftable)
- struct tdb_freetable *ftable = mem;
- memset(ftable, 0, sizeof(*ftable));
- set_header(tdb, &ftable->hdr, TDB_FTABLE_MAGIC, 0,
- sizeof(*ftable) - sizeof(ftable->hdr),
- sizeof(*ftable) - sizeof(ftable->hdr), 0);
- if (last_ftable) {
- ftable = (struct tdb_freetable *)((char *)hdr + last_ftable);
- ftable->next = freetable->;
- } else {
- hdr->free_table = freetable->;
- }
-static void add_to_freetable(struct tdb_context *tdb,
- tdb_off_t eoff,
- tdb_off_t elen,
- unsigned ftable,
- struct tle_freetable *freetable)
- tdb->ftable_off = freetable->;
- tdb->ftable = ftable;
- add_free_record(tdb, eoff, sizeof(struct tdb_used_record) + elen,
- TDB_LOCK_WAIT, false);
-static tdb_off_t hbucket_off(tdb_off_t group_start, unsigned ingroup)
- return group_start
- + (ingroup % (1 << TDB_HASH_GROUP_BITS)) * sizeof(tdb_off_t);
-/* Get bits from a value. */
-static uint32_t bits(uint64_t val, unsigned start, unsigned num)
- assert(num <= 32);
- return (val >> start) & ((1U << num) - 1);
-/* We take bits from the top: that way we can lock whole sections of the hash
- * by using lock ranges. */
-static uint32_t use_bits(uint64_t h, unsigned num, unsigned *used)
- *used += num;
- return bits(h, 64 - *used, num);
-static tdb_off_t encode_offset(tdb_off_t new_off, unsigned bucket,
- uint64_t h)
- return bucket
- | new_off
- | ((uint64_t)bits(h, 64 - TDB_OFF_UPPER_STEAL_EXTRA,
-/* FIXME: Our hash table handling here is primitive: we don't expand! */
-static void add_to_hashtable(struct tdb_context *tdb,
- tdb_off_t eoff,
- struct tdb_data key)
- uint64_t h = tdb_hash(tdb, key.dptr, key.dsize);
- tdb_off_t b_off, group_start;
- unsigned i, group, in_group;
- unsigned used = 0;
- group = use_bits(h, TDB_TOPLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS, &used);
- in_group = use_bits(h, TDB_HASH_GROUP_BITS, &used);
- group_start = offsetof(struct tdb_header, hashtable)
- + group * (sizeof(tdb_off_t) << TDB_HASH_GROUP_BITS);
- for (i = 0; i < (1 << TDB_HASH_GROUP_BITS); i++) {
- unsigned bucket = (in_group + i) % (1 << TDB_HASH_GROUP_BITS);
- b_off = hbucket_off(group_start, bucket);
- if (tdb_read_off(tdb, b_off) == 0) {
- tdb_write_off(tdb, b_off,
- encode_offset(eoff, in_group, h));
- return;
- }
- }
- abort();
-static struct tle_freetable *find_ftable(struct tdb_layout *layout, unsigned num)
- unsigned i;
- for (i = 0; i < layout->num_elems; i++) {
- if (layout->elem[i].base.type != FREETABLE)
- continue;
- if (num == 0)
- return &layout->elem[i].ftable;
- num--;
- }
- abort();
-/* FIXME: Support TDB_CONVERT */
-struct tdb_context *tdb_layout_get(struct tdb_layout *layout,
- void (*freefn)(void *),
- union tdb_attribute *attr)
- unsigned int i;
- tdb_off_t off, len, last_ftable, last_cap;
- char *mem;
- struct tdb_context *tdb;
- off = sizeof(struct tdb_header);
- /* First pass of layout: calc lengths */
- for (i = 0; i < layout->num_elems; i++) {
- union tdb_layout_elem *e = &layout->elem[i];
- e-> = off;
- switch (e->base.type) {
- len = freetable_len(&e->ftable);
- break;
- case FREE:
- len = free_record_len(e->free.len);
- break;
- case DATA:
- len = data_record_len(&e->used);
- break;
- len = hashtable_len(&e->hashtable);
- break;
- len = capability_len(&e->capability);
- break;
- default:
- abort();
- }
- off += len;
- }
- mem = malloc(off);
- /* Fill with some weird pattern. */
- memset(mem, 0x99, off);
- /* Now populate our header, cribbing from a real TDB header. */
- tdb = tdb_open(NULL, TDB_INTERNAL, O_RDWR, 0, attr);
- memcpy(mem, tdb->file->map_ptr, sizeof(struct tdb_header));
- /* Mug the tdb we have to make it use this. */
- freefn(tdb->file->map_ptr);
- tdb->file->map_ptr = mem;
- tdb->file->map_size = off;
- last_ftable = 0;
- last_cap = 0;
- for (i = 0; i < layout->num_elems; i++) {
- union tdb_layout_elem *e = &layout->elem[i];
- switch (e->base.type) {
- set_freetable(mem + e->, tdb, &e->ftable,
- (struct tdb_header *)mem, last_ftable);
- last_ftable = e->;
- break;
- case FREE:
- set_free_record(mem + e->, e->free.len);
- break;
- case DATA:
- set_data_record(mem + e->, tdb, &e->used);
- break;
- set_hashtable(mem + e->, tdb, &e->hashtable);
- break;
- set_capability(mem + e->, tdb, &e->capability,
- (struct tdb_header *)mem, last_cap);
- last_cap = e->;
- break;
- }
- }
- /* Must have a free table! */
- assert(last_ftable);
- /* Now fill the free and hash tables. */
- for (i = 0; i < layout->num_elems; i++) {
- union tdb_layout_elem *e = &layout->elem[i];
- switch (e->base.type) {
- case FREE:
- add_to_freetable(tdb, e->, e->free.len,
- e->free.ftable_num,
- find_ftable(layout, e->free.ftable_num));
- break;
- case DATA:
- add_to_hashtable(tdb, e->, e->used.key);
- break;
- default:
- break;
- }
- }
- tdb->ftable_off = find_ftable(layout, 0)->;
- return tdb;
-void tdb_layout_write(struct tdb_layout *layout, void (*freefn)(void *),
- union tdb_attribute *attr, const char *filename)
- struct tdb_context *tdb = tdb_layout_get(layout, freefn, attr);
- int fd;
- fd = open(filename, O_WRONLY|O_TRUNC|O_CREAT, 0600);
- if (fd < 0)
- err(1, "opening %s for writing", filename);
- if (write(fd, tdb->file->map_ptr, tdb->file->map_size)
- != tdb->file->map_size)
- err(1, "writing %s", filename);
- close(fd);
- tdb_close(tdb);
-void tdb_layout_free(struct tdb_layout *layout)
- unsigned int i;
- for (i = 0; i < layout->num_elems; i++) {
- if (layout->elem[i].base.type == DATA) {
- free(layout->elem[i].used.key.dptr);
- free(layout->elem[i];
- }
- }
- free(layout->elem);
- free(layout);
diff --git a/lib/tdb2/test/layout.h b/lib/tdb2/test/layout.h
deleted file mode 100644
index 3aadf20ee2..0000000000
--- a/lib/tdb2/test/layout.h
+++ /dev/null
@@ -1,87 +0,0 @@
-#include "private.h"
-struct tdb_layout *new_tdb_layout(void);
-void tdb_layout_add_freetable(struct tdb_layout *layout);
-void tdb_layout_add_free(struct tdb_layout *layout, tdb_len_t len,
- unsigned ftable);
-void tdb_layout_add_used(struct tdb_layout *layout,
- TDB_DATA key, TDB_DATA data,
- tdb_len_t extra);
-void tdb_layout_add_capability(struct tdb_layout *layout,
- uint64_t type,
- bool write_breaks,
- bool check_breaks,
- bool open_breaks,
- tdb_len_t extra);
-#if 0 /* FIXME: Allow allocation of subtables */
-void tdb_layout_add_hashtable(struct tdb_layout *layout,
- int htable_parent, /* -1 == toplevel */
- unsigned int bucket,
- tdb_len_t extra);
-/* freefn is needed if we're using failtest_free. */
-struct tdb_context *tdb_layout_get(struct tdb_layout *layout,
- void (*freefn)(void *),
- union tdb_attribute *attr);
-void tdb_layout_write(struct tdb_layout *layout, void (*freefn)(void *),
- union tdb_attribute *attr, const char *filename);
-void tdb_layout_free(struct tdb_layout *layout);
-enum layout_type {
-/* Shared by all union members. */
-struct tle_base {
- enum layout_type type;
- tdb_off_t off;
-struct tle_freetable {
- struct tle_base base;
-struct tle_free {
- struct tle_base base;
- tdb_len_t len;
- unsigned ftable_num;
-struct tle_used {
- struct tle_base base;
- TDB_DATA key;
- TDB_DATA data;
- tdb_len_t extra;
-struct tle_hashtable {
- struct tle_base base;
- int parent;
- unsigned int bucket;
- tdb_len_t extra;
-struct tle_capability {
- struct tle_base base;
- uint64_t type;
- tdb_len_t extra;
-union tdb_layout_elem {
- struct tle_base base;
- struct tle_freetable ftable;
- struct tle_free free;
- struct tle_used used;
- struct tle_hashtable hashtable;
- struct tle_capability capability;
-struct tdb_layout {
- unsigned int num_elems;
- union tdb_layout_elem *elem;
-#endif /* TDB2_TEST_LAYOUT_H */
diff --git a/lib/tdb2/test/lock-tracking.c b/lib/tdb2/test/lock-tracking.c
deleted file mode 100644
index c7387ead99..0000000000
--- a/lib/tdb2/test/lock-tracking.c
+++ /dev/null
@@ -1,147 +0,0 @@
-/* We save the locks so we can reaquire them. */
-#include "private.h" /* For TDB_HASH_LOCK_START, etc. */
-#include <unistd.h>
-#include <fcntl.h>
-#include <stdarg.h>
-#include <stdlib.h>
-#include "tap-interface.h"
-#include "lock-tracking.h"
-struct lock {
- struct lock *next;
- unsigned int off;
- unsigned int len;
- int type;
-static struct lock *locks;
-int locking_errors = 0;
-bool suppress_lockcheck = false;
-bool nonblocking_locks;
-int locking_would_block = 0;
-void (*unlock_callback)(int fd);
-int fcntl_with_lockcheck(int fd, int cmd, ... /* arg */ )
- va_list ap;
- int ret, arg3;
- struct flock *fl;
- bool may_block = false;
- if (cmd != F_SETLK && cmd != F_SETLKW) {
- /* This may be totally bogus, but we don't know in general. */
- va_start(ap, cmd);
- arg3 = va_arg(ap, int);
- va_end(ap);
- return fcntl(fd, cmd, arg3);
- }
- va_start(ap, cmd);
- fl = va_arg(ap, struct flock *);
- va_end(ap);
- if (cmd == F_SETLKW && nonblocking_locks) {
- cmd = F_SETLK;
- may_block = true;
- }
- ret = fcntl(fd, cmd, fl);
- /* Detect when we failed, but might have been OK if we waited. */
- if (may_block && ret == -1 && (errno == EAGAIN || errno == EACCES)) {
- locking_would_block++;
- }
- if (fl->l_type == F_UNLCK) {
- struct lock **l;
- struct lock *old = NULL;
- for (l = &locks; *l; l = &(*l)->next) {
- if ((*l)->off == fl->l_start
- && (*l)->len == fl->l_len) {
- if (ret == 0) {
- old = *l;
- *l = (*l)->next;
- free(old);
- }
- break;
- }
- }
- if (!old && !suppress_lockcheck) {
- diag("Unknown unlock %u@%u - %i",
- (int)fl->l_len, (int)fl->l_start, ret);
- locking_errors++;
- }
- } else {
- struct lock *new, *i;
- unsigned int fl_end = fl->l_start + fl->l_len;
- if (fl->l_len == 0)
- fl_end = (unsigned int)-1;
- /* Check for overlaps: we shouldn't do this. */
- for (i = locks; i; i = i->next) {
- unsigned int i_end = i->off + i->len;
- if (i->len == 0)
- i_end = (unsigned int)-1;
- if (fl->l_start >= i->off && fl->l_start < i_end)
- break;
- if (fl_end > i->off && fl_end < i_end)
- break;
- /* tdb_allrecord_lock does this, handle adjacent: */
- if (fl->l_start > TDB_HASH_LOCK_START
- && fl->l_start == i_end && fl->l_type == i->type) {
- if (ret == 0) {
- i->len = fl->l_len
- ? i->len + fl->l_len
- : 0;
- }
- goto done;
- }
- }
- if (i) {
- /* Special case: upgrade of allrecord lock. */
- if (i->type == F_RDLCK && fl->l_type == F_WRLCK
- && i->off == TDB_HASH_LOCK_START
- && fl->l_start == TDB_HASH_LOCK_START
- && i->len == 0
- && fl->l_len == 0) {
- if (ret == 0)
- i->type = F_WRLCK;
- goto done;
- }
- if (!suppress_lockcheck) {
- diag("%s lock %u@%u overlaps %u@%u",
- fl->l_type == F_WRLCK ? "write" : "read",
- (int)fl->l_len, (int)fl->l_start,
- i->len, (int)i->off);
- locking_errors++;
- }
- }
- if (ret == 0) {
- new = malloc(sizeof *new);
- new->off = fl->l_start;
- new->len = fl->l_len;
- new->type = fl->l_type;
- new->next = locks;
- locks = new;
- }
- }
- if (ret == 0 && fl->l_type == F_UNLCK && unlock_callback)
- unlock_callback(fd);
- return ret;
-unsigned int forget_locking(void)
- unsigned int num = 0;
- while (locks) {
- struct lock *next = locks->next;
- free(locks);
- locks = next;
- num++;
- }
- return num;
diff --git a/lib/tdb2/test/lock-tracking.h b/lib/tdb2/test/lock-tracking.h
deleted file mode 100644
index f2c9c44653..0000000000
--- a/lib/tdb2/test/lock-tracking.h
+++ /dev/null
@@ -1,25 +0,0 @@
-#include <stdbool.h>
-/* Set this if you want a callback after fnctl unlock. */
-extern void (*unlock_callback)(int fd);
-/* Replacement fcntl. */
-int fcntl_with_lockcheck(int fd, int cmd, ... /* arg */ );
-/* Discard locking info: returns number of locks outstanding. */
-unsigned int forget_locking(void);
-/* Number of errors in locking. */
-extern int locking_errors;
-/* Suppress lock checking. */
-extern bool suppress_lockcheck;
-/* Make all locks non-blocking. */
-extern bool nonblocking_locks;
-/* Number of times we failed a lock because we made it non-blocking. */
-extern int locking_would_block;
-#endif /* LOCK_TRACKING_H */
diff --git a/lib/tdb2/test/logging.c b/lib/tdb2/test/logging.c
deleted file mode 100644
index 86fc152bab..0000000000
--- a/lib/tdb2/test/logging.c
+++ /dev/null
@@ -1,30 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include "tap-interface.h"
-#include "logging.h"
-unsigned tap_log_messages;
-const char *log_prefix = "";
-char *log_last = NULL;
-bool suppress_logging;
-union tdb_attribute tap_log_attr = {
- .log = { .base = { .attr = TDB_ATTRIBUTE_LOG },
- .fn = tap_log_fn }
-void tap_log_fn(struct tdb_context *tdb,
- enum tdb_log_level level,
- enum TDB_ERROR ecode,
- const char *message, void *priv)
- if (suppress_logging)
- return;
- diag("tdb log level %u: %s: %s%s",
- level, tdb_errorstr(ecode), log_prefix, message);
- if (log_last)
- free(log_last);
- log_last = strdup(message);
- tap_log_messages++;
diff --git a/lib/tdb2/test/logging.h b/lib/tdb2/test/logging.h
deleted file mode 100644
index 5f517dc592..0000000000
--- a/lib/tdb2/test/logging.h
+++ /dev/null
@@ -1,17 +0,0 @@
-#include "tdb2.h"
-#include <stdbool.h>
-#include <string.h>
-extern bool suppress_logging;
-extern const char *log_prefix;
-extern unsigned tap_log_messages;
-extern union tdb_attribute tap_log_attr;
-extern char *log_last;
-void tap_log_fn(struct tdb_context *tdb,
- enum tdb_log_level level,
- enum TDB_ERROR ecode,
- const char *message, void *priv);
-#endif /* TDB2_TEST_LOGGING_H */
diff --git a/lib/tdb2/test/run-001-encode.c b/lib/tdb2/test/run-001-encode.c
deleted file mode 100644
index 9657eb79d0..0000000000
--- a/lib/tdb2/test/run-001-encode.c
+++ /dev/null
@@ -1,41 +0,0 @@
-#include "tdb2-source.h"
-#include "tap-interface.h"
-#include "logging.h"
-int main(int argc, char *argv[])
- unsigned int i;
- struct tdb_used_record rec;
- struct tdb_context tdb = { .log_fn = tap_log_fn };
- plan_tests(64 + 32 + 48*6 + 1);
- /* We should be able to encode any data value. */
- for (i = 0; i < 64; i++)
- ok1(set_header(&tdb, &rec, TDB_USED_MAGIC, 0, 1ULL << i,
- 1ULL << i, 0) == 0);
- /* And any key and data with < 64 bits between them. */
- for (i = 0; i < 32; i++) {
- tdb_len_t dlen = 1ULL >> (63 - i), klen = 1ULL << i;
- ok1(set_header(&tdb, &rec, TDB_USED_MAGIC, klen, dlen,
- klen + dlen, 0) == 0);
- }
- /* We should neatly encode all values. */
- for (i = 0; i < 48; i++) {
- uint64_t h = 1ULL << (i < 5 ? i : 4);
- uint64_t klen = 1ULL << (i < 16 ? i : 15);
- uint64_t dlen = 1ULL << i;
- uint64_t xlen = 1ULL << (i < 32 ? i : 31);
- ok1(set_header(&tdb, &rec, TDB_USED_MAGIC, klen, dlen,
- klen+dlen+xlen, h) == 0);
- ok1(rec_key_length(&rec) == klen);
- ok1(rec_data_length(&rec) == dlen);
- ok1(rec_extra_padding(&rec) == xlen);
- ok1((uint64_t)rec_hash(&rec) == h);
- ok1(rec_magic(&rec) == TDB_USED_MAGIC);
- }
- ok1(tap_log_messages == 0);
- return exit_status();
diff --git a/lib/tdb2/test/run-001-fls.c b/lib/tdb2/test/run-001-fls.c
deleted file mode 100644
index 792adbf655..0000000000
--- a/lib/tdb2/test/run-001-fls.c
+++ /dev/null
@@ -1,33 +0,0 @@
-#include "tdb2-source.h"
-#include "tap-interface.h"
-static unsigned int dumb_fls(uint64_t num)
- int i;
- for (i = 63; i >= 0; i--) {
- if (num & (1ULL << i))
- break;
- }
- return i + 1;
-int main(int argc, char *argv[])
- unsigned int i, j;
- plan_tests(64 * 64 + 2);
- ok1(fls64(0) == 0);
- ok1(dumb_fls(0) == 0);
- for (i = 0; i < 64; i++) {
- for (j = 0; j < 64; j++) {
- uint64_t val = (1ULL << i) | (1ULL << j);
- ok(fls64(val) == dumb_fls(val),
- "%llu -> %u should be %u", (long long)val,
- fls64(val), dumb_fls(val));
- }
- }
- return exit_status();
diff --git a/lib/tdb2/test/run-01-new_database.c b/lib/tdb2/test/run-01-new_database.c
deleted file mode 100644
index 00c15140df..0000000000
--- a/lib/tdb2/test/run-01-new_database.c
+++ /dev/null
@@ -1,34 +0,0 @@
-#include <ccan/failtest/failtest_override.h>
-#include "tdb2-source.h"
-#include "tap-interface.h"
-#include <ccan/failtest/failtest.h>
-#include "logging.h"
-#include "failtest_helper.h"
-int main(int argc, char *argv[])
- unsigned int i;
- struct tdb_context *tdb;
- failtest_init(argc, argv);
- failtest_hook = block_repeat_failures;
- failtest_exit_check = exit_check_log;
- plan_tests(sizeof(flags) / sizeof(flags[0]) * 3);
- for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
- tdb = tdb_open("run-new_database.tdb", flags[i],
- O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
- if (!ok1(tdb))
- failtest_exit(exit_status());
- failtest_suppress = true;
- ok1(tdb_check(tdb, NULL, NULL) == 0);
- failtest_suppress = false;
- tdb_close(tdb);
- if (!ok1(tap_log_messages == 0))
- break;
- }
- failtest_exit(exit_status());
diff --git a/lib/tdb2/test/run-02-expand.c b/lib/tdb2/test/run-02-expand.c
deleted file mode 100644
index fd1ae4be34..0000000000
--- a/lib/tdb2/test/run-02-expand.c
+++ /dev/null
@@ -1,62 +0,0 @@
-#include <ccan/failtest/failtest_override.h>
-#include "tdb2-source.h"
-#include "tap-interface.h"
-#include <ccan/failtest/failtest.h>
-#include "logging.h"
-#include "failtest_helper.h"
-int main(int argc, char *argv[])
- unsigned int i;
- uint64_t val;
- struct tdb_context *tdb;
- plan_tests(sizeof(flags) / sizeof(flags[0]) * 11 + 1);
- failtest_init(argc, argv);
- failtest_hook = block_repeat_failures;
- failtest_exit_check = exit_check_log;
- for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
- failtest_suppress = true;
- tdb = tdb_open("run-expand.tdb", flags[i],
- O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
- if (!ok1(tdb))
- break;
- val = tdb->file->map_size;
- /* Need some hash lock for expand. */
- ok1(tdb_lock_hashes(tdb, 0, 1, F_WRLCK, TDB_LOCK_WAIT) == 0);
- failtest_suppress = false;
- if (!ok1(tdb_expand(tdb, 1) == 0)) {
- failtest_suppress = true;
- tdb_close(tdb);
- break;
- }
- failtest_suppress = true;
- ok1(tdb->file->map_size >= val + 1 * TDB_EXTENSION_FACTOR);
- ok1(tdb_unlock_hashes(tdb, 0, 1, F_WRLCK) == 0);
- ok1(tdb_check(tdb, NULL, NULL) == 0);
- val = tdb->file->map_size;
- ok1(tdb_lock_hashes(tdb, 0, 1, F_WRLCK, TDB_LOCK_WAIT) == 0);
- failtest_suppress = false;
- if (!ok1(tdb_expand(tdb, 1024) == 0)) {
- failtest_suppress = true;
- tdb_close(tdb);
- break;
- }
- failtest_suppress = true;
- ok1(tdb_unlock_hashes(tdb, 0, 1, F_WRLCK) == 0);
- ok1(tdb->file->map_size >= val + 1024 * TDB_EXTENSION_FACTOR);
- ok1(tdb_check(tdb, NULL, NULL) == 0);
- tdb_close(tdb);
- }
- ok1(tap_log_messages == 0);
- failtest_exit(exit_status());
diff --git a/lib/tdb2/test/run-03-coalesce.c b/lib/tdb2/test/run-03-coalesce.c
deleted file mode 100644
index ecc469fa32..0000000000
--- a/lib/tdb2/test/run-03-coalesce.c
+++ /dev/null
@@ -1,178 +0,0 @@
-#include "tdb2-source.h"
-#include "tap-interface.h"
-#include "logging.h"
-#include "layout.h"
-static tdb_len_t free_record_length(struct tdb_context *tdb, tdb_off_t off)
- struct tdb_free_record f;
- enum TDB_ERROR ecode;
- ecode = tdb_read_convert(tdb, off, &f, sizeof(f));
- if (ecode != TDB_SUCCESS)
- return ecode;
- if (frec_magic(&f) != TDB_FREE_MAGIC)
- return frec_len(&f);
-int main(int argc, char *argv[])
- tdb_off_t b_off, test;
- struct tdb_context *tdb;
- struct tdb_layout *layout;
- struct tdb_data data, key;
- tdb_len_t len;
- /* FIXME: Test lock order fail. */
- plan_tests(42);
- data = tdb_mkdata("world", 5);
- key = tdb_mkdata("hello", 5);
- /* No coalescing can be done due to EOF */
- layout = new_tdb_layout();
- tdb_layout_add_freetable(layout);
- len = 1024;
- tdb_layout_add_free(layout, len, 0);
- tdb_layout_write(layout, free, &tap_log_attr, "run-03-coalesce.tdb");
- /* NOMMAP is for lockcheck. */
- tdb = tdb_open("run-03-coalesce.tdb", TDB_NOMMAP, O_RDWR, 0,
- &tap_log_attr);
- ok1(tdb_check(tdb, NULL, NULL) == 0);
- ok1(free_record_length(tdb, layout->elem[1] == len);
- /* Figure out which bucket free entry is. */
- b_off = bucket_off(tdb->ftable_off, size_to_bucket(len));
- /* Lock and fail to coalesce. */
- ok1(tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) == 0);
- test = layout->elem[1];
- ok1(coalesce(tdb, layout->elem[1], b_off, len, &test)
- == 0);
- tdb_unlock_free_bucket(tdb, b_off);
- ok1(free_record_length(tdb, layout->elem[1] == len);
- ok1(test == layout->elem[1];
- ok1(tdb_check(tdb, NULL, NULL) == 0);
- tdb_close(tdb);
- tdb_layout_free(layout);
- /* No coalescing can be done due to used record */
- layout = new_tdb_layout();
- tdb_layout_add_freetable(layout);
- tdb_layout_add_free(layout, 1024, 0);
- tdb_layout_add_used(layout, key, data, 6);
- tdb_layout_write(layout, free, &tap_log_attr, "run-03-coalesce.tdb");
- /* NOMMAP is for lockcheck. */
- tdb = tdb_open("run-03-coalesce.tdb", TDB_NOMMAP, O_RDWR, 0,
- &tap_log_attr);
- ok1(free_record_length(tdb, layout->elem[1] == 1024);
- ok1(tdb_check(tdb, NULL, NULL) == 0);
- /* Figure out which bucket free entry is. */
- b_off = bucket_off(tdb->ftable_off, size_to_bucket(1024));
- /* Lock and fail to coalesce. */
- ok1(tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) == 0);
- test = layout->elem[1];
- ok1(coalesce(tdb, layout->elem[1], b_off, 1024, &test)
- == 0);
- tdb_unlock_free_bucket(tdb, b_off);
- ok1(free_record_length(tdb, layout->elem[1] == 1024);
- ok1(test == layout->elem[1];
- ok1(tdb_check(tdb, NULL, NULL) == 0);
- tdb_close(tdb);
- tdb_layout_free(layout);
- /* Coalescing can be done due to two free records, then EOF */
- layout = new_tdb_layout();
- tdb_layout_add_freetable(layout);
- tdb_layout_add_free(layout, 1024, 0);
- tdb_layout_add_free(layout, 2048, 0);
- tdb_layout_write(layout, free, &tap_log_attr, "run-03-coalesce.tdb");
- /* NOMMAP is for lockcheck. */
- tdb = tdb_open("run-03-coalesce.tdb", TDB_NOMMAP, O_RDWR, 0,
- &tap_log_attr);
- ok1(free_record_length(tdb, layout->elem[1] == 1024);
- ok1(free_record_length(tdb, layout->elem[2] == 2048);
- ok1(tdb_check(tdb, NULL, NULL) == 0);
- /* Figure out which bucket (first) free entry is. */
- b_off = bucket_off(tdb->ftable_off, size_to_bucket(1024));
- /* Lock and coalesce. */
- ok1(tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) == 0);
- test = layout->elem[2];
- ok1(coalesce(tdb, layout->elem[1], b_off, 1024, &test)
- == 1024 + sizeof(struct tdb_used_record) + 2048);
- /* Should tell us it's erased this one... */
- ok1(test == TDB_ERR_NOEXIST);
- ok1(tdb->file->allrecord_lock.count == 0 && tdb->file->num_lockrecs == 0);
- ok1(free_record_length(tdb, layout->elem[1]
- == 1024 + sizeof(struct tdb_used_record) + 2048);
- ok1(tdb_check(tdb, NULL, NULL) == 0);
- tdb_close(tdb);
- tdb_layout_free(layout);
- /* Coalescing can be done due to two free records, then data */
- layout = new_tdb_layout();
- tdb_layout_add_freetable(layout);
- tdb_layout_add_free(layout, 1024, 0);
- tdb_layout_add_free(layout, 512, 0);
- tdb_layout_add_used(layout, key, data, 6);
- tdb_layout_write(layout, free, &tap_log_attr, "run-03-coalesce.tdb");
- /* NOMMAP is for lockcheck. */
- tdb = tdb_open("run-03-coalesce.tdb", TDB_NOMMAP, O_RDWR, 0,
- &tap_log_attr);
- ok1(free_record_length(tdb, layout->elem[1] == 1024);
- ok1(free_record_length(tdb, layout->elem[2] == 512);
- ok1(tdb_check(tdb, NULL, NULL) == 0);
- /* Figure out which bucket free entry is. */
- b_off = bucket_off(tdb->ftable_off, size_to_bucket(1024));
- /* Lock and coalesce. */
- ok1(tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) == 0);
- test = layout->elem[2];
- ok1(coalesce(tdb, layout->elem[1], b_off, 1024, &test)
- == 1024 + sizeof(struct tdb_used_record) + 512);
- ok1(tdb->file->allrecord_lock.count == 0 && tdb->file->num_lockrecs == 0);
- ok1(free_record_length(tdb, layout->elem[1]
- == 1024 + sizeof(struct tdb_used_record) + 512);
- ok1(test == TDB_ERR_NOEXIST);
- ok1(tdb_check(tdb, NULL, NULL) == 0);
- tdb_close(tdb);
- tdb_layout_free(layout);
- /* Coalescing can be done due to three free records, then EOF */
- layout = new_tdb_layout();
- tdb_layout_add_freetable(layout);
- tdb_layout_add_free(layout, 1024, 0);
- tdb_layout_add_free(layout, 512, 0);
- tdb_layout_add_free(layout, 256, 0);
- tdb_layout_write(layout, free, &tap_log_attr, "run-03-coalesce.tdb");
- /* NOMMAP is for lockcheck. */
- tdb = tdb_open("run-03-coalesce.tdb", TDB_NOMMAP, O_RDWR, 0,
- &tap_log_attr);
- ok1(free_record_length(tdb, layout->elem[1] == 1024);
- ok1(free_record_length(tdb, layout->elem[2] == 512);
- ok1(free_record_length(tdb, layout->elem[3] == 256);
- ok1(tdb_check(tdb, NULL, NULL) == 0);
- /* Figure out which bucket free entry is. */
- b_off = bucket_off(tdb->ftable_off, size_to_bucket(1024));
- /* Lock and coalesce. */
- ok1(tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) == 0);
- test = layout->elem[2];
- ok1(coalesce(tdb, layout->elem[1], b_off, 1024, &test)
- == 1024 + sizeof(struct tdb_used_record) + 512
- + sizeof(struct tdb_used_record) + 256);
- ok1(tdb->file->allrecord_lock.count == 0
- && tdb->file->num_lockrecs == 0);
- ok1(free_record_length(tdb, layout->elem[1]
- == 1024 + sizeof(struct tdb_used_record) + 512
- + sizeof(struct tdb_used_record) + 256);
- ok1(tdb_check(tdb, NULL, NULL) == 0);
- tdb_close(tdb);
- tdb_layout_free(layout);
- ok1(tap_log_messages == 0);
- return exit_status();
diff --git a/lib/tdb2/test/run-04-basichash.c b/lib/tdb2/test/run-04-basichash.c
deleted file mode 100644
index dc75fc72dc..0000000000
--- a/lib/tdb2/test/run-04-basichash.c
+++ /dev/null
@@ -1,260 +0,0 @@
-#include "tdb2-source.h"
-#include "tap-interface.h"
-#include "logging.h"
-/* We rig the hash so adjacent-numbered records always clash. */
-static uint64_t clash(const void *key, size_t len, uint64_t seed, void *priv)
- return ((uint64_t)*(const unsigned int *)key)
- << (64 - TDB_TOPLEVEL_HASH_BITS - 1);
-int main(int argc, char *argv[])
- unsigned int i, j;
- struct tdb_context *tdb;
- unsigned int v;
- struct tdb_used_record rec;
- struct tdb_data key = { (unsigned char *)&v, sizeof(v) };
- struct tdb_data dbuf = { (unsigned char *)&v, sizeof(v) };
- union tdb_attribute hattr = { .hash = { .base = { TDB_ATTRIBUTE_HASH },
- .fn = clash } };
- };
- = &tap_log_attr;
- plan_tests(sizeof(flags) / sizeof(flags[0])
- * (91 + (2 * ((1 << TDB_HASH_GROUP_BITS) - 1))) + 1);
- for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
- struct hash_info h;
- tdb_off_t new_off, off, subhash;
- tdb = tdb_open("run-04-basichash.tdb", flags[i],
- O_RDWR|O_CREAT|O_TRUNC, 0600, &hattr);
- ok1(tdb);
- if (!tdb)
- continue;
- v = 0;
- /* Should not find it. */
- ok1(find_and_lock(tdb, key, F_WRLCK, &h, &rec, NULL) == 0);
- /* Should have created correct hash. */
- ok1(h.h == tdb_hash(tdb, key.dptr, key.dsize));
- /* Should have located space in group 0, bucket 0. */
- ok1(h.group_start == offsetof(struct tdb_header, hashtable));
- ok1(h.home_bucket == 0);
- ok1(h.found_bucket == 0);
- ok1(h.hash_used == TDB_TOPLEVEL_HASH_BITS);
- /* Should have lock on bucket 0 */
- ok1(h.hlock_start == 0);
- ok1(h.hlock_range ==
- ok1((tdb->flags & TDB_NOLOCK) || tdb->file->num_lockrecs == 1);
- ok1((tdb->flags & TDB_NOLOCK)
- || tdb->file->lockrecs[0].off == TDB_HASH_LOCK_START);
- /* FIXME: Check lock length */
- /* Allocate a new record. */
- new_off = alloc(tdb, key.dsize, dbuf.dsize, h.h,
- TDB_USED_MAGIC, false);
- ok1(!TDB_OFF_IS_ERR(new_off));
- /* We should be able to add it now. */
- ok1(add_to_hash(tdb, &h, new_off) == 0);
- /* Make sure we fill it in for later finding. */
- off = new_off + sizeof(struct tdb_used_record);
- ok1(!tdb->io->twrite(tdb, off, key.dptr, key.dsize));
- off += key.dsize;
- ok1(!tdb->io->twrite(tdb, off, dbuf.dptr, dbuf.dsize));
- /* We should be able to unlock that OK. */
- ok1(tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range,
- F_WRLCK) == 0);
- /* Database should be consistent. */
- ok1(tdb_check(tdb, NULL, NULL) == 0);
- /* Now, this should give a successful lookup. */
- ok1(find_and_lock(tdb, key, F_WRLCK, &h, &rec, NULL)
- == new_off);
- /* Should have created correct hash. */
- ok1(h.h == tdb_hash(tdb, key.dptr, key.dsize));
- /* Should have located space in group 0, bucket 0. */
- ok1(h.group_start == offsetof(struct tdb_header, hashtable));
- ok1(h.home_bucket == 0);
- ok1(h.found_bucket == 0);
- ok1(h.hash_used == TDB_TOPLEVEL_HASH_BITS);
- /* Should have lock on bucket 0 */
- ok1(h.hlock_start == 0);
- ok1(h.hlock_range ==
- ok1((tdb->flags & TDB_NOLOCK) || tdb->file->num_lockrecs == 1);
- ok1((tdb->flags & TDB_NOLOCK)
- || tdb->file->lockrecs[0].off == TDB_HASH_LOCK_START);
- /* FIXME: Check lock length */
- ok1(tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range,
- F_WRLCK) == 0);
- /* Database should be consistent. */
- ok1(tdb_check(tdb, NULL, NULL) == 0);
- /* Test expansion. */
- v = 1;
- ok1(find_and_lock(tdb, key, F_WRLCK, &h, &rec, NULL) == 0);
- /* Should have created correct hash. */
- ok1(h.h == tdb_hash(tdb, key.dptr, key.dsize));
- /* Should have located space in group 0, bucket 1. */
- ok1(h.group_start == offsetof(struct tdb_header, hashtable));
- ok1(h.home_bucket == 0);
- ok1(h.found_bucket == 1);
- ok1(h.hash_used == TDB_TOPLEVEL_HASH_BITS);
- /* Should have lock on bucket 0 */
- ok1(h.hlock_start == 0);
- ok1(h.hlock_range ==
- ok1((tdb->flags & TDB_NOLOCK) || tdb->file->num_lockrecs == 1);
- ok1((tdb->flags & TDB_NOLOCK)
- || tdb->file->lockrecs[0].off == TDB_HASH_LOCK_START);
- /* FIXME: Check lock length */
- /* Make it expand 0'th bucket. */
- ok1(expand_group(tdb, &h) == 0);
- /* First one should be subhash, next should be empty. */
- ok1(is_subhash([0]));
- subhash = ([0] & TDB_OFF_MASK);
- for (j = 1; j < (1 << TDB_HASH_GROUP_BITS); j++)
- ok1([j] == 0);
- ok1(tdb_write_convert(tdb, h.group_start,
-, sizeof( == 0);
- ok1(tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range,
- F_WRLCK) == 0);
- /* Should be happy with expansion. */
- ok1(tdb_check(tdb, NULL, NULL) == 0);
- /* Should be able to find it. */
- v = 0;
- ok1(find_and_lock(tdb, key, F_WRLCK, &h, &rec, NULL)
- == new_off);
- /* Should have created correct hash. */
- ok1(h.h == tdb_hash(tdb, key.dptr, key.dsize));
- /* Should have located space in expanded group 0, bucket 0. */
- ok1(h.group_start == subhash + sizeof(struct tdb_used_record));
- ok1(h.home_bucket == 0);
- ok1(h.found_bucket == 0);
- ok1(h.hash_used == TDB_TOPLEVEL_HASH_BITS
- /* Should have lock on bucket 0 */
- ok1(h.hlock_start == 0);
- ok1(h.hlock_range ==
- ok1((tdb->flags & TDB_NOLOCK) || tdb->file->num_lockrecs == 1);
- ok1((tdb->flags & TDB_NOLOCK)
- || tdb->file->lockrecs[0].off == TDB_HASH_LOCK_START);
- /* FIXME: Check lock length */
- /* Simple delete should work. */
- ok1(delete_from_hash(tdb, &h) == 0);
- ok1(add_free_record(tdb, new_off,
- sizeof(struct tdb_used_record)
- + rec_key_length(&rec)
- + rec_data_length(&rec)
- + rec_extra_padding(&rec),
- TDB_LOCK_NOWAIT, false) == 0);
- ok1(tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range,
- F_WRLCK) == 0);
- ok1(tdb_check(tdb, NULL, NULL) == 0);
- /* Test second-level expansion: should expand 0th bucket. */
- v = 0;
- ok1(find_and_lock(tdb, key, F_WRLCK, &h, &rec, NULL) == 0);
- /* Should have created correct hash. */
- ok1(h.h == tdb_hash(tdb, key.dptr, key.dsize));
- /* Should have located space in group 0, bucket 0. */
- ok1(h.group_start == subhash + sizeof(struct tdb_used_record));
- ok1(h.home_bucket == 0);
- ok1(h.found_bucket == 0);
- /* Should have lock on bucket 0 */
- ok1(h.hlock_start == 0);
- ok1(h.hlock_range ==
- ok1((tdb->flags & TDB_NOLOCK) || tdb->file->num_lockrecs == 1);
- ok1((tdb->flags & TDB_NOLOCK)
- || tdb->file->lockrecs[0].off == TDB_HASH_LOCK_START);
- /* FIXME: Check lock length */
- ok1(expand_group(tdb, &h) == 0);
- /* First one should be subhash, next should be empty. */
- ok1(is_subhash([0]));
- subhash = ([0] & TDB_OFF_MASK);
- for (j = 1; j < (1 << TDB_HASH_GROUP_BITS); j++)
- ok1([j] == 0);
- ok1(tdb_write_convert(tdb, h.group_start,
-, sizeof( == 0);
- ok1(tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range,
- F_WRLCK) == 0);
- /* Should be happy with expansion. */
- ok1(tdb_check(tdb, NULL, NULL) == 0);
- ok1(find_and_lock(tdb, key, F_WRLCK, &h, &rec, NULL) == 0);
- /* Should have created correct hash. */
- ok1(h.h == tdb_hash(tdb, key.dptr, key.dsize));
- /* Should have located space in group 0, bucket 0. */
- ok1(h.group_start == subhash + sizeof(struct tdb_used_record));
- ok1(h.home_bucket == 0);
- ok1(h.found_bucket == 0);
- ok1(h.hash_used == TDB_TOPLEVEL_HASH_BITS
- /* We should be able to add it now. */
- /* Allocate a new record. */
- new_off = alloc(tdb, key.dsize, dbuf.dsize, h.h,
- TDB_USED_MAGIC, false);
- ok1(!TDB_OFF_IS_ERR(new_off));
- ok1(add_to_hash(tdb, &h, new_off) == 0);
- /* Make sure we fill it in for later finding. */
- off = new_off + sizeof(struct tdb_used_record);
- ok1(!tdb->io->twrite(tdb, off, key.dptr, key.dsize));
- off += key.dsize;
- ok1(!tdb->io->twrite(tdb, off, dbuf.dptr, dbuf.dsize));
- /* We should be able to unlock that OK. */
- ok1(tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range,
- F_WRLCK) == 0);
- /* Database should be consistent. */
- ok1(tdb_check(tdb, NULL, NULL) == 0);
- /* Should be able to find it. */
- v = 0;
- ok1(find_and_lock(tdb, key, F_WRLCK, &h, &rec, NULL)
- == new_off);
- /* Should have created correct hash. */
- ok1(h.h == tdb_hash(tdb, key.dptr, key.dsize));
- /* Should have located space in expanded group 0, bucket 0. */
- ok1(h.group_start == subhash + sizeof(struct tdb_used_record));
- ok1(h.home_bucket == 0);
- ok1(h.found_bucket == 0);
- ok1(h.hash_used == TDB_TOPLEVEL_HASH_BITS
- tdb_close(tdb);
- }
- ok1(tap_log_messages == 0);
- return exit_status();
diff --git a/lib/tdb2/test/run-05-readonly-open.c b/lib/tdb2/test/run-05-readonly-open.c
deleted file mode 100644
index 1046a8b47e..0000000000
--- a/lib/tdb2/test/run-05-readonly-open.c
+++ /dev/null
@@ -1,71 +0,0 @@
-#include <ccan/failtest/failtest_override.h>
-#include "tdb2-source.h"
-#include "tap-interface.h"
-#include <ccan/failtest/failtest.h>
-#include "logging.h"
-#include "failtest_helper.h"
-int main(int argc, char *argv[])
- unsigned int i;
- struct tdb_context *tdb;
- int flags[] = { TDB_DEFAULT, TDB_NOMMAP,
- struct tdb_data key = tdb_mkdata("key", 3);
- struct tdb_data data = tdb_mkdata("data", 4), d;
- union tdb_attribute seed_attr;
- unsigned int msgs = 0;
- failtest_init(argc, argv);
- failtest_hook = block_repeat_failures;
- failtest_exit_check = exit_check_log;
- seed_attr.base.attr = TDB_ATTRIBUTE_SEED;
- = &tap_log_attr;
- seed_attr.seed.seed = 0;
- failtest_suppress = true;
- plan_tests(sizeof(flags) / sizeof(flags[0]) * 11);
- for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
- tdb = tdb_open("run-05-readonly-open.tdb", flags[i],
- &seed_attr);
- ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0);
- tdb_close(tdb);
- failtest_suppress = false;
- tdb = tdb_open("run-05-readonly-open.tdb", flags[i],
- O_RDONLY, 0600, &tap_log_attr);
- if (!ok1(tdb))
- break;
- ok1(tap_log_messages == msgs);
- /* Fetch should succeed, stores should fail. */
- if (!ok1(tdb_fetch(tdb, key, &d) == 0))
- goto fail;
- ok1(tdb_deq(d, data));
- free(d.dptr);
- if (!ok1(tdb_store(tdb, key, data, TDB_MODIFY)
- goto fail;
- ok1(tap_log_messages == ++msgs);
- if (!ok1(tdb_store(tdb, key, data, TDB_INSERT)
- goto fail;
- ok1(tap_log_messages == ++msgs);
- failtest_suppress = true;
- ok1(tdb_check(tdb, NULL, NULL) == 0);
- tdb_close(tdb);
- ok1(tap_log_messages == msgs);
- /* SIGH: failtest bug, it doesn't save the tdb file because
- * we have it read-only. If we go around again, it gets
- * changed underneath us and things get screwy. */
- if (failtest_has_failed())
- break;
- }
- failtest_exit(exit_status());
- failtest_suppress = true;
- tdb_close(tdb);
- failtest_exit(exit_status());
diff --git a/lib/tdb2/test/run-10-simple-store.c b/lib/tdb2/test/run-10-simple-store.c
deleted file mode 100644
index 66bf6a6a51..0000000000
--- a/lib/tdb2/test/run-10-simple-store.c
+++ /dev/null
@@ -1,58 +0,0 @@
-#include <ccan/failtest/failtest_override.h>
-#include "tdb2-source.h"
-#include "tap-interface.h"
-#include <ccan/failtest/failtest.h>
-#include "logging.h"
-#include "failtest_helper.h"
-int main(int argc, char *argv[])
- unsigned int i;
- struct tdb_context *tdb;
- struct tdb_data key = tdb_mkdata("key", 3);
- struct tdb_data data = tdb_mkdata("data", 4);
- failtest_init(argc, argv);
- failtest_hook = block_repeat_failures;
- failtest_exit_check = exit_check_log;
- failtest_suppress = true;
- plan_tests(sizeof(flags) / sizeof(flags[0]) * 7 + 1);
- for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
- tdb = tdb_open("run-10-simple-store.tdb", flags[i],
- O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
- if (!ok1(tdb))
- break;
- /* Modify should fail. */
- failtest_suppress = false;
- if (!ok1(tdb_store(tdb, key, data, TDB_MODIFY)
- goto fail;
- failtest_suppress = true;
- ok1(tdb_check(tdb, NULL, NULL) == 0);
- /* Insert should succeed. */
- failtest_suppress = false;
- if (!ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0))
- goto fail;
- failtest_suppress = true;
- ok1(tdb_check(tdb, NULL, NULL) == 0);
- /* Second insert should fail. */
- failtest_suppress = false;
- if (!ok1(tdb_store(tdb, key, data, TDB_INSERT)
- goto fail;
- failtest_suppress = true;
- ok1(tdb_check(tdb, NULL, NULL) == 0);
- tdb_close(tdb);
- }
- ok1(tap_log_messages == 0);
- failtest_exit(exit_status());
- failtest_suppress = true;
- tdb_close(tdb);
- failtest_exit(exit_status());
diff --git a/lib/tdb2/test/run-11-simple-fetch.c b/lib/tdb2/test/run-11-simple-fetch.c
deleted file mode 100644
index 4c41ceec6d..0000000000
--- a/lib/tdb2/test/run-11-simple-fetch.c
+++ /dev/null
@@ -1,58 +0,0 @@
-#include <ccan/failtest/failtest_override.h>
-#include "tdb2-source.h"
-#include "tap-interface.h"
-#include <ccan/failtest/failtest.h>
-#include "logging.h"
-#include "failtest_helper.h"
-int main(int argc, char *argv[])
- unsigned int i;
- struct tdb_context *tdb;
- struct tdb_data key = tdb_mkdata("key", 3);
- struct tdb_data data = tdb_mkdata("data", 4);
- failtest_init(argc, argv);
- failtest_hook = block_repeat_failures;
- failtest_exit_check = exit_check_log;
- failtest_suppress = true;
- plan_tests(sizeof(flags) / sizeof(flags[0]) * 8 + 1);
- for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
- tdb = tdb_open("run-11-simple-fetch.tdb", flags[i],
- O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
- ok1(tdb);
- if (tdb) {
- struct tdb_data d = { NULL, 0 }; /* Bogus GCC warning */
- /* fetch should fail. */
- failtest_suppress = false;
- if (!ok1(tdb_fetch(tdb, key, &d) == TDB_ERR_NOEXIST))
- goto fail;
- failtest_suppress = true;
- ok1(tdb_check(tdb, NULL, NULL) == 0);
- /* Insert should succeed. */
- ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0);
- ok1(tdb_check(tdb, NULL, NULL) == 0);
- /* Fetch should now work. */
- failtest_suppress = false;
- if (!ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS))
- goto fail;
- failtest_suppress = true;
- ok1(tdb_deq(d, data));
- free(d.dptr);
- ok1(tdb_check(tdb, NULL, NULL) == 0);
- tdb_close(tdb);
- }
- }
- ok1(tap_log_messages == 0);
- failtest_exit(exit_status());
- failtest_suppress = true;
- tdb_close(tdb);
- failtest_exit(exit_status());
diff --git a/lib/tdb2/test/run-12-check.c b/lib/tdb2/test/run-12-check.c
deleted file mode 100644
index cc57726f93..0000000000
--- a/lib/tdb2/test/run-12-check.c
+++ /dev/null
@@ -1,46 +0,0 @@
-#include "private.h"
-#include <ccan/failtest/failtest_override.h>
-#include "tdb2-source.h"
-#include "tap-interface.h"
-#include <ccan/failtest/failtest.h>
-#include "logging.h"
-#include "failtest_helper.h"
-int main(int argc, char *argv[])
- unsigned int i;
- struct tdb_context *tdb;
- int flags[] = { TDB_INTERNAL,
- struct tdb_data key = tdb_mkdata("key", 3);
- struct tdb_data data = tdb_mkdata("data", 4);
- failtest_init(argc, argv);
- failtest_hook = block_repeat_failures;
- failtest_exit_check = exit_check_log;
- failtest_suppress = true;
- plan_tests(sizeof(flags) / sizeof(flags[0]) * 3 + 1);
- for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
- tdb = tdb_open("run-12-check.tdb", flags[i],
- O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
- ok1(tdb);
- ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0);
- /* This is what we really want to test: tdb_check(). */
- failtest_suppress = false;
- if (!ok1(tdb_check(tdb, NULL, NULL) == 0))
- goto fail;
- failtest_suppress = true;
- tdb_close(tdb);
- }
- ok1(tap_log_messages == 0);
- failtest_exit(exit_status());
- failtest_suppress = true;
- tdb_close(tdb);
- failtest_exit(exit_status());
diff --git a/lib/tdb2/test/run-15-append.c b/lib/tdb2/test/run-15-append.c
deleted file mode 100644
index 6578b70734..0000000000
--- a/lib/tdb2/test/run-15-append.c
+++ /dev/null
@@ -1,130 +0,0 @@
-#include "tdb2-source.h"
-#include "tap-interface.h"
-#include <ccan/ilog/ilog.h>
-#include "logging.h"
-#define MAX_SIZE 13100
-#define SIZE_STEP 131
-static tdb_off_t tdb_offset(struct tdb_context *tdb, struct tdb_data key)
- tdb_off_t off;
- struct tdb_used_record urec;
- struct hash_info h;
- off = find_and_lock(tdb, key, F_RDLCK, &h, &urec, NULL);
- if (TDB_OFF_IS_ERR(off))
- return 0;
- tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range, F_RDLCK);
- return off;
-int main(int argc, char *argv[])
- unsigned int i, j, moves;
- struct tdb_context *tdb;
- unsigned char *buffer;
- tdb_off_t oldoff = 0, newoff;
- struct tdb_data key = tdb_mkdata("key", 3);
- struct tdb_data data;
- buffer = malloc(MAX_SIZE);
- for (i = 0; i < MAX_SIZE; i++)
- buffer[i] = i;
- plan_tests(sizeof(flags) / sizeof(flags[0])
- * ((3 + MAX_SIZE/SIZE_STEP * 5) * 2 + 7)
- + 1);
- /* Using tdb_store. */
- for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
- tdb = tdb_open("run-append.tdb", flags[i],
- O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
- ok1(tdb);
- if (!tdb)
- continue;
- moves = 0;
- for (j = 0; j < MAX_SIZE; j += SIZE_STEP) {
- data.dptr = buffer;
- data.dsize = j;
- ok1(tdb_store(tdb, key, data, TDB_REPLACE) == 0);
- ok1(tdb_check(tdb, NULL, NULL) == 0);
- ok1(tdb_fetch(tdb, key, &data) == TDB_SUCCESS);
- ok1(data.dsize == j);
- ok1(memcmp(data.dptr, buffer, data.dsize) == 0);
- free(data.dptr);
- newoff = tdb_offset(tdb, key);
- if (newoff != oldoff)
- moves++;
- oldoff = newoff;
- }
- ok1(!tdb->file || (tdb->file->allrecord_lock.count == 0
- && tdb->file->num_lockrecs == 0));
- /* We should increase by 50% each time... */
- ok(moves <= ilog64(j / SIZE_STEP)*2,
- "Moved %u times", moves);
- tdb_close(tdb);
- }
- /* Using tdb_append. */
- for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
- size_t prev_len = 0;
- tdb = tdb_open("run-append.tdb", flags[i],
- O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
- ok1(tdb);
- if (!tdb)
- continue;
- moves = 0;
- for (j = 0; j < MAX_SIZE; j += SIZE_STEP) {
- data.dptr = buffer + prev_len;
- data.dsize = j - prev_len;
- ok1(tdb_append(tdb, key, data) == 0);
- ok1(tdb_check(tdb, NULL, NULL) == 0);
- ok1(tdb_fetch(tdb, key, &data) == TDB_SUCCESS);
- ok1(data.dsize == j);
- ok1(memcmp(data.dptr, buffer, data.dsize) == 0);
- free(data.dptr);
- prev_len = data.dsize;
- newoff = tdb_offset(tdb, key);
- if (newoff != oldoff)
- moves++;
- oldoff = newoff;
- }
- ok1(!tdb->file || (tdb->file->allrecord_lock.count == 0
- && tdb->file->num_lockrecs == 0));
- /* We should increase by 50% each time... */
- ok(moves <= ilog64(j / SIZE_STEP)*2,
- "Moved %u times", moves);
- tdb_close(tdb);
- }
- for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
- tdb = tdb_open("run-append.tdb", flags[i],
- O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
- ok1(tdb);
- if (!tdb)
- continue;
- /* Huge initial store. */
- data.dptr = buffer;
- data.dsize = MAX_SIZE;
- ok1(tdb_append(tdb, key, data) == 0);
- ok1(tdb_check(tdb, NULL, NULL) == 0);
- ok1(tdb_fetch(tdb, key, &data) == TDB_SUCCESS);
- ok1(data.dsize == MAX_SIZE);
- ok1(memcmp(data.dptr, buffer, data.dsize) == 0);
- free(data.dptr);
- ok1(!tdb->file || (tdb->file->allrecord_lock.count == 0
- && tdb->file->num_lockrecs == 0));
- tdb_close(tdb);
- }
- ok1(tap_log_messages == 0);
- free(buffer);
- return exit_status();
diff --git a/lib/tdb2/test/run-20-growhash.c b/lib/tdb2/test/run-20-growhash.c
deleted file mode 100644
index 2f634a27c0..0000000000
--- a/lib/tdb2/test/run-20-growhash.c
+++ /dev/null
@@ -1,137 +0,0 @@
-#include "tdb2-source.h"
-#include "tap-interface.h"
-#include "logging.h"
-static uint64_t myhash(const void *key, size_t len, uint64_t seed, void *priv)
- return *(const uint64_t *)key;
-static void add_bits(uint64_t *val, unsigned new, unsigned new_bits,
- unsigned *done)
- *done += new_bits;
- *val |= ((uint64_t)new << (64 - *done));
-static uint64_t make_key(unsigned topgroup, unsigned topbucket,
- unsigned subgroup1, unsigned subbucket1,
- unsigned subgroup2, unsigned subbucket2)
- uint64_t key = 0;
- unsigned done = 0;
- add_bits(&key, topgroup, TDB_TOPLEVEL_HASH_BITS - TDB_HASH_GROUP_BITS,
- &done);
- add_bits(&key, topbucket, TDB_HASH_GROUP_BITS, &done);
- add_bits(&key, subgroup1, TDB_SUBLEVEL_HASH_BITS - TDB_HASH_GROUP_BITS,
- &done);
- add_bits(&key, subbucket1, TDB_HASH_GROUP_BITS, &done);
- add_bits(&key, subgroup2, TDB_SUBLEVEL_HASH_BITS - TDB_HASH_GROUP_BITS,
- &done);
- add_bits(&key, subbucket2, TDB_HASH_GROUP_BITS, &done);
- return key;
-int main(int argc, char *argv[])
- unsigned int i, j;
- struct tdb_context *tdb;
- uint64_t kdata;
- struct tdb_used_record rec;
- struct tdb_data key = { (unsigned char *)&kdata, sizeof(kdata) };
- struct tdb_data dbuf = { (unsigned char *)&kdata, sizeof(kdata) };
- union tdb_attribute hattr = { .hash = { .base = { TDB_ATTRIBUTE_HASH },
- .fn = myhash } };
- };
- = &tap_log_attr;
- plan_tests(sizeof(flags) / sizeof(flags[0])
- * (9 + (20 + 2 * ((1 << TDB_HASH_GROUP_BITS) - 2))
- * (1 << TDB_HASH_GROUP_BITS)) + 1);
- for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
- struct hash_info h;
- tdb = tdb_open("run-20-growhash.tdb", flags[i],
- O_RDWR|O_CREAT|O_TRUNC, 0600, &hattr);
- ok1(tdb);
- if (!tdb)
- continue;
- /* Fill a group. */
- for (j = 0; j < (1 << TDB_HASH_GROUP_BITS); j++) {
- kdata = make_key(0, j, 0, 0, 0, 0);
- ok1(tdb_store(tdb, key, dbuf, TDB_INSERT) == 0);
- }
- ok1(tdb_check(tdb, NULL, NULL) == 0);
- /* Check first still exists. */
- kdata = make_key(0, 0, 0, 0, 0, 0);
- ok1(find_and_lock(tdb, key, F_RDLCK, &h, &rec, NULL) != 0);
- /* Should have created correct hash. */
- ok1(h.h == tdb_hash(tdb, key.dptr, key.dsize));
- /* Should have located space in group 0, bucket 0. */
- ok1(h.group_start == offsetof(struct tdb_header, hashtable));
- ok1(h.home_bucket == 0);
- ok1(h.found_bucket == 0);
- ok1(h.hash_used == TDB_TOPLEVEL_HASH_BITS);
- /* Entire group should be full! */
- for (j = 0; j < (1 << TDB_HASH_GROUP_BITS); j++)
- ok1([j] != 0);
- ok1(tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range,
- F_RDLCK) == 0);
- /* Now, add one more to each should expand (that) bucket. */
- for (j = 0; j < (1 << TDB_HASH_GROUP_BITS); j++) {
- unsigned int k;
- kdata = make_key(0, j, 0, 1, 0, 0);
- ok1(tdb_store(tdb, key, dbuf, TDB_INSERT) == 0);
- ok1(tdb_check(tdb, NULL, NULL) == 0);
- ok1(find_and_lock(tdb, key, F_RDLCK, &h, &rec, NULL));
- /* Should have created correct hash. */
- ok1(h.h == tdb_hash(tdb, key.dptr, key.dsize));
- /* Should have moved to subhash */
- ok1(h.group_start >= sizeof(struct tdb_header));
- ok1(h.home_bucket == 1);
- ok1(h.found_bucket == 1);
- ok1(h.hash_used == TDB_TOPLEVEL_HASH_BITS
- ok1(tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range,
- F_RDLCK) == 0);
- /* Keep adding, make it expand again. */
- for (k = 2; k < (1 << TDB_HASH_GROUP_BITS); k++) {
- kdata = make_key(0, j, 0, k, 0, 0);
- ok1(tdb_store(tdb, key, dbuf, TDB_INSERT) == 0);
- ok1(tdb_check(tdb, NULL, NULL) == 0);
- }
- /* This should tip it over to sub-sub-hash. */
- kdata = make_key(0, j, 0, 0, 0, 1);
- ok1(tdb_store(tdb, key, dbuf, TDB_INSERT) == 0);
- ok1(tdb_check(tdb, NULL, NULL) == 0);
- ok1(find_and_lock(tdb, key, F_RDLCK, &h, &rec, NULL));
- /* Should have created correct hash. */
- ok1(h.h == tdb_hash(tdb, key.dptr, key.dsize));
- /* Should have moved to subhash */
- ok1(h.group_start >= sizeof(struct tdb_header));
- ok1(h.home_bucket == 1);
- ok1(h.found_bucket == 1);
- ok1(h.hash_used == TDB_TOPLEVEL_HASH_BITS
- ok1(tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range,
- F_RDLCK) == 0);
- }
- tdb_close(tdb);
- }
- ok1(tap_log_messages == 0);
- return exit_status();
diff --git a/lib/tdb2/test/run-25-hashoverload.c b/lib/tdb2/test/run-25-hashoverload.c
deleted file mode 100644
index 850321554a..0000000000
--- a/lib/tdb2/test/run-25-hashoverload.c
+++ /dev/null
@@ -1,113 +0,0 @@
-#include "tdb2-source.h"
-#include "tap-interface.h"
-#include "logging.h"
-static uint64_t badhash(const void *key, size_t len, uint64_t seed, void *priv)
- return 0;
-static int trav(struct tdb_context *tdb, TDB_DATA key, TDB_DATA dbuf, void *p)
- if (p)
- return tdb_delete(tdb, key);
- return 0;
-int main(int argc, char *argv[])
- unsigned int i, j;
- struct tdb_context *tdb;
- struct tdb_data key = { (unsigned char *)&j, sizeof(j) };
- struct tdb_data dbuf = { (unsigned char *)&j, sizeof(j) };
- union tdb_attribute hattr = { .hash = { .base = { TDB_ATTRIBUTE_HASH },
- .fn = badhash } };
- };
- = &tap_log_attr;
- plan_tests(6883);
- for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
- struct tdb_data d = { NULL, 0 }; /* Bogus GCC warning */
- tdb = tdb_open("run-25-hashoverload.tdb", flags[i],
- O_RDWR|O_CREAT|O_TRUNC, 0600, &hattr);
- ok1(tdb);
- if (!tdb)
- continue;
- /* Fill a group. */
- for (j = 0; j < (1 << TDB_HASH_GROUP_BITS); j++) {
- ok1(tdb_store(tdb, key, dbuf, TDB_INSERT) == 0);
- }
- ok1(tdb_check(tdb, NULL, NULL) == 0);
- /* Now store one last value: should form chain. */
- ok1(tdb_store(tdb, key, dbuf, TDB_INSERT) == 0);
- ok1(tdb_check(tdb, NULL, NULL) == 0);
- /* Check we can find them all. */
- for (j = 0; j < (1 << TDB_HASH_GROUP_BITS) + 1; j++) {
- ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS);
- ok1(d.dsize == sizeof(j));
- ok1(d.dptr != NULL);
- ok1(d.dptr && memcmp(d.dptr, &j, d.dsize) == 0);
- free(d.dptr);
- }
- /* Now add a *lot* more. */
- for (j = (1 << TDB_HASH_GROUP_BITS) + 1;
- j < (16 << TDB_HASH_GROUP_BITS);
- j++) {
- ok1(tdb_store(tdb, key, dbuf, TDB_INSERT) == 0);
- ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS);
- ok1(d.dsize == sizeof(j));
- ok1(d.dptr != NULL);
- ok1(d.dptr && memcmp(d.dptr, &j, d.dsize) == 0);
- free(d.dptr);
- }
- ok1(tdb_check(tdb, NULL, NULL) == 0);
- /* Traverse through them. */
- ok1(tdb_traverse(tdb, trav, NULL) == j);
- /* Empty the first chain-worth. */
- for (j = 0; j < (1 << TDB_HASH_GROUP_BITS); j++)
- ok1(tdb_delete(tdb, key) == 0);
- ok1(tdb_check(tdb, NULL, NULL) == 0);
- for (j = (1 << TDB_HASH_GROUP_BITS);
- j < (16 << TDB_HASH_GROUP_BITS);
- j++) {
- ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS);
- ok1(d.dsize == sizeof(j));
- ok1(d.dptr != NULL);
- ok1(d.dptr && memcmp(d.dptr, &j, d.dsize) == 0);
- free(d.dptr);
- }
- /* Traverse through them. */
- ok1(tdb_traverse(tdb, trav, NULL)
- == (15 << TDB_HASH_GROUP_BITS));
- /* Re-add */
- for (j = 0; j < (1 << TDB_HASH_GROUP_BITS); j++) {
- ok1(tdb_store(tdb, key, dbuf, TDB_INSERT) == 0);
- }
- ok1(tdb_check(tdb, NULL, NULL) == 0);
- /* Now try deleting as we go. */
- ok1(tdb_traverse(tdb, trav, trav)
- == (16 << TDB_HASH_GROUP_BITS));
- ok1(tdb_check(tdb, NULL, NULL) == 0);
- ok1(tdb_traverse(tdb, trav, NULL) == 0);
- tdb_close(tdb);
- }
- ok1(tap_log_messages == 0);
- return exit_status();
diff --git a/lib/tdb2/test/run-30-exhaust-before-expand.c b/lib/tdb2/test/run-30-exhaust-before-expand.c
deleted file mode 100644
index 13bb9461d4..0000000000
--- a/lib/tdb2/test/run-30-exhaust-before-expand.c
+++ /dev/null
@@ -1,71 +0,0 @@
-#include "tdb2-source.h"
-#include "tap-interface.h"
-#include "logging.h"
-static bool empty_freetable(struct tdb_context *tdb)
- struct tdb_freetable ftab;
- unsigned int i;
- /* Now, free table should be completely exhausted in zone 0 */
- if (tdb_read_convert(tdb, tdb->ftable_off, &ftab, sizeof(ftab)) != 0)
- abort();
- for (i = 0; i < sizeof(ftab.buckets)/sizeof(ftab.buckets[0]); i++) {
- if (ftab.buckets[i])
- return false;
- }
- return true;
-int main(int argc, char *argv[])
- unsigned int i, j;
- struct tdb_context *tdb;
- plan_tests(sizeof(flags) / sizeof(flags[0]) * 9 + 1);
- for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
- uint64_t size;
- bool was_empty = false;
- k.dptr = (void *)&j;
- k.dsize = sizeof(j);
- tdb = tdb_open("run-30-exhaust-before-expand.tdb", flags[i],
- O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
- ok1(tdb);
- if (!tdb)
- continue;
- ok1(empty_freetable(tdb));
- /* Need some hash lock for expand. */
- ok1(tdb_lock_hashes(tdb, 0, 1, F_WRLCK, TDB_LOCK_WAIT) == 0);
- /* Create some free space. */
- ok1(tdb_expand(tdb, 1) == 0);
- ok1(tdb_unlock_hashes(tdb, 0, 1, F_WRLCK) == 0);
- ok1(tdb_check(tdb, NULL, NULL) == 0);
- ok1(!empty_freetable(tdb));
- size = tdb->file->map_size;
- /* Insert minimal-length records until we expand. */
- for (j = 0; tdb->file->map_size == size; j++) {
- was_empty = empty_freetable(tdb);
- if (tdb_store(tdb, k, k, TDB_INSERT) != 0)
- err(1, "Failed to store record %i", j);
- }
- /* Would have been empty before expansion, but no longer. */
- ok1(was_empty);
- ok1(!empty_freetable(tdb));
- tdb_close(tdb);
- }
- ok1(tap_log_messages == 0);
- return exit_status();
diff --git a/lib/tdb2/test/run-35-convert.c b/lib/tdb2/test/run-35-convert.c
deleted file mode 100644
index ac7939591b..0000000000
--- a/lib/tdb2/test/run-35-convert.c
+++ /dev/null
@@ -1,54 +0,0 @@
-#include "private.h"
-#include <ccan/failtest/failtest_override.h>
-#include "tdb2-source.h"
-#include "tap-interface.h"
-#include <ccan/failtest/failtest.h>
-#include "logging.h"
-#include "failtest_helper.h"
-int main(int argc, char *argv[])
- unsigned int i, messages = 0;
- struct tdb_context *tdb;
- int flags[] = { TDB_DEFAULT, TDB_NOMMAP,
- failtest_init(argc, argv);
- failtest_hook = block_repeat_failures;
- failtest_exit_check = exit_check_log;
- plan_tests(sizeof(flags) / sizeof(flags[0]) * 4);
- for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
- tdb = tdb_open("run-35-convert.tdb", flags[i],
- O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
- if (!ok1(tdb))
- failtest_exit(exit_status());
- tdb_close(tdb);
- /* If we say TDB_CONVERT, it must be converted */
- tdb = tdb_open("run-35-convert.tdb",
- flags[i]|TDB_CONVERT,
- O_RDWR, 0600, &tap_log_attr);
- if (flags[i] & TDB_CONVERT) {
- if (!tdb)
- failtest_exit(exit_status());
- ok1(tdb_get_flags(tdb) & TDB_CONVERT);
- tdb_close(tdb);
- } else {
- if (!ok1(!tdb && errno == EIO))
- failtest_exit(exit_status());
- ok1(tap_log_messages == ++messages);
- if (!ok1(log_last && strstr(log_last, "TDB_CONVERT")))
- failtest_exit(exit_status());
- }
- /* If don't say TDB_CONVERT, it *may* be converted */
- tdb = tdb_open("run-35-convert.tdb",
- flags[i] & ~TDB_CONVERT,
- O_RDWR, 0600, &tap_log_attr);
- if (!tdb)
- failtest_exit(exit_status());
- ok1(tdb_get_flags(tdb) == flags[i]);
- tdb_close(tdb);
- }
- failtest_exit(exit_status());
diff --git a/lib/tdb2/test/run-50-multiple-freelists.c b/lib/tdb2/test/run-50-multiple-freelists.c
deleted file mode 100644
index b102876c8d..0000000000
--- a/lib/tdb2/test/run-50-multiple-freelists.c
+++ /dev/null
@@ -1,70 +0,0 @@
-#include "tdb2-source.h"
-#include "tap-interface.h"
-#include "logging.h"
-#include "layout.h"
-int main(int argc, char *argv[])
- tdb_off_t off;
- struct tdb_context *tdb;
- struct tdb_layout *layout;
- TDB_DATA key, data;
- union tdb_attribute seed;
- /* This seed value previously tickled a layout.c bug. */
- seed.base.attr = TDB_ATTRIBUTE_SEED;
- seed.seed.seed = 0xb1142bc054d035b4ULL;
- = &tap_log_attr;
- plan_tests(11);
- key = tdb_mkdata("Hello", 5);
- data = tdb_mkdata("world", 5);
- /* Create a TDB with three free tables. */
- layout = new_tdb_layout();
- tdb_layout_add_freetable(layout);
- tdb_layout_add_freetable(layout);
- tdb_layout_add_freetable(layout);
- tdb_layout_add_free(layout, 80, 0);
- /* Used record prevent coalescing. */
- tdb_layout_add_used(layout, key, data, 6);
- tdb_layout_add_free(layout, 160, 1);
- key.dsize--;
- tdb_layout_add_used(layout, key, data, 7);
- tdb_layout_add_free(layout, 320, 2);
- key.dsize--;
- tdb_layout_add_used(layout, key, data, 8);
- tdb_layout_add_free(layout, 40, 0);
- tdb = tdb_layout_get(layout, free, &seed);
- ok1(tdb_check(tdb, NULL, NULL) == 0);
- off = get_free(tdb, 0, 80 - sizeof(struct tdb_used_record), 0,
- ok1(off == layout->elem[3];
- ok1(tdb->ftable_off == layout->elem[0];
- off = get_free(tdb, 0, 160 - sizeof(struct tdb_used_record), 0,
- ok1(off == layout->elem[5];
- ok1(tdb->ftable_off == layout->elem[1];
- off = get_free(tdb, 0, 320 - sizeof(struct tdb_used_record), 0,
- ok1(off == layout->elem[7];
- ok1(tdb->ftable_off == layout->elem[2];
- off = get_free(tdb, 0, 40 - sizeof(struct tdb_used_record), 0,
- ok1(off == layout->elem[9];
- ok1(tdb->ftable_off == layout->elem[0];
- /* Now we fail. */
- off = get_free(tdb, 0, 0, 1, TDB_USED_MAGIC, 0);
- ok1(off == 0);
- tdb_close(tdb);
- tdb_layout_free(layout);
- ok1(tap_log_messages == 0);
- return exit_status();
diff --git a/lib/tdb2/test/run-56-open-during-transaction.c b/lib/tdb2/test/run-56-open-during-transaction.c
deleted file mode 100644
index c514caa92b..0000000000
--- a/lib/tdb2/test/run-56-open-during-transaction.c
+++ /dev/null
@@ -1,165 +0,0 @@
-#include "private.h"
-#include <unistd.h>
-#include "lock-tracking.h"
-static ssize_t pwrite_check(int fd, const void *buf, size_t count, off_t offset);
-static ssize_t write_check(int fd, const void *buf, size_t count);
-static int ftruncate_check(int fd, off_t length);
-#define pwrite pwrite_check
-#define write write_check
-#define fcntl fcntl_with_lockcheck
-#define ftruncate ftruncate_check
-#include "tdb2-source.h"
-#include "tap-interface.h"
-#include <stdlib.h>
-#include <stdbool.h>
-#include <stdarg.h>
-#include "external-agent.h"
-#include "logging.h"
-static struct agent *agent;
-static bool opened;
-static int errors = 0;
-#define TEST_DBNAME "run-56-open-during-transaction.tdb"
-#undef write
-#undef pwrite
-#undef fcntl
-#undef ftruncate
-static bool is_same(const char *snapshot, const char *latest, off_t len)
- unsigned i;
- for (i = 0; i < len; i++) {
- if (snapshot[i] != latest[i])
- return false;
- }
- return true;
-static bool compare_file(int fd, const char *snapshot, off_t snapshot_len)
- char *contents;
- bool ret;
- /* over-length read serves as length check. */
- contents = malloc(snapshot_len+1);
- ret = pread(fd, contents, snapshot_len+1, 0) == snapshot_len
- && is_same(snapshot, contents, snapshot_len);
- free(contents);
- return ret;
-static void check_file_intact(int fd)
- enum agent_return ret;
- struct stat st;
- char *contents;
- fstat(fd, &st);
- contents = malloc(st.st_size);
- if (pread(fd, contents, st.st_size, 0) != st.st_size) {
- diag("Read fail");
- errors++;
- return;
- }
- /* Ask agent to open file. */
- ret = external_agent_operation(agent, OPEN, TEST_DBNAME);
- /* It's OK to open it, but it must not have changed! */
- if (!compare_file(fd, contents, st.st_size)) {
- diag("Agent changed file after opening %s",
- agent_return_name(ret));
- errors++;
- }
- if (ret == SUCCESS) {
- ret = external_agent_operation(agent, CLOSE, NULL);
- if (ret != SUCCESS) {
- diag("Agent failed to close tdb: %s",
- agent_return_name(ret));
- errors++;
- }
- } else if (ret != WOULD_HAVE_BLOCKED) {
- diag("Agent opening file gave %s",
- agent_return_name(ret));
- errors++;
- }
- free(contents);
-static void after_unlock(int fd)
- if (opened)
- check_file_intact(fd);
-static ssize_t pwrite_check(int fd,
- const void *buf, size_t count, off_t offset)
- if (opened)
- check_file_intact(fd);
- return pwrite(fd, buf, count, offset);
-static ssize_t write_check(int fd, const void *buf, size_t count)
- if (opened)
- check_file_intact(fd);
- return write(fd, buf, count);
-static int ftruncate_check(int fd, off_t length)
- if (opened)
- check_file_intact(fd);
- return ftruncate(fd, length);
-int main(int argc, char *argv[])
- const int flags[] = { TDB_DEFAULT, TDB_NOMMAP,
- int i;
- struct tdb_context *tdb;
- TDB_DATA key, data;
- plan_tests(sizeof(flags)/sizeof(flags[0]) * 5);
- agent = prepare_external_agent();
- if (!agent)
- err(1, "preparing agent");
- unlock_callback = after_unlock;
- for (i = 0; i < sizeof(flags)/sizeof(flags[0]); i++) {
- diag("Test with %s and %s\n",
- (flags[i] & TDB_CONVERT) ? "CONVERT" : "DEFAULT",
- (flags[i] & TDB_NOMMAP) ? "no mmap" : "mmap");
- unlink(TEST_DBNAME);
- tdb = tdb_open(TEST_DBNAME, flags[i],
- O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
- ok1(tdb);
- opened = true;
- ok1(tdb_transaction_start(tdb) == 0);
- key = tdb_mkdata("hi", strlen("hi"));
- data = tdb_mkdata("world", strlen("world"));
- ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0);
- ok1(tdb_transaction_commit(tdb) == 0);
- ok(!errors, "We had %u open errors", errors);
- opened = false;
- tdb_close(tdb);
- }
- return exit_status();
diff --git a/lib/tdb2/test/run-57-die-during-transaction.c b/lib/tdb2/test/run-57-die-during-transaction.c
deleted file mode 100644
index ee33a896ff..0000000000
--- a/lib/tdb2/test/run-57-die-during-transaction.c
+++ /dev/null
@@ -1,293 +0,0 @@
-#include "private.h"
-#include <unistd.h>
-#include "lock-tracking.h"
-#include "tap-interface.h"
-#include <stdlib.h>
-#include <assert.h>
-static ssize_t pwrite_check(int fd, const void *buf, size_t count, off_t offset);
-static ssize_t write_check(int fd, const void *buf, size_t count);
-static int ftruncate_check(int fd, off_t length);
-#define pwrite pwrite_check
-#define write write_check
-#define fcntl fcntl_with_lockcheck
-#define ftruncate ftruncate_check
-/* There's a malloc inside transaction_setup_recovery, and valgrind complains
- * when we longjmp and leak it. */
-static void *allocated[MAX_ALLOCATIONS];
-static unsigned max_alloc = 0;
-static void *malloc_noleak(size_t len)
- unsigned int i;
- for (i = 0; i < MAX_ALLOCATIONS; i++)
- if (!allocated[i]) {
- allocated[i] = malloc(len);
- if (i > max_alloc) {
- max_alloc = i;
- diag("max_alloc: %i", max_alloc);
- }
- return allocated[i];
- }
- diag("Too many allocations!");
- abort();
-static void *realloc_noleak(void *p, size_t size)
- unsigned int i;
- for (i = 0; i < MAX_ALLOCATIONS; i++) {
- if (allocated[i] == p) {
- if (i > max_alloc) {
- max_alloc = i;
- diag("max_alloc: %i", max_alloc);
- }
- return allocated[i] = realloc(p, size);
- }
- }
- diag("Untracked realloc!");
- abort();
-static void free_noleak(void *p)
- unsigned int i;
- /* We don't catch asprintf, so don't complain if we miss one. */
- for (i = 0; i < MAX_ALLOCATIONS; i++) {
- if (allocated[i] == p) {
- allocated[i] = NULL;
- break;
- }
- }
- free(p);
-static void free_all(void)
- unsigned int i;
- for (i = 0; i < MAX_ALLOCATIONS; i++) {
- free(allocated[i]);
- allocated[i] = NULL;
- }
-#define malloc malloc_noleak
-#define free free_noleak
-#define realloc realloc_noleak
-#include "tdb2-source.h"
-#undef malloc
-#undef free
-#undef realloc
-#undef write
-#undef pwrite
-#undef fcntl
-#undef ftruncate
-#include <stdbool.h>
-#include <stdarg.h>
-#include <setjmp.h>
-#include "external-agent.h"
-#include "logging.h"
-static bool in_transaction;
-static int target, current;
-static jmp_buf jmpbuf;
-#define TEST_DBNAME "run-57-die-during-transaction.tdb"
-#define KEY_STRING "helloworld"
-static void maybe_die(int fd)
- if (in_transaction && current++ == target) {
- longjmp(jmpbuf, 1);
- }
-static ssize_t pwrite_check(int fd,
- const void *buf, size_t count, off_t offset)
- ssize_t ret;
- maybe_die(fd);
- ret = pwrite(fd, buf, count, offset);
- if (ret != count)
- return ret;
- maybe_die(fd);
- return ret;
-static ssize_t write_check(int fd, const void *buf, size_t count)
- ssize_t ret;
- maybe_die(fd);
- ret = write(fd, buf, count);
- if (ret != count)
- return ret;
- maybe_die(fd);
- return ret;
-static int ftruncate_check(int fd, off_t length)
- int ret;
- maybe_die(fd);
- ret = ftruncate(fd, length);
- maybe_die(fd);
- return ret;
-static bool test_death(enum operation op, struct agent *agent)
- struct tdb_context *tdb = NULL;
- TDB_DATA key;
- enum agent_return ret;
- int needed_recovery = 0;
- current = target = 0;
- unlink(TEST_DBNAME);
- tdb = tdb_open(TEST_DBNAME, TDB_NOMMAP,
- O_CREAT|O_TRUNC|O_RDWR, 0600, &tap_log_attr);
- if (!tdb) {
- diag("Failed opening TDB: %s", strerror(errno));
- return false;
- }
- if (setjmp(jmpbuf) != 0) {
- /* We're partway through. Simulate our death. */
- close(tdb->file->fd);
- forget_locking();
- in_transaction = false;
- ret = external_agent_operation(agent, NEEDS_RECOVERY, "");
- if (ret == SUCCESS)
- needed_recovery++;
- else if (ret != FAILED) {
- diag("Step %u agent NEEDS_RECOVERY = %s", current,
- agent_return_name(ret));
- return false;
- }
- ret = external_agent_operation(agent, op, KEY_STRING);
- if (ret != SUCCESS) {
- diag("Step %u op %s failed = %s", current,
- operation_name(op),
- agent_return_name(ret));
- return false;
- }
- ret = external_agent_operation(agent, NEEDS_RECOVERY, "");
- if (ret != FAILED) {
- diag("Still needs recovery after step %u = %s",
- current, agent_return_name(ret));
- return false;
- }
- ret = external_agent_operation(agent, CHECK, "");
- if (ret != SUCCESS) {
- diag("Step %u check failed = %s", current,
- agent_return_name(ret));
- return false;
- }
- ret = external_agent_operation(agent, CLOSE, "");
- if (ret != SUCCESS) {
- diag("Step %u close failed = %s", current,
- agent_return_name(ret));
- return false;
- }
- /* Suppress logging as this tries to use closed fd. */
- suppress_logging = true;
- suppress_lockcheck = true;
- tdb_close(tdb);
- suppress_logging = false;
- suppress_lockcheck = false;
- target++;
- current = 0;
- free_all();
- goto reset;
- }
- /* Put key for agent to fetch. */
- key = tdb_mkdata(KEY_STRING, strlen(KEY_STRING));
- if (tdb_store(tdb, key, key, TDB_INSERT) != 0)
- return false;
- /* This is the key we insert in transaction. */
- key.dsize--;
- ret = external_agent_operation(agent, OPEN, TEST_DBNAME);
- if (ret != SUCCESS)
- errx(1, "Agent failed to open: %s", agent_return_name(ret));
- ret = external_agent_operation(agent, FETCH, KEY_STRING);
- if (ret != SUCCESS)
- errx(1, "Agent failed find key: %s", agent_return_name(ret));
- in_transaction = true;
- if (tdb_transaction_start(tdb) != 0)
- return false;
- if (tdb_store(tdb, key, key, TDB_INSERT) != 0)
- return false;
- if (tdb_transaction_commit(tdb) != 0)
- return false;
- in_transaction = false;
- /* We made it! */
- diag("Completed %u runs", current);
- tdb_close(tdb);
- ret = external_agent_operation(agent, CLOSE, "");
- if (ret != SUCCESS) {
- diag("Step %u close failed = %s", current,
- agent_return_name(ret));
- return false;
- }
- ok1(needed_recovery);
- ok1(locking_errors == 0);
- ok1(forget_locking() == 0);
- locking_errors = 0;
- return true;
-int main(int argc, char *argv[])
- enum operation ops[] = { FETCH, STORE, TRANSACTION_START };
- struct agent *agent;
- int i;
- plan_tests(12);
- unlock_callback = maybe_die;
- external_agent_free = free_noleak;
- agent = prepare_external_agent();
- if (!agent)
- err(1, "preparing agent");
- for (i = 0; i < sizeof(ops)/sizeof(ops[0]); i++) {
- diag("Testing %s after death", operation_name(ops[i]));
- ok1(test_death(ops[i], agent));
- }
- free_external_agent(agent);
- return exit_status();
diff --git a/lib/tdb2/test/run-64-bit-tdb.c b/lib/tdb2/test/run-64-bit-tdb.c
deleted file mode 100644
index ef6e243a05..0000000000
--- a/lib/tdb2/test/run-64-bit-tdb.c
+++ /dev/null
@@ -1,72 +0,0 @@
-#include "tdb2-source.h"
-#include "tap-interface.h"
-#include "logging.h"
-int main(int argc, char *argv[])
- unsigned int i;
- struct tdb_context *tdb;
- int flags[] = { TDB_DEFAULT, TDB_NOMMAP,
- if (sizeof(off_t) <= 4) {
- plan_tests(1);
- pass("No 64 bit off_t");
- return exit_status();
- }
- plan_tests(sizeof(flags) / sizeof(flags[0]) * 14);
- for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
- off_t old_size;
- TDB_DATA k, d;
- struct hash_info h;
- struct tdb_used_record rec;
- tdb_off_t off;
- tdb = tdb_open("run-64-bit-tdb.tdb", flags[i],
- O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
- ok1(tdb);
- if (!tdb)
- continue;
- old_size = tdb->file->map_size;
- /* This makes a sparse file */
- ok1(ftruncate(tdb->file->fd, 0xFFFFFFF0) == 0);
- ok1(add_free_record(tdb, old_size, 0xFFFFFFF0 - old_size,
- /* Now add a little record past the 4G barrier. */
- ok1(tdb_expand_file(tdb, 100) == TDB_SUCCESS);
- ok1(add_free_record(tdb, 0xFFFFFFF0, 100, TDB_LOCK_WAIT, false)
- ok1(tdb_check(tdb, NULL, NULL) == TDB_SUCCESS);
- /* Test allocation path. */
- k = tdb_mkdata("key", 4);
- d = tdb_mkdata("data", 5);
- ok1(tdb_store(tdb, k, d, TDB_INSERT) == 0);
- ok1(tdb_check(tdb, NULL, NULL) == TDB_SUCCESS);
- /* Make sure it put it at end as we expected. */
- off = find_and_lock(tdb, k, F_RDLCK, &h, &rec, NULL);
- ok1(off >= 0xFFFFFFF0);
- tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range, F_RDLCK);
- ok1(tdb_fetch(tdb, k, &d) == 0);
- ok1(d.dsize == 5);
- ok1(strcmp((char *)d.dptr, "data") == 0);
- free(d.dptr);
- ok1(tdb_delete(tdb, k) == 0);
- ok1(tdb_check(tdb, NULL, NULL) == TDB_SUCCESS);
- tdb_close(tdb);
- }
- /* We might get messages about mmap failing, so don't test
- * tap_log_messages */
- return exit_status();
diff --git a/lib/tdb2/test/run-90-get-set-attributes.c b/lib/tdb2/test/run-90-get-set-attributes.c
deleted file mode 100644
index edf0735013..0000000000
--- a/lib/tdb2/test/run-90-get-set-attributes.c
+++ /dev/null
@@ -1,159 +0,0 @@
-#include "tdb2-source.h"
-#include "tap-interface.h"
-#include "logging.h"
-static int mylock(int fd, int rw, off_t off, off_t len, bool waitflag,
- void *unused)
- return 0;
-static int myunlock(int fd, int rw, off_t off, off_t len, void *unused)
- return 0;
-static uint64_t hash_fn(const void *key, size_t len, uint64_t seed,
- void *priv)
- return 0;
-int main(int argc, char *argv[])
- unsigned int i;
- struct tdb_context *tdb;
- int flags[] = { TDB_DEFAULT, TDB_NOMMAP,
- union tdb_attribute seed_attr;
- union tdb_attribute hash_attr;
- union tdb_attribute lock_attr;
- seed_attr.base.attr = TDB_ATTRIBUTE_SEED;
- = &hash_attr;
- seed_attr.seed.seed = 100;
- hash_attr.base.attr = TDB_ATTRIBUTE_HASH;
- = &lock_attr;
- hash_attr.hash.fn = hash_fn;
- = &hash_attr;
- lock_attr.base.attr = TDB_ATTRIBUTE_FLOCK;
- = &tap_log_attr;
- lock_attr.flock.lock = mylock;
- lock_attr.flock.unlock = myunlock;
- = &lock_attr;
- plan_tests(sizeof(flags) / sizeof(flags[0]) * 50);
- for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
- union tdb_attribute attr;
- /* First open with no attributes. */
- tdb = tdb_open("run-90-get-set-attributes.tdb", flags[i],
- ok1(tdb);
- /* Get log on no attributes will fail */
- attr.base.attr = TDB_ATTRIBUTE_LOG;
- ok1(tdb_get_attribute(tdb, &attr) == TDB_ERR_NOEXIST);
- /* These always work. */
- attr.base.attr = TDB_ATTRIBUTE_HASH;
- ok1(tdb_get_attribute(tdb, &attr) == 0);
- ok1(attr.base.attr == TDB_ATTRIBUTE_HASH);
- ok1(attr.hash.fn == tdb_jenkins_hash);
- attr.base.attr = TDB_ATTRIBUTE_FLOCK;
- ok1(tdb_get_attribute(tdb, &attr) == 0);
- ok1(attr.base.attr == TDB_ATTRIBUTE_FLOCK);
- ok1(attr.flock.lock == tdb_fcntl_lock);
- ok1(attr.flock.unlock == tdb_fcntl_unlock);
- attr.base.attr = TDB_ATTRIBUTE_SEED;
- ok1(tdb_get_attribute(tdb, &attr) == 0);
- ok1(attr.base.attr == TDB_ATTRIBUTE_SEED);
- /* This is possible, just astronomically unlikely. */
- ok1(attr.seed.seed != 0);
- /* Unset attributes. */
- tdb_unset_attribute(tdb, TDB_ATTRIBUTE_LOG);
- tdb_unset_attribute(tdb, TDB_ATTRIBUTE_FLOCK);
- /* Set them. */
- ok1(tdb_set_attribute(tdb, &tap_log_attr) == 0);
- ok1(tdb_set_attribute(tdb, &lock_attr) == 0);
- /* These should fail. */
- ok1(tdb_set_attribute(tdb, &seed_attr) == TDB_ERR_EINVAL);
- ok1(tap_log_messages == 1);
- ok1(tdb_set_attribute(tdb, &hash_attr) == TDB_ERR_EINVAL);
- ok1(tap_log_messages == 2);
- tap_log_messages = 0;
- /* Getting them should work as expected. */
- attr.base.attr = TDB_ATTRIBUTE_LOG;
- ok1(tdb_get_attribute(tdb, &attr) == 0);
- ok1(attr.base.attr == TDB_ATTRIBUTE_LOG);
- ok1(attr.log.fn == tap_log_attr.log.fn);
- ok1( ==;
- attr.base.attr = TDB_ATTRIBUTE_FLOCK;
- ok1(tdb_get_attribute(tdb, &attr) == 0);
- ok1(attr.base.attr == TDB_ATTRIBUTE_FLOCK);
- ok1(attr.flock.lock == mylock);
- ok1(attr.flock.unlock == myunlock);
- ok1( == &lock_attr);
- /* Unset them again. */
- tdb_unset_attribute(tdb, TDB_ATTRIBUTE_FLOCK);
- ok1(tap_log_messages == 0);
- tdb_unset_attribute(tdb, TDB_ATTRIBUTE_LOG);
- ok1(tap_log_messages == 0);
- tdb_close(tdb);
- ok1(tap_log_messages == 0);
- /* Now open with all attributes. */
- tdb = tdb_open("run-90-get-set-attributes.tdb", flags[i],
- &seed_attr);
- ok1(tdb);
- /* Get will succeed */
- attr.base.attr = TDB_ATTRIBUTE_LOG;
- ok1(tdb_get_attribute(tdb, &attr) == 0);
- ok1(attr.base.attr == TDB_ATTRIBUTE_LOG);
- ok1(attr.log.fn == tap_log_attr.log.fn);
- ok1( ==;
- attr.base.attr = TDB_ATTRIBUTE_HASH;
- ok1(tdb_get_attribute(tdb, &attr) == 0);
- ok1(attr.base.attr == TDB_ATTRIBUTE_HASH);
- ok1(attr.hash.fn == hash_fn);
- ok1( == &hash_attr);
- attr.base.attr = TDB_ATTRIBUTE_FLOCK;
- ok1(tdb_get_attribute(tdb, &attr) == 0);
- ok1(attr.base.attr == TDB_ATTRIBUTE_FLOCK);
- ok1(attr.flock.lock == mylock);
- ok1(attr.flock.unlock == myunlock);
- ok1( == &lock_attr);
- attr.base.attr = TDB_ATTRIBUTE_SEED;
- ok1(tdb_get_attribute(tdb, &attr) == 0);
- ok1(attr.base.attr == TDB_ATTRIBUTE_SEED);
- ok1(attr.seed.seed == seed_attr.seed.seed);
- /* Unset attributes. */
- tdb_unset_attribute(tdb, TDB_ATTRIBUTE_HASH);
- ok1(tap_log_messages == 1);
- tdb_unset_attribute(tdb, TDB_ATTRIBUTE_SEED);
- ok1(tap_log_messages == 2);
- tdb_unset_attribute(tdb, TDB_ATTRIBUTE_FLOCK);
- tdb_unset_attribute(tdb, TDB_ATTRIBUTE_LOG);
- ok1(tap_log_messages == 2);
- tap_log_messages = 0;
- tdb_close(tdb);
- }
- return exit_status();
diff --git a/lib/tdb2/test/run-capabilities.c b/lib/tdb2/test/run-capabilities.c
deleted file mode 100644
index 1501abbe5c..0000000000
--- a/lib/tdb2/test/run-capabilities.c
+++ /dev/null
@@ -1,271 +0,0 @@
-#include <ccan/failtest/failtest_override.h>
-#include "tdb2-source.h"
-#include "tap-interface.h"
-#include "logging.h"
-#include "layout.h"
-#include "failtest_helper.h"
-#include <stdarg.h>
-static size_t len_of(bool breaks_check, bool breaks_write, bool breaks_open)
- size_t len = 0;
- if (breaks_check)
- len += 8;
- if (breaks_write)
- len += 16;
- if (breaks_open)
- len += 32;
- return len;
-/* Creates a TDB with various capabilities. */
-static void create_tdb(const char *name,
- unsigned int cap,
- bool breaks_check,
- bool breaks_write,
- bool breaks_open, ...)
- TDB_DATA key, data;
- va_list ap;
- struct tdb_layout *layout;
- struct tdb_context *tdb;
- int fd;
- key = tdb_mkdata("Hello", 5);
- data = tdb_mkdata("world", 5);
- /* Create a TDB with some data, and some capabilities */
- layout = new_tdb_layout();
- tdb_layout_add_freetable(layout);
- tdb_layout_add_used(layout, key, data, 6);
- tdb_layout_add_free(layout, 80, 0);
- tdb_layout_add_capability(layout, cap,
- breaks_write, breaks_check, breaks_open,
- len_of(breaks_check, breaks_write, breaks_open));
- va_start(ap, breaks_open);
- while ((cap = va_arg(ap, int)) != 0) {
- breaks_check = va_arg(ap, int);
- breaks_write = va_arg(ap, int);
- breaks_open = va_arg(ap, int);
- key.dsize--;
- tdb_layout_add_used(layout, key, data, 11 - key.dsize);
- tdb_layout_add_free(layout, 80, 0);
- tdb_layout_add_capability(layout, cap,
- breaks_write, breaks_check,
- breaks_open,
- len_of(breaks_check, breaks_write,
- breaks_open));
- }
- va_end(ap);
- /* We open-code this, because we need to use the failtest write. */
- tdb = tdb_layout_get(layout, failtest_free, &tap_log_attr);
- fd = open(name, O_RDWR|O_TRUNC|O_CREAT, 0600);
- if (fd < 0)
- err(1, "opening %s for writing", name);
- if (write(fd, tdb->file->map_ptr, tdb->file->map_size)
- != tdb->file->map_size)
- err(1, "writing %s", name);
- close(fd);
- tdb_close(tdb);
- tdb_layout_free(layout);
-/* Note all the "goto out" early exits: they're to shorten failtest time. */
-int main(int argc, char *argv[])
- struct tdb_context *tdb;
- char *summary;
- failtest_init(argc, argv);
- failtest_hook = block_repeat_failures;
- failtest_exit_check = exit_check_log;
- plan_tests(60);
- failtest_suppress = true;
- /* Capability says you can ignore it? */
- create_tdb("run-capabilities.tdb", 1, false, false, false, 0);
- failtest_suppress = false;
- tdb = tdb_open("run-capabilities.tdb", TDB_DEFAULT, O_RDWR, 0,
- &tap_log_attr);
- failtest_suppress = true;
- if (!ok1(tdb))
- goto out;
- ok1(tap_log_messages == 0);
- ok1(tdb_check(tdb, NULL, NULL) == TDB_SUCCESS);
- ok1(tap_log_messages == 0);
- tdb_close(tdb);
- /* Two capabilitues say you can ignore them? */
- create_tdb("run-capabilities.tdb",
- 1, false, false, false,
- 2, false, false, false, 0);
- failtest_suppress = false;
- tdb = tdb_open("run-capabilities.tdb", TDB_DEFAULT, O_RDWR, 0,
- &tap_log_attr);
- failtest_suppress = true;
- if (!ok1(tdb))
- goto out;
- ok1(tap_log_messages == 0);
- ok1(tdb_check(tdb, NULL, NULL) == TDB_SUCCESS);
- ok1(tap_log_messages == 0);
- ok1(tdb_summary(tdb, 0, &summary) == TDB_SUCCESS);
- ok1(strstr(summary, "Capability 1\n"));
- free(summary);
- tdb_close(tdb);
- /* Capability says you can't check. */
- create_tdb("run-capabilities.tdb",
- 1, false, false, false,
- 2, true, false, false, 0);
- failtest_suppress = false;
- tdb = tdb_open("run-capabilities.tdb", TDB_DEFAULT, O_RDWR, 0,
- &tap_log_attr);
- failtest_suppress = true;
- if (!ok1(tdb))
- goto out;
- ok1(tap_log_messages == 0);
- ok1(tdb_get_flags(tdb) & TDB_CANT_CHECK);
- ok1(tdb_check(tdb, NULL, NULL) == TDB_SUCCESS);
- /* We expect a warning! */
- ok1(tap_log_messages == 1);
- ok1(strstr(log_last, "capabilit"));
- ok1(tdb_summary(tdb, 0, &summary) == TDB_SUCCESS);
- ok1(strstr(summary, "Capability 1\n"));
- ok1(strstr(summary, "Capability 2 (uncheckable)\n"));
- free(summary);
- tdb_close(tdb);
- /* Capability says you can't write. */
- create_tdb("run-capabilities.tdb",
- 1, false, false, false,
- 2, false, true, false, 0);
- failtest_suppress = false;
- tdb = tdb_open("run-capabilities.tdb", TDB_DEFAULT, O_RDWR, 0,
- &tap_log_attr);
- failtest_suppress = true;
- /* We expect a message. */
- ok1(!tdb);
- if (!ok1(tap_log_messages == 2))
- goto out;
- if (!ok1(strstr(log_last, "unknown")))
- goto out;
- ok1(strstr(log_last, "write"));
- /* We can open it read-only though! */
- failtest_suppress = false;
- tdb = tdb_open("run-capabilities.tdb", TDB_DEFAULT, O_RDONLY, 0,
- &tap_log_attr);
- failtest_suppress = true;
- if (!ok1(tdb))
- goto out;
- ok1(tap_log_messages == 2);
- ok1(tdb_check(tdb, NULL, NULL) == TDB_SUCCESS);
- ok1(tap_log_messages == 2);
- ok1(tdb_summary(tdb, 0, &summary) == TDB_SUCCESS);
- ok1(strstr(summary, "Capability 1\n"));
- ok1(strstr(summary, "Capability 2 (read-only)\n"));
- free(summary);
- tdb_close(tdb);
- /* Capability says you can't open. */
- create_tdb("run-capabilities.tdb",
- 1, false, false, false,
- 2, false, false, true, 0);
- failtest_suppress = false;
- tdb = tdb_open("run-capabilities.tdb", TDB_DEFAULT, O_RDWR, 0,
- &tap_log_attr);
- failtest_suppress = true;
- /* We expect a message. */
- ok1(!tdb);
- if (!ok1(tap_log_messages == 3))
- goto out;
- if (!ok1(strstr(log_last, "unknown")))
- goto out;
- /* Combine capabilities correctly. */
- create_tdb("run-capabilities.tdb",
- 1, false, false, false,
- 2, true, false, false,
- 3, false, true, false, 0);
- failtest_suppress = false;
- tdb = tdb_open("run-capabilities.tdb", TDB_DEFAULT, O_RDWR, 0,
- &tap_log_attr);
- failtest_suppress = true;
- /* We expect a message. */
- ok1(!tdb);
- if (!ok1(tap_log_messages == 4))
- goto out;
- if (!ok1(strstr(log_last, "unknown")))
- goto out;
- ok1(strstr(log_last, "write"));
- /* We can open it read-only though! */
- failtest_suppress = false;
- tdb = tdb_open("run-capabilities.tdb", TDB_DEFAULT, O_RDONLY, 0,
- &tap_log_attr);
- failtest_suppress = true;
- if (!ok1(tdb))
- goto out;
- ok1(tap_log_messages == 4);
- ok1(tdb_get_flags(tdb) & TDB_CANT_CHECK);
- ok1(tdb_check(tdb, NULL, NULL) == TDB_SUCCESS);
- /* We expect a warning! */
- ok1(tap_log_messages == 5);
- ok1(strstr(log_last, "unknown"));
- ok1(tdb_summary(tdb, 0, &summary) == TDB_SUCCESS);
- ok1(strstr(summary, "Capability 1\n"));
- ok1(strstr(summary, "Capability 2 (uncheckable)\n"));
- ok1(strstr(summary, "Capability 3 (read-only)\n"));
- free(summary);
- tdb_close(tdb);
- /* Two capability flags in one. */
- create_tdb("run-capabilities.tdb",
- 1, false, false, false,
- 2, true, true, false,
- 0);
- failtest_suppress = false;
- tdb = tdb_open("run-capabilities.tdb", TDB_DEFAULT, O_RDWR, 0,
- &tap_log_attr);
- failtest_suppress = true;
- /* We expect a message. */
- ok1(!tdb);
- if (!ok1(tap_log_messages == 6))
- goto out;
- if (!ok1(strstr(log_last, "unknown")))
- goto out;
- ok1(strstr(log_last, "write"));
- /* We can open it read-only though! */
- failtest_suppress = false;
- tdb = tdb_open("run-capabilities.tdb", TDB_DEFAULT, O_RDONLY, 0,
- &tap_log_attr);
- failtest_suppress = true;
- if (!ok1(tdb))
- goto out;
- ok1(tap_log_messages == 6);
- ok1(tdb_get_flags(tdb) & TDB_CANT_CHECK);
- ok1(tdb_check(tdb, NULL, NULL) == TDB_SUCCESS);
- /* We expect a warning! */
- ok1(tap_log_messages == 7);
- ok1(strstr(log_last, "unknown"));
- ok1(tdb_summary(tdb, 0, &summary) == TDB_SUCCESS);
- ok1(strstr(summary, "Capability 1\n"));
- ok1(strstr(summary, "Capability 2 (uncheckable,read-only)\n"));
- free(summary);
- tdb_close(tdb);
- failtest_exit(exit_status());
diff --git a/lib/tdb2/test/run-expand-in-transaction.c b/lib/tdb2/test/run-expand-in-transaction.c
deleted file mode 100644
index 6b22d2ef46..0000000000
--- a/lib/tdb2/test/run-expand-in-transaction.c
+++ /dev/null
@@ -1,36 +0,0 @@
-#include "tdb2-source.h"
-#include "tap-interface.h"
-#include "logging.h"
-int main(int argc, char *argv[])
- unsigned int i;
- struct tdb_context *tdb;
- int flags[] = { TDB_DEFAULT, TDB_NOMMAP,
- struct tdb_data key = tdb_mkdata("key", 3);
- struct tdb_data data = tdb_mkdata("data", 4);
- plan_tests(sizeof(flags) / sizeof(flags[0]) * 7 + 1);
- for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
- size_t size;
- tdb = tdb_open("run-expand-in-transaction.tdb", flags[i],
- O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
- ok1(tdb);
- if (!tdb)
- continue;
- size = tdb->file->map_size;
- ok1(tdb_transaction_start(tdb) == 0);
- ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0);
- ok1(tdb->file->map_size > size);
- ok1(tdb_transaction_commit(tdb) == 0);
- ok1(tdb->file->map_size > size);
- ok1(tdb_check(tdb, NULL, NULL) == 0);
- tdb_close(tdb);
- }
- ok1(tap_log_messages == 0);
- return exit_status();
diff --git a/lib/tdb2/test/run-features.c b/lib/tdb2/test/run-features.c
deleted file mode 100644
index f552fcfb58..0000000000
--- a/lib/tdb2/test/run-features.c
+++ /dev/null
@@ -1,62 +0,0 @@
-#include "tdb2-source.h"
-#include "tap-interface.h"
-#include "logging.h"
-int main(int argc, char *argv[])
- unsigned int i, j;
- struct tdb_context *tdb;
- int flags[] = { TDB_DEFAULT, TDB_NOMMAP,
- struct tdb_data key = { (unsigned char *)&j, sizeof(j) };
- struct tdb_data data = { (unsigned char *)&j, sizeof(j) };
- plan_tests(sizeof(flags) / sizeof(flags[0]) * 8 + 1);
- for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
- uint64_t features;
- tdb = tdb_open("run-features.tdb", flags[i],
- O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
- ok1(tdb);
- if (!tdb)
- continue;
- /* Put some stuff in there. */
- for (j = 0; j < 100; j++) {
- if (tdb_store(tdb, key, data, TDB_REPLACE) != 0)
- fail("Storing in tdb");
- }
- /* Mess with features fields in hdr. */
- features = (~TDB_FEATURE_MASK ^ 1);
- ok1(tdb_write_convert(tdb, offsetof(struct tdb_header,
- features_used),
- &features, sizeof(features)) == 0);
- ok1(tdb_write_convert(tdb, offsetof(struct tdb_header,
- features_offered),
- &features, sizeof(features)) == 0);
- tdb_close(tdb);
- tdb = tdb_open("run-features.tdb", flags[i], O_RDWR, 0,
- &tap_log_attr);
- ok1(tdb);
- if (!tdb)
- continue;
- /* Should not have changed features offered. */
- ok1(tdb_read_convert(tdb, offsetof(struct tdb_header,
- features_offered),
- &features, sizeof(features)) == 0);
- ok1(features == (~TDB_FEATURE_MASK ^ 1));
- /* Should have cleared unknown bits in features_used. */
- ok1(tdb_read_convert(tdb, offsetof(struct tdb_header,
- features_used),
- &features, sizeof(features)) == 0);
- ok1(features == (1 & TDB_FEATURE_MASK));
- tdb_close(tdb);
- }
- ok1(tap_log_messages == 0);
- return exit_status();
diff --git a/lib/tdb2/test/run-lockall.c b/lib/tdb2/test/run-lockall.c
deleted file mode 100644
index 3ae0d14f65..0000000000
--- a/lib/tdb2/test/run-lockall.c
+++ /dev/null
@@ -1,71 +0,0 @@
-#include "private.h"
-#include <unistd.h>
-#include "lock-tracking.h"
-#define fcntl fcntl_with_lockcheck
-#include "tdb2-source.h"
-#include "tap-interface.h"
-#include <stdlib.h>
-#include <stdbool.h>
-#include <stdarg.h>
-#include <ccan/err/err.h>
-#include "external-agent.h"
-#include "logging.h"
-#define TEST_DBNAME "run-lockall.tdb"
-#undef fcntl
-int main(int argc, char *argv[])
- struct agent *agent;
- int flags[] = { TDB_DEFAULT, TDB_NOMMAP,
- int i;
- plan_tests(13 * sizeof(flags)/sizeof(flags[0]) + 1);
- agent = prepare_external_agent();
- if (!agent)
- err(1, "preparing agent");
- for (i = 0; i < sizeof(flags)/sizeof(flags[0]); i++) {
- enum agent_return ret;
- struct tdb_context *tdb;
- tdb = tdb_open(TEST_DBNAME, flags[i],
- O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
- ok1(tdb);
- ret = external_agent_operation(agent, OPEN, TEST_DBNAME);
- ok1(ret == SUCCESS);
- ok1(tdb_lockall(tdb) == TDB_SUCCESS);
- ok1(external_agent_operation(agent, STORE, "key")
- ok1(external_agent_operation(agent, FETCH, "key")
- /* Test nesting. */
- ok1(tdb_lockall(tdb) == TDB_SUCCESS);
- tdb_unlockall(tdb);
- tdb_unlockall(tdb);
- ok1(external_agent_operation(agent, STORE, "key") == SUCCESS);
- ok1(tdb_lockall_read(tdb) == TDB_SUCCESS);
- ok1(external_agent_operation(agent, STORE, "key")
- ok1(external_agent_operation(agent, FETCH, "key") == SUCCESS);
- ok1(tdb_lockall_read(tdb) == TDB_SUCCESS);
- tdb_unlockall_read(tdb);
- tdb_unlockall_read(tdb);
- ok1(external_agent_operation(agent, STORE, "key") == SUCCESS);
- ok1(external_agent_operation(agent, CLOSE, NULL) == SUCCESS);
- tdb_close(tdb);
- }
- free_external_agent(agent);
- ok1(tap_log_messages == 0);
- return exit_status();
diff --git a/lib/tdb2/test/run-remap-in-read_traverse.c b/lib/tdb2/test/run-remap-in-read_traverse.c
deleted file mode 100644
index 16a1baab46..0000000000
--- a/lib/tdb2/test/run-remap-in-read_traverse.c
+++ /dev/null
@@ -1,57 +0,0 @@
-#include "tdb2-source.h"
-/* We had a bug where we marked the tdb read-only for a tdb_traverse_read.
- * If we then expanded the tdb, we would remap read-only, and later SEGV. */
-#include "tap-interface.h"
-#include "external-agent.h"
-#include "logging.h"
-static bool file_larger(int fd, tdb_len_t size)
- struct stat st;
- fstat(fd, &st);
- return st.st_size != size;
-static unsigned add_records_to_grow(struct agent *agent, int fd, tdb_len_t size)
- unsigned int i;
- for (i = 0; !file_larger(fd, size); i++) {
- char data[20];
- sprintf(data, "%i", i);
- if (external_agent_operation(agent, STORE, data) != SUCCESS)
- return 0;
- }
- diag("Added %u records to grow file", i);
- return i;
-int main(int argc, char *argv[])
- unsigned int i;
- struct agent *agent;
- struct tdb_context *tdb;
- struct tdb_data d = tdb_mkdata("hello", 5);
- const char filename[] = "run-remap-in-read_traverse.tdb";
- plan_tests(4);
- agent = prepare_external_agent();
- tdb = tdb_open(filename, TDB_DEFAULT,
- O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
- ok1(external_agent_operation(agent, OPEN, filename) == SUCCESS);
- i = add_records_to_grow(agent, tdb->file->fd, tdb->file->map_size);
- /* Do a traverse. */
- ok1(tdb_traverse(tdb, NULL, NULL) == i);
- /* Now store something! */
- ok1(tdb_store(tdb, d, d, TDB_INSERT) == 0);
- ok1(tap_log_messages == 0);
- tdb_close(tdb);
- free_external_agent(agent);
- return exit_status();
diff --git a/lib/tdb2/test/run-seed.c b/lib/tdb2/test/run-seed.c
deleted file mode 100644
index 9c90833001..0000000000
--- a/lib/tdb2/test/run-seed.c
+++ /dev/null
@@ -1,61 +0,0 @@
-#include "tdb2-source.h"
-#include "tap-interface.h"
-#include "logging.h"
-static int log_count = 0;
-/* Normally we get a log when setting random seed. */
-static void my_log_fn(struct tdb_context *tdb,
- enum tdb_log_level level,
- enum TDB_ERROR ecode,
- const char *message, void *priv)
- log_count++;
-static union tdb_attribute log_attr = {
- .log = { .base = { .attr = TDB_ATTRIBUTE_LOG },
- .fn = my_log_fn }
-int main(int argc, char *argv[])
- unsigned int i;
- struct tdb_context *tdb;
- union tdb_attribute attr;
- attr.seed.base.attr = TDB_ATTRIBUTE_SEED;
- = &log_attr;
- attr.seed.seed = 42;
- plan_tests(sizeof(flags) / sizeof(flags[0]) * 4 + 4 * 3);
- for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
- struct tdb_header hdr;
- int fd;
- tdb = tdb_open("run-seed.tdb", flags[i],
- O_RDWR|O_CREAT|O_TRUNC, 0600, &attr);
- ok1(tdb);
- if (!tdb)
- continue;
- ok1(tdb_check(tdb, NULL, NULL) == 0);
- ok1(tdb->hash_seed == 42);
- ok1(log_count == 0);
- tdb_close(tdb);
- if (flags[i] & TDB_INTERNAL)
- continue;
- fd = open("run-seed.tdb", O_RDONLY);
- ok1(fd >= 0);
- ok1(read(fd, &hdr, sizeof(hdr)) == sizeof(hdr));
- if (flags[i] & TDB_CONVERT)
- ok1(bswap_64(hdr.hash_seed) == 42);
- else
- ok1(hdr.hash_seed == 42);
- close(fd);
- }
- return exit_status();
diff --git a/lib/tdb2/test/run-tdb_errorstr.c b/lib/tdb2/test/run-tdb_errorstr.c
deleted file mode 100644
index 7a2da251aa..0000000000
--- a/lib/tdb2/test/run-tdb_errorstr.c
+++ /dev/null
@@ -1,52 +0,0 @@
-#include "tdb2-source.h"
-#include "tap-interface.h"
-int main(int argc, char *argv[])
- enum TDB_ERROR e;
- plan_tests(TDB_ERR_RDONLY*-1 + 2);
- for (e = TDB_SUCCESS; e >= TDB_ERR_RDONLY; e--) {
- switch (e) {
- ok1(!strcmp(tdb_errorstr(e),
- "Success"));
- break;
- case TDB_ERR_IO:
- ok1(!strcmp(tdb_errorstr(e),
- "IO Error"));
- break;
- case TDB_ERR_LOCK:
- ok1(!strcmp(tdb_errorstr(e),
- "Locking error"));
- break;
- case TDB_ERR_OOM:
- ok1(!strcmp(tdb_errorstr(e),
- "Out of memory"));
- break;
- ok1(!strcmp(tdb_errorstr(e),
- "Record exists"));
- break;
- ok1(!strcmp(tdb_errorstr(e),
- "Invalid parameter"));
- break;
- ok1(!strcmp(tdb_errorstr(e),
- "Record does not exist"));
- break;
- ok1(!strcmp(tdb_errorstr(e),
- "write not permitted"));
- break;
- ok1(!strcmp(tdb_errorstr(e),
- "Corrupt database"));
- break;
- }
- }
- ok1(!strcmp(tdb_errorstr(e), "Invalid error code"));
- return exit_status();
diff --git a/lib/tdb2/test/run-tdb_foreach.c b/lib/tdb2/test/run-tdb_foreach.c
deleted file mode 100644
index b1eb2de217..0000000000
--- a/lib/tdb2/test/run-tdb_foreach.c
+++ /dev/null
@@ -1,86 +0,0 @@
-#include "tdb2-source.h"
-#include "tap-interface.h"
-#include "logging.h"
-static int drop_count(struct tdb_context *tdb, unsigned int *count)
- if (--(*count) == 0)
- return 1;
- return 0;
-static int set_found(struct tdb_context *tdb, bool found[3])
- unsigned int idx;
- if (strcmp(tdb_name(tdb), "run-tdb_foreach0.tdb") == 0)
- idx = 0;
- else if (strcmp(tdb_name(tdb), "run-tdb_foreach1.tdb") == 0)
- idx = 1;
- else if (strcmp(tdb_name(tdb), "run-tdb_foreach2.tdb") == 0)
- idx = 2;
- else
- abort();
- if (found[idx])
- abort();
- found[idx] = true;
- return 0;
-int main(int argc, char *argv[])
- unsigned int i, count;
- bool found[3];
- struct tdb_context *tdb0, *tdb1, *tdb2;
- int flags[] = { TDB_DEFAULT, TDB_NOMMAP,
- plan_tests(sizeof(flags) / sizeof(flags[0]) * 8);
- for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
- tdb0 = tdb_open("run-tdb_foreach0.tdb", flags[i],
- O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
- tdb1 = tdb_open("run-tdb_foreach1.tdb", flags[i],
- O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
- tdb2 = tdb_open("run-tdb_foreach2.tdb", flags[i],
- O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
- memset(found, 0, sizeof(found));
- tdb_foreach(set_found, found);
- ok1(found[0] && found[1] && found[2]);
- /* Test premature iteration termination */
- count = 1;
- tdb_foreach(drop_count, &count);
- ok1(count == 0);
- tdb_close(tdb1);
- memset(found, 0, sizeof(found));
- tdb_foreach(set_found, found);
- ok1(found[0] && !found[1] && found[2]);
- tdb_close(tdb2);
- memset(found, 0, sizeof(found));
- tdb_foreach(set_found, found);
- ok1(found[0] && !found[1] && !found[2]);
- tdb1 = tdb_open("run-tdb_foreach1.tdb", flags[i],
- O_RDWR, 0600, &tap_log_attr);
- memset(found, 0, sizeof(found));
- tdb_foreach(set_found, found);
- ok1(found[0] && found[1] && !found[2]);
- tdb_close(tdb0);
- memset(found, 0, sizeof(found));
- tdb_foreach(set_found, found);
- ok1(!found[0] && found[1] && !found[2]);
- tdb_close(tdb1);
- memset(found, 0, sizeof(found));
- tdb_foreach(set_found, found);
- ok1(!found[0] && !found[1] && !found[2]);
- ok1(tap_log_messages == 0);
- }
- return exit_status();
diff --git a/lib/tdb2/test/run-traverse.c b/lib/tdb2/test/run-traverse.c
deleted file mode 100644
index 20d610fe66..0000000000
--- a/lib/tdb2/test/run-traverse.c
+++ /dev/null
@@ -1,203 +0,0 @@
-#include "tdb2-source.h"
-#include "tap-interface.h"
-#include "logging.h"
-#define NUM_RECORDS 1000
-/* We use the same seed which we saw a failure on. */
-static uint64_t fixedhash(const void *key, size_t len, uint64_t seed, void *p)
- return hash64_stable((const unsigned char *)key, len,
- *(uint64_t *)p);
-static bool store_records(struct tdb_context *tdb)
- int i;
- struct tdb_data key = { (unsigned char *)&i, sizeof(i) };
- struct tdb_data data = { (unsigned char *)&i, sizeof(i) };
- for (i = 0; i < NUM_RECORDS; i++)
- if (tdb_store(tdb, key, data, TDB_REPLACE) != 0)
- return false;
- return true;
-struct trav_data {
- unsigned int calls, call_limit;
- int low, high;
- bool mismatch;
- bool delete;
- enum TDB_ERROR delete_error;
-static int trav(struct tdb_context *tdb, TDB_DATA key, TDB_DATA dbuf,
- struct trav_data *td)
- int val;
- td->calls++;
- if (key.dsize != sizeof(val) || dbuf.dsize != sizeof(val)
- || memcmp(key.dptr, dbuf.dptr, key.dsize) != 0) {
- td->mismatch = true;
- return -1;
- }
- memcpy(&val, dbuf.dptr, dbuf.dsize);
- if (val < td->low)
- td->low = val;
- if (val > td->high)
- td->high = val;
- if (td->delete) {
- td->delete_error = tdb_delete(tdb, key);
- if (td->delete_error != TDB_SUCCESS) {
- return -1;
- }
- }
- if (td->calls == td->call_limit)
- return 1;
- return 0;
-struct trav_grow_data {
- unsigned int calls;
- unsigned int num_large;
- bool mismatch;
- enum TDB_ERROR error;
-static int trav_grow(struct tdb_context *tdb, TDB_DATA key, TDB_DATA dbuf,
- struct trav_grow_data *tgd)
- int val;
- unsigned char buffer[128] = { 0 };
- tgd->calls++;
- if (key.dsize != sizeof(val) || dbuf.dsize < sizeof(val)
- || memcmp(key.dptr, dbuf.dptr, key.dsize) != 0) {
- tgd->mismatch = true;
- return -1;
- }
- if (dbuf.dsize > sizeof(val))
- /* We must have seen this before! */
- tgd->num_large++;
- /* Make a big difference to the database. */
- dbuf.dptr = buffer;
- dbuf.dsize = sizeof(buffer);
- tgd->error = tdb_append(tdb, key, dbuf);
- if (tgd->error != TDB_SUCCESS) {
- return -1;
- }
- return 0;
-int main(int argc, char *argv[])
- unsigned int i;
- int num;
- struct trav_data td;
- struct trav_grow_data tgd;
- struct tdb_context *tdb;
- uint64_t seed = 16014841315512641303ULL;
- union tdb_attribute hattr = { .hash = { .base = { TDB_ATTRIBUTE_HASH },
- .fn = fixedhash,
- .data = &seed } };
- = &tap_log_attr;
- plan_tests(sizeof(flags) / sizeof(flags[0]) * 32 + 1);
- for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
- tdb = tdb_open("run-traverse.tdb", flags[i],
- O_RDWR|O_CREAT|O_TRUNC, 0600, &hattr);
- ok1(tdb);
- if (!tdb)
- continue;
- ok1(tdb_traverse(tdb, NULL, NULL) == 0);
- ok1(store_records(tdb));
- num = tdb_traverse(tdb, NULL, NULL);
- ok1(num == NUM_RECORDS);
- /* Full traverse. */
- td.calls = 0;
- td.call_limit = UINT_MAX;
- td.low = INT_MAX;
- td.high = INT_MIN;
- td.mismatch = false;
- td.delete = false;
- num = tdb_traverse(tdb, trav, &td);
- ok1(num == NUM_RECORDS);
- ok1(!td.mismatch);
- ok1(td.calls == NUM_RECORDS);
- ok1(td.low == 0);
- ok1(td.high == NUM_RECORDS-1);
- /* Short traverse. */
- td.calls = 0;
- td.call_limit = NUM_RECORDS / 2;
- td.low = INT_MAX;
- td.high = INT_MIN;
- td.mismatch = false;
- td.delete = false;
- num = tdb_traverse(tdb, trav, &td);
- ok1(num == NUM_RECORDS / 2);
- ok1(!td.mismatch);
- ok1(td.calls == NUM_RECORDS / 2);
- ok1(td.low <= NUM_RECORDS / 2);
- ok1(td.high > NUM_RECORDS / 2);
- ok1(tdb_check(tdb, NULL, NULL) == 0);
- ok1(tap_log_messages == 0);
- /* Deleting traverse (delete everything). */
- td.calls = 0;
- td.call_limit = UINT_MAX;
- td.low = INT_MAX;
- td.high = INT_MIN;
- td.mismatch = false;
- td.delete = true;
- td.delete_error = TDB_SUCCESS;
- num = tdb_traverse(tdb, trav, &td);
- ok1(num == NUM_RECORDS);
- ok1(td.delete_error == TDB_SUCCESS);
- ok1(!td.mismatch);
- ok1(td.calls == NUM_RECORDS);
- ok1(td.low == 0);
- ok1(td.high == NUM_RECORDS - 1);
- ok1(tdb_check(tdb, NULL, NULL) == 0);
- /* Now it's empty! */
- ok1(tdb_traverse(tdb, NULL, NULL) == 0);
- /* Re-add. */
- ok1(store_records(tdb));
- ok1(tdb_traverse(tdb, NULL, NULL) == NUM_RECORDS);
- ok1(tdb_check(tdb, NULL, NULL) == 0);
- /* Grow. This will cause us to be reshuffled. */
- tgd.calls = 0;
- tgd.num_large = 0;
- tgd.mismatch = false;
- tgd.error = TDB_SUCCESS;
- ok1(tdb_traverse(tdb, trav_grow, &tgd) > 1);
- ok1(tgd.error == 0);
- ok1(!tgd.mismatch);
- ok1(tdb_check(tdb, NULL, NULL) == 0);
- ok1(tgd.num_large < tgd.calls);
- diag("growing db: %u calls, %u repeats",
- tgd.calls, tgd.num_large);
- tdb_close(tdb);
- }
- ok1(tap_log_messages == 0);
- return exit_status();
diff --git a/lib/tdb2/test/tap-interface.c b/lib/tdb2/test/tap-interface.c
deleted file mode 100644
index 077ec2cd9a..0000000000
--- a/lib/tdb2/test/tap-interface.c
+++ /dev/null
@@ -1,3 +0,0 @@
-#include "tap-interface.h"
-unsigned tap_ok_count, tap_ok_target = -1U;
diff --git a/lib/tdb2/test/tap-interface.h b/lib/tdb2/test/tap-interface.h
deleted file mode 100644
index f3d4ec2545..0000000000
--- a/lib/tdb2/test/tap-interface.h
+++ /dev/null
@@ -1,41 +0,0 @@
- Unix SMB/CIFS implementation.
- Simplistic implementation of tap interface.
- Copyright (C) Rusty Russell 2012
- ** NOTE! The following LGPL license applies to the talloc
- ** library. This does NOT imply that all of Samba is released
- ** under the LGPL
- This library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 3 of the License, or (at your option) any later version.
- This library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- Lesser General Public License for more details.
- You should have received a copy of the GNU Lesser General Public
- License along with this library; if not, see <>.
-#include <stdio.h>
-#include <ccan/err/err.h>
-#ifndef __location__
-#define __TAP_STRING_LINE1__(s) #s
-#define __TAP_STRING_LINE2__(s) __TAP_STRING_LINE1__(s)
-#define __TAP_STRING_LINE3__ __TAP_STRING_LINE2__(__LINE__)
-#define __location__ __FILE__ ":" __TAP_STRING_LINE3__
-extern unsigned tap_ok_count, tap_ok_target;
-#define plan_tests(num) do { tap_ok_target = (num); } while(0)
-#define ok(e, ...) ((e) ? (printf("."), tap_ok_count++, true) : (warnx(__VA_ARGS__), false))
-#define ok1(e) ok((e), "%s:%s", __location__, #e)
-#define pass(...) (printf("."), tap_ok_count++)
-#define fail(...) warnx(__VA_ARGS__)
-#define diag printf
-#define exit_status() (tap_ok_count == tap_ok_target ? 0 : 1)
diff --git a/lib/tdb2/test/tdb2-source.h b/lib/tdb2/test/tdb2-source.h
deleted file mode 100644
index d13d8b868c..0000000000
--- a/lib/tdb2/test/tdb2-source.h
+++ /dev/null
@@ -1,11 +0,0 @@
-#include "config.h"
-#include "check.c"
-#include "free.c"
-#include "hash.c"
-#include "io.c"
-#include "lock.c"
-#include "open.c"
-#include "summary.c"
-#include "tdb.c"
-#include "transaction.c"
-#include "traverse.c"
diff --git a/lib/tdb2/tools/Makefile b/lib/tdb2/tools/Makefile
deleted file mode 100644
index 11188c3baf..0000000000
--- a/lib/tdb2/tools/Makefile
+++ /dev/null
@@ -1,16 +0,0 @@
-OBJS:=../../tdb2.o ../../hash.o ../../tally.o
-CFLAGS:=-I../../.. -I.. -Wall -g -O3 #-g -pg
-default: tdb2torture tdb2tool tdb2dump tdb2restore mktdb2 speed growtdb-bench
-tdb2dump: tdb2dump.c $(OBJS)
-tdb2restore: tdb2restore.c $(OBJS)
-tdb2torture: tdb2torture.c $(OBJS)
-tdb2tool: tdb2tool.c $(OBJS)
-mktdb2: mktdb2.c $(OBJS)
-speed: speed.c $(OBJS)
-growtdb-bench: growtdb-bench.c $(OBJS)
- rm -f tdb2torture tdb2dump tdb2restore tdb2tool mktdb2 speed growtdb-bench
diff --git a/lib/tdb2/tools/growtdb-bench.c b/lib/tdb2/tools/growtdb-bench.c
deleted file mode 100644
index 476e8be5da..0000000000
--- a/lib/tdb2/tools/growtdb-bench.c
+++ /dev/null
@@ -1,114 +0,0 @@
-#include "tdb2.h"
-#include <stdlib.h>
-#include <string.h>
-#include <stdio.h>
-#include <unistd.h>
-#include <ccan/err/err.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-static void logfn(struct tdb_context *tdb,
- enum tdb_log_level level,
- enum TDB_ERROR ecode,
- const char *message,
- void *data)
- fprintf(stderr, "tdb:%s:%s:%s\n",
- tdb_name(tdb), tdb_errorstr(ecode), message);
-int main(int argc, char *argv[])
- unsigned int i, j, users, groups;
- TDB_DATA idxkey, idxdata;
- TDB_DATA k, d, gk;
- char cmd[100];
- struct tdb_context *tdb;
- enum TDB_ERROR ecode;
- union tdb_attribute log;
- if (argc != 3) {
- printf("Usage: growtdb-bench <users> <groups>\n");
- exit(1);
- }
- users = atoi(argv[1]);
- groups = atoi(argv[2]);
- sprintf(cmd, "cat /proc/%i/statm", getpid());
- log.base.attr = TDB_ATTRIBUTE_LOG;
- = NULL;
- log.log.fn = logfn;
- tdb = tdb_open("/tmp/growtdb.tdb", TDB_DEFAULT,
- O_RDWR|O_CREAT|O_TRUNC, 0600, &log);
- idxkey.dptr = (unsigned char *)"User index";
- idxkey.dsize = strlen("User index");
- idxdata.dsize = 51;
- idxdata.dptr = calloc(idxdata.dsize, 1);
- /* Create users. */
- k.dsize = 48;
- k.dptr = calloc(k.dsize, 1);
- d.dsize = 64;
- d.dptr = calloc(d.dsize, 1);
- tdb_transaction_start(tdb);
- for (i = 0; i < users; i++) {
- memcpy(k.dptr, &i, sizeof(i));
- ecode = tdb_store(tdb, k, d, TDB_INSERT);
- if (ecode != TDB_SUCCESS)
- errx(1, "tdb insert failed: %s", tdb_errorstr(ecode));
- /* This simulates a growing index record. */
- ecode = tdb_append(tdb, idxkey, idxdata);
- if (ecode != TDB_SUCCESS)
- errx(1, "tdb append failed: %s", tdb_errorstr(ecode));
- }
- if ((ecode = tdb_transaction_commit(tdb)) != 0)
- errx(1, "tdb commit1 failed: %s", tdb_errorstr(ecode));
- if ((ecode = tdb_check(tdb, NULL, NULL)) != 0)
- errx(1, "tdb_check failed after initial insert!");
- system(cmd);
- /* Now put them all in groups: add 32 bytes to each record for
- * a group. */
- gk.dsize = 48;
- gk.dptr = calloc(k.dsize, 1);
- gk.dptr[gk.dsize-1] = 1;
- d.dsize = 32;
- for (i = 0; i < groups; i++) {
- tdb_transaction_start(tdb);
- /* Create the "group". */
- memcpy(gk.dptr, &i, sizeof(i));
- ecode = tdb_store(tdb, gk, d, TDB_INSERT);
- if (ecode != TDB_SUCCESS)
- errx(1, "tdb insert failed: %s", tdb_errorstr(ecode));
- /* Now populate it. */
- for (j = 0; j < users; j++) {
- /* Append to the user. */
- memcpy(k.dptr, &j, sizeof(j));
- if ((ecode = tdb_append(tdb, k, d)) != 0)
- errx(1, "tdb append failed: %s",
- tdb_errorstr(ecode));
- /* Append to the group. */
- if ((ecode = tdb_append(tdb, gk, d)) != 0)
- errx(1, "tdb append failed: %s",
- tdb_errorstr(ecode));
- }
- if ((ecode = tdb_transaction_commit(tdb)) != 0)
- errx(1, "tdb commit2 failed: %s", tdb_errorstr(ecode));
- if ((ecode = tdb_check(tdb, NULL, NULL)) != 0)
- errx(1, "tdb_check failed after iteration %i!", i);
- system(cmd);
- }
- return 0;
diff --git a/lib/tdb2/tools/mktdb2.c b/lib/tdb2/tools/mktdb2.c
deleted file mode 100644
index 35d7a07d0b..0000000000
--- a/lib/tdb2/tools/mktdb2.c
+++ /dev/null
@@ -1,29 +0,0 @@
-#include "tdb2.h"
-#include <stdlib.h>
-#include <stdio.h>
-#include <fcntl.h>
-#include <ccan/err/err.h>
-int main(int argc, char *argv[])
- unsigned int i, num_recs;
- struct tdb_context *tdb;
- if (argc != 3 || (num_recs = atoi(argv[2])) == 0)
- errx(1, "Usage: mktdb <tdbfile> <numrecords>");
- tdb = tdb_open(argv[1], TDB_DEFAULT, O_CREAT|O_TRUNC|O_RDWR, 0600,NULL);
- if (!tdb)
- err(1, "Opening %s", argv[1]);
- for (i = 0; i < num_recs; i++) {
- d.dptr = (void *)&i;
- d.dsize = sizeof(i);
- if (tdb_store(tdb, d, d, TDB_INSERT) != 0)
- err(1, "Failed to store record %i", i);
- }
- printf("Done\n");
- return 0;
diff --git a/lib/tdb2/tools/speed.c b/lib/tdb2/tools/speed.c
deleted file mode 100644
index 259d53f6c8..0000000000
--- a/lib/tdb2/tools/speed.c
+++ /dev/null
@@ -1,443 +0,0 @@
-/* Simple speed test for TDB */
-#include <ccan/err/err.h>
-#include <time.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <unistd.h>
-#include <sys/time.h>
-#include <fcntl.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <stdbool.h>
-#include "tdb2.h"
-/* Nanoseconds per operation */
-static size_t normalize(const struct timeval *start,
- const struct timeval *stop,
- unsigned int num)
- struct timeval diff;
- timersub(stop, start, &diff);
- /* Floating point is more accurate here. */
- return (double)(diff.tv_sec * 1000000 + diff.tv_usec)
- / num * 1000;
-static size_t file_size(void)
- struct stat st;
- if (stat("/tmp/speed.tdb", &st) != 0)
- return -1;
- return st.st_size;
-static int count_record(struct tdb_context *tdb,
- TDB_DATA key, TDB_DATA data, void *p)
- int *total = p;
- *total += *(int *)data.dptr;
- return 0;
-static void dump_and_clear_stats(struct tdb_context **tdb,
- int flags,
- union tdb_attribute *attr)
- union tdb_attribute stats;
- enum TDB_ERROR ecode;
- stats.base.attr = TDB_ATTRIBUTE_STATS;
- stats.stats.size = sizeof(stats.stats);
- ecode = tdb_get_attribute(*tdb, &stats);
- if (ecode != TDB_SUCCESS)
- errx(1, "Getting stats: %s", tdb_errorstr(ecode));
- printf("allocs = %llu\n",
- (unsigned long long)stats.stats.allocs);
- printf(" alloc_subhash = %llu\n",
- (unsigned long long)stats.stats.alloc_subhash);
- printf(" alloc_chain = %llu\n",
- (unsigned long long)stats.stats.alloc_chain);
- printf(" alloc_bucket_exact = %llu\n",
- (unsigned long long)stats.stats.alloc_bucket_exact);
- printf(" alloc_bucket_max = %llu\n",
- (unsigned long long)stats.stats.alloc_bucket_max);
- printf(" alloc_leftover = %llu\n",
- (unsigned long long)stats.stats.alloc_leftover);
- printf(" alloc_coalesce_tried = %llu\n",
- (unsigned long long)stats.stats.alloc_coalesce_tried);
- printf(" alloc_coalesce_iterate_clash = %llu\n",
- (unsigned long long)stats.stats.alloc_coalesce_iterate_clash);
- printf(" alloc_coalesce_lockfail = %llu\n",
- (unsigned long long)stats.stats.alloc_coalesce_lockfail);
- printf(" alloc_coalesce_race = %llu\n",
- (unsigned long long)stats.stats.alloc_coalesce_race);
- printf(" alloc_coalesce_succeeded = %llu\n",
- (unsigned long long)stats.stats.alloc_coalesce_succeeded);
- printf(" alloc_coalesce_num_merged = %llu\n",
- (unsigned long long)stats.stats.alloc_coalesce_num_merged);
- printf("compares = %llu\n",
- (unsigned long long)stats.stats.compares);
- printf(" compare_wrong_bucket = %llu\n",
- (unsigned long long)stats.stats.compare_wrong_bucket);
- printf(" compare_wrong_offsetbits = %llu\n",
- (unsigned long long)stats.stats.compare_wrong_offsetbits);
- printf(" compare_wrong_keylen = %llu\n",
- (unsigned long long)stats.stats.compare_wrong_keylen);
- printf(" compare_wrong_rechash = %llu\n",
- (unsigned long long)stats.stats.compare_wrong_rechash);
- printf(" compare_wrong_keycmp = %llu\n",
- (unsigned long long)stats.stats.compare_wrong_keycmp);
- printf("transactions = %llu\n",
- (unsigned long long)stats.stats.transactions);
- printf(" transaction_cancel = %llu\n",
- (unsigned long long)stats.stats.transaction_cancel);
- printf(" transaction_nest = %llu\n",
- (unsigned long long)stats.stats.transaction_nest);
- printf(" transaction_expand_file = %llu\n",
- (unsigned long long)stats.stats.transaction_expand_file);
- printf(" transaction_read_direct = %llu\n",
- (unsigned long long)stats.stats.transaction_read_direct);
- printf(" transaction_read_direct_fail = %llu\n",
- (unsigned long long)stats.stats.transaction_read_direct_fail);
- printf(" transaction_write_direct = %llu\n",
- (unsigned long long)stats.stats.transaction_write_direct);
- printf(" transaction_write_direct_fail = %llu\n",
- (unsigned long long)stats.stats.transaction_write_direct_fail);
- printf("expands = %llu\n",
- (unsigned long long)stats.stats.expands);
- printf("frees = %llu\n",
- (unsigned long long)stats.stats.frees);
- printf("locks = %llu\n",
- (unsigned long long)stats.stats.locks);
- printf(" lock_lowlevel = %llu\n",
- (unsigned long long)stats.stats.lock_lowlevel);
- printf(" lock_nonblock = %llu\n",
- (unsigned long long)stats.stats.lock_nonblock);
- printf(" lock_nonblock_fail = %llu\n",
- (unsigned long long)stats.stats.lock_nonblock_fail);
- /* Now clear. */
- tdb_close(*tdb);
- *tdb = tdb_open("/tmp/speed.tdb", flags, O_RDWR, 0, attr);
-static void tdb_log(struct tdb_context *tdb,
- enum tdb_log_level level,
- enum TDB_ERROR ecode,
- const char *message,
- void *data)
- fprintf(stderr, "tdb:%s:%s:%s\n",
- tdb_name(tdb), tdb_errorstr(ecode), message);
-int main(int argc, char *argv[])
- unsigned int i, j, num = 1000, stage = 0, stopat = -1;
- int flags = TDB_DEFAULT;
- bool transaction = false, summary = false;
- TDB_DATA key, data;
- struct tdb_context *tdb;
- struct timeval start, stop;
- union tdb_attribute seed, log;
- bool do_stats = false;
- enum TDB_ERROR ecode;
- /* Try to keep benchmarks even. */
- seed.base.attr = TDB_ATTRIBUTE_SEED;
- = NULL;
- seed.seed.seed = 0;
- log.base.attr = TDB_ATTRIBUTE_LOG;
- = &seed;
- log.log.fn = tdb_log;
- if (argv[1] && strcmp(argv[1], "--internal") == 0) {
- flags = TDB_INTERNAL;
- argc--;
- argv++;
- }
- if (argv[1] && strcmp(argv[1], "--transaction") == 0) {
- transaction = true;
- argc--;
- argv++;
- }
- if (argv[1] && strcmp(argv[1], "--no-sync") == 0) {
- flags |= TDB_NOSYNC;
- argc--;
- argv++;
- }
- if (argv[1] && strcmp(argv[1], "--summary") == 0) {
- summary = true;
- argc--;
- argv++;
- }
- if (argv[1] && strcmp(argv[1], "--stats") == 0) {
- do_stats = true;
- argc--;
- argv++;
- }
- tdb = tdb_open("/tmp/speed.tdb", flags, O_RDWR|O_CREAT|O_TRUNC,
- 0600, &log);
- if (!tdb)
- err(1, "Opening /tmp/speed.tdb");
- key.dptr = (void *)&i;
- key.dsize = sizeof(i);
- data = key;
- if (argv[1]) {
- num = atoi(argv[1]);
- argv++;
- argc--;
- }
- if (argv[1]) {
- stopat = atoi(argv[1]);
- argv++;
- argc--;
- }
- /* Add 1000 records. */
- printf("Adding %u records: ", num); fflush(stdout);
- if (transaction && (ecode = tdb_transaction_start(tdb)))
- errx(1, "starting transaction: %s", tdb_errorstr(ecode));
- gettimeofday(&start, NULL);
- for (i = 0; i < num; i++)
- if ((ecode = tdb_store(tdb, key, data, TDB_INSERT)) != 0)
- errx(1, "Inserting key %u in tdb: %s",
- i, tdb_errorstr(ecode));
- gettimeofday(&stop, NULL);
- if (transaction && (ecode = tdb_transaction_commit(tdb)))
- errx(1, "committing transaction: %s", tdb_errorstr(ecode));
- printf(" %zu ns (%zu bytes)\n",
- normalize(&start, &stop, num), file_size());
- if (tdb_check(tdb, NULL, NULL))
- errx(1, "tdb_check failed!");
- if (summary) {
- char *sumstr = NULL;
- tdb_summary(tdb, TDB_SUMMARY_HISTOGRAMS, &sumstr);
- printf("%s\n", sumstr);
- free(sumstr);
- }
- if (do_stats)
- dump_and_clear_stats(&tdb, flags, &log);
- if (++stage == stopat)
- exit(0);
- /* Finding 1000 records. */
- printf("Finding %u records: ", num); fflush(stdout);
- if (transaction && (ecode = tdb_transaction_start(tdb)))
- errx(1, "starting transaction: %s", tdb_errorstr(ecode));
- gettimeofday(&start, NULL);
- for (i = 0; i < num; i++) {
- struct tdb_data dbuf;
- if ((ecode = tdb_fetch(tdb, key, &dbuf)) != TDB_SUCCESS
- || *(int *)dbuf.dptr != i) {
- errx(1, "Fetching key %u in tdb gave %u",
- i, ecode ? ecode : *(int *)dbuf.dptr);
- }
- }
- gettimeofday(&stop, NULL);
- if (transaction && (ecode = tdb_transaction_commit(tdb)))
- errx(1, "committing transaction: %s", tdb_errorstr(ecode));
- printf(" %zu ns (%zu bytes)\n",
- normalize(&start, &stop, num), file_size());
- if (tdb_check(tdb, NULL, NULL))
- errx(1, "tdb_check failed!");
- if (summary) {
- char *sumstr = NULL;
- tdb_summary(tdb, TDB_SUMMARY_HISTOGRAMS, &sumstr);
- printf("%s\n", sumstr);
- free(sumstr);
- }
- if (do_stats)
- dump_and_clear_stats(&tdb, flags, &log);
- if (++stage == stopat)
- exit(0);
- /* Missing 1000 records. */
- printf("Missing %u records: ", num); fflush(stdout);
- if (transaction && (ecode = tdb_transaction_start(tdb)))
- errx(1, "starting transaction: %s", tdb_errorstr(ecode));
- gettimeofday(&start, NULL);
- for (i = num; i < num*2; i++) {
- struct tdb_data dbuf;
- ecode = tdb_fetch(tdb, key, &dbuf);
- if (ecode != TDB_ERR_NOEXIST)
- errx(1, "Fetching key %u in tdb gave %s",
- i, tdb_errorstr(ecode));
- }
- gettimeofday(&stop, NULL);
- if (transaction && (ecode = tdb_transaction_commit(tdb)))
- errx(1, "committing transaction: %s", tdb_errorstr(ecode));
- printf(" %zu ns (%zu bytes)\n",
- normalize(&start, &stop, num), file_size());
- if (tdb_check(tdb, NULL, NULL))
- errx(1, "tdb_check failed!");
- if (summary) {
- char *sumstr = NULL;
- tdb_summary(tdb, TDB_SUMMARY_HISTOGRAMS, &sumstr);
- printf("%s\n", sumstr);
- free(sumstr);
- }
- if (do_stats)
- dump_and_clear_stats(&tdb, flags, &log);
- if (++stage == stopat)
- exit(0);
- /* Traverse 1000 records. */
- printf("Traversing %u records: ", num); fflush(stdout);
- if (transaction && (ecode = tdb_transaction_start(tdb)))
- errx(1, "starting transaction: %s", tdb_errorstr(ecode));
- i = 0;
- gettimeofday(&start, NULL);
- if (tdb_traverse(tdb, count_record, &i) != num)
- errx(1, "Traverse returned wrong number of records");
- if (i != (num - 1) * (num / 2))
- errx(1, "Traverse tallied to %u", i);
- gettimeofday(&stop, NULL);
- if (transaction && (ecode = tdb_transaction_commit(tdb)))
- errx(1, "committing transaction: %s", tdb_errorstr(ecode));
- printf(" %zu ns (%zu bytes)\n",
- normalize(&start, &stop, num), file_size());
- if (tdb_check(tdb, NULL, NULL))
- errx(1, "tdb_check failed!");
- if (summary) {
- char *sumstr = NULL;
- tdb_summary(tdb, TDB_SUMMARY_HISTOGRAMS, &sumstr);
- printf("%s\n", sumstr);
- free(sumstr);
- }
- if (do_stats)
- dump_and_clear_stats(&tdb, flags, &log);
- if (++stage == stopat)
- exit(0);
- /* Delete 1000 records (not in order). */
- printf("Deleting %u records: ", num); fflush(stdout);
- if (transaction && (ecode = tdb_transaction_start(tdb)))
- errx(1, "starting transaction: %s", tdb_errorstr(ecode));
- gettimeofday(&start, NULL);
- for (j = 0; j < num; j++) {
- i = (j + 100003) % num;
- if ((ecode = tdb_delete(tdb, key)) != TDB_SUCCESS)
- errx(1, "Deleting key %u in tdb: %s",
- i, tdb_errorstr(ecode));
- }
- gettimeofday(&stop, NULL);
- if (transaction && (ecode = tdb_transaction_commit(tdb)))
- errx(1, "committing transaction: %s", tdb_errorstr(ecode));
- printf(" %zu ns (%zu bytes)\n",
- normalize(&start, &stop, num), file_size());
- if (tdb_check(tdb, NULL, NULL))
- errx(1, "tdb_check failed!");
- if (summary) {
- char *sumstr = NULL;
- tdb_summary(tdb, TDB_SUMMARY_HISTOGRAMS, &sumstr);
- printf("%s\n", sumstr);
- free(sumstr);
- }
- if (do_stats)
- dump_and_clear_stats(&tdb, flags, &log);
- if (++stage == stopat)
- exit(0);
- /* Re-add 1000 records (not in order). */
- printf("Re-adding %u records: ", num); fflush(stdout);
- if (transaction && (ecode = tdb_transaction_start(tdb)))
- errx(1, "starting transaction: %s", tdb_errorstr(ecode));
- gettimeofday(&start, NULL);
- for (j = 0; j < num; j++) {
- i = (j + 100003) % num;
- if ((ecode = tdb_store(tdb, key, data, TDB_INSERT)) != 0)
- errx(1, "Inserting key %u in tdb: %s",
- i, tdb_errorstr(ecode));
- }
- gettimeofday(&stop, NULL);
- if (transaction && (ecode = tdb_transaction_commit(tdb)))
- errx(1, "committing transaction: %s", tdb_errorstr(ecode));
- printf(" %zu ns (%zu bytes)\n",
- normalize(&start, &stop, num), file_size());
- if (tdb_check(tdb, NULL, NULL))
- errx(1, "tdb_check failed!");
- if (summary) {
- char *sumstr = NULL;
- tdb_summary(tdb, TDB_SUMMARY_HISTOGRAMS, &sumstr);
- printf("%s\n", sumstr);
- free(sumstr);
- }
- if (do_stats)
- dump_and_clear_stats(&tdb, flags, &log);
- if (++stage == stopat)
- exit(0);
- /* Append 1000 records. */
- if (transaction && (ecode = tdb_transaction_start(tdb)))
- errx(1, "starting transaction: %s", tdb_errorstr(ecode));
- printf("Appending %u records: ", num); fflush(stdout);
- gettimeofday(&start, NULL);
- for (i = 0; i < num; i++)
- if ((ecode = tdb_append(tdb, key, data)) != TDB_SUCCESS)
- errx(1, "Appending key %u in tdb: %s",
- i, tdb_errorstr(ecode));
- gettimeofday(&stop, NULL);
- if (transaction && (ecode = tdb_transaction_commit(tdb)))
- errx(1, "committing transaction: %s", tdb_errorstr(ecode));
- printf(" %zu ns (%zu bytes)\n",
- normalize(&start, &stop, num), file_size());
- if (tdb_check(tdb, NULL, NULL))
- errx(1, "tdb_check failed!");
- if (summary) {
- char *sumstr = NULL;
- tdb_summary(tdb, TDB_SUMMARY_HISTOGRAMS, &sumstr);
- printf("%s\n", sumstr);
- free(sumstr);
- }
- if (++stage == stopat)
- exit(0);
- /* Churn 1000 records: not in order! */
- if (transaction && (ecode = tdb_transaction_start(tdb)))
- errx(1, "starting transaction: %s", tdb_errorstr(ecode));
- printf("Churning %u records: ", num); fflush(stdout);
- gettimeofday(&start, NULL);
- for (j = 0; j < num; j++) {
- i = (j + 1000019) % num;
- if ((ecode = tdb_delete(tdb, key)) != TDB_SUCCESS)
- errx(1, "Deleting key %u in tdb: %s",
- i, tdb_errorstr(ecode));
- i += num;
- if ((ecode = tdb_store(tdb, key, data, TDB_INSERT)) != 0)
- errx(1, "Inserting key %u in tdb: %s",
- i, tdb_errorstr(ecode));
- }
- gettimeofday(&stop, NULL);
- if (transaction && (ecode = tdb_transaction_commit(tdb)))
- errx(1, "committing transaction: %s", tdb_errorstr(ecode));
- printf(" %zu ns (%zu bytes)\n",
- normalize(&start, &stop, num), file_size());
- if (tdb_check(tdb, NULL, NULL))
- errx(1, "tdb_check failed!");
- if (summary) {
- char *sumstr = NULL;
- tdb_summary(tdb, TDB_SUMMARY_HISTOGRAMS, &sumstr);
- printf("%s\n", sumstr);
- free(sumstr);
- }
- if (do_stats)
- dump_and_clear_stats(&tdb, flags, &log);
- if (++stage == stopat)
- exit(0);
- return 0;
diff --git a/lib/tdb2/tools/tdb2backup.c b/lib/tdb2/tools/tdb2backup.c
deleted file mode 100644
index 37b301c548..0000000000
--- a/lib/tdb2/tools/tdb2backup.c
+++ /dev/null
@@ -1,340 +0,0 @@
- Unix SMB/CIFS implementation.
- low level tdb backup and restore utility
- Copyright (C) Andrew Tridgell 2002
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 3 of the License, or
- (at your option) any later version.
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- GNU General Public License for more details.
- You should have received a copy of the GNU General Public License
- along with this program. If not, see <>.
- This program is meant for backup/restore of tdb databases. Typical usage would be:
- tdbbackup *.tdb
- when Samba shuts down cleanly, which will make a backup of all the local databases
- to *.bak files. Then on Samba startup you would use:
- tdbbackup -v *.tdb
- and this will check the databases for corruption and if corruption is detected then
- the backup will be restored.
- You may also like to do a backup on a regular basis while Samba is
- running, perhaps using cron.
- The reason this program is needed is to cope with power failures
- while Samba is running. A power failure could lead to database
- corruption and Samba will then not start correctly.
- Note that many of the databases in Samba are transient and thus
- don't need to be backed up, so you can optimise the above a little
- by only running the backup on the critical databases.
- */
-#include "config.h"
-#include "tdb2.h"
-#include "system/filesys.h"
-#include <getopt.h>
-static int failed;
-static void tdb_log(struct tdb_context *tdb,
- enum tdb_log_level level,
- enum TDB_ERROR ecode,
- const char *message,
- void *data)
- fprintf(stderr, "%s:%s\n", tdb_errorstr(ecode), message);
-static char *add_suffix(const char *name, const char *suffix)
- char *ret;
- int len = strlen(name) + strlen(suffix) + 1;
- ret = (char *)malloc(len);
- if (!ret) {
- fprintf(stderr,"Out of memory!\n");
- exit(1);
- }
- snprintf(ret, len, "%s%s", name, suffix);
- return ret;
-static int copy_fn(struct tdb_context *tdb, TDB_DATA key, TDB_DATA dbuf, void *state)
- struct tdb_context *tdb_new = (struct tdb_context *)state;
- enum TDB_ERROR err;
- err = tdb_store(tdb_new, key, dbuf, TDB_INSERT);
- if (err) {
- fprintf(stderr,"Failed to insert into %s: %s\n",
- tdb_name(tdb_new), tdb_errorstr(err));
- failed = 1;
- return 1;
- }
- return 0;
-static int test_fn(struct tdb_context *tdb, TDB_DATA key, TDB_DATA dbuf, void *state)
- return 0;
- carefully backup a tdb, validating the contents and
- only doing the backup if its OK
- this function is also used for restore
-static int backup_tdb(const char *old_name, const char *new_name)
- struct tdb_context *tdb;
- struct tdb_context *tdb_new;
- char *tmp_name;
- struct stat st;
