diff options
84 files changed, 26504 insertions, 0 deletions
diff --git a/lib/tdb2/LICENSE b/lib/tdb2/LICENSE new file mode 100644 index 0000000000..cca7fc278f --- /dev/null +++ b/lib/tdb2/LICENSE @@ -0,0 +1,165 @@ +		   GNU LESSER GENERAL PUBLIC LICENSE +                       Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/> + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + +  This version of the GNU Lesser General Public License incorporates +the terms and conditions of version 3 of the GNU General Public +License, supplemented by the additional permissions listed below. + +  0. Additional Definitions. + +  As used herein, "this License" refers to version 3 of the GNU Lesser +General Public License, and the "GNU GPL" refers to version 3 of the GNU +General Public License. + +  "The Library" refers to a covered work governed by this License, +other than an Application or a Combined Work as defined below. + +  An "Application" is any work that makes use of an interface provided +by the Library, but which is not otherwise based on the Library. +Defining a subclass of a class defined by the Library is deemed a mode +of using an interface provided by the Library. + +  A "Combined Work" is a work produced by combining or linking an +Application with the Library.  The particular version of the Library +with which the Combined Work was made is also called the "Linked +Version". + +  The "Minimal Corresponding Source" for a Combined Work means the +Corresponding Source for the Combined Work, excluding any source code +for portions of the Combined Work that, considered in isolation, are +based on the Application, and not on the Linked Version. + +  The "Corresponding Application Code" for a Combined Work means the +object code and/or source code for the Application, including any data +and utility programs needed for reproducing the Combined Work from the +Application, but excluding the System Libraries of the Combined Work. + +  1. Exception to Section 3 of the GNU GPL. + +  You may convey a covered work under sections 3 and 4 of this License +without being bound by section 3 of the GNU GPL. + +  2. Conveying Modified Versions. + +  If you modify a copy of the Library, and, in your modifications, a +facility refers to a function or data to be supplied by an Application +that uses the facility (other than as an argument passed when the +facility is invoked), then you may convey a copy of the modified +version: + +   a) under this License, provided that you make a good faith effort to +   ensure that, in the event an Application does not supply the +   function or data, the facility still operates, and performs +   whatever part of its purpose remains meaningful, or + +   b) under the GNU GPL, with none of the additional permissions of +   this License applicable to that copy. + +  3. Object Code Incorporating Material from Library Header Files. + +  The object code form of an Application may incorporate material from +a header file that is part of the Library.  You may convey such object +code under terms of your choice, provided that, if the incorporated +material is not limited to numerical parameters, data structure +layouts and accessors, or small macros, inline functions and templates +(ten or fewer lines in length), you do both of the following: + +   a) Give prominent notice with each copy of the object code that the +   Library is used in it and that the Library and its use are +   covered by this License. + +   b) Accompany the object code with a copy of the GNU GPL and this license +   document. + +  4. Combined Works. + +  You may convey a Combined Work under terms of your choice that, +taken together, effectively do not restrict modification of the +portions of the Library contained in the Combined Work and reverse +engineering for debugging such modifications, if you also do each of +the following: + +   a) Give prominent notice with each copy of the Combined Work that +   the Library is used in it and that the Library and its use are +   covered by this License. + +   b) Accompany the Combined Work with a copy of the GNU GPL and this license +   document. + +   c) For a Combined Work that displays copyright notices during +   execution, include the copyright notice for the Library among +   these notices, as well as a reference directing the user to the +   copies of the GNU GPL and this license document. + +   d) Do one of the following: + +       0) Convey the Minimal Corresponding Source under the terms of this +       License, and the Corresponding Application Code in a form +       suitable for, and under terms that permit, the user to +       recombine or relink the Application with a modified version of +       the Linked Version to produce a modified Combined Work, in the +       manner specified by section 6 of the GNU GPL for conveying +       Corresponding Source. + +       1) Use a suitable shared library mechanism for linking with the +       Library.  A suitable mechanism is one that (a) uses at run time +       a copy of the Library already present on the user's computer +       system, and (b) will operate properly with a modified version +       of the Library that is interface-compatible with the Linked +       Version. + +   e) Provide Installation Information, but only if you would otherwise +   be required to provide such information under section 6 of the +   GNU GPL, and only to the extent that such information is +   necessary to install and execute a modified version of the +   Combined Work produced by recombining or relinking the +   Application with a modified version of the Linked Version. (If +   you use option 4d0, the Installation Information must accompany +   the Minimal Corresponding Source and Corresponding Application +   Code. If you use option 4d1, you must provide the Installation +   Information in the manner specified by section 6 of the GNU GPL +   for conveying Corresponding Source.) + +  5. Combined Libraries. + +  You may place library facilities that are a work based on the +Library side by side in a single library together with other library +facilities that are not Applications and are not covered by this +License, and convey such a combined library under terms of your +choice, if you do both of the following: + +   a) Accompany the combined library with a copy of the same work based +   on the Library, uncombined with any other library facilities, +   conveyed under the terms of this License. + +   b) Give prominent notice with the combined library that part of it +   is a work based on the Library, and explaining where to find the +   accompanying uncombined form of the same work. + +  6. Revised Versions of the GNU Lesser General Public License. + +  The Free Software Foundation may publish revised and/or new versions +of the GNU Lesser General Public License from time to time. Such new +versions will be similar in spirit to the present version, but may +differ in detail to address new problems or concerns. + +  Each version is given a distinguishing version number. If the +Library as you received it specifies that a certain numbered version +of the GNU Lesser General Public License "or any later version" +applies to it, you have the option of following the terms and +conditions either of that published version or of any later version +published by the Free Software Foundation. If the Library as you +received it does not specify a version number of the GNU Lesser +General Public License, you may choose any version of the GNU Lesser +General Public License ever published by the Free Software Foundation. + +  If the Library as you received it specifies that a proxy can decide +whether future versions of the GNU Lesser General Public License shall +apply, that proxy's public statement of acceptance of any version is +permanent authorization for you to choose that version for the +Library. diff --git a/lib/tdb2/_info b/lib/tdb2/_info new file mode 100644 index 0000000000..7213d67a22 --- /dev/null +++ b/lib/tdb2/_info @@ -0,0 +1,91 @@ +#include <string.h> +#include <stdio.h> + +/** + * tdb2 - [[WORK IN PROGRESS!]] The trivial (64bit transactional) database + * + * The tdb2 module provides an efficient keyword data mapping (usually + * within a file).  It supports transactions, so the contents of the + * database is reliable even across crashes. + * + * Example: + *	#include <ccan/tdb2/tdb2.h> + *	#include <ccan/str/str.h> + *	#include <err.h> + *	#include <stdio.h> + * + *	static void usage(const char *argv0) + *	{ + *		errx(1, "Usage: %s fetch <dbfile> <key>\n" + *		     "OR %s store <dbfile> <key> <data>", argv0, argv0); + *	} + * + *	int main(int argc, char *argv[]) + *	{ + *		struct tdb_context *tdb; + *		TDB_DATA key, value; + *		enum TDB_ERROR error; + * + *		if (argc < 4) + *			usage(argv[0]); + * + *		tdb = tdb_open(argv[2], TDB_DEFAULT, O_CREAT|O_RDWR,0600, NULL); + *		if (!tdb) + *			err(1, "Opening %s", argv[2]); + * + *		key.dptr = (void *)argv[3]; + *		key.dsize = strlen(argv[3]); + * + *		if (streq(argv[1], "fetch")) { + *			if (argc != 4) + *				usage(argv[0]); + *			error = tdb_fetch(tdb, key, &value); + *			if (error) + *				errx(1, "fetch %s: %s", + *				     argv[3], tdb_errorstr(error)); + *			printf("%.*s\n", value.dsize, (char *)value.dptr); + *			free(value.dptr); + *		} else if (streq(argv[1], "store")) { + *			if (argc != 5) + *				usage(argv[0]); + *			value.dptr = (void *)argv[4]; + *			value.dsize = strlen(argv[4]); + *			error = tdb_store(tdb, key, value, 0); + *			if (error) + *				errx(1, "store %s: %s", + *				     argv[3], tdb_errorstr(error)); + *		} else + *			usage(argv[0]); + * + *		return 0; + *	} + * + * Maintainer: Rusty Russell <rusty@rustcorp.com.au> + * + * Author: Rusty Russell + * + * License: LGPLv3 (or later) + */ +int main(int argc, char *argv[]) +{ +	if (argc != 2) +		return 1; + +	if (strcmp(argv[1], "depends") == 0) { +		printf("ccan/asprintf\n"); +		printf("ccan/hash\n"); +		printf("ccan/likely\n"); +		printf("ccan/asearch\n"); +		printf("ccan/compiler\n"); +		printf("ccan/build_assert\n"); +		printf("ccan/ilog\n"); +		printf("ccan/failtest\n"); +		printf("ccan/tally\n"); +		printf("ccan/typesafe_cb\n"); +		printf("ccan/cast\n"); +		printf("ccan/endian\n"); +		return 0; +	} + +	return 1; +} diff --git a/lib/tdb2/check.c b/lib/tdb2/check.c new file mode 100644 index 0000000000..52fb188764 --- /dev/null +++ b/lib/tdb2/check.c @@ -0,0 +1,835 @@ + /* +   Trivial Database 2: free list/block handling +   Copyright (C) Rusty Russell 2010 + +   This library is free software; you can redistribute it and/or +   modify it under the terms of the GNU Lesser General Public +   License as published by the Free Software Foundation; either +   version 3 of the License, or (at your option) any later version. + +   This library is distributed in the hope that it will be useful, +   but WITHOUT ANY WARRANTY; without even the implied warranty of +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU +   Lesser General Public License for more details. + +   You should have received a copy of the GNU Lesser General Public +   License along with this library; if not, see <http://www.gnu.org/licenses/>. +*/ +#include "private.h" +#include <ccan/likely/likely.h> +#include <ccan/asearch/asearch.h> + +/* We keep an ordered array of offsets. */ +static bool append(tdb_off_t **arr, size_t *num, tdb_off_t off) +{ +	tdb_off_t *new = realloc(*arr, (*num + 1) * sizeof(tdb_off_t)); +	if (!new) +		return false; +	new[(*num)++] = off; +	*arr = new; +	return true; +} + +static enum TDB_ERROR check_header(struct tdb_context *tdb, tdb_off_t *recovery, +				   uint64_t *features) +{ +	uint64_t hash_test; +	struct tdb_header hdr; +	enum TDB_ERROR ecode; + +	ecode = tdb_read_convert(tdb, 0, &hdr, sizeof(hdr)); +	if (ecode != TDB_SUCCESS) { +		return ecode; +	} +	/* magic food should not be converted, so convert back. */ +	tdb_convert(tdb, hdr.magic_food, sizeof(hdr.magic_food)); + +	hash_test = TDB_HASH_MAGIC; +	hash_test = tdb_hash(tdb, &hash_test, sizeof(hash_test)); +	if (hdr.hash_test != hash_test) { +		return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR, +				  "check: hash test %llu should be %llu", +				  (long long)hdr.hash_test, +				  (long long)hash_test); +	} + +	if (strcmp(hdr.magic_food, TDB_MAGIC_FOOD) != 0) { +		return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR, +				  "check: bad magic '%.*s'", +				  (unsigned)sizeof(hdr.magic_food), +				  hdr.magic_food); +	} + +	/* Features which are used must be a subset of features offered. */ +	if (hdr.features_used & ~hdr.features_offered) { +		return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR, +				  "check: features used (0x%llx) which" +				  " are not offered (0x%llx)", +				  (long long)hdr.features_used, +				  (long long)hdr.features_offered); +	} + +	*features = hdr.features_offered; +	*recovery = hdr.recovery; +	if (*recovery) { +		if (*recovery < sizeof(hdr) +		    || *recovery > tdb->file->map_size) { +			return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR, +					  "tdb_check:" +					  " invalid recovery offset %zu", +					  (size_t)*recovery); +		} +	} + +	/* Don't check reserved: they *can* be used later. */ +	return TDB_SUCCESS; +} + +static enum TDB_ERROR check_hash_tree(struct tdb_context *tdb, +				      tdb_off_t off, unsigned int group_bits, +				      uint64_t hprefix, +				      unsigned hprefix_bits, +				      tdb_off_t used[], +				      size_t num_used, +				      size_t *num_found, +				      enum TDB_ERROR (*check)(TDB_DATA, +							      TDB_DATA, void *), +				      void *data); + +static enum TDB_ERROR check_hash_chain(struct tdb_context *tdb, +				       tdb_off_t off, +				       uint64_t hash, +				       tdb_off_t used[], +				       size_t num_used, +				       size_t *num_found, +				       enum TDB_ERROR (*check)(TDB_DATA, +							       TDB_DATA, +							       void *), +				       void *data) +{ +	struct tdb_used_record rec; +	enum TDB_ERROR ecode; + +	ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec)); +	if (ecode != TDB_SUCCESS) { +		return ecode; +	} + +	if (rec_magic(&rec) != TDB_CHAIN_MAGIC) { +		return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR, +				  "tdb_check: Bad hash chain magic %llu", +				  (long long)rec_magic(&rec)); +	} + +	if (rec_data_length(&rec) != sizeof(struct tdb_chain)) { +		return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR, +				  "tdb_check:" +				  " Bad hash chain length %llu vs %zu", +				  (long long)rec_data_length(&rec), +				  sizeof(struct tdb_chain)); +	} +	if (rec_key_length(&rec) != 0) { +		return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR, +				  "tdb_check: Bad hash chain key length %llu", +				  (long long)rec_key_length(&rec)); +	} +	if (rec_hash(&rec) != 0) { +		return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR, +				  "tdb_check: Bad hash chain hash value %llu", +				  (long long)rec_hash(&rec)); +	} + +	off += sizeof(rec); +	ecode = check_hash_tree(tdb, off, 0, hash, 64, +				used, num_used, num_found, check, data); +	if (ecode != TDB_SUCCESS) { +		return ecode; +	} + +	off = tdb_read_off(tdb, off + offsetof(struct tdb_chain, next)); +	if (TDB_OFF_IS_ERR(off)) { +		return off; +	} +	if (off == 0) +		return TDB_SUCCESS; +	(*num_found)++; +	return check_hash_chain(tdb, off, hash, used, num_used, num_found, +				check, data); +} + +static enum TDB_ERROR check_hash_record(struct tdb_context *tdb, +					tdb_off_t off, +					uint64_t hprefix, +					unsigned hprefix_bits, +					tdb_off_t used[], +					size_t num_used, +					size_t *num_found, +					enum TDB_ERROR (*check)(TDB_DATA, +								TDB_DATA, +								void *), +					void *data) +{ +	struct tdb_used_record rec; +	enum TDB_ERROR ecode; + +	if (hprefix_bits >= 64) +		return check_hash_chain(tdb, off, hprefix, used, num_used, +					num_found, check, data); + +	ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec)); +	if (ecode != TDB_SUCCESS) { +		return ecode; +	} + +	if (rec_magic(&rec) != TDB_HTABLE_MAGIC) { +		return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR, +				  "tdb_check: Bad hash table magic %llu", +				  (long long)rec_magic(&rec)); +	} +	if (rec_data_length(&rec) +	    != sizeof(tdb_off_t) << TDB_SUBLEVEL_HASH_BITS) { +		return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR, +				  "tdb_check:" +				  " Bad hash table length %llu vs %llu", +				  (long long)rec_data_length(&rec), +				  (long long)sizeof(tdb_off_t) +				  << TDB_SUBLEVEL_HASH_BITS); +	} +	if (rec_key_length(&rec) != 0) { +		return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR, +				  "tdb_check: Bad hash table key length %llu", +				  (long long)rec_key_length(&rec)); +	} +	if (rec_hash(&rec) != 0) { +		return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR, +				  "tdb_check: Bad hash table hash value %llu", +				  (long long)rec_hash(&rec)); +	} + +	off += sizeof(rec); +	return check_hash_tree(tdb, off, +			       TDB_SUBLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS, +			       hprefix, hprefix_bits, +			       used, num_used, num_found, check, data); +} + +static int off_cmp(const tdb_off_t *a, const tdb_off_t *b) +{ +	/* Can overflow an int. */ +	return *a > *b ? 1 +		: *a < *b ? -1 +		: 0; +} + +static uint64_t get_bits(uint64_t h, unsigned num, unsigned *used) +{ +	*used += num; + +	return (h >> (64 - *used)) & ((1U << num) - 1); +} + +static enum TDB_ERROR check_hash_tree(struct tdb_context *tdb, +				      tdb_off_t off, unsigned int group_bits, +				      uint64_t hprefix, +				      unsigned hprefix_bits, +				      tdb_off_t used[], +				      size_t num_used, +				      size_t *num_found, +				      enum TDB_ERROR (*check)(TDB_DATA, +							      TDB_DATA, void *), +				      void *data) +{ +	unsigned int g, b; +	const tdb_off_t *hash; +	struct tdb_used_record rec; +	enum TDB_ERROR ecode; + +	hash = tdb_access_read(tdb, off, +			       sizeof(tdb_off_t) +			       << (group_bits + TDB_HASH_GROUP_BITS), +			       true); +	if (TDB_PTR_IS_ERR(hash)) { +		return TDB_PTR_ERR(hash); +	} + +	for (g = 0; g < (1 << group_bits); g++) { +		const tdb_off_t *group = hash + (g << TDB_HASH_GROUP_BITS); +		for (b = 0; b < (1 << TDB_HASH_GROUP_BITS); b++) { +			unsigned int bucket, i, used_bits; +			uint64_t h; +			tdb_off_t *p; +			if (group[b] == 0) +				continue; + +			off = group[b] & TDB_OFF_MASK; +			p = asearch(&off, used, num_used, off_cmp); +			if (!p) { +				ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT, +						   TDB_LOG_ERROR, +						   "tdb_check: Invalid offset" +						   " %llu in hash", +						   (long long)off); +				goto fail; +			} +			/* Mark it invalid. */ +			*p ^= 1; +			(*num_found)++; + +			if (hprefix_bits == 64) { +				/* Chained entries are unordered. */ +				if (is_subhash(group[b])) { +					ecode = TDB_ERR_CORRUPT; +					tdb_logerr(tdb, ecode, +						   TDB_LOG_ERROR, +						   "tdb_check: Invalid chain" +						   " entry subhash"); +					goto fail; +				} +				h = hash_record(tdb, off); +				if (h != hprefix) { +					ecode = TDB_ERR_CORRUPT; +					tdb_logerr(tdb, ecode, +						   TDB_LOG_ERROR, +						   "check: bad hash chain" +						   " placement" +						   " 0x%llx vs 0x%llx", +						   (long long)h, +						   (long long)hprefix); +					goto fail; +				} +				ecode = tdb_read_convert(tdb, off, &rec, +							 sizeof(rec)); +				if (ecode != TDB_SUCCESS) { +					goto fail; +				} +				goto check; +			} + +			if (is_subhash(group[b])) { +				uint64_t subprefix; +				subprefix = (hprefix +				     << (group_bits + TDB_HASH_GROUP_BITS)) +					+ g * (1 << TDB_HASH_GROUP_BITS) + b; + +				ecode = check_hash_record(tdb, +					       group[b] & TDB_OFF_MASK, +					       subprefix, +					       hprefix_bits +						       + group_bits +						       + TDB_HASH_GROUP_BITS, +					       used, num_used, num_found, +					       check, data); +				if (ecode != TDB_SUCCESS) { +					goto fail; +				} +				continue; +			} +			/* A normal entry */ + +			/* Does it belong here at all? */ +			h = hash_record(tdb, off); +			used_bits = 0; +			if (get_bits(h, hprefix_bits, &used_bits) != hprefix +			    && hprefix_bits) { +				ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT, +						   TDB_LOG_ERROR, +						   "check: bad hash placement" +						   " 0x%llx vs 0x%llx", +						   (long long)h, +						   (long long)hprefix); +				goto fail; +			} + +			/* Does it belong in this group? */ +			if (get_bits(h, group_bits, &used_bits) != g) { +				ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT, +						   TDB_LOG_ERROR, +						   "check: bad group %llu" +						   " vs %u", +						   (long long)h, g); +				goto fail; +			} + +			/* Are bucket bits correct? */ +			bucket = group[b] & TDB_OFF_HASH_GROUP_MASK; +			if (get_bits(h, TDB_HASH_GROUP_BITS, &used_bits) +			    != bucket) { +				used_bits -= TDB_HASH_GROUP_BITS; +				ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT, +						   TDB_LOG_ERROR, +						   "check: bad bucket %u vs %u", +						   (unsigned)get_bits(h, +							TDB_HASH_GROUP_BITS, +							&used_bits), +						   bucket); +				goto fail; +			} + +			/* There must not be any zero entries between +			 * the bucket it belongs in and this one! */ +			for (i = bucket; +			     i != b; +			     i = (i + 1) % (1 << TDB_HASH_GROUP_BITS)) { +				if (group[i] == 0) { +					ecode = TDB_ERR_CORRUPT; +					tdb_logerr(tdb, ecode, +						   TDB_LOG_ERROR, +						   "check: bad group placement" +						   " %u vs %u", +						   b, bucket); +					goto fail; +				} +			} + +			ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec)); +			if (ecode != TDB_SUCCESS) { +				goto fail; +			} + +			/* Bottom bits must match header. */ +			if ((h & ((1 << 11)-1)) != rec_hash(&rec)) { +				ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT, +						   TDB_LOG_ERROR, +						   "tdb_check: Bad hash magic" +						   " at offset %llu" +						   " (0x%llx vs 0x%llx)", +						   (long long)off, +						   (long long)h, +						   (long long)rec_hash(&rec)); +				goto fail; +			} + +		check: +			if (check) { +				TDB_DATA k, d; +				const unsigned char *kptr; + +				kptr = tdb_access_read(tdb, +						       off + sizeof(rec), +						       rec_key_length(&rec) +						       + rec_data_length(&rec), +						       false); +				if (TDB_PTR_IS_ERR(kptr)) { +					ecode = TDB_PTR_ERR(kptr); +					goto fail; +				} + +				k = tdb_mkdata(kptr, rec_key_length(&rec)); +				d = tdb_mkdata(kptr + k.dsize, +					       rec_data_length(&rec)); +				ecode = check(k, d, data); +				tdb_access_release(tdb, kptr); +				if (ecode != TDB_SUCCESS) { +					goto fail; +				} +			} +		} +	} +	tdb_access_release(tdb, hash); +	return TDB_SUCCESS; + +fail: +	tdb_access_release(tdb, hash); +	return ecode; +} + +static enum TDB_ERROR check_hash(struct tdb_context *tdb, +				 tdb_off_t used[], +				 size_t num_used, size_t num_ftables, +				 int (*check)(TDB_DATA, TDB_DATA, void *), +				 void *data) +{ +	/* Free tables also show up as used. */ +	size_t num_found = num_ftables; +	enum TDB_ERROR ecode; + +	ecode = check_hash_tree(tdb, offsetof(struct tdb_header, hashtable), +				TDB_TOPLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS, +				0, 0, used, num_used, &num_found, +				check, data); +	if (ecode == TDB_SUCCESS) { +		if (num_found != num_used) { +			ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR, +					   "tdb_check: Not all entries" +					   " are in hash"); +		} +	} +	return ecode; +} + +static enum TDB_ERROR check_free(struct tdb_context *tdb, +				 tdb_off_t off, +				 const struct tdb_free_record *frec, +				 tdb_off_t prev, unsigned int ftable, +				 unsigned int bucket) +{ +	enum TDB_ERROR ecode; + +	if (frec_magic(frec) != TDB_FREE_MAGIC) { +		return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR, +				  "tdb_check: offset %llu bad magic 0x%llx", +				  (long long)off, +				  (long long)frec->magic_and_prev); +	} +	if (frec_ftable(frec) != ftable) { +		return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR, +				  "tdb_check: offset %llu bad freetable %u", +				  (long long)off, frec_ftable(frec)); + +	} + +	ecode = tdb->methods->oob(tdb, off +				  + frec_len(frec) +				  + sizeof(struct tdb_used_record), +				  false); +	if (ecode != TDB_SUCCESS) { +		return ecode; +	} +	if (size_to_bucket(frec_len(frec)) != bucket) { +		return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR, +				  "tdb_check: offset %llu in wrong bucket" +				  " (%u vs %u)", +				  (long long)off, +				  bucket, size_to_bucket(frec_len(frec))); +	} +	if (prev && prev != frec_prev(frec)) { +		return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR, +				  "tdb_check: offset %llu bad prev" +				  " (%llu vs %llu)", +				  (long long)off, +				  (long long)prev, (long long)frec_len(frec)); +	} +	return TDB_SUCCESS; +} + +static enum TDB_ERROR check_free_table(struct tdb_context *tdb, +				       tdb_off_t ftable_off, +				       unsigned ftable_num, +				       tdb_off_t fr[], +				       size_t num_free, +				       size_t *num_found) +{ +	struct tdb_freetable ft; +	tdb_off_t h; +	unsigned int i; +	enum TDB_ERROR ecode; + +	ecode = tdb_read_convert(tdb, ftable_off, &ft, sizeof(ft)); +	if (ecode != TDB_SUCCESS) { +		return ecode; +	} + +	if (rec_magic(&ft.hdr) != TDB_FTABLE_MAGIC +	    || rec_key_length(&ft.hdr) != 0 +	    || rec_data_length(&ft.hdr) != sizeof(ft) - sizeof(ft.hdr) +	    || rec_hash(&ft.hdr) != 0) { +		return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR, +				  "tdb_check: Invalid header on free table"); +	} + +	for (i = 0; i < TDB_FREE_BUCKETS; i++) { +		tdb_off_t off, prev = 0, *p, first = 0; +		struct tdb_free_record f; + +		h = bucket_off(ftable_off, i); +		for (off = tdb_read_off(tdb, h); off; off = f.next) { +			if (TDB_OFF_IS_ERR(off)) { +				return off; +			} +			if (!first) { +				off &= TDB_OFF_MASK; +				first = off; +			} +			ecode = tdb_read_convert(tdb, off, &f, sizeof(f)); +			if (ecode != TDB_SUCCESS) { +				return ecode; +			} +			ecode = check_free(tdb, off, &f, prev, ftable_num, i); +			if (ecode != TDB_SUCCESS) { +				return ecode; +			} + +			/* FIXME: Check hash bits */ +			p = asearch(&off, fr, num_free, off_cmp); +			if (!p) { +				return tdb_logerr(tdb, TDB_ERR_CORRUPT, +						  TDB_LOG_ERROR, +						  "tdb_check: Invalid offset" +						  " %llu in free table", +						  (long long)off); +			} +			/* Mark it invalid. */ +			*p ^= 1; +			(*num_found)++; +			prev = off; +		} + +		if (first) { +			/* Now we can check first back pointer. */ +			ecode = tdb_read_convert(tdb, first, &f, sizeof(f)); +			if (ecode != TDB_SUCCESS) { +				return ecode; +			} +			ecode = check_free(tdb, first, &f, prev, ftable_num, i); +			if (ecode != TDB_SUCCESS) { +				return ecode; +			} +		} +	} +	return TDB_SUCCESS; +} + +/* Slow, but should be very rare. */ +tdb_off_t dead_space(struct tdb_context *tdb, tdb_off_t off) +{ +	size_t len; +	enum TDB_ERROR ecode; + +	for (len = 0; off + len < tdb->file->map_size; len++) { +		char c; +		ecode = tdb->methods->tread(tdb, off, &c, 1); +		if (ecode != TDB_SUCCESS) { +			return ecode; +		} +		if (c != 0 && c != 0x43) +			break; +	} +	return len; +} + +static enum TDB_ERROR check_linear(struct tdb_context *tdb, +				   tdb_off_t **used, size_t *num_used, +				   tdb_off_t **fr, size_t *num_free, +				   uint64_t features, tdb_off_t recovery) +{ +	tdb_off_t off; +	tdb_len_t len; +	enum TDB_ERROR ecode; +	bool found_recovery = false; + +	for (off = sizeof(struct tdb_header); +	     off < tdb->file->map_size; +	     off += len) { +		union { +			struct tdb_used_record u; +			struct tdb_free_record f; +			struct tdb_recovery_record r; +		} rec; +		/* r is larger: only get that if we need to. */ +		ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec.f)); +		if (ecode != TDB_SUCCESS) { +			return ecode; +		} + +		/* If we crash after ftruncate, we can get zeroes or fill. */ +		if (rec.r.magic == TDB_RECOVERY_INVALID_MAGIC +		    || rec.r.magic ==  0x4343434343434343ULL) { +			ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec.r)); +			if (ecode != TDB_SUCCESS) { +				return ecode; +			} +			if (recovery == off) { +				found_recovery = true; +				len = sizeof(rec.r) + rec.r.max_len; +			} else { +				len = dead_space(tdb, off); +				if (TDB_OFF_IS_ERR(len)) { +					return len; +				} +				if (len < sizeof(rec.r)) { +					return tdb_logerr(tdb, TDB_ERR_CORRUPT, +							  TDB_LOG_ERROR, +							  "tdb_check: invalid" +							  " dead space at %zu", +							  (size_t)off); +				} + +				tdb_logerr(tdb, TDB_SUCCESS, TDB_LOG_WARNING, +					   "Dead space at %zu-%zu (of %zu)", +					   (size_t)off, (size_t)(off + len), +					   (size_t)tdb->file->map_size); +			} +		} else if (rec.r.magic == TDB_RECOVERY_MAGIC) { +			ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec.r)); +			if (ecode != TDB_SUCCESS) { +				return ecode; +			} +			if (recovery != off) { +				return tdb_logerr(tdb, TDB_ERR_CORRUPT, +						  TDB_LOG_ERROR, +						  "tdb_check: unexpected" +						  " recovery record at offset" +						  " %zu", +						  (size_t)off); +			} +			if (rec.r.len > rec.r.max_len) { +				return tdb_logerr(tdb, TDB_ERR_CORRUPT, +						  TDB_LOG_ERROR, +						  "tdb_check: invalid recovery" +						  " length %zu", +						  (size_t)rec.r.len); +			} +			if (rec.r.eof > tdb->file->map_size) { +				return tdb_logerr(tdb, TDB_ERR_CORRUPT, +						  TDB_LOG_ERROR, +						  "tdb_check: invalid old EOF" +						  " %zu", (size_t)rec.r.eof); +			} +			found_recovery = true; +			len = sizeof(rec.r) + rec.r.max_len; +		} else if (frec_magic(&rec.f) == TDB_FREE_MAGIC) { +			len = sizeof(rec.u) + frec_len(&rec.f); +			if (off + len > tdb->file->map_size) { +				return tdb_logerr(tdb, TDB_ERR_CORRUPT, +						  TDB_LOG_ERROR, +						  "tdb_check: free overlength" +						  " %llu at offset %llu", +						  (long long)len, +						  (long long)off); +			} +			/* This record should be in free lists. */ +			if (frec_ftable(&rec.f) != TDB_FTABLE_NONE +			    && !append(fr, num_free, off)) { +				return tdb_logerr(tdb, TDB_ERR_OOM, +						  TDB_LOG_ERROR, +						  "tdb_check: tracking %zu'th" +						  " free record.", *num_free); +			} +		} else if (rec_magic(&rec.u) == TDB_USED_MAGIC +			   || rec_magic(&rec.u) == TDB_CHAIN_MAGIC +			   || rec_magic(&rec.u) == TDB_HTABLE_MAGIC +			   || rec_magic(&rec.u) == TDB_FTABLE_MAGIC) { +			uint64_t klen, dlen, extra; + +			/* This record is used! */ +			if (!append(used, num_used, off)) { +				return tdb_logerr(tdb, TDB_ERR_OOM, +						  TDB_LOG_ERROR, +						  "tdb_check: tracking %zu'th" +						  " used record.", *num_used); +			} + +			klen = rec_key_length(&rec.u); +			dlen = rec_data_length(&rec.u); +			extra = rec_extra_padding(&rec.u); + +			len = sizeof(rec.u) + klen + dlen + extra; +			if (off + len > tdb->file->map_size) { +				return tdb_logerr(tdb, TDB_ERR_CORRUPT, +						  TDB_LOG_ERROR, +						  "tdb_check: used overlength" +						  " %llu at offset %llu", +						  (long long)len, +						  (long long)off); +			} + +			if (len < sizeof(rec.f)) { +				return tdb_logerr(tdb, TDB_ERR_CORRUPT, +						  TDB_LOG_ERROR, +						  "tdb_check: too short record" +						  " %llu at %llu", +						  (long long)len, +						  (long long)off); +			} + +			/* Check that records have correct 0 at end (but may +			 * not in future). */ +			if (extra && !features) { +				const char *p; +				char c; +				p = tdb_access_read(tdb, off + sizeof(rec.u) +						    + klen + dlen, 1, false); +				if (TDB_PTR_IS_ERR(p)) +					return TDB_PTR_ERR(p); +				c = *p; +				tdb_access_release(tdb, p); + +				if (c != '\0') { +					return tdb_logerr(tdb, TDB_ERR_CORRUPT, +							  TDB_LOG_ERROR, +							  "tdb_check:" +							  " non-zero extra" +							  " at %llu", +							  (long long)off); +				} +			} +		} else { +			return tdb_logerr(tdb, TDB_ERR_CORRUPT, +					  TDB_LOG_ERROR, +					  "tdb_check: Bad magic 0x%llx" +					  " at offset %zu", +					  (long long)rec_magic(&rec.u), +					  (size_t)off); +		} +	} + +	/* We must have found recovery area if there was one. */ +	if (recovery != 0 && !found_recovery) { +		return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR, +				  "tdb_check: expected a recovery area at %zu", +				  (size_t)recovery); +	} + +	return TDB_SUCCESS; +} + +enum TDB_ERROR tdb_check_(struct tdb_context *tdb, +			  enum TDB_ERROR (*check)(TDB_DATA, TDB_DATA, void *), +			  void *data) +{ +	tdb_off_t *fr = NULL, *used = NULL, ft, recovery; +	size_t num_free = 0, num_used = 0, num_found = 0, num_ftables = 0; +	uint64_t features; +	enum TDB_ERROR ecode; + +	ecode = tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_WAIT, false); +	if (ecode != TDB_SUCCESS) { +		return tdb->last_error = ecode; +	} + +	ecode = tdb_lock_expand(tdb, F_RDLCK); +	if (ecode != TDB_SUCCESS) { +		tdb_allrecord_unlock(tdb, F_RDLCK); +		return tdb->last_error = ecode; +	} + +	ecode = check_header(tdb, &recovery, &features); +	if (ecode != TDB_SUCCESS) +		goto out; + +	/* First we do a linear scan, checking all records. */ +	ecode = check_linear(tdb, &used, &num_used, &fr, &num_free, features, +			     recovery); +	if (ecode != TDB_SUCCESS) +		goto out; + +	for (ft = first_ftable(tdb); ft; ft = next_ftable(tdb, ft)) { +		if (TDB_OFF_IS_ERR(ft)) { +			ecode = ft; +			goto out; +		} +		ecode = check_free_table(tdb, ft, num_ftables, fr, num_free, +					 &num_found); +		if (ecode != TDB_SUCCESS) +			goto out; +		num_ftables++; +	} + +	/* FIXME: Check key uniqueness? */ +	ecode = check_hash(tdb, used, num_used, num_ftables, check, data); +	if (ecode != TDB_SUCCESS) +		goto out; + +	if (num_found != num_free) { +		ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR, +				   "tdb_check: Not all entries are in" +				   " free table"); +	} + +out: +	tdb_allrecord_unlock(tdb, F_RDLCK); +	tdb_unlock_expand(tdb, F_RDLCK); +	free(fr); +	free(used); +	return tdb->last_error = ecode; +} diff --git a/lib/tdb2/doc/TDB1_porting.txt b/lib/tdb2/doc/TDB1_porting.txt new file mode 100644 index 0000000000..90ba249738 --- /dev/null +++ b/lib/tdb2/doc/TDB1_porting.txt @@ -0,0 +1,44 @@ +Interface differences between TDB1 and TDB2. + +- tdb2 uses 'struct tdb_data', tdb1 uses 'struct TDB_DATA'.  Use the +  TDB_DATA typedef if you want portability between the two. + +- tdb2 functions return 0 on success, and a negative error on failure, +  whereas tdb1 functions returned 0 on success, and -1 on failure. +  tdb1 then used tdb_error() to determine the error; this is also +  supported in tdb2 to ease backwards compatibility, though the other +  form is preferred. + +- tdb2's tdb_fetch() returns an error, tdb1's returned the data directly +  (or tdb_null, and you were supposed to check tdb_error() to find out why). + +- tdb2's tdb_nextkey() frees the old key's dptr, in tdb2 you needed to do +  this manually. + +- tdb1's tdb_open/tdb_open_ex took an explicit hash size.  tdb2's hash table +  resizes as required. + +- tdb2 uses a linked list of attribute structures to implement logging and +  alternate hashes.  tdb1 used tdb_open_ex, which was not extensible. + +- tdb2 does locking on read-only databases (ie. O_RDONLY passed to tdb_open). +  tdb1 did not: use the TDB_NOLOCK flag if you want to suppress locking. + +- tdb2's log function is simpler than tdb1's log function.  The string is +  already formatted, and it takes an enum tdb_log_level not a tdb_debug_level, +  and which has only three values: TDB_LOG_ERROR, TDB_LOG_USE_ERROR and +  TDB_LOG_WARNING. + +- tdb2 provides tdb_deq() for comparing two struct tdb_data. + +- tdb2's tdb_name() returns a copy of the name even for TDB_INTERNAL dbs. + +- tdb2 does not need tdb_reopen() or tdb_reopen_all().  If you call +  fork() after during certain operations the child should close the +  tdb, or complete the operations before continuing to use the tdb: + +	tdb_transaction_start(): child must tdb_transaction_cancel() +	tdb_lockall(): child must call tdb_unlockall() +	tdb_lockall_read(): child must call tdb_unlockall_read() +	tdb_chainlock(): child must call tdb_chainunlock() +	tdb_parse() callback: child must return from tdb_parse() diff --git a/lib/tdb2/doc/design-1.3.txt b/lib/tdb2/doc/design-1.3.txt new file mode 100644 index 0000000000..f81ecf7885 --- /dev/null +++ b/lib/tdb2/doc/design-1.3.txt @@ -0,0 +1,1049 @@ +TDB2: A Redesigning The Trivial DataBase + +Rusty Russell, IBM Corporation + +27-April-2010 + +Abstract + +The Trivial DataBase on-disk format is 32 bits; with usage cases +heading towards the 4G limit, that must change. This required +breakage provides an opportunity to revisit TDB's other design +decisions and reassess them. + +1 Introduction + +The Trivial DataBase was originally written by Andrew Tridgell as +a simple key/data pair storage system with the same API as dbm, +but allowing multiple readers and writers while being small +enough (< 1000 lines of C) to include in SAMBA. The simple design +created in 1999 has proven surprisingly robust and performant, +used in Samba versions 3 and 4 as well as numerous other +projects. Its useful life was greatly increased by the +(backwards-compatible!) addition of transaction support in 2005. + +The wider variety and greater demands of TDB-using code has lead +to some organic growth of the API, as well as some compromises on +the implementation. None of these, by themselves, are seen as +show-stoppers, but the cumulative effect is to a loss of elegance +over the initial, simple TDB implementation. Here is a table of +the approximate number of lines of implementation code and number +of API functions at the end of each year: + + ++-----------+----------------+--------------------------------+ +| Year End  | API Functions  | Lines of C Code Implementation | ++-----------+----------------+--------------------------------+ ++-----------+----------------+--------------------------------+ +|   1999    |      13        |              1195              | ++-----------+----------------+--------------------------------+ +|   2000    |      24        |              1725              | ++-----------+----------------+--------------------------------+ +|   2001    |      32        |              2228              | ++-----------+----------------+--------------------------------+ +|   2002    |      35        |              2481              | ++-----------+----------------+--------------------------------+ +|   2003    |      35        |              2552              | ++-----------+----------------+--------------------------------+ +|   2004    |      40        |              2584              | ++-----------+----------------+--------------------------------+ +|   2005    |      38        |              2647              | ++-----------+----------------+--------------------------------+ +|   2006    |      52        |              3754              | ++-----------+----------------+--------------------------------+ +|   2007    |      66        |              4398              | ++-----------+----------------+--------------------------------+ +|   2008    |      71        |              4768              | ++-----------+----------------+--------------------------------+ +|   2009    |      73        |              5715              | ++-----------+----------------+--------------------------------+ + + +This review is an attempt to catalog and address all the known +issues with TDB and create solutions which address the problems +without significantly increasing complexity; all involved are far +too aware of the dangers of second system syndrome in rewriting a +successful project like this. + +2 API Issues + +2.1 tdb_open_ex Is Not Expandable + +The tdb_open() call was expanded to tdb_open_ex(), which added an +optional hashing function and an optional logging function +argument. Additional arguments to open would require the +introduction of a tdb_open_ex2 call etc. + +2.1.1 Proposed Solution + +tdb_open() will take a linked-list of attributes: + +enum tdb_attribute { + +    TDB_ATTRIBUTE_LOG = 0, + +    TDB_ATTRIBUTE_HASH = 1 + +}; + +struct tdb_attribute_base { + +    enum tdb_attribute attr; + +    union tdb_attribute *next; + +}; + +struct tdb_attribute_log { + +    struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_LOG +*/ + +    tdb_log_func log_fn; + +    void *log_private; + +}; + +struct tdb_attribute_hash { + +    struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_HASH +*/ + +    tdb_hash_func hash_fn; + +    void *hash_private; + +}; + +union tdb_attribute { + +    struct tdb_attribute_base base; + +    struct tdb_attribute_log log; + +    struct tdb_attribute_hash hash; + +}; + +This allows future attributes to be added, even if this expands +the size of the union. + +2.2 tdb_traverse Makes Impossible Guarantees + +tdb_traverse (and tdb_firstkey/tdb_nextkey) predate transactions, +and it was thought that it was important to guarantee that all +records which exist at the start and end of the traversal would +be included, and no record would be included twice. + +This adds complexity (see[Reliable-Traversal-Adds]) and does not +work anyway for records which are altered (in particular, those +which are expanded may be effectively deleted and re-added behind +the traversal). + +2.2.1 <traverse-Proposed-Solution>Proposed Solution + +Abandon the guarantee. You will see every record if no changes +occur during your traversal, otherwise you will see some subset. +You can prevent changes by using a transaction or the locking +API. + +2.3 Nesting of Transactions Is Fraught + +TDB has alternated between allowing nested transactions and not +allowing them. Various paths in the Samba codebase assume that +transactions will nest, and in a sense they can: the operation is +only committed to disk when the outer transaction is committed. +There are two problems, however: + +1. Canceling the inner transaction will cause the outer +  transaction commit to fail, and will not undo any operations +  since the inner transaction began. This problem is soluble with +  some additional internal code. + +2. An inner transaction commit can be cancelled by the outer +  transaction. This is desirable in the way which Samba's +  database initialization code uses transactions, but could be a +  surprise to any users expecting a successful transaction commit +  to expose changes to others. + +The current solution is to specify the behavior at tdb_open(), +with the default currently that nested transactions are allowed. +This flag can also be changed at runtime. + +2.3.1 Proposed Solution + +Given the usage patterns, it seems that the “least-surprise” +behavior of disallowing nested transactions should become the +default. Additionally, it seems the outer transaction is the only +code which knows whether inner transactions should be allowed, so +a flag to indicate this could be added to tdb_transaction_start. +However, this behavior can be simulated with a wrapper which uses +tdb_add_flags() and tdb_remove_flags(), so the API should not be +expanded for this relatively-obscure case. + +2.4 Incorrect Hash Function is Not Detected + +tdb_open_ex() allows the calling code to specify a different hash +function to use, but does not check that all other processes +accessing this tdb are using the same hash function. The result +is that records are missing from tdb_fetch(). + +2.4.1 Proposed Solution + +The header should contain an example hash result (eg. the hash of +0xdeadbeef), and tdb_open_ex() should check that the given hash +function produces the same answer, or fail the tdb_open call. + +2.5 tdb_set_max_dead/TDB_VOLATILE Expose Implementation + +In response to scalability issues with the free list ([TDB-Freelist-Is] +) two API workarounds have been incorporated in TDB: +tdb_set_max_dead() and the TDB_VOLATILE flag to tdb_open. The +latter actually calls the former with an argument of “5”. + +This code allows deleted records to accumulate without putting +them in the free list. On delete we iterate through each chain +and free them in a batch if there are more than max_dead entries. +These are never otherwise recycled except as a side-effect of a +tdb_repack. + +2.5.1 Proposed Solution + +With the scalability problems of the freelist solved, this API +can be removed. The TDB_VOLATILE flag may still be useful as a +hint that store and delete of records will be at least as common +as fetch in order to allow some internal tuning, but initially +will become a no-op. + +2.6 <TDB-Files-Cannot>TDB Files Cannot Be Opened Multiple Times +  In The Same Process + +No process can open the same TDB twice; we check and disallow it. +This is an unfortunate side-effect of fcntl locks, which operate +on a per-file rather than per-file-descriptor basis, and do not +nest. Thus, closing any file descriptor on a file clears all the +locks obtained by this process, even if they were placed using a +different file descriptor! + +Note that even if this were solved, deadlock could occur if +operations were nested: this is a more manageable programming +error in most cases. + +2.6.1 Proposed Solution + +We could lobby POSIX to fix the perverse rules, or at least lobby +Linux to violate them so that the most common implementation does +not have this restriction. This would be a generally good idea +for other fcntl lock users. + +Samba uses a wrapper which hands out the same tdb_context to +multiple callers if this happens, and does simple reference +counting. We should do this inside the tdb library, which already +emulates lock nesting internally; it would need to recognize when +deadlock occurs within a single process. This would create a new +failure mode for tdb operations (while we currently handle +locking failures, they are impossible in normal use and a process +encountering them can do little but give up). + +I do not see benefit in an additional tdb_open flag to indicate +whether re-opening is allowed, as though there may be some +benefit to adding a call to detect when a tdb_context is shared, +to allow other to create such an API. + +2.7 TDB API Is Not POSIX Thread-safe + +The TDB API uses an error code which can be queried after an +operation to determine what went wrong. This programming model +does not work with threads, unless specific additional guarantees +are given by the implementation. In addition, even +otherwise-independent threads cannot open the same TDB (as in [TDB-Files-Cannot] +). + +2.7.1 Proposed Solution + +Reachitecting the API to include a tdb_errcode pointer would be a +great deal of churn; we are better to guarantee that the +tdb_errcode is per-thread so the current programming model can be +maintained. + +This requires dynamic per-thread allocations, which is awkward +with POSIX threads (pthread_key_create space is limited and we +cannot simply allocate a key for every TDB). + +Internal locking is required to make sure that fcntl locks do not +overlap between threads, and also that the global list of tdbs is +maintained. + +The aim is that building tdb with -DTDB_PTHREAD will result in a +pthread-safe version of the library, and otherwise no overhead +will exist. + +2.8 *_nonblock Functions And *_mark Functions Expose +  Implementation + +CTDB[footnote: +Clustered TDB, see http://ctdb.samba.org +] wishes to operate on TDB in a non-blocking manner. This is +currently done as follows: + +1. Call the _nonblock variant of an API function (eg. +  tdb_lockall_nonblock). If this fails: + +2. Fork a child process, and wait for it to call the normal +  variant (eg. tdb_lockall). + +3. If the child succeeds, call the _mark variant to indicate we +  already have the locks (eg. tdb_lockall_mark). + +4. Upon completion, tell the child to release the locks (eg. +  tdb_unlockall). + +5. Indicate to tdb that it should consider the locks removed (eg. +  tdb_unlockall_mark). + +There are several issues with this approach. Firstly, adding two +new variants of each function clutters the API for an obscure +use, and so not all functions have three variants. Secondly, it +assumes that all paths of the functions ask for the same locks, +otherwise the parent process will have to get a lock which the +child doesn't have under some circumstances. I don't believe this +is currently the case, but it constrains the implementation. + +2.8.1 <Proposed-Solution-locking-hook>Proposed Solution + +Implement a hook for locking methods, so that the caller can +control the calls to create and remove fcntl locks. In this +scenario, ctdbd would operate as follows: + +1. Call the normal API function, eg tdb_lockall(). + +2. When the lock callback comes in, check if the child has the +  lock. Initially, this is always false. If so, return 0. +  Otherwise, try to obtain it in non-blocking mode. If that +  fails, return EWOULDBLOCK. + +3. Release locks in the unlock callback as normal. + +4. If tdb_lockall() fails, see if we recorded a lock failure; if +  so, call the child to repeat the operation. + +5. The child records what locks it obtains, and returns that +  information to the parent. + +6. When the child has succeeded, goto 1. + +This is flexible enough to handle any potential locking scenario, +even when lock requirements change. It can be optimized so that +the parent does not release locks, just tells the child which +locks it doesn't need to obtain. + +It also keeps the complexity out of the API, and in ctdbd where +it is needed. + +2.9 tdb_chainlock Functions Expose Implementation + +tdb_chainlock locks some number of records, including the record +indicated by the given key. This gave atomicity guarantees; +no-one can start a transaction, alter, read or delete that key +while the lock is held. + +It also makes the same guarantee for any other key in the chain, +which is an internal implementation detail and potentially a +cause for deadlock. + +2.9.1 Proposed Solution + +None. It would be nice to have an explicit single entry lock +which effected no other keys. Unfortunately, this won't work for +an entry which doesn't exist. Thus while chainlock may be +implemented more efficiently for the existing case, it will still +have overlap issues with the non-existing case. So it is best to +keep the current (lack of) guarantee about which records will be +effected to avoid constraining our implementation. + +2.10 Signal Handling is Not Race-Free + +The tdb_setalarm_sigptr() call allows the caller's signal handler +to indicate that the tdb locking code should return with a +failure, rather than trying again when a signal is received (and +errno == EAGAIN). This is usually used to implement timeouts. + +Unfortunately, this does not work in the case where the signal is +received before the tdb code enters the fcntl() call to place the +lock: the code will sleep within the fcntl() code, unaware that +the signal wants it to exit. In the case of long timeouts, this +does not happen in practice. + +2.10.1 Proposed Solution + +The locking hooks proposed in[Proposed-Solution-locking-hook] +would allow the user to decide on whether to fail the lock +acquisition on a signal. This allows the caller to choose their +own compromise: they could narrow the race by checking +immediately before the fcntl call.[footnote: +It may be possible to make this race-free in some implementations +by having the signal handler alter the struct flock to make it +invalid. This will cause the fcntl() lock call to fail with +EINVAL if the signal occurs before the kernel is entered, +otherwise EAGAIN. +] + +2.11 The API Uses Gratuitous Typedefs, Capitals + +typedefs are useful for providing source compatibility when types +can differ across implementations, or arguably in the case of +function pointer definitions which are hard for humans to parse. +Otherwise it is simply obfuscation and pollutes the namespace. + +Capitalization is usually reserved for compile-time constants and +macros. + +  TDB_CONTEXT There is no reason to use this over 'struct +  tdb_context'; the definition isn't visible to the API user +  anyway. + +  TDB_DATA There is no reason to use this over struct TDB_DATA; +  the struct needs to be understood by the API user. + +  struct TDB_DATA This would normally be called 'struct +  tdb_data'. + +  enum TDB_ERROR Similarly, this would normally be enum +  tdb_error. + +2.11.1 Proposed Solution + +None. Introducing lower case variants would please pedants like +myself, but if it were done the existing ones should be kept. +There is little point forcing a purely cosmetic change upon tdb +users. + +2.12 <tdb_log_func-Doesnt-Take>tdb_log_func Doesn't Take The +  Private Pointer + +For API compatibility reasons, the logging function needs to call +tdb_get_logging_private() to retrieve the pointer registered by +the tdb_open_ex for logging. + +2.12.1 Proposed Solution + +It should simply take an extra argument, since we are prepared to +break the API/ABI. + +2.13 Various Callback Functions Are Not Typesafe + +The callback functions in tdb_set_logging_function (after [tdb_log_func-Doesnt-Take] + is resolved), tdb_parse_record, tdb_traverse, tdb_traverse_read +and tdb_check all take void * and must internally convert it to +the argument type they were expecting. + +If this type changes, the compiler will not produce warnings on +the callers, since it only sees void *. + +2.13.1 Proposed Solution + +With careful use of macros, we can create callback functions +which give a warning when used on gcc and the types of the +callback and its private argument differ. Unsupported compilers +will not give a warning, which is no worse than now. In addition, +the callbacks become clearer, as they need not use void * for +their parameter. + +See CCAN's typesafe_cb module at +http://ccan.ozlabs.org/info/typesafe_cb.html + +2.14 TDB_CLEAR_IF_FIRST Must Be Specified On All Opens, +  tdb_reopen_all Problematic + +The TDB_CLEAR_IF_FIRST flag to tdb_open indicates that the TDB +file should be cleared if the caller discovers it is the only +process with the TDB open. However, if any caller does not +specify TDB_CLEAR_IF_FIRST it will not be detected, so will have +the TDB erased underneath them (usually resulting in a crash). + +There is a similar issue on fork(); if the parent exits (or +otherwise closes the tdb) before the child calls tdb_reopen_all() +to establish the lock used to indicate the TDB is opened by +someone, a TDB_CLEAR_IF_FIRST opener at that moment will believe +it alone has opened the TDB and will erase it. + +2.14.1 Proposed Solution + +Remove TDB_CLEAR_IF_FIRST. Other workarounds are possible, but +see [TDB_CLEAR_IF_FIRST-Imposes-Performance]. + +3 Performance And Scalability Issues + +3.1 <TDB_CLEAR_IF_FIRST-Imposes-Performance>TDB_CLEAR_IF_FIRST +  Imposes Performance Penalty + +When TDB_CLEAR_IF_FIRST is specified, a 1-byte read lock is +placed at offset 4 (aka. the ACTIVE_LOCK). While these locks +never conflict in normal tdb usage, they do add substantial +overhead for most fcntl lock implementations when the kernel +scans to detect if a lock conflict exists. This is often a single +linked list, making the time to acquire and release a fcntl lock +O(N) where N is the number of processes with the TDB open, not +the number actually doing work. + +In a Samba server it is common to have huge numbers of clients +sitting idle, and thus they have weaned themselves off the +TDB_CLEAR_IF_FIRST flag.[footnote: +There is a flag to tdb_reopen_all() which is used for this +optimization: if the parent process will outlive the child, the +child does not need the ACTIVE_LOCK. This is a workaround for +this very performance issue. +] + +3.1.1 Proposed Solution + +Remove the flag. It was a neat idea, but even trivial servers +tend to know when they are initializing for the first time and +can simply unlink the old tdb at that point. + +3.2 TDB Files Have a 4G Limit + +This seems to be becoming an issue (so much for “trivial”!), +particularly for ldb. + +3.2.1 Proposed Solution + +A new, incompatible TDB format which uses 64 bit offsets +internally rather than 32 bit as now. For simplicity of endian +conversion (which TDB does on the fly if required), all values +will be 64 bit on disk. In practice, some upper bits may be used +for other purposes, but at least 56 bits will be available for +file offsets. + +tdb_open() will automatically detect the old version, and even +create them if TDB_VERSION6 is specified to tdb_open. + +32 bit processes will still be able to access TDBs larger than 4G +(assuming that their off_t allows them to seek to 64 bits), they +will gracefully fall back as they fail to mmap. This can happen +already with large TDBs. + +Old versions of tdb will fail to open the new TDB files (since 28 +August 2009, commit 398d0c29290: prior to that any unrecognized +file format would be erased and initialized as a fresh tdb!) + +3.3 TDB Records Have a 4G Limit + +This has not been a reported problem, and the API uses size_t +which can be 64 bit on 64 bit platforms. However, other limits +may have made such an issue moot. + +3.3.1 Proposed Solution + +Record sizes will be 64 bit, with an error returned on 32 bit +platforms which try to access such records (the current +implementation would return TDB_ERR_OOM in a similar case). It +seems unlikely that 32 bit keys will be a limitation, so the +implementation may not support this (see [sub:Records-Incur-A]). + +3.4 Hash Size Is Determined At TDB Creation Time + +TDB contains a number of hash chains in the header; the number is +specified at creation time, and defaults to 131. This is such a +bottleneck on large databases (as each hash chain gets quite +long), that LDB uses 10,000 for this hash. In general it is +impossible to know what the 'right' answer is at database +creation time. + +3.4.1 Proposed Solution + +After comprehensive performance testing on various scalable hash +variants[footnote: +http://rusty.ozlabs.org/?p=89 and http://rusty.ozlabs.org/?p=94 +This was annoying because I was previously convinced that an +expanding tree of hashes would be very close to optimal. +], it became clear that it is hard to beat a straight linear hash +table which doubles in size when it reaches saturation. There are +three details which become important: + +1. On encountering a full bucket, we use the next bucket. + +2. Extra hash bits are stored with the offset, to reduce +  comparisons. + +3. A marker entry is used on deleting an entry. + +The doubling of the table must be done under a transaction; we +will not reduce it on deletion, so it will be an unusual case. It +will either be placed at the head (other entries will be moved +out the way so we can expand). We could have a pointer in the +header to the current hashtable location, but that pointer would +have to be read frequently to check for hashtable moves. + +The locking for this is slightly more complex than the chained +case; we currently have one lock per bucket, and that means we +would need to expand the lock if we overflow to the next bucket. +The frequency of such collisions will effect our locking +heuristics: we can always lock more buckets than we need. + +One possible optimization is to only re-check the hash size on an +insert or a lookup miss. + +3.5 <TDB-Freelist-Is>TDB Freelist Is Highly Contended + +TDB uses a single linked list for the free list. Allocation +occurs as follows, using heuristics which have evolved over time: + +1. Get the free list lock for this whole operation. + +2. Multiply length by 1.25, so we always over-allocate by 25%. + +3. Set the slack multiplier to 1. + +4. Examine the current freelist entry: if it is > length but < +  the current best case, remember it as the best case. + +5. Multiply the slack multiplier by 1.05. + +6. If our best fit so far is less than length * slack multiplier, +  return it. The slack will be turned into a new free record if +  it's large enough. + +7. Otherwise, go onto the next freelist entry. + +Deleting a record occurs as follows: + +1. Lock the hash chain for this whole operation. + +2. Walk the chain to find the record, keeping the prev pointer +  offset. + +3. If max_dead is non-zero: + +  (a) Walk the hash chain again and count the dead records. + +  (b) If it's more than max_dead, bulk free all the dead ones +    (similar to steps 4 and below, but the lock is only obtained +    once). + +  (c) Simply mark this record as dead and return. + +4. Get the free list lock for the remainder of this operation. + +5. <right-merging>Examine the following block to see if it is +  free; if so, enlarge the current block and remove that block +  from the free list. This was disabled, as removal from the free +  list was O(entries-in-free-list). + +6. Examine the preceeding block to see if it is free: for this +  reason, each block has a 32-bit tailer which indicates its +  length. If it is free, expand it to cover our new block and +  return. + +7. Otherwise, prepend ourselves to the free list. + +Disabling right-merging (step [right-merging]) causes +fragmentation; the other heuristics proved insufficient to +address this, so the final answer to this was that when we expand +the TDB file inside a transaction commit, we repack the entire +tdb. + +The single list lock limits our allocation rate; due to the other +issues this is not currently seen as a bottleneck. + +3.5.1 Proposed Solution + +The first step is to remove all the current heuristics, as they +obviously interact, then examine them once the lock contention is +addressed. + +The free list must be split to reduce contention. Assuming +perfect free merging, we can at most have 1 free list entry for +each entry. This implies that the number of free lists is related +to the size of the hash table, but as it is rare to walk a large +number of free list entries we can use far fewer, say 1/32 of the +number of hash buckets. + +There are various benefits in using per-size free lists (see [sub:TDB-Becomes-Fragmented] +) but it's not clear this would reduce contention in the common +case where all processes are allocating/freeing the same size. +Thus we almost certainly need to divide in other ways: the most +obvious is to divide the file into zones, and using a free list +(or set of free lists) for each. This approximates address +ordering. + +Note that this means we need to split the free lists when we +expand the file; this is probably acceptable when we double the +hash table size, since that is such an expensive operation +already. In the case of increasing the file size, there is an +optimization we can use: if we use M in the formula above as the +file size rounded up to the next power of 2, we only need +reshuffle free lists when the file size crosses a power of 2 +boundary, and reshuffling the free lists is trivial: we simply +merge every consecutive pair of free lists. + +The basic algorithm is as follows. Freeing is simple: + +1. Identify the correct zone. + +2. Lock the corresponding list. + +3. Re-check the zone (we didn't have a lock, sizes could have +  changed): relock if necessary. + +4. Place the freed entry in the list for that zone. + +Allocation is a little more complicated, as we perform delayed +coalescing at this point: + +1. Pick a zone either the zone we last freed into, or based on a “ +  random” number. + +2. Lock the corresponding list. + +3. Re-check the zone: relock if necessary. + +4. If the top entry is -large enough, remove it from the list and +  return it. + +5. Otherwise, coalesce entries in the list. + +  (a) + +  (b) + +  (c) + +  (d) + +6. If there was no entry large enough, unlock the list and try +  the next zone. + +7. + +8. + +9. If no zone satisfies, expand the file. + +This optimizes rapid insert/delete of free list entries by not +coalescing them all the time.. First-fit address ordering +ordering seems to be fairly good for keeping fragmentation low +(see [sub:TDB-Becomes-Fragmented]). Note that address ordering +does not need a tailer to coalesce, though if we needed one we +could have one cheaply: see [sub:Records-Incur-A]. + + + +I anticipate that the number of entries in each free zone would +be small, but it might be worth using one free entry to hold +pointers to the others for cache efficiency. + +3.6 <sub:TDB-Becomes-Fragmented>TDB Becomes Fragmented + +Much of this is a result of allocation strategy[footnote: +The Memory Fragmentation Problem: Solved? Johnstone & Wilson 1995 +ftp://ftp.cs.utexas.edu/pub/garbage/malloc/ismm98.ps +] and deliberate hobbling of coalescing; internal fragmentation +(aka overallocation) is deliberately set at 25%, and external +fragmentation is only cured by the decision to repack the entire +db when a transaction commit needs to enlarge the file. + +3.6.1 Proposed Solution + +The 25% overhead on allocation works in practice for ldb because +indexes tend to expand by one record at a time. This internal +fragmentation can be resolved by having an “expanded” bit in the +header to note entries that have previously expanded, and +allocating more space for them. + +There are is a spectrum of possible solutions for external +fragmentation: one is to use a fragmentation-avoiding allocation +strategy such as best-fit address-order allocator. The other end +of the spectrum would be to use a bump allocator (very fast and +simple) and simply repack the file when we reach the end. + +There are three problems with efficient fragmentation-avoiding +allocators: they are non-trivial, they tend to use a single free +list for each size, and there's no evidence that tdb allocation +patterns will match those recorded for general allocators (though +it seems likely). + +Thus we don't spend too much effort on external fragmentation; we +will be no worse than the current code if we need to repack on +occasion. More effort is spent on reducing freelist contention, +and reducing overhead. + +3.7 <sub:Records-Incur-A>Records Incur A 28-Byte Overhead + +Each TDB record has a header as follows: + +struct tdb_record { + +        tdb_off_t next; /* offset of the next record in the list +*/ + +        tdb_len_t rec_len; /* total byte length of record */ + +        tdb_len_t key_len; /* byte length of key */ + +        tdb_len_t data_len; /* byte length of data */ + +        uint32_t full_hash; /* the full 32 bit hash of the key */ + +        uint32_t magic;   /* try to catch errors */ + +        /* the following union is implied: + +                union { + +                        char record[rec_len]; + +                        struct { + +                                char key[key_len]; + +                                char data[data_len]; + +                        } + +                        uint32_t totalsize; (tailer) + +                } + +        */ + +}; + +Naively, this would double to a 56-byte overhead on a 64 bit +implementation. + +3.7.1 Proposed Solution + +We can use various techniques to reduce this for an allocated +block: + +1. The 'next' pointer is not required, as we are using a flat +  hash table. + +2. 'rec_len' can instead be expressed as an addition to key_len +  and data_len (it accounts for wasted or overallocated length in +  the record). Since the record length is always a multiple of 8, +  we can conveniently fit it in 32 bits (representing up to 35 +  bits). + +3. 'key_len' and 'data_len' can be reduced. I'm unwilling to +  restrict 'data_len' to 32 bits, but instead we can combine the +  two into one 64-bit field and using a 5 bit value which +  indicates at what bit to divide the two. Keys are unlikely to +  scale as fast as data, so I'm assuming a maximum key size of 32 +  bits. + +4. 'full_hash' is used to avoid a memcmp on the “miss” case, but +  this is diminishing returns after a handful of bits (at 10 +  bits, it reduces 99.9% of false memcmp). As an aside, as the +  lower bits are already incorporated in the hash table +  resolution, the upper bits should be used here. + +5. 'magic' does not need to be enlarged: it currently reflects +  one of 5 values (used, free, dead, recovery, and +  unused_recovery). It is useful for quick sanity checking +  however, and should not be eliminated. + +6. 'tailer' is only used to coalesce free blocks (so a block to +  the right can find the header to check if this block is free). +  This can be replaced by a single 'free' bit in the header of +  the following block (and the tailer only exists in free +  blocks).[footnote: +This technique from Thomas Standish. Data Structure Techniques. +Addison-Wesley, Reading, Massachusetts, 1980. +] The current proposed coalescing algorithm doesn't need this, +  however. + +This produces a 16 byte used header like this: + +struct tdb_used_record { + +        uint32_t magic : 16, + +                 prev_is_free: 1, + +                 key_data_divide: 5, + +                 top_hash: 10; + +        uint32_t extra_octets; + +        uint64_t key_and_data_len; + +}; + +And a free record like this: + +struct tdb_free_record { + +        uint32_t free_magic; + +        uint64_t total_length; + +        ... + +        uint64_t tailer; + +}; + + + +3.8 Transaction Commit Requires 4 fdatasync + +The current transaction algorithm is: + +1. write_recovery_data(); + +2. sync(); + +3. write_recovery_header(); + +4. sync(); + +5. overwrite_with_new_data(); + +6. sync(); + +7. remove_recovery_header(); + +8. sync(); + +On current ext3, each sync flushes all data to disk, so the next +3 syncs are relatively expensive. But this could become a +performance bottleneck on other filesystems such as ext4. + +3.8.1 Proposed Solution + + + + + + + + + +Neil Brown points out that this is overzealous, and only one sync +is needed: + +1. Bundle the recovery data, a transaction counter and a strong +  checksum of the new data. + +2. Strong checksum that whole bundle. + +3. Store the bundle in the database. + +4. Overwrite the oldest of the two recovery pointers in the +  header (identified using the transaction counter) with the +  offset of this bundle. + +5. sync. + +6. Write the new data to the file. + +Checking for recovery means identifying the latest bundle with a +valid checksum and using the new data checksum to ensure that it +has been applied. This is more expensive than the current check, +but need only be done at open. For running databases, a separate +header field can be used to indicate a transaction in progress; +we need only check for recovery if this is set. + +3.9 TDB Does Not Have Snapshot Support + +3.9.1 Proposed Solution + +None. At some point you say “use a real database”. + +But as a thought experiment, if we implemented transactions to +only overwrite free entries (this is tricky: there must not be a +header in each entry which indicates whether it is free, but use +of presence in metadata elsewhere), and a pointer to the hash +table, we could create an entirely new commit without destroying +existing data. Then it would be easy to implement snapshots in a +similar way. + +This would not allow arbitrary changes to the database, such as +tdb_repack does, and would require more space (since we have to +preserve the current and future entries at once). If we used hash +trees rather than one big hash table, we might only have to +rewrite some sections of the hash, too. + +We could then implement snapshots using a similar method, using +multiple different hash tables/free tables. + +3.10 Transactions Cannot Operate in Parallel + +This would be useless for ldb, as it hits the index records with +just about every update. It would add significant complexity in +resolving clashes, and cause the all transaction callers to write +their code to loop in the case where the transactions spuriously +failed. + +3.10.1 Proposed Solution + +We could solve a small part of the problem by providing read-only +transactions. These would allow one write transaction to begin, +but it could not commit until all r/o transactions are done. This +would require a new RO_TRANSACTION_LOCK, which would be upgraded +on commit. + +3.11 Default Hash Function Is Suboptimal + +The Knuth-inspired multiplicative hash used by tdb is fairly slow +(especially if we expand it to 64 bits), and works best when the +hash bucket size is a prime number (which also means a slow +modulus). In addition, it is highly predictable which could +potentially lead to a Denial of Service attack in some TDB uses. + +3.11.1 Proposed Solution + +The Jenkins lookup3 hash[footnote: +http://burtleburtle.net/bob/c/lookup3.c +] is a fast and superbly-mixing hash. It's used by the Linux +kernel and almost everything else. This has the particular +properties that it takes an initial seed, and produces two 32 bit +hash numbers, which we can combine into a 64-bit hash. + +The seed should be created at tdb-creation time from some random +source, and placed in the header. This is far from foolproof, but +adds a little bit of protection against hash bombing. + +3.12 <Reliable-Traversal-Adds>Reliable Traversal Adds Complexity + +We lock a record during traversal iteration, and try to grab that +lock in the delete code. If that grab on delete fails, we simply +mark it deleted and continue onwards; traversal checks for this +condition and does the delete when it moves off the record. + +If traversal terminates, the dead record may be left +indefinitely. + +3.12.1 Proposed Solution + +Remove reliability guarantees; see [traverse-Proposed-Solution]. + +3.13 Fcntl Locking Adds Overhead + +Placing a fcntl lock means a system call, as does removing one. +This is actually one reason why transactions can be faster +(everything is locked once at transaction start). In the +uncontended case, this overhead can theoretically be eliminated. + +3.13.1 Proposed Solution + +None. + +We tried this before with spinlock support, in the early days of +TDB, and it didn't make much difference except in manufactured +benchmarks. + +We could use spinlocks (with futex kernel support under Linux), +but it means that we lose automatic cleanup when a process dies +with a lock. There is a method of auto-cleanup under Linux, but +it's not supported by other operating systems. We could +reintroduce a clear-if-first-style lock and sweep for dead +futexes on open, but that wouldn't help the normal case of one +concurrent opener dying. Increasingly elaborate repair schemes +could be considered, but they require an ABI change (everyone +must use them) anyway, so there's no need to do this at the same +time as everything else. diff --git a/lib/tdb2/doc/design.lyx b/lib/tdb2/doc/design.lyx new file mode 100644 index 0000000000..0a1d6a14bc --- /dev/null +++ b/lib/tdb2/doc/design.lyx @@ -0,0 +1,2689 @@ +#LyX 1.6.7 created this file. For more info see http://www.lyx.org/ +\lyxformat 345 +\begin_document +\begin_header +\textclass article +\use_default_options true +\language english +\inputencoding auto +\font_roman default +\font_sans default +\font_typewriter default +\font_default_family default +\font_sc false +\font_osf false +\font_sf_scale 100 +\font_tt_scale 100 + +\graphics default +\paperfontsize default +\use_hyperref false +\papersize default +\use_geometry false +\use_amsmath 1 +\use_esint 1 +\cite_engine basic +\use_bibtopic false +\paperorientation portrait +\secnumdepth 3 +\tocdepth 3 +\paragraph_separation indent +\defskip medskip +\quotes_language english +\papercolumns 1 +\papersides 1 +\paperpagestyle default +\tracking_changes true +\output_changes true +\author "" +\author "" +\end_header + +\begin_body + +\begin_layout Title +TDB2: A Redesigning The Trivial DataBase +\end_layout + +\begin_layout Author +Rusty Russell, IBM Corporation +\end_layout + +\begin_layout Date +17-March-2011 +\end_layout + +\begin_layout Abstract +The Trivial DataBase on-disk format is 32 bits; with usage cases heading + towards the 4G limit, that must change. + This required breakage provides an opportunity to revisit TDB's other design + decisions and reassess them. +\end_layout + +\begin_layout Section +Introduction +\end_layout + +\begin_layout Standard +The Trivial DataBase was originally written by Andrew Tridgell as a simple + key/data pair storage system with the same API as dbm, but allowing multiple + readers and writers while being small enough (< 1000 lines of C) to include + in SAMBA. + The simple design created in 1999 has proven surprisingly robust and performant +, used in Samba versions 3 and 4 as well as numerous other projects. + Its useful life was greatly increased by the (backwards-compatible!) addition + of transaction support in 2005. +\end_layout + +\begin_layout Standard +The wider variety and greater demands of TDB-using code has lead to some + organic growth of the API, as well as some compromises on the implementation. + None of these, by themselves, are seen as show-stoppers, but the cumulative + effect is to a loss of elegance over the initial, simple TDB implementation. + Here is a table of the approximate number of lines of implementation code + and number of API functions at the end of each year: +\end_layout + +\begin_layout Standard +\begin_inset Tabular +<lyxtabular version="3" rows="12" columns="3"> +<features> +<column alignment="center" valignment="top" width="0"> +<column alignment="center" valignment="top" width="0"> +<column alignment="center" valignment="top" width="0"> +<row> +<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +Year End +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +API Functions +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +Lines of C Code Implementation +\end_layout + +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +1999 +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +13 +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +1195 +\end_layout + +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +2000 +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +24 +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +1725 +\end_layout + +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +2001 +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +32 +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +2228 +\end_layout + +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +2002 +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +35 +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +2481 +\end_layout + +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +2003 +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +35 +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +2552 +\end_layout + +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +2004 +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +40 +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +2584 +\end_layout + +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +2005 +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +38 +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +2647 +\end_layout + +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +2006 +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +52 +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +3754 +\end_layout + +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +2007 +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +66 +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +4398 +\end_layout + +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +2008 +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +71 +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +4768 +\end_layout + +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +2009 +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +73 +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +5715 +\end_layout + +\end_inset +</cell> +</row> +</lyxtabular> + +\end_inset + + +\end_layout + +\begin_layout Standard +This review is an attempt to catalog and address all the known issues with + TDB and create solutions which address the problems without significantly + increasing complexity; all involved are far too aware of the dangers of + second system syndrome in rewriting a successful project like this. +\end_layout + +\begin_layout Section +API Issues +\end_layout + +\begin_layout Subsection +tdb_open_ex Is Not Expandable +\end_layout + +\begin_layout Standard +The tdb_open() call was expanded to tdb_open_ex(), which added an optional + hashing function and an optional logging function argument. + Additional arguments to open would require the introduction of a tdb_open_ex2 + call etc. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\begin_inset CommandInset label +LatexCommand label +name "attributes" + +\end_inset + + +\end_layout + +\begin_layout Standard +tdb_open() will take a linked-list of attributes: +\end_layout + +\begin_layout LyX-Code +enum tdb_attribute { +\end_layout + +\begin_layout LyX-Code +    TDB_ATTRIBUTE_LOG = 0, +\end_layout + +\begin_layout LyX-Code +    TDB_ATTRIBUTE_HASH = 1 +\end_layout + +\begin_layout LyX-Code +}; +\end_layout + +\begin_layout LyX-Code +struct tdb_attribute_base { +\end_layout + +\begin_layout LyX-Code +    enum tdb_attribute attr; +\end_layout + +\begin_layout LyX-Code +    union tdb_attribute *next; +\end_layout + +\begin_layout LyX-Code +}; +\end_layout + +\begin_layout LyX-Code +struct tdb_attribute_log { +\end_layout + +\begin_layout LyX-Code +    struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_LOG */ +\end_layout + +\begin_layout LyX-Code +    tdb_log_func log_fn; +\end_layout + +\begin_layout LyX-Code +    void *log_private; +\end_layout + +\begin_layout LyX-Code +}; +\end_layout + +\begin_layout LyX-Code +struct tdb_attribute_hash { +\end_layout + +\begin_layout LyX-Code +    struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_HASH */ +\end_layout + +\begin_layout LyX-Code +    tdb_hash_func hash_fn; +\end_layout + +\begin_layout LyX-Code +    void *hash_private; +\end_layout + +\begin_layout LyX-Code +}; +\end_layout + +\begin_layout LyX-Code +union tdb_attribute { +\end_layout + +\begin_layout LyX-Code +    struct tdb_attribute_base base; +\end_layout + +\begin_layout LyX-Code +    struct tdb_attribute_log log; +\end_layout + +\begin_layout LyX-Code +    struct tdb_attribute_hash hash; +\end_layout + +\begin_layout LyX-Code +}; +\end_layout + +\begin_layout Standard +This allows future attributes to be added, even if this expands the size + of the union. +\end_layout + +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Complete. +\end_layout + +\begin_layout Subsection +tdb_traverse Makes Impossible Guarantees +\end_layout + +\begin_layout Standard +tdb_traverse (and tdb_firstkey/tdb_nextkey) predate transactions, and it + was thought that it was important to guarantee that all records which exist + at the start and end of the traversal would be included, and no record + would be included twice. +\end_layout + +\begin_layout Standard +This adds complexity (see +\begin_inset CommandInset ref +LatexCommand ref +reference "Reliable-Traversal-Adds" + +\end_inset + +) and does not work anyway for records which are altered (in particular, + those which are expanded may be effectively deleted and re-added behind + the traversal). +\end_layout + +\begin_layout Subsubsection +\begin_inset CommandInset label +LatexCommand label +name "traverse-Proposed-Solution" + +\end_inset + +Proposed Solution +\end_layout + +\begin_layout Standard +Abandon the guarantee. + You will see every record if no changes occur during your traversal, otherwise + you will see some subset. + You can prevent changes by using a transaction or the locking API. +\end_layout + +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Complete. + Delete-during-traverse will still delete every record, too (assuming no + other changes). +\end_layout + +\begin_layout Subsection +Nesting of Transactions Is Fraught +\end_layout + +\begin_layout Standard +TDB has alternated between allowing nested transactions and not allowing + them. + Various paths in the Samba codebase assume that transactions will nest, + and in a sense they can: the operation is only committed to disk when the + outer transaction is committed. + There are two problems, however: +\end_layout + +\begin_layout Enumerate +Canceling the inner transaction will cause the outer transaction commit + to fail, and will not undo any operations since the inner transaction began. + This problem is soluble with some additional internal code. +\end_layout + +\begin_layout Enumerate +An inner transaction commit can be cancelled by the outer transaction. + This is desirable in the way which Samba's database initialization code + uses transactions, but could be a surprise to any users expecting a successful + transaction commit to expose changes to others. +\end_layout + +\begin_layout Standard +The current solution is to specify the behavior at tdb_open(), with the + default currently that nested transactions are allowed. + This flag can also be changed at runtime. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +Given the usage patterns, it seems that the +\begin_inset Quotes eld +\end_inset + +least-surprise +\begin_inset Quotes erd +\end_inset + + behavior of disallowing nested transactions should become the default. + Additionally, it seems the outer transaction is the only code which knows + whether inner transactions should be allowed, so a flag to indicate this + could be added to tdb_transaction_start. + However, this behavior can be simulated with a wrapper which uses tdb_add_flags +() and tdb_remove_flags(), so the API should not be expanded for this relatively +-obscure case. +\end_layout + +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Complete; the nesting flag has been removed. +\end_layout + +\begin_layout Subsection +Incorrect Hash Function is Not Detected +\end_layout + +\begin_layout Standard +tdb_open_ex() allows the calling code to specify a different hash function + to use, but does not check that all other processes accessing this tdb + are using the same hash function. + The result is that records are missing from tdb_fetch(). +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +The header should contain an example hash result (eg. + the hash of 0xdeadbeef), and tdb_open_ex() should check that the given + hash function produces the same answer, or fail the tdb_open call. +\end_layout + +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Complete. +\end_layout + +\begin_layout Subsection +tdb_set_max_dead/TDB_VOLATILE Expose Implementation +\end_layout + +\begin_layout Standard +In response to scalability issues with the free list ( +\begin_inset CommandInset ref +LatexCommand ref +reference "TDB-Freelist-Is" + +\end_inset + +) two API workarounds have been incorporated in TDB: tdb_set_max_dead() + and the TDB_VOLATILE flag to tdb_open. + The latter actually calls the former with an argument of +\begin_inset Quotes eld +\end_inset + +5 +\begin_inset Quotes erd +\end_inset + +. +\end_layout + +\begin_layout Standard +This code allows deleted records to accumulate without putting them in the + free list. + On delete we iterate through each chain and free them in a batch if there + are more than max_dead entries. + These are never otherwise recycled except as a side-effect of a tdb_repack. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +With the scalability problems of the freelist solved, this API can be removed. + The TDB_VOLATILE flag may still be useful as a hint that store and delete + of records will be at least as common as fetch in order to allow some internal + tuning, but initially will become a no-op. +\end_layout + +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Complete. + Unknown flags cause tdb_open() to fail as well, so they can be detected + at runtime. +\end_layout + +\begin_layout Subsection +\begin_inset CommandInset label +LatexCommand label +name "TDB-Files-Cannot" + +\end_inset + +TDB Files Cannot Be Opened Multiple Times In The Same Process +\end_layout + +\begin_layout Standard +No process can open the same TDB twice; we check and disallow it. + This is an unfortunate side-effect of fcntl locks, which operate on a per-file + rather than per-file-descriptor basis, and do not nest. + Thus, closing any file descriptor on a file clears all the locks obtained + by this process, even if they were placed using a different file descriptor! +\end_layout + +\begin_layout Standard +Note that even if this were solved, deadlock could occur if operations were + nested: this is a more manageable programming error in most cases. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +We could lobby POSIX to fix the perverse rules, or at least lobby Linux + to violate them so that the most common implementation does not have this + restriction. + This would be a generally good idea for other fcntl lock users. +\end_layout + +\begin_layout Standard +Samba uses a wrapper which hands out the same tdb_context to multiple callers + if this happens, and does simple reference counting. + We should do this inside the tdb library, which already emulates lock nesting + internally; it would need to recognize when deadlock occurs within a single + process. + This would create a new failure mode for tdb operations (while we currently + handle locking failures, they are impossible in normal use and a process + encountering them can do little but give up). +\end_layout + +\begin_layout Standard +I do not see benefit in an additional tdb_open flag to indicate whether + re-opening is allowed, as though there may be some benefit to adding a + call to detect when a tdb_context is shared, to allow other to create such + an API. +\end_layout + +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Complete. +\end_layout + +\begin_layout Subsection +TDB API Is Not POSIX Thread-safe +\end_layout + +\begin_layout Standard +The TDB API uses an error code which can be queried after an operation to + determine what went wrong. + This programming model does not work with threads, unless specific additional + guarantees are given by the implementation. + In addition, even otherwise-independent threads cannot open the same TDB + (as in +\begin_inset CommandInset ref +LatexCommand ref +reference "TDB-Files-Cannot" + +\end_inset + +). +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +Reachitecting the API to include a tdb_errcode pointer would be a great + deal of churn, but fortunately most functions return 0 on success and -1 + on error: we can change these to return 0 on success and a negative error + code on error, and the API remains similar to previous. + The tdb_fetch, tdb_firstkey and tdb_nextkey functions need to take a TDB_DATA + pointer and return an error code. + It is also simpler to have tdb_nextkey replace its key argument in place, + freeing up any old .dptr. +\end_layout + +\begin_layout Standard +Internal locking is required to make sure that fcntl locks do not overlap + between threads, and also that the global list of tdbs is maintained. +\end_layout + +\begin_layout Standard +The aim is that building tdb with -DTDB_PTHREAD will result in a pthread-safe + version of the library, and otherwise no overhead will exist. + Alternatively, a hooking mechanism similar to that proposed for +\begin_inset CommandInset ref +LatexCommand ref +reference "Proposed-Solution-locking-hook" + +\end_inset + + could be used to enable pthread locking at runtime. +\end_layout + +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Incomplete; API has been changed but thread safety has not been implemented. +\end_layout + +\begin_layout Subsection +*_nonblock Functions And *_mark Functions Expose Implementation +\end_layout + +\begin_layout Standard +CTDB +\begin_inset Foot +status collapsed + +\begin_layout Plain Layout +Clustered TDB, see http://ctdb.samba.org +\end_layout + +\end_inset + + wishes to operate on TDB in a non-blocking manner. + This is currently done as follows: +\end_layout + +\begin_layout Enumerate +Call the _nonblock variant of an API function (eg. + tdb_lockall_nonblock). + If this fails: +\end_layout + +\begin_layout Enumerate +Fork a child process, and wait for it to call the normal variant (eg. + tdb_lockall). +\end_layout + +\begin_layout Enumerate +If the child succeeds, call the _mark variant to indicate we already have + the locks (eg. + tdb_lockall_mark). +\end_layout + +\begin_layout Enumerate +Upon completion, tell the child to release the locks (eg. + tdb_unlockall). +\end_layout + +\begin_layout Enumerate +Indicate to tdb that it should consider the locks removed (eg. + tdb_unlockall_mark). +\end_layout + +\begin_layout Standard +There are several issues with this approach. + Firstly, adding two new variants of each function clutters the API for + an obscure use, and so not all functions have three variants. + Secondly, it assumes that all paths of the functions ask for the same locks, + otherwise the parent process will have to get a lock which the child doesn't + have under some circumstances. + I don't believe this is currently the case, but it constrains the implementatio +n. + +\end_layout + +\begin_layout Subsubsection +\begin_inset CommandInset label +LatexCommand label +name "Proposed-Solution-locking-hook" + +\end_inset + +Proposed Solution +\end_layout + +\begin_layout Standard +Implement a hook for locking methods, so that the caller can control the + calls to create and remove fcntl locks. + In this scenario, ctdbd would operate as follows: +\end_layout + +\begin_layout Enumerate +Call the normal API function, eg tdb_lockall(). +\end_layout + +\begin_layout Enumerate +When the lock callback comes in, check if the child has the lock. + Initially, this is always false. + If so, return 0. + Otherwise, try to obtain it in non-blocking mode. + If that fails, return EWOULDBLOCK. +\end_layout + +\begin_layout Enumerate +Release locks in the unlock callback as normal. +\end_layout + +\begin_layout Enumerate +If tdb_lockall() fails, see if we recorded a lock failure; if so, call the + child to repeat the operation. +\end_layout + +\begin_layout Enumerate +The child records what locks it obtains, and returns that information to + the parent. +\end_layout + +\begin_layout Enumerate +When the child has succeeded, goto 1. +\end_layout + +\begin_layout Standard +This is flexible enough to handle any potential locking scenario, even when + lock requirements change. + It can be optimized so that the parent does not release locks, just tells + the child which locks it doesn't need to obtain. +\end_layout + +\begin_layout Standard +It also keeps the complexity out of the API, and in ctdbd where it is needed. +\end_layout + +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Incomplete. +\end_layout + +\begin_layout Subsection +tdb_chainlock Functions Expose Implementation +\end_layout + +\begin_layout Standard +tdb_chainlock locks some number of records, including the record indicated + by the given key. + This gave atomicity guarantees; no-one can start a transaction, alter, + read or delete that key while the lock is held. +\end_layout + +\begin_layout Standard +It also makes the same guarantee for any other key in the chain, which is + an internal implementation detail and potentially a cause for deadlock. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +None. + It would be nice to have an explicit single entry lock which effected no + other keys. + Unfortunately, this won't work for an entry which doesn't exist. + Thus while chainlock may be implemented more efficiently for the existing + case, it will still have overlap issues with the non-existing case. + So it is best to keep the current (lack of) guarantee about which records + will be effected to avoid constraining our implementation. +\end_layout + +\begin_layout Subsection +Signal Handling is Not Race-Free +\end_layout + +\begin_layout Standard +The tdb_setalarm_sigptr() call allows the caller's signal handler to indicate + that the tdb locking code should return with a failure, rather than trying + again when a signal is received (and errno == EAGAIN). + This is usually used to implement timeouts. +\end_layout + +\begin_layout Standard +Unfortunately, this does not work in the case where the signal is received + before the tdb code enters the fcntl() call to place the lock: the code + will sleep within the fcntl() code, unaware that the signal wants it to + exit. + In the case of long timeouts, this does not happen in practice. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +The locking hooks proposed in +\begin_inset CommandInset ref +LatexCommand ref +reference "Proposed-Solution-locking-hook" + +\end_inset + + would allow the user to decide on whether to fail the lock acquisition + on a signal. + This allows the caller to choose their own compromise: they could narrow + the race by checking immediately before the fcntl call. +\begin_inset Foot +status collapsed + +\begin_layout Plain Layout +It may be possible to make this race-free in some implementations by having + the signal handler alter the struct flock to make it invalid. + This will cause the fcntl() lock call to fail with EINVAL if the signal + occurs before the kernel is entered, otherwise EAGAIN. +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Incomplete. +\end_layout + +\begin_layout Subsection +The API Uses Gratuitous Typedefs, Capitals +\end_layout + +\begin_layout Standard +typedefs are useful for providing source compatibility when types can differ + across implementations, or arguably in the case of function pointer definitions + which are hard for humans to parse. + Otherwise it is simply obfuscation and pollutes the namespace. +\end_layout + +\begin_layout Standard +Capitalization is usually reserved for compile-time constants and macros. +\end_layout + +\begin_layout Description +TDB_CONTEXT There is no reason to use this over 'struct tdb_context'; the + definition isn't visible to the API user anyway. +\end_layout + +\begin_layout Description +TDB_DATA There is no reason to use this over struct TDB_DATA; the struct + needs to be understood by the API user. +\end_layout + +\begin_layout Description +struct +\begin_inset space ~ +\end_inset + +TDB_DATA This would normally be called 'struct tdb_data'. +\end_layout + +\begin_layout Description +enum +\begin_inset space ~ +\end_inset + +TDB_ERROR Similarly, this would normally be enum tdb_error. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +None. + Introducing lower case variants would please pedants like myself, but if + it were done the existing ones should be kept. + There is little point forcing a purely cosmetic change upon tdb users. +\end_layout + +\begin_layout Subsection +\begin_inset CommandInset label +LatexCommand label +name "tdb_log_func-Doesnt-Take" + +\end_inset + +tdb_log_func Doesn't Take The Private Pointer +\end_layout + +\begin_layout Standard +For API compatibility reasons, the logging function needs to call tdb_get_loggin +g_private() to retrieve the pointer registered by the tdb_open_ex for logging. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +It should simply take an extra argument, since we are prepared to break + the API/ABI. +\end_layout + +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Complete. +\end_layout + +\begin_layout Subsection +Various Callback Functions Are Not Typesafe +\end_layout + +\begin_layout Standard +The callback functions in tdb_set_logging_function (after +\begin_inset CommandInset ref +LatexCommand ref +reference "tdb_log_func-Doesnt-Take" + +\end_inset + + is resolved), tdb_parse_record, tdb_traverse, tdb_traverse_read and tdb_check + all take void * and must internally convert it to the argument type they + were expecting. +\end_layout + +\begin_layout Standard +If this type changes, the compiler will not produce warnings on the callers, + since it only sees void *. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +With careful use of macros, we can create callback functions which give + a warning when used on gcc and the types of the callback and its private + argument differ. + Unsupported compilers will not give a warning, which is no worse than now. + In addition, the callbacks become clearer, as they need not use void * + for their parameter. +\end_layout + +\begin_layout Standard +See CCAN's typesafe_cb module at http://ccan.ozlabs.org/info/typesafe_cb.html +\end_layout + +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Complete. +\end_layout + +\begin_layout Subsection +TDB_CLEAR_IF_FIRST Must Be Specified On All Opens, tdb_reopen_all Problematic +\end_layout + +\begin_layout Standard +The TDB_CLEAR_IF_FIRST flag to tdb_open indicates that the TDB file should + be cleared if the caller discovers it is the only process with the TDB + open. + However, if any caller does not specify TDB_CLEAR_IF_FIRST it will not + be detected, so will have the TDB erased underneath them (usually resulting + in a crash). +\end_layout + +\begin_layout Standard +There is a similar issue on fork(); if the parent exits (or otherwise closes + the tdb) before the child calls tdb_reopen_all() to establish the lock + used to indicate the TDB is opened by someone, a TDB_CLEAR_IF_FIRST opener + at that moment will believe it alone has opened the TDB and will erase + it. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +Remove TDB_CLEAR_IF_FIRST. + Other workarounds are possible, but see +\begin_inset CommandInset ref +LatexCommand ref +reference "TDB_CLEAR_IF_FIRST-Imposes-Performance" + +\end_inset + +. +\end_layout + +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Complete. +\end_layout + +\begin_layout Subsection +Extending The Header Is Difficult +\end_layout + +\begin_layout Standard +We have reserved (zeroed) words in the TDB header, which can be used for + future features. + If the future features are compulsory, the version number must be updated + to prevent old code from accessing the database. + But if the future feature is optional, we have no way of telling if older + code is accessing the database or not. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +The header should contain a +\begin_inset Quotes eld +\end_inset + +format variant +\begin_inset Quotes erd +\end_inset + + value (64-bit). + This is divided into two 32-bit parts: +\end_layout + +\begin_layout Enumerate +The lower part reflects the format variant understood by code accessing + the database. +\end_layout + +\begin_layout Enumerate +The upper part reflects the format variant you must understand to write + to the database (otherwise you can only open for reading). +\end_layout + +\begin_layout Standard +The latter field can only be written at creation time, the former should + be written under the OPEN_LOCK when opening the database for writing, if + the variant of the code is lower than the current lowest variant. +\end_layout + +\begin_layout Standard +This should allow backwards-compatible features to be added, and detection + if older code (which doesn't understand the feature) writes to the database. +\end_layout + +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Complete. +\end_layout + +\begin_layout Subsection +Record Headers Are Not Expandible +\end_layout + +\begin_layout Standard +If we later want to add (say) checksums on keys and data, it would require + another format change, which we'd like to avoid. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +We often have extra padding at the tail of a record. + If we ensure that the first byte (if any) of this padding is zero, we will + have a way for future changes to detect code which doesn't understand a + new format: the new code would write (say) a 1 at the tail, and thus if + there is no tail or the first byte is 0, we would know the extension is + not present on that record. +\end_layout + +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Complete. +\end_layout + +\begin_layout Subsection +TDB Does Not Use Talloc +\end_layout + +\begin_layout Standard +Many users of TDB (particularly Samba) use the talloc allocator, and thus + have to wrap TDB in a talloc context to use it conveniently. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +The allocation within TDB is not complicated enough to justify the use of + talloc, and I am reluctant to force another (excellent) library on TDB + users. + Nonetheless a compromise is possible. + An attribute (see +\begin_inset CommandInset ref +LatexCommand ref +reference "attributes" + +\end_inset + +) can be added later to tdb_open() to provide an alternate allocation mechanism, + specifically for talloc but usable by any other allocator (which would + ignore the +\begin_inset Quotes eld +\end_inset + +context +\begin_inset Quotes erd +\end_inset + + argument). +\end_layout + +\begin_layout Standard +This would form a talloc heirarchy as expected, but the caller would still + have to attach a destructor to the tdb context returned from tdb_open to + close it. + All TDB_DATA fields would be children of the tdb_context, and the caller + would still have to manage them (using talloc_free() or talloc_steal()). +\end_layout + +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Deferred. +\end_layout + +\begin_layout Section +Performance And Scalability Issues +\end_layout + +\begin_layout Subsection +\begin_inset CommandInset label +LatexCommand label +name "TDB_CLEAR_IF_FIRST-Imposes-Performance" + +\end_inset + +TDB_CLEAR_IF_FIRST Imposes Performance Penalty +\end_layout + +\begin_layout Standard +When TDB_CLEAR_IF_FIRST is specified, a 1-byte read lock is placed at offset + 4 (aka. + the ACTIVE_LOCK). + While these locks never conflict in normal tdb usage, they do add substantial + overhead for most fcntl lock implementations when the kernel scans to detect + if a lock conflict exists. + This is often a single linked list, making the time to acquire and release + a fcntl lock O(N) where N is the number of processes with the TDB open, + not the number actually doing work. +\end_layout + +\begin_layout Standard +In a Samba server it is common to have huge numbers of clients sitting idle, + and thus they have weaned themselves off the TDB_CLEAR_IF_FIRST flag. +\begin_inset Foot +status collapsed + +\begin_layout Plain Layout +There is a flag to tdb_reopen_all() which is used for this optimization: + if the parent process will outlive the child, the child does not need the + ACTIVE_LOCK. + This is a workaround for this very performance issue. +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +Remove the flag. + It was a neat idea, but even trivial servers tend to know when they are + initializing for the first time and can simply unlink the old tdb at that + point. +\end_layout + +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Complete. +\end_layout + +\begin_layout Subsection +TDB Files Have a 4G Limit +\end_layout + +\begin_layout Standard +This seems to be becoming an issue (so much for +\begin_inset Quotes eld +\end_inset + +trivial +\begin_inset Quotes erd +\end_inset + +!), particularly for ldb. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +A new, incompatible TDB format which uses 64 bit offsets internally rather + than 32 bit as now. + For simplicity of endian conversion (which TDB does on the fly if required), + all values will be 64 bit on disk. + In practice, some upper bits may be used for other purposes, but at least + 56 bits will be available for file offsets. +\end_layout + +\begin_layout Standard +tdb_open() will automatically detect the old version, and even create them + if TDB_VERSION6 is specified to tdb_open. +\end_layout + +\begin_layout Standard +32 bit processes will still be able to access TDBs larger than 4G (assuming + that their off_t allows them to seek to 64 bits), they will gracefully + fall back as they fail to mmap. + This can happen already with large TDBs. +\end_layout + +\begin_layout Standard +Old versions of tdb will fail to open the new TDB files (since 28 August + 2009, commit 398d0c29290: prior to that any unrecognized file format would + be erased and initialized as a fresh tdb!) +\end_layout + +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Complete. +\end_layout + +\begin_layout Subsection +TDB Records Have a 4G Limit +\end_layout + +\begin_layout Standard +This has not been a reported problem, and the API uses size_t which can + be 64 bit on 64 bit platforms. + However, other limits may have made such an issue moot. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +Record sizes will be 64 bit, with an error returned on 32 bit platforms + which try to access such records (the current implementation would return + TDB_ERR_OOM in a similar case). + It seems unlikely that 32 bit keys will be a limitation, so the implementation + may not support this (see +\begin_inset CommandInset ref +LatexCommand ref +reference "sub:Records-Incur-A" + +\end_inset + +). +\end_layout + +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Complete. +\end_layout + +\begin_layout Subsection +Hash Size Is Determined At TDB Creation Time +\end_layout + +\begin_layout Standard +TDB contains a number of hash chains in the header; the number is specified + at creation time, and defaults to 131. + This is such a bottleneck on large databases (as each hash chain gets quite + long), that LDB uses 10,000 for this hash. + In general it is impossible to know what the 'right' answer is at database + creation time. +\end_layout + +\begin_layout Subsubsection +\begin_inset CommandInset label +LatexCommand label +name "sub:Hash-Size-Solution" + +\end_inset + +Proposed Solution +\end_layout + +\begin_layout Standard +After comprehensive performance testing on various scalable hash variants +\begin_inset Foot +status collapsed + +\begin_layout Plain Layout +http://rusty.ozlabs.org/?p=89 and http://rusty.ozlabs.org/?p=94 This was annoying + because I was previously convinced that an expanding tree of hashes would + be very close to optimal. +\end_layout + +\end_inset + +, it became clear that it is hard to beat a straight linear hash table which + doubles in size when it reaches saturation. + Unfortunately, altering the hash table introduces serious locking complications +: the entire hash table needs to be locked to enlarge the hash table, and + others might be holding locks. + Particularly insidious are insertions done under tdb_chainlock. +\end_layout + +\begin_layout Standard +Thus an expanding layered hash will be used: an array of hash groups, with + each hash group exploding into pointers to lower hash groups once it fills, + turning into a hash tree. + This has implications for locking: we must lock the entire group in case + we need to expand it, yet we don't know how deep the tree is at that point. +\end_layout + +\begin_layout Standard +Note that bits from the hash table entries should be stolen to hold more + hash bits to reduce the penalty of collisions. + We can use the otherwise-unused lower 3 bits. + If we limit the size of the database to 64 exabytes, we can use the top + 8 bits of the hash entry as well. + These 11 bits would reduce false positives down to 1 in 2000 which is more + than we need: we can use one of the bits to indicate that the extra hash + bits are valid. + This means we can choose not to re-hash all entries when we expand a hash + group; simply use the next bits we need and mark them invalid. +\end_layout + +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Complete. +\end_layout + +\begin_layout Subsection +\begin_inset CommandInset label +LatexCommand label +name "TDB-Freelist-Is" + +\end_inset + +TDB Freelist Is Highly Contended +\end_layout + +\begin_layout Standard +TDB uses a single linked list for the free list. + Allocation occurs as follows, using heuristics which have evolved over + time: +\end_layout + +\begin_layout Enumerate +Get the free list lock for this whole operation. +\end_layout + +\begin_layout Enumerate +Multiply length by 1.25, so we always over-allocate by 25%. +\end_layout + +\begin_layout Enumerate +Set the slack multiplier to 1. +\end_layout + +\begin_layout Enumerate +Examine the current freelist entry: if it is > length but < the current + best case, remember it as the best case. +\end_layout + +\begin_layout Enumerate +Multiply the slack multiplier by 1.05. +\end_layout + +\begin_layout Enumerate +If our best fit so far is less than length * slack multiplier, return it. + The slack will be turned into a new free record if it's large enough. +\end_layout + +\begin_layout Enumerate +Otherwise, go onto the next freelist entry. +\end_layout + +\begin_layout Standard +Deleting a record occurs as follows: +\end_layout + +\begin_layout Enumerate +Lock the hash chain for this whole operation. +\end_layout + +\begin_layout Enumerate +Walk the chain to find the record, keeping the prev pointer offset. +\end_layout + +\begin_layout Enumerate +If max_dead is non-zero: +\end_layout + +\begin_deeper +\begin_layout Enumerate +Walk the hash chain again and count the dead records. +\end_layout + +\begin_layout Enumerate +If it's more than max_dead, bulk free all the dead ones (similar to steps + 4 and below, but the lock is only obtained once). +\end_layout + +\begin_layout Enumerate +Simply mark this record as dead and return. + +\end_layout + +\end_deeper +\begin_layout Enumerate +Get the free list lock for the remainder of this operation. +\end_layout + +\begin_layout Enumerate +\begin_inset CommandInset label +LatexCommand label +name "right-merging" + +\end_inset + +Examine the following block to see if it is free; if so, enlarge the current + block and remove that block from the free list. + This was disabled, as removal from the free list was O(entries-in-free-list). +\end_layout + +\begin_layout Enumerate +Examine the preceeding block to see if it is free: for this reason, each + block has a 32-bit tailer which indicates its length. + If it is free, expand it to cover our new block and return. +\end_layout + +\begin_layout Enumerate +Otherwise, prepend ourselves to the free list. +\end_layout + +\begin_layout Standard +Disabling right-merging (step +\begin_inset CommandInset ref +LatexCommand ref +reference "right-merging" + +\end_inset + +) causes fragmentation; the other heuristics proved insufficient to address + this, so the final answer to this was that when we expand the TDB file + inside a transaction commit, we repack the entire tdb. +\end_layout + +\begin_layout Standard +The single list lock limits our allocation rate; due to the other issues + this is not currently seen as a bottleneck. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +The first step is to remove all the current heuristics, as they obviously + interact, then examine them once the lock contention is addressed. +\end_layout + +\begin_layout Standard +The free list must be split to reduce contention. + Assuming perfect free merging, we can at most have 1 free list entry for + each entry. + This implies that the number of free lists is related to the size of the + hash table, but as it is rare to walk a large number of free list entries + we can use far fewer, say 1/32 of the number of hash buckets. +\end_layout + +\begin_layout Standard +It seems tempting to try to reuse the hash implementation which we use for + records here, but we have two ways of searching for free entries: for allocatio +n we search by size (and possibly zone) which produces too many clashes + for our hash table to handle well, and for coalescing we search by address. + Thus an array of doubly-linked free lists seems preferable. +\end_layout + +\begin_layout Standard +There are various benefits in using per-size free lists (see +\begin_inset CommandInset ref +LatexCommand ref +reference "sub:TDB-Becomes-Fragmented" + +\end_inset + +) but it's not clear this would reduce contention in the common case where + all processes are allocating/freeing the same size. + Thus we almost certainly need to divide in other ways: the most obvious + is to divide the file into zones, and using a free list (or table of free + lists) for each. + This approximates address ordering. +\end_layout + +\begin_layout Standard +Unfortunately it is difficult to know what heuristics should be used to + determine zone sizes, and our transaction code relies on being able to + create a +\begin_inset Quotes eld +\end_inset + +recovery area +\begin_inset Quotes erd +\end_inset + + by simply appending to the file (difficult if it would need to create a + new zone header). + Thus we use a linked-list of free tables; currently we only ever create + one, but if there is more than one we choose one at random to use. + In future we may use heuristics to add new free tables on contention. + We only expand the file when all free tables are exhausted. +\end_layout + +\begin_layout Standard +The basic algorithm is as follows. + Freeing is simple: +\end_layout + +\begin_layout Enumerate +Identify the correct free list. +\end_layout + +\begin_layout Enumerate +Lock the corresponding list. +\end_layout + +\begin_layout Enumerate +Re-check the list (we didn't have a lock, sizes could have changed): relock + if necessary. +\end_layout + +\begin_layout Enumerate +Place the freed entry in the list. +\end_layout + +\begin_layout Standard +Allocation is a little more complicated, as we perform delayed coalescing + at this point: +\end_layout + +\begin_layout Enumerate +Pick a free table; usually the previous one. +\end_layout + +\begin_layout Enumerate +Lock the corresponding list. +\end_layout + +\begin_layout Enumerate +If the top entry is -large enough, remove it from the list and return it. +\end_layout + +\begin_layout Enumerate +Otherwise, coalesce entries in the list.If there was no entry large enough, + unlock the list and try the next largest list +\end_layout + +\begin_layout Enumerate +If no list has an entry which meets our needs, try the next free table. +\end_layout + +\begin_layout Enumerate +If no zone satisfies, expand the file. +\end_layout + +\begin_layout Standard +This optimizes rapid insert/delete of free list entries by not coalescing + them all the time.. + First-fit address ordering ordering seems to be fairly good for keeping + fragmentation low (see +\begin_inset CommandInset ref +LatexCommand ref +reference "sub:TDB-Becomes-Fragmented" + +\end_inset + +). + Note that address ordering does not need a tailer to coalesce, though if + we needed one we could have one cheaply: see +\begin_inset CommandInset ref +LatexCommand ref +reference "sub:Records-Incur-A" + +\end_inset + +. + +\end_layout + +\begin_layout Standard +Each free entry has the free table number in the header: less than 255. + It also contains a doubly-linked list for easy deletion. +\end_layout + +\begin_layout Subsection +\begin_inset CommandInset label +LatexCommand label +name "sub:TDB-Becomes-Fragmented" + +\end_inset + +TDB Becomes Fragmented +\end_layout + +\begin_layout Standard +Much of this is a result of allocation strategy +\begin_inset Foot +status collapsed + +\begin_layout Plain Layout +The Memory Fragmentation Problem: Solved? Johnstone & Wilson 1995 ftp://ftp.cs.ute +xas.edu/pub/garbage/malloc/ismm98.ps +\end_layout + +\end_inset + + and deliberate hobbling of coalescing; internal fragmentation (aka overallocati +on) is deliberately set at 25%, and external fragmentation is only cured + by the decision to repack the entire db when a transaction commit needs + to enlarge the file. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +The 25% overhead on allocation works in practice for ldb because indexes + tend to expand by one record at a time. + This internal fragmentation can be resolved by having an +\begin_inset Quotes eld +\end_inset + +expanded +\begin_inset Quotes erd +\end_inset + + bit in the header to note entries that have previously expanded, and allocating + more space for them. +\end_layout + +\begin_layout Standard +There are is a spectrum of possible solutions for external fragmentation: + one is to use a fragmentation-avoiding allocation strategy such as best-fit + address-order allocator. + The other end of the spectrum would be to use a bump allocator (very fast + and simple) and simply repack the file when we reach the end. +\end_layout + +\begin_layout Standard +There are three problems with efficient fragmentation-avoiding allocators: + they are non-trivial, they tend to use a single free list for each size, + and there's no evidence that tdb allocation patterns will match those recorded + for general allocators (though it seems likely). +\end_layout + +\begin_layout Standard +Thus we don't spend too much effort on external fragmentation; we will be + no worse than the current code if we need to repack on occasion. + More effort is spent on reducing freelist contention, and reducing overhead. +\end_layout + +\begin_layout Subsection +\begin_inset CommandInset label +LatexCommand label +name "sub:Records-Incur-A" + +\end_inset + +Records Incur A 28-Byte Overhead +\end_layout + +\begin_layout Standard +Each TDB record has a header as follows: +\end_layout + +\begin_layout LyX-Code +struct tdb_record { +\end_layout + +\begin_layout LyX-Code +        tdb_off_t next; /* offset of the next record in the list */ +\end_layout + +\begin_layout LyX-Code +        tdb_len_t rec_len; /* total byte length of record */ +\end_layout + +\begin_layout LyX-Code +        tdb_len_t key_len; /* byte length of key */ +\end_layout + +\begin_layout LyX-Code +        tdb_len_t data_len; /* byte length of data */ +\end_layout + +\begin_layout LyX-Code +        uint32_t full_hash; /* the full 32 bit hash of the key */ +\end_layout + +\begin_layout LyX-Code +        uint32_t magic;   /* try to catch errors */ +\end_layout + +\begin_layout LyX-Code +        /* the following union is implied: +\end_layout + +\begin_layout LyX-Code +                union { +\end_layout + +\begin_layout LyX-Code +                        char record[rec_len]; +\end_layout + +\begin_layout LyX-Code +                        struct { +\end_layout + +\begin_layout LyX-Code +                                char key[key_len]; +\end_layout + +\begin_layout LyX-Code +                                char data[data_len]; +\end_layout + +\begin_layout LyX-Code +                        } +\end_layout + +\begin_layout LyX-Code +                        uint32_t totalsize; (tailer) +\end_layout + +\begin_layout LyX-Code +                } +\end_layout + +\begin_layout LyX-Code +        */ +\end_layout + +\begin_layout LyX-Code +}; +\end_layout + +\begin_layout Standard +Naively, this would double to a 56-byte overhead on a 64 bit implementation. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +We can use various techniques to reduce this for an allocated block: +\end_layout + +\begin_layout Enumerate +The 'next' pointer is not required, as we are using a flat hash table. +\end_layout + +\begin_layout Enumerate +'rec_len' can instead be expressed as an addition to key_len and data_len + (it accounts for wasted or overallocated length in the record). + Since the record length is always a multiple of 8, we can conveniently + fit it in 32 bits (representing up to 35 bits). +\end_layout + +\begin_layout Enumerate +'key_len' and 'data_len' can be reduced. + I'm unwilling to restrict 'data_len' to 32 bits, but instead we can combine + the two into one 64-bit field and using a 5 bit value which indicates at + what bit to divide the two. + Keys are unlikely to scale as fast as data, so I'm assuming a maximum key + size of 32 bits. +\end_layout + +\begin_layout Enumerate +'full_hash' is used to avoid a memcmp on the +\begin_inset Quotes eld +\end_inset + +miss +\begin_inset Quotes erd +\end_inset + + case, but this is diminishing returns after a handful of bits (at 10 bits, + it reduces 99.9% of false memcmp). + As an aside, as the lower bits are already incorporated in the hash table + resolution, the upper bits should be used here. + Note that it's not clear that these bits will be a win, given the extra + bits in the hash table itself (see +\begin_inset CommandInset ref +LatexCommand ref +reference "sub:Hash-Size-Solution" + +\end_inset + +). +\end_layout + +\begin_layout Enumerate +'magic' does not need to be enlarged: it currently reflects one of 5 values + (used, free, dead, recovery, and unused_recovery). + It is useful for quick sanity checking however, and should not be eliminated. +\end_layout + +\begin_layout Enumerate +'tailer' is only used to coalesce free blocks (so a block to the right can + find the header to check if this block is free). + This can be replaced by a single 'free' bit in the header of the following + block (and the tailer only exists in free blocks). +\begin_inset Foot +status collapsed + +\begin_layout Plain Layout +This technique from Thomas Standish. + Data Structure Techniques. + Addison-Wesley, Reading, Massachusetts, 1980. +\end_layout + +\end_inset + + The current proposed coalescing algorithm doesn't need this, however. +\end_layout + +\begin_layout Standard +This produces a 16 byte used header like this: +\end_layout + +\begin_layout LyX-Code +struct tdb_used_record { +\end_layout + +\begin_layout LyX-Code +        uint32_t used_magic : 16, +\end_layout + +\begin_layout LyX-Code + +\end_layout + +\begin_layout LyX-Code +                 key_data_divide: 5, +\end_layout + +\begin_layout LyX-Code +                 top_hash: 11; +\end_layout + +\begin_layout LyX-Code +        uint32_t extra_octets; +\end_layout + +\begin_layout LyX-Code +        uint64_t key_and_data_len; +\end_layout + +\begin_layout LyX-Code +}; +\end_layout + +\begin_layout Standard +And a free record like this: +\end_layout + +\begin_layout LyX-Code +struct tdb_free_record { +\end_layout + +\begin_layout LyX-Code +        uint64_t free_magic: 8, +\end_layout + +\begin_layout LyX-Code +                   prev : 56; +\end_layout + +\begin_layout LyX-Code + +\end_layout + +\begin_layout LyX-Code +        uint64_t free_table: 8, +\end_layout + +\begin_layout LyX-Code +                 total_length : 56 +\end_layout + +\begin_layout LyX-Code +        uint64_t next;; +\end_layout + +\begin_layout LyX-Code +}; +\end_layout + +\begin_layout Standard +Note that by limiting valid offsets to 56 bits, we can pack everything we + need into 3 64-byte words, meaning our minimum record size is 8 bytes. +\end_layout + +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Complete. +\end_layout + +\begin_layout Subsection +Transaction Commit Requires 4 fdatasync +\end_layout + +\begin_layout Standard +The current transaction algorithm is: +\end_layout + +\begin_layout Enumerate +write_recovery_data(); +\end_layout + +\begin_layout Enumerate +sync(); +\end_layout + +\begin_layout Enumerate +write_recovery_header(); +\end_layout + +\begin_layout Enumerate +sync(); +\end_layout + +\begin_layout Enumerate +overwrite_with_new_data(); +\end_layout + +\begin_layout Enumerate +sync(); +\end_layout + +\begin_layout Enumerate +remove_recovery_header(); +\end_layout + +\begin_layout Enumerate +sync(); +\end_layout + +\begin_layout Standard +On current ext3, each sync flushes all data to disk, so the next 3 syncs + are relatively expensive. + But this could become a performance bottleneck on other filesystems such + as ext4. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +Neil Brown points out that this is overzealous, and only one sync is needed: +\end_layout + +\begin_layout Enumerate +Bundle the recovery data, a transaction counter and a strong checksum of + the new data. +\end_layout + +\begin_layout Enumerate +Strong checksum that whole bundle. +\end_layout + +\begin_layout Enumerate +Store the bundle in the database. +\end_layout + +\begin_layout Enumerate +Overwrite the oldest of the two recovery pointers in the header (identified + using the transaction counter) with the offset of this bundle. +\end_layout + +\begin_layout Enumerate +sync. +\end_layout + +\begin_layout Enumerate +Write the new data to the file. +\end_layout + +\begin_layout Standard +Checking for recovery means identifying the latest bundle with a valid checksum + and using the new data checksum to ensure that it has been applied. + This is more expensive than the current check, but need only be done at + open. + For running databases, a separate header field can be used to indicate + a transaction in progress; we need only check for recovery if this is set. +\end_layout + +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Deferred. +\end_layout + +\begin_layout Subsection +\begin_inset CommandInset label +LatexCommand label +name "sub:TDB-Does-Not" + +\end_inset + +TDB Does Not Have Snapshot Support +\end_layout + +\begin_layout Subsubsection +Proposed SolutionNone. + At some point you say +\begin_inset Quotes eld +\end_inset + +use a real database +\begin_inset Quotes erd +\end_inset + + (but see +\begin_inset CommandInset ref +LatexCommand ref +reference "replay-attribute" + +\end_inset + +). +\end_layout + +\begin_layout Standard +But as a thought experiment, if we implemented transactions to only overwrite + free entries (this is tricky: there must not be a header in each entry + which indicates whether it is free, but use of presence in metadata elsewhere), + and a pointer to the hash table, we could create an entirely new commit + without destroying existing data. + Then it would be easy to implement snapshots in a similar way. +\end_layout + +\begin_layout Standard +This would not allow arbitrary changes to the database, such as tdb_repack + does, and would require more space (since we have to preserve the current + and future entries at once). + If we used hash trees rather than one big hash table, we might only have + to rewrite some sections of the hash, too. +\end_layout + +\begin_layout Standard +We could then implement snapshots using a similar method, using multiple + different hash tables/free tables. +\end_layout + +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Deferred. +\end_layout + +\begin_layout Subsection +Transactions Cannot Operate in Parallel +\end_layout + +\begin_layout Standard +This would be useless for ldb, as it hits the index records with just about + every update. + It would add significant complexity in resolving clashes, and cause the + all transaction callers to write their code to loop in the case where the + transactions spuriously failed. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +None (but see +\begin_inset CommandInset ref +LatexCommand ref +reference "replay-attribute" + +\end_inset + +). + We could solve a small part of the problem by providing read-only transactions. + These would allow one write transaction to begin, but it could not commit + until all r/o transactions are done. + This would require a new RO_TRANSACTION_LOCK, which would be upgraded on + commit. +\end_layout + +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Deferred. +\end_layout + +\begin_layout Subsection +Default Hash Function Is Suboptimal +\end_layout + +\begin_layout Standard +The Knuth-inspired multiplicative hash used by tdb is fairly slow (especially + if we expand it to 64 bits), and works best when the hash bucket size is + a prime number (which also means a slow modulus). + In addition, it is highly predictable which could potentially lead to a + Denial of Service attack in some TDB uses. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +The Jenkins lookup3 hash +\begin_inset Foot +status open + +\begin_layout Plain Layout +http://burtleburtle.net/bob/c/lookup3.c +\end_layout + +\end_inset + + is a fast and superbly-mixing hash. + It's used by the Linux kernel and almost everything else. + This has the particular properties that it takes an initial seed, and produces + two 32 bit hash numbers, which we can combine into a 64-bit hash. +\end_layout + +\begin_layout Standard +The seed should be created at tdb-creation time from some random source, + and placed in the header. + This is far from foolproof, but adds a little bit of protection against + hash bombing. +\end_layout + +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Complete. +\end_layout + +\begin_layout Subsection +\begin_inset CommandInset label +LatexCommand label +name "Reliable-Traversal-Adds" + +\end_inset + +Reliable Traversal Adds Complexity +\end_layout + +\begin_layout Standard +We lock a record during traversal iteration, and try to grab that lock in + the delete code. + If that grab on delete fails, we simply mark it deleted and continue onwards; + traversal checks for this condition and does the delete when it moves off + the record. +\end_layout + +\begin_layout Standard +If traversal terminates, the dead record may be left indefinitely. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +Remove reliability guarantees; see +\begin_inset CommandInset ref +LatexCommand ref +reference "traverse-Proposed-Solution" + +\end_inset + +. +\end_layout + +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Complete. +\end_layout + +\begin_layout Subsection +Fcntl Locking Adds Overhead +\end_layout + +\begin_layout Standard +Placing a fcntl lock means a system call, as does removing one. + This is actually one reason why transactions can be faster (everything + is locked once at transaction start). + In the uncontended case, this overhead can theoretically be eliminated. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +None. +\end_layout + +\begin_layout Standard +We tried this before with spinlock support, in the early days of TDB, and + it didn't make much difference except in manufactured benchmarks. +\end_layout + +\begin_layout Standard +We could use spinlocks (with futex kernel support under Linux), but it means + that we lose automatic cleanup when a process dies with a lock. + There is a method of auto-cleanup under Linux, but it's not supported by + other operating systems. + We could reintroduce a clear-if-first-style lock and sweep for dead futexes + on open, but that wouldn't help the normal case of one concurrent opener + dying. + Increasingly elaborate repair schemes could be considered, but they require + an ABI change (everyone must use them) anyway, so there's no need to do + this at the same time as everything else. +\end_layout + +\begin_layout Subsection +Some Transactions Don't Require Durability +\end_layout + +\begin_layout Standard +Volker points out that gencache uses a CLEAR_IF_FIRST tdb for normal (fast) + usage, and occasionally empties the results into a transactional TDB. + This kind of usage prioritizes performance over durability: as long as + we are consistent, data can be lost. +\end_layout + +\begin_layout Standard +This would be more neatly implemented inside tdb: a +\begin_inset Quotes eld +\end_inset + +soft +\begin_inset Quotes erd +\end_inset + + transaction commit (ie. + syncless) which meant that data may be reverted on a crash. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +None. +\end_layout + +\begin_layout Standard +Unfortunately any transaction scheme which overwrites old data requires + a sync before that overwrite to avoid the possibility of corruption. +\end_layout + +\begin_layout Standard +It seems possible to use a scheme similar to that described in +\begin_inset CommandInset ref +LatexCommand ref +reference "sub:TDB-Does-Not" + +\end_inset + +,where transactions are committed without overwriting existing data, and + an array of top-level pointers were available in the header. + If the transaction is +\begin_inset Quotes eld +\end_inset + +soft +\begin_inset Quotes erd +\end_inset + + then we would not need a sync at all: existing processes would pick up + the new hash table and free list and work with that. +\end_layout + +\begin_layout Standard +At some later point, a sync would allow recovery of the old data into the + free lists (perhaps when the array of top-level pointers filled). + On crash, tdb_open() would examine the array of top levels, and apply the + transactions until it encountered an invalid checksum. +\end_layout + +\begin_layout Subsection +Tracing Is Fragile, Replay Is External +\end_layout + +\begin_layout Standard +The current TDB has compile-time-enabled tracing code, but it often breaks + as it is not enabled by default. + In a similar way, the ctdb code has an external wrapper which does replay + tracing so it can coordinate cluster-wide transactions. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\begin_inset CommandInset label +LatexCommand label +name "replay-attribute" + +\end_inset + + +\end_layout + +\begin_layout Standard +Tridge points out that an attribute can be later added to tdb_open (see + +\begin_inset CommandInset ref +LatexCommand ref +reference "attributes" + +\end_inset + +) to provide replay/trace hooks, which could become the basis for this and + future parallel transactions and snapshot support. +\end_layout + +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Deferred. +\end_layout + +\end_body +\end_document diff --git a/lib/tdb2/doc/design.lyx,v b/lib/tdb2/doc/design.lyx,v new file mode 100644 index 0000000000..13e6387f7f --- /dev/null +++ b/lib/tdb2/doc/design.lyx,v @@ -0,0 +1,4679 @@ +head	1.13; +access; +symbols; +locks; strict; +comment	@# @; + + +1.13 +date	2011.03.01.11.46.54;	author rusty;	state Exp; +branches; +next	1.12; + +1.12 +date	2010.12.01.12.20.49;	author rusty;	state Exp; +branches; +next	1.11; + +1.11 +date	2010.12.01.11.55.20;	author rusty;	state Exp; +branches; +next	1.10; + +1.10 +date	2010.09.14.00.33.57;	author rusty;	state Exp; +branches; +next	1.9; + +1.9 +date	2010.09.09.07.25.12;	author rusty;	state Exp; +branches; +next	1.8; + +1.8 +date	2010.09.02.02.29.05;	author rusty;	state Exp; +branches; +next	1.7; + +1.7 +date	2010.09.01.10.58.12;	author rusty;	state Exp; +branches; +next	1.6; + +1.6 +date	2010.08.02.00.21.43;	author rusty;	state Exp; +branches; +next	1.5; + +1.5 +date	2010.08.02.00.21.16;	author rusty;	state Exp; +branches; +next	1.4; + +1.4 +date	2010.05.10.13.09.11;	author rusty;	state Exp; +branches; +next	1.3; + +1.3 +date	2010.05.10.11.58.37;	author rusty;	state Exp; +branches; +next	1.2; + +1.2 +date	2010.05.10.05.35.13;	author rusty;	state Exp; +branches; +next	1.1; + +1.1 +date	2010.05.04.02.29.16;	author rusty;	state Exp; +branches; +next	; + + +desc +@First draft +@ + + +1.13 +log +@Thread-safe API +@ +text +@#LyX 1.6.7 created this file. For more info see http://www.lyx.org/ +\lyxformat 345 +\begin_document +\begin_header +\textclass article +\use_default_options true +\language english +\inputencoding auto +\font_roman default +\font_sans default +\font_typewriter default +\font_default_family default +\font_sc false +\font_osf false +\font_sf_scale 100 +\font_tt_scale 100 + +\graphics default +\paperfontsize default +\use_hyperref false +\papersize default +\use_geometry false +\use_amsmath 1 +\use_esint 1 +\cite_engine basic +\use_bibtopic false +\paperorientation portrait +\secnumdepth 3 +\tocdepth 3 +\paragraph_separation indent +\defskip medskip +\quotes_language english +\papercolumns 1 +\papersides 1 +\paperpagestyle default +\tracking_changes true +\output_changes true +\author "Rusty Russell,,," +\author "" +\end_header + +\begin_body + +\begin_layout Title +TDB2: A Redesigning The Trivial DataBase +\end_layout + +\begin_layout Author +Rusty Russell, IBM Corporation +\end_layout + +\begin_layout Date +1-December-2010 +\end_layout + +\begin_layout Abstract +The Trivial DataBase on-disk format is 32 bits; with usage cases heading + towards the 4G limit, that must change. + This required breakage provides an opportunity to revisit TDB's other design + decisions and reassess them. +\end_layout + +\begin_layout Section +Introduction +\end_layout + +\begin_layout Standard +The Trivial DataBase was originally written by Andrew Tridgell as a simple + key/data pair storage system with the same API as dbm, but allowing multiple + readers and writers while being small enough (< 1000 lines of C) to include + in SAMBA. + The simple design created in 1999 has proven surprisingly robust and performant +, used in Samba versions 3 and 4 as well as numerous other projects. + Its useful life was greatly increased by the (backwards-compatible!) addition + of transaction support in 2005. +\end_layout + +\begin_layout Standard +The wider variety and greater demands of TDB-using code has lead to some + organic growth of the API, as well as some compromises on the implementation. + None of these, by themselves, are seen as show-stoppers, but the cumulative + effect is to a loss of elegance over the initial, simple TDB implementation. + Here is a table of the approximate number of lines of implementation code + and number of API functions at the end of each year: +\end_layout + +\begin_layout Standard +\begin_inset Tabular +<lyxtabular version="3" rows="12" columns="3"> +<features> +<column alignment="center" valignment="top" width="0"> +<column alignment="center" valignment="top" width="0"> +<column alignment="center" valignment="top" width="0"> +<row> +<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +Year End +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +API Functions +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +Lines of C Code Implementation +\end_layout + +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +1999 +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +13 +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +1195 +\end_layout + +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +2000 +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +24 +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +1725 +\end_layout + +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +2001 +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +32 +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +2228 +\end_layout + +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +2002 +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +35 +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +2481 +\end_layout + +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +2003 +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +35 +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +2552 +\end_layout + +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +2004 +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +40 +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +2584 +\end_layout + +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +2005 +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +38 +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +2647 +\end_layout + +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +2006 +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +52 +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +3754 +\end_layout + +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +2007 +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +66 +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +4398 +\end_layout + +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +2008 +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +71 +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +4768 +\end_layout + +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +2009 +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +73 +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +5715 +\end_layout + +\end_inset +</cell> +</row> +</lyxtabular> + +\end_inset + + +\end_layout + +\begin_layout Standard +This review is an attempt to catalog and address all the known issues with + TDB and create solutions which address the problems without significantly + increasing complexity; all involved are far too aware of the dangers of + second system syndrome in rewriting a successful project like this. +\end_layout + +\begin_layout Section +API Issues +\end_layout + +\begin_layout Subsection +tdb_open_ex Is Not Expandable +\end_layout + +\begin_layout Standard +The tdb_open() call was expanded to tdb_open_ex(), which added an optional + hashing function and an optional logging function argument. + Additional arguments to open would require the introduction of a tdb_open_ex2 + call etc. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\begin_inset CommandInset label +LatexCommand label +name "attributes" + +\end_inset + + +\end_layout + +\begin_layout Standard +tdb_open() will take a linked-list of attributes: +\end_layout + +\begin_layout LyX-Code +enum tdb_attribute { +\end_layout + +\begin_layout LyX-Code +    TDB_ATTRIBUTE_LOG = 0, +\end_layout + +\begin_layout LyX-Code +    TDB_ATTRIBUTE_HASH = 1 +\end_layout + +\begin_layout LyX-Code +}; +\end_layout + +\begin_layout LyX-Code +struct tdb_attribute_base { +\end_layout + +\begin_layout LyX-Code +    enum tdb_attribute attr; +\end_layout + +\begin_layout LyX-Code +    union tdb_attribute *next; +\end_layout + +\begin_layout LyX-Code +}; +\end_layout + +\begin_layout LyX-Code +struct tdb_attribute_log { +\end_layout + +\begin_layout LyX-Code +    struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_LOG */ +\end_layout + +\begin_layout LyX-Code +    tdb_log_func log_fn; +\end_layout + +\begin_layout LyX-Code +    void *log_private; +\end_layout + +\begin_layout LyX-Code +}; +\end_layout + +\begin_layout LyX-Code +struct tdb_attribute_hash { +\end_layout + +\begin_layout LyX-Code +    struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_HASH */ +\end_layout + +\begin_layout LyX-Code +    tdb_hash_func hash_fn; +\end_layout + +\begin_layout LyX-Code +    void *hash_private; +\end_layout + +\begin_layout LyX-Code +}; +\end_layout + +\begin_layout LyX-Code +union tdb_attribute { +\end_layout + +\begin_layout LyX-Code +    struct tdb_attribute_base base; +\end_layout + +\begin_layout LyX-Code +    struct tdb_attribute_log log; +\end_layout + +\begin_layout LyX-Code +    struct tdb_attribute_hash hash; +\end_layout + +\begin_layout LyX-Code +}; +\end_layout + +\begin_layout Standard +This allows future attributes to be added, even if this expands the size + of the union. +\end_layout + +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Complete. +\end_layout + +\begin_layout Subsection +tdb_traverse Makes Impossible Guarantees +\end_layout + +\begin_layout Standard +tdb_traverse (and tdb_firstkey/tdb_nextkey) predate transactions, and it + was thought that it was important to guarantee that all records which exist + at the start and end of the traversal would be included, and no record + would be included twice. +\end_layout + +\begin_layout Standard +This adds complexity (see +\begin_inset CommandInset ref +LatexCommand ref +reference "Reliable-Traversal-Adds" + +\end_inset + +) and does not work anyway for records which are altered (in particular, + those which are expanded may be effectively deleted and re-added behind + the traversal). +\end_layout + +\begin_layout Subsubsection +\begin_inset CommandInset label +LatexCommand label +name "traverse-Proposed-Solution" + +\end_inset + +Proposed Solution +\end_layout + +\begin_layout Standard +Abandon the guarantee. + You will see every record if no changes occur during your traversal, otherwise + you will see some subset. + You can prevent changes by using a transaction or the locking API. +\end_layout + +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Complete. + Delete-during-traverse will still delete every record, too (assuming no + other changes). +\end_layout + +\begin_layout Subsection +Nesting of Transactions Is Fraught +\end_layout + +\begin_layout Standard +TDB has alternated between allowing nested transactions and not allowing + them. + Various paths in the Samba codebase assume that transactions will nest, + and in a sense they can: the operation is only committed to disk when the + outer transaction is committed. + There are two problems, however: +\end_layout + +\begin_layout Enumerate +Canceling the inner transaction will cause the outer transaction commit + to fail, and will not undo any operations since the inner transaction began. + This problem is soluble with some additional internal code. +\end_layout + +\begin_layout Enumerate +An inner transaction commit can be cancelled by the outer transaction. + This is desirable in the way which Samba's database initialization code + uses transactions, but could be a surprise to any users expecting a successful + transaction commit to expose changes to others. +\end_layout + +\begin_layout Standard +The current solution is to specify the behavior at tdb_open(), with the + default currently that nested transactions are allowed. + This flag can also be changed at runtime. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +Given the usage patterns, it seems that the +\begin_inset Quotes eld +\end_inset + +least-surprise +\begin_inset Quotes erd +\end_inset + + behavior of disallowing nested transactions should become the default. + Additionally, it seems the outer transaction is the only code which knows + whether inner transactions should be allowed, so a flag to indicate this + could be added to tdb_transaction_start. + However, this behavior can be simulated with a wrapper which uses tdb_add_flags +() and tdb_remove_flags(), so the API should not be expanded for this relatively +-obscure case. +\end_layout + +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard + +\change_deleted 0 1298979572 +Incomplete; nesting flag is still defined as per tdb1. +\change_inserted 0 1298979584 +Complete; the nesting flag has been removed. +\change_unchanged + +\end_layout + +\begin_layout Subsection +Incorrect Hash Function is Not Detected +\end_layout + +\begin_layout Standard +tdb_open_ex() allows the calling code to specify a different hash function + to use, but does not check that all other processes accessing this tdb + are using the same hash function. + The result is that records are missing from tdb_fetch(). +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +The header should contain an example hash result (eg. + the hash of 0xdeadbeef), and tdb_open_ex() should check that the given + hash function produces the same answer, or fail the tdb_open call. +\end_layout + +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Complete. +\end_layout + +\begin_layout Subsection +tdb_set_max_dead/TDB_VOLATILE Expose Implementation +\end_layout + +\begin_layout Standard +In response to scalability issues with the free list ( +\begin_inset CommandInset ref +LatexCommand ref +reference "TDB-Freelist-Is" + +\end_inset + +) two API workarounds have been incorporated in TDB: tdb_set_max_dead() + and the TDB_VOLATILE flag to tdb_open. + The latter actually calls the former with an argument of +\begin_inset Quotes eld +\end_inset + +5 +\begin_inset Quotes erd +\end_inset + +. +\end_layout + +\begin_layout Standard +This code allows deleted records to accumulate without putting them in the + free list. + On delete we iterate through each chain and free them in a batch if there + are more than max_dead entries. + These are never otherwise recycled except as a side-effect of a tdb_repack. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +With the scalability problems of the freelist solved, this API can be removed. + The TDB_VOLATILE flag may still be useful as a hint that store and delete + of records will be at least as common as fetch in order to allow some internal + tuning, but initially will become a no-op. +\end_layout + +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Incomplete. + TDB_VOLATILE still defined, but implementation should fail on unknown flags + to be future-proof. +\end_layout + +\begin_layout Subsection +\begin_inset CommandInset label +LatexCommand label +name "TDB-Files-Cannot" + +\end_inset + +TDB Files Cannot Be Opened Multiple Times In The Same Process +\end_layout + +\begin_layout Standard +No process can open the same TDB twice; we check and disallow it. + This is an unfortunate side-effect of fcntl locks, which operate on a per-file + rather than per-file-descriptor basis, and do not nest. + Thus, closing any file descriptor on a file clears all the locks obtained + by this process, even if they were placed using a different file descriptor! +\end_layout + +\begin_layout Standard +Note that even if this were solved, deadlock could occur if operations were + nested: this is a more manageable programming error in most cases. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +We could lobby POSIX to fix the perverse rules, or at least lobby Linux + to violate them so that the most common implementation does not have this + restriction. + This would be a generally good idea for other fcntl lock users. +\end_layout + +\begin_layout Standard +Samba uses a wrapper which hands out the same tdb_context to multiple callers + if this happens, and does simple reference counting. + We should do this inside the tdb library, which already emulates lock nesting + internally; it would need to recognize when deadlock occurs within a single + process. + This would create a new failure mode for tdb operations (while we currently + handle locking failures, they are impossible in normal use and a process + encountering them can do little but give up). +\end_layout + +\begin_layout Standard +I do not see benefit in an additional tdb_open flag to indicate whether + re-opening is allowed, as though there may be some benefit to adding a + call to detect when a tdb_context is shared, to allow other to create such + an API. +\end_layout + +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Incomplete. +\end_layout + +\begin_layout Subsection +TDB API Is Not POSIX Thread-safe +\end_layout + +\begin_layout Standard +The TDB API uses an error code which can be queried after an operation to + determine what went wrong. + This programming model does not work with threads, unless specific additional + guarantees are given by the implementation. + In addition, even otherwise-independent threads cannot open the same TDB + (as in +\begin_inset CommandInset ref +LatexCommand ref +reference "TDB-Files-Cannot" + +\end_inset + +). +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +Reachitecting the API to include a tdb_errcode pointer would be a great + deal of churn +\change_inserted 0 1298979557 +, but fortunately most functions return 0 on success and -1 on error: we + can change these to return 0 on success and a negative error code on error, + and the API remains similar to previous. + The tdb_fetch, tdb_firstkey and tdb_nextkey functions need to take a TDB_DATA + pointer and return an error code. + It is also simpler to have tdb_nextkey replace its key argument in place, + freeing up any old .dptr. +\end_layout + +\begin_layout Standard + +\change_deleted 0 1298979438 +; we are better to guarantee that the tdb_errcode is per-thread so the current + programming model can be maintained. +\end_layout + +\begin_layout Standard + +\change_deleted 0 1298979438 +This requires dynamic per-thread allocations, which is awkward with POSIX + threads (pthread_key_create space is limited and we cannot simply allocate + a key for every TDB). +\change_unchanged + +\end_layout + +\begin_layout Standard +Internal locking is required to make sure that fcntl locks do not overlap + between threads, and also that the global list of tdbs is maintained. +\end_layout + +\begin_layout Standard +The aim is that building tdb with -DTDB_PTHREAD will result in a pthread-safe + version of the library, and otherwise no overhead will exist. + Alternatively, a hooking mechanism similar to that proposed for +\begin_inset CommandInset ref +LatexCommand ref +reference "Proposed-Solution-locking-hook" + +\end_inset + + could be used to enable pthread locking at runtime. +\end_layout + +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Incomplete +\change_inserted 0 1298979681 +; API has been changed but thread safety has not been implemented. +\change_deleted 0 1298979669 +. +\change_unchanged + +\end_layout + +\begin_layout Subsection +*_nonblock Functions And *_mark Functions Expose Implementation +\end_layout + +\begin_layout Standard +CTDB +\begin_inset Foot +status collapsed + +\begin_layout Plain Layout +Clustered TDB, see http://ctdb.samba.org +\end_layout + +\end_inset + + wishes to operate on TDB in a non-blocking manner. + This is currently done as follows: +\end_layout + +\begin_layout Enumerate +Call the _nonblock variant of an API function (eg. + tdb_lockall_nonblock). + If this fails: +\end_layout + +\begin_layout Enumerate +Fork a child process, and wait for it to call the normal variant (eg. + tdb_lockall). +\end_layout + +\begin_layout Enumerate +If the child succeeds, call the _mark variant to indicate we already have + the locks (eg. + tdb_lockall_mark). +\end_layout + +\begin_layout Enumerate +Upon completion, tell the child to release the locks (eg. + tdb_unlockall). +\end_layout + +\begin_layout Enumerate +Indicate to tdb that it should consider the locks removed (eg. + tdb_unlockall_mark). +\end_layout + +\begin_layout Standard +There are several issues with this approach. + Firstly, adding two new variants of each function clutters the API for + an obscure use, and so not all functions have three variants. + Secondly, it assumes that all paths of the functions ask for the same locks, + otherwise the parent process will have to get a lock which the child doesn't + have under some circumstances. + I don't believe this is currently the case, but it constrains the implementatio +n. + +\end_layout + +\begin_layout Subsubsection +\begin_inset CommandInset label +LatexCommand label +name "Proposed-Solution-locking-hook" + +\end_inset + +Proposed Solution +\end_layout + +\begin_layout Standard +Implement a hook for locking methods, so that the caller can control the + calls to create and remove fcntl locks. + In this scenario, ctdbd would operate as follows: +\end_layout + +\begin_layout Enumerate +Call the normal API function, eg tdb_lockall(). +\end_layout + +\begin_layout Enumerate +When the lock callback comes in, check if the child has the lock. + Initially, this is always false. + If so, return 0. + Otherwise, try to obtain it in non-blocking mode. + If that fails, return EWOULDBLOCK. +\end_layout + +\begin_layout Enumerate +Release locks in the unlock callback as normal. +\end_layout + +\begin_layout Enumerate +If tdb_lockall() fails, see if we recorded a lock failure; if so, call the + child to repeat the operation. +\end_layout + +\begin_layout Enumerate +The child records what locks it obtains, and returns that information to + the parent. +\end_layout + +\begin_layout Enumerate +When the child has succeeded, goto 1. +\end_layout + +\begin_layout Standard +This is flexible enough to handle any potential locking scenario, even when + lock requirements change. + It can be optimized so that the parent does not release locks, just tells + the child which locks it doesn't need to obtain. +\end_layout + +\begin_layout Standard +It also keeps the complexity out of the API, and in ctdbd where it is needed. +\end_layout + +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Incomplete. +\end_layout + +\begin_layout Subsection +tdb_chainlock Functions Expose Implementation +\end_layout + +\begin_layout Standard +tdb_chainlock locks some number of records, including the record indicated + by the given key. + This gave atomicity guarantees; no-one can start a transaction, alter, + read or delete that key while the lock is held. +\end_layout + +\begin_layout Standard +It also makes the same guarantee for any other key in the chain, which is + an internal implementation detail and potentially a cause for deadlock. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +None. + It would be nice to have an explicit single entry lock which effected no + other keys. + Unfortunately, this won't work for an entry which doesn't exist. + Thus while chainlock may be implemented more efficiently for the existing + case, it will still have overlap issues with the non-existing case. + So it is best to keep the current (lack of) guarantee about which records + will be effected to avoid constraining our implementation. +\end_layout + +\begin_layout Subsection +Signal Handling is Not Race-Free +\end_layout + +\begin_layout Standard +The tdb_setalarm_sigptr() call allows the caller's signal handler to indicate + that the tdb locking code should return with a failure, rather than trying + again when a signal is received (and errno == EAGAIN). + This is usually used to implement timeouts. +\end_layout + +\begin_layout Standard +Unfortunately, this does not work in the case where the signal is received + before the tdb code enters the fcntl() call to place the lock: the code + will sleep within the fcntl() code, unaware that the signal wants it to + exit. + In the case of long timeouts, this does not happen in practice. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +The locking hooks proposed in +\begin_inset CommandInset ref +LatexCommand ref +reference "Proposed-Solution-locking-hook" + +\end_inset + + would allow the user to decide on whether to fail the lock acquisition + on a signal. + This allows the caller to choose their own compromise: they could narrow + the race by checking immediately before the fcntl call. +\begin_inset Foot +status collapsed + +\begin_layout Plain Layout +It may be possible to make this race-free in some implementations by having + the signal handler alter the struct flock to make it invalid. + This will cause the fcntl() lock call to fail with EINVAL if the signal + occurs before the kernel is entered, otherwise EAGAIN. +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Incomplete. +\end_layout + +\begin_layout Subsection +The API Uses Gratuitous Typedefs, Capitals +\end_layout + +\begin_layout Standard +typedefs are useful for providing source compatibility when types can differ + across implementations, or arguably in the case of function pointer definitions + which are hard for humans to parse. + Otherwise it is simply obfuscation and pollutes the namespace. +\end_layout + +\begin_layout Standard +Capitalization is usually reserved for compile-time constants and macros. +\end_layout + +\begin_layout Description +TDB_CONTEXT There is no reason to use this over 'struct tdb_context'; the + definition isn't visible to the API user anyway. +\end_layout + +\begin_layout Description +TDB_DATA There is no reason to use this over struct TDB_DATA; the struct + needs to be understood by the API user. +\end_layout + +\begin_layout Description +struct +\begin_inset space ~ +\end_inset + +TDB_DATA This would normally be called 'struct tdb_data'. +\end_layout + +\begin_layout Description +enum +\begin_inset space ~ +\end_inset + +TDB_ERROR Similarly, this would normally be enum tdb_error. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +None. + Introducing lower case variants would please pedants like myself, but if + it were done the existing ones should be kept. + There is little point forcing a purely cosmetic change upon tdb users. +\end_layout + +\begin_layout Subsection +\begin_inset CommandInset label +LatexCommand label +name "tdb_log_func-Doesnt-Take" + +\end_inset + +tdb_log_func Doesn't Take The Private Pointer +\end_layout + +\begin_layout Standard +For API compatibility reasons, the logging function needs to call tdb_get_loggin +g_private() to retrieve the pointer registered by the tdb_open_ex for logging. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +It should simply take an extra argument, since we are prepared to break + the API/ABI. +\end_layout + +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Complete. +\end_layout + +\begin_layout Subsection +Various Callback Functions Are Not Typesafe +\end_layout + +\begin_layout Standard +The callback functions in tdb_set_logging_function (after +\begin_inset CommandInset ref +LatexCommand ref +reference "tdb_log_func-Doesnt-Take" + +\end_inset + + is resolved), tdb_parse_record, tdb_traverse, tdb_traverse_read and tdb_check + all take void * and must internally convert it to the argument type they + were expecting. +\end_layout + +\begin_layout Standard +If this type changes, the compiler will not produce warnings on the callers, + since it only sees void *. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +With careful use of macros, we can create callback functions which give + a warning when used on gcc and the types of the callback and its private + argument differ. + Unsupported compilers will not give a warning, which is no worse than now. + In addition, the callbacks become clearer, as they need not use void * + for their parameter. +\end_layout + +\begin_layout Standard +See CCAN's typesafe_cb module at http://ccan.ozlabs.org/info/typesafe_cb.html +\end_layout + +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Incomplete. +\end_layout + +\begin_layout Subsection +TDB_CLEAR_IF_FIRST Must Be Specified On All Opens, tdb_reopen_all Problematic +\end_layout + +\begin_layout Standard +The TDB_CLEAR_IF_FIRST flag to tdb_open indicates that the TDB file should + be cleared if the caller discovers it is the only process with the TDB + open. + However, if any caller does not specify TDB_CLEAR_IF_FIRST it will not + be detected, so will have the TDB erased underneath them (usually resulting + in a crash). +\end_layout + +\begin_layout Standard +There is a similar issue on fork(); if the parent exits (or otherwise closes + the tdb) before the child calls tdb_reopen_all() to establish the lock + used to indicate the TDB is opened by someone, a TDB_CLEAR_IF_FIRST opener + at that moment will believe it alone has opened the TDB and will erase + it. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +Remove TDB_CLEAR_IF_FIRST. + Other workarounds are possible, but see +\begin_inset CommandInset ref +LatexCommand ref +reference "TDB_CLEAR_IF_FIRST-Imposes-Performance" + +\end_inset + +. +\end_layout + +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard + +\change_deleted 0 1298979699 +Incomplete, TDB_CLEAR_IF_FIRST still defined, but not implemented. +\change_inserted 0 1298979700 +Complete. +\change_unchanged + +\end_layout + +\begin_layout Subsection +Extending The Header Is Difficult +\end_layout + +\begin_layout Standard +We have reserved (zeroed) words in the TDB header, which can be used for + future features. + If the future features are compulsory, the version number must be updated + to prevent old code from accessing the database. + But if the future feature is optional, we have no way of telling if older + code is accessing the database or not. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +The header should contain a +\begin_inset Quotes eld +\end_inset + +format variant +\begin_inset Quotes erd +\end_inset + + value (64-bit). + This is divided into two 32-bit parts: +\end_layout + +\begin_layout Enumerate +The lower part reflects the format variant understood by code accessing + the database. +\end_layout + +\begin_layout Enumerate +The upper part reflects the format variant you must understand to write + to the database (otherwise you can only open for reading). +\end_layout + +\begin_layout Standard +The latter field can only be written at creation time, the former should + be written under the OPEN_LOCK when opening the database for writing, if + the variant of the code is lower than the current lowest variant. +\end_layout + +\begin_layout Standard +This should allow backwards-compatible features to be added, and detection + if older code (which doesn't understand the feature) writes to the database. +\end_layout + +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Incomplete. +\end_layout + +\begin_layout Subsection +Record Headers Are Not Expandible +\end_layout + +\begin_layout Standard +If we later want to add (say) checksums on keys and data, it would require + another format change, which we'd like to avoid. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +We often have extra padding at the tail of a record. + If we ensure that the first byte (if any) of this padding is zero, we will + have a way for future changes to detect code which doesn't understand a + new format: the new code would write (say) a 1 at the tail, and thus if + there is no tail or the first byte is 0, we would know the extension is + not present on that record. +\end_layout + +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Incomplete. +\end_layout + +\begin_layout Subsection +TDB Does Not Use Talloc +\end_layout + +\begin_layout Standard +Many users of TDB (particularly Samba) use the talloc allocator, and thus + have to wrap TDB in a talloc context to use it conveniently. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +The allocation within TDB is not complicated enough to justify the use of + talloc, and I am reluctant to force another (excellent) library on TDB + users. + Nonetheless a compromise is possible. + An attribute (see +\begin_inset CommandInset ref +LatexCommand ref +reference "attributes" + +\end_inset + +) can be added later to tdb_open() to provide an alternate allocation mechanism, + specifically for talloc but usable by any other allocator (which would + ignore the +\begin_inset Quotes eld +\end_inset + +context +\begin_inset Quotes erd +\end_inset + + argument). +\end_layout + +\begin_layout Standard +This would form a talloc heirarchy as expected, but the caller would still + have to attach a destructor to the tdb context returned from tdb_open to + close it. + All TDB_DATA fields would be children of the tdb_context, and the caller + would still have to manage them (using talloc_free() or talloc_steal()). +\end_layout + +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Deferred. +\end_layout + +\begin_layout Section +Performance And Scalability Issues +\end_layout + +\begin_layout Subsection +\begin_inset CommandInset label +LatexCommand label +name "TDB_CLEAR_IF_FIRST-Imposes-Performance" + +\end_inset + +TDB_CLEAR_IF_FIRST Imposes Performance Penalty +\end_layout + +\begin_layout Standard +When TDB_CLEAR_IF_FIRST is specified, a 1-byte read lock is placed at offset + 4 (aka. + the ACTIVE_LOCK). + While these locks never conflict in normal tdb usage, they do add substantial + overhead for most fcntl lock implementations when the kernel scans to detect + if a lock conflict exists. + This is often a single linked list, making the time to acquire and release + a fcntl lock O(N) where N is the number of processes with the TDB open, + not the number actually doing work. +\end_layout + +\begin_layout Standard +In a Samba server it is common to have huge numbers of clients sitting idle, + and thus they have weaned themselves off the TDB_CLEAR_IF_FIRST flag. +\begin_inset Foot +status collapsed + +\begin_layout Plain Layout +There is a flag to tdb_reopen_all() which is used for this optimization: + if the parent process will outlive the child, the child does not need the + ACTIVE_LOCK. + This is a workaround for this very performance issue. +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +Remove the flag. + It was a neat idea, but even trivial servers tend to know when they are + initializing for the first time and can simply unlink the old tdb at that + point. +\end_layout + +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard + +\change_deleted 0 1298979837 +Incomplete; TDB_CLEAR_IF_FIRST still defined, but does nothing. +\change_inserted 0 1298979837 +Complete. +\change_unchanged + +\end_layout + +\begin_layout Subsection +TDB Files Have a 4G Limit +\end_layout + +\begin_layout Standard +This seems to be becoming an issue (so much for +\begin_inset Quotes eld +\end_inset + +trivial +\begin_inset Quotes erd +\end_inset + +!), particularly for ldb. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +A new, incompatible TDB format which uses 64 bit offsets internally rather + than 32 bit as now. + For simplicity of endian conversion (which TDB does on the fly if required), + all values will be 64 bit on disk. + In practice, some upper bits may be used for other purposes, but at least + 56 bits will be available for file offsets. +\end_layout + +\begin_layout Standard +tdb_open() will automatically detect the old version, and even create them + if TDB_VERSION6 is specified to tdb_open. +\end_layout + +\begin_layout Standard +32 bit processes will still be able to access TDBs larger than 4G (assuming + that their off_t allows them to seek to 64 bits), they will gracefully + fall back as they fail to mmap. + This can happen already with large TDBs. +\end_layout + +\begin_layout Standard +Old versions of tdb will fail to open the new TDB files (since 28 August + 2009, commit 398d0c29290: prior to that any unrecognized file format would + be erased and initialized as a fresh tdb!) +\end_layout + +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Complete. +\end_layout + +\begin_layout Subsection +TDB Records Have a 4G Limit +\end_layout + +\begin_layout Standard +This has not been a reported problem, and the API uses size_t which can + be 64 bit on 64 bit platforms. + However, other limits may have made such an issue moot. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +Record sizes will be 64 bit, with an error returned on 32 bit platforms + which try to access such records (the current implementation would return + TDB_ERR_OOM in a similar case). + It seems unlikely that 32 bit keys will be a limitation, so the implementation + may not support this (see +\begin_inset CommandInset ref +LatexCommand ref +reference "sub:Records-Incur-A" + +\end_inset + +). +\end_layout + +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Complete. +\end_layout + +\begin_layout Subsection +Hash Size Is Determined At TDB Creation Time +\end_layout + +\begin_layout Standard +TDB contains a number of hash chains in the header; the number is specified + at creation time, and defaults to 131. + This is such a bottleneck on large databases (as each hash chain gets quite + long), that LDB uses 10,000 for this hash. + In general it is impossible to know what the 'right' answer is at database + creation time. +\end_layout + +\begin_layout Subsubsection +\begin_inset CommandInset label +LatexCommand label +name "sub:Hash-Size-Solution" + +\end_inset + +Proposed Solution +\end_layout + +\begin_layout Standard +After comprehensive performance testing on various scalable hash variants +\begin_inset Foot +status collapsed + +\begin_layout Plain Layout +http://rusty.ozlabs.org/?p=89 and http://rusty.ozlabs.org/?p=94 This was annoying + because I was previously convinced that an expanding tree of hashes would + be very close to optimal. +\end_layout + +\end_inset + +, it became clear that it is hard to beat a straight linear hash table which + doubles in size when it reaches saturation. + Unfortunately, altering the hash table introduces serious locking complications +: the entire hash table needs to be locked to enlarge the hash table, and + others might be holding locks. + Particularly insidious are insertions done under tdb_chainlock. +\end_layout + +\begin_layout Standard +Thus an expanding layered hash will be used: an array of hash groups, with + each hash group exploding into pointers to lower hash groups once it fills, + turning into a hash tree. + This has implications for locking: we must lock the entire group in case + we need to expand it, yet we don't know how deep the tree is at that point. +\end_layout + +\begin_layout Standard +Note that bits from the hash table entries should be stolen to hold more + hash bits to reduce the penalty of collisions. + We can use the otherwise-unused lower 3 bits. + If we limit the size of the database to 64 exabytes, we can use the top + 8 bits of the hash entry as well. + These 11 bits would reduce false positives down to 1 in 2000 which is more + than we need: we can use one of the bits to indicate that the extra hash + bits are valid. + This means we can choose not to re-hash all entries when we expand a hash + group; simply use the next bits we need and mark them invalid. +\end_layout + +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Complete. +\end_layout + +\begin_layout Subsection +\begin_inset CommandInset label +LatexCommand label +name "TDB-Freelist-Is" + +\end_inset + +TDB Freelist Is Highly Contended +\end_layout + +\begin_layout Standard +TDB uses a single linked list for the free list. + Allocation occurs as follows, using heuristics which have evolved over + time: +\end_layout + +\begin_layout Enumerate +Get the free list lock for this whole operation. +\end_layout + +\begin_layout Enumerate +Multiply length by 1.25, so we always over-allocate by 25%. +\end_layout + +\begin_layout Enumerate +Set the slack multiplier to 1. +\end_layout + +\begin_layout Enumerate +Examine the current freelist entry: if it is > length but < the current + best case, remember it as the best case. +\end_layout + +\begin_layout Enumerate +Multiply the slack multiplier by 1.05. +\end_layout + +\begin_layout Enumerate +If our best fit so far is less than length * slack multiplier, return it. + The slack will be turned into a new free record if it's large enough. +\end_layout + +\begin_layout Enumerate +Otherwise, go onto the next freelist entry. +\end_layout + +\begin_layout Standard +Deleting a record occurs as follows: +\end_layout + +\begin_layout Enumerate +Lock the hash chain for this whole operation. +\end_layout + +\begin_layout Enumerate +Walk the chain to find the record, keeping the prev pointer offset. +\end_layout + +\begin_layout Enumerate +If max_dead is non-zero: +\end_layout + +\begin_deeper +\begin_layout Enumerate +Walk the hash chain again and count the dead records. +\end_layout + +\begin_layout Enumerate +If it's more than max_dead, bulk free all the dead ones (similar to steps + 4 and below, but the lock is only obtained once). +\end_layout + +\begin_layout Enumerate +Simply mark this record as dead and return. + +\end_layout + +\end_deeper +\begin_layout Enumerate +Get the free list lock for the remainder of this operation. +\end_layout + +\begin_layout Enumerate +\begin_inset CommandInset label +LatexCommand label +name "right-merging" + +\end_inset + +Examine the following block to see if it is free; if so, enlarge the current + block and remove that block from the free list. + This was disabled, as removal from the free list was O(entries-in-free-list). +\end_layout + +\begin_layout Enumerate +Examine the preceeding block to see if it is free: for this reason, each + block has a 32-bit tailer which indicates its length. + If it is free, expand it to cover our new block and return. +\end_layout + +\begin_layout Enumerate +Otherwise, prepend ourselves to the free list. +\end_layout + +\begin_layout Standard +Disabling right-merging (step +\begin_inset CommandInset ref +LatexCommand ref +reference "right-merging" + +\end_inset + +) causes fragmentation; the other heuristics proved insufficient to address + this, so the final answer to this was that when we expand the TDB file + inside a transaction commit, we repack the entire tdb. +\end_layout + +\begin_layout Standard +The single list lock limits our allocation rate; due to the other issues + this is not currently seen as a bottleneck. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +The first step is to remove all the current heuristics, as they obviously + interact, then examine them once the lock contention is addressed. +\end_layout + +\begin_layout Standard +The free list must be split to reduce contention. + Assuming perfect free merging, we can at most have 1 free list entry for + each entry. + This implies that the number of free lists is related to the size of the + hash table, but as it is rare to walk a large number of free list entries + we can use far fewer, say 1/32 of the number of hash buckets. +\end_layout + +\begin_layout Standard +It seems tempting to try to reuse the hash implementation which we use for + records here, but we have two ways of searching for free entries: for allocatio +n we search by size (and possibly zone) which produces too many clashes + for our hash table to handle well, and for coalescing we search by address. + Thus an array of doubly-linked free lists seems preferable. +\end_layout + +\begin_layout Standard +There are various benefits in using per-size free lists (see +\begin_inset CommandInset ref +LatexCommand ref +reference "sub:TDB-Becomes-Fragmented" + +\end_inset + +) but it's not clear this would reduce contention in the common case where + all processes are allocating/freeing the same size. + Thus we almost certainly need to divide in other ways: the most obvious + is to divide the file into zones, and using a free list (or table of free + lists) for each. + This approximates address ordering. +\end_layout + +\begin_layout Standard +Unfortunately it is difficult to know what heuristics should be used to + determine zone sizes, and our transaction code relies on being able to + create a +\begin_inset Quotes eld +\end_inset + +recovery area +\begin_inset Quotes erd +\end_inset + + by simply appending to the file (difficult if it would need to create a + new zone header). + Thus we use a linked-list of free tables; currently we only ever create + one, but if there is more than one we choose one at random to use. + In future we may use heuristics to add new free tables on contention. + We only expand the file when all free tables are exhausted. +\end_layout + +\begin_layout Standard +The basic algorithm is as follows. + Freeing is simple: +\end_layout + +\begin_layout Enumerate +Identify the correct free list. +\end_layout + +\begin_layout Enumerate +Lock the corresponding list. +\end_layout + +\begin_layout Enumerate +Re-check the list (we didn't have a lock, sizes could have changed): relock + if necessary. +\end_layout + +\begin_layout Enumerate +Place the freed entry in the list. +\end_layout + +\begin_layout Standard +Allocation is a little more complicated, as we perform delayed coalescing + at this point: +\end_layout + +\begin_layout Enumerate +Pick a free table; usually the previous one. +\end_layout + +\begin_layout Enumerate +Lock the corresponding list. +\end_layout + +\begin_layout Enumerate +If the top entry is -large enough, remove it from the list and return it. +\end_layout + +\begin_layout Enumerate +Otherwise, coalesce entries in the list.If there was no entry large enough, + unlock the list and try the next largest list +\end_layout + +\begin_layout Enumerate +If no list has an entry which meets our needs, try the next free table. +\end_layout + +\begin_layout Enumerate +If no zone satisfies, expand the file. +\end_layout + +\begin_layout Standard +This optimizes rapid insert/delete of free list entries by not coalescing + them all the time.. + First-fit address ordering ordering seems to be fairly good for keeping + fragmentation low (see +\begin_inset CommandInset ref +LatexCommand ref +reference "sub:TDB-Becomes-Fragmented" + +\end_inset + +). + Note that address ordering does not need a tailer to coalesce, though if + we needed one we could have one cheaply: see +\begin_inset CommandInset ref +LatexCommand ref +reference "sub:Records-Incur-A" + +\end_inset + +. + +\end_layout + +\begin_layout Standard +Each free entry has the free table number in the header: less than 255. + It also contains a doubly-linked list for easy deletion. +\end_layout + +\begin_layout Subsection +\begin_inset CommandInset label +LatexCommand label +name "sub:TDB-Becomes-Fragmented" + +\end_inset + +TDB Becomes Fragmented +\end_layout + +\begin_layout Standard +Much of this is a result of allocation strategy +\begin_inset Foot +status collapsed + +\begin_layout Plain Layout +The Memory Fragmentation Problem: Solved? Johnstone & Wilson 1995 ftp://ftp.cs.ute +xas.edu/pub/garbage/malloc/ismm98.ps +\end_layout + +\end_inset + + and deliberate hobbling of coalescing; internal fragmentation (aka overallocati +on) is deliberately set at 25%, and external fragmentation is only cured + by the decision to repack the entire db when a transaction commit needs + to enlarge the file. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +The 25% overhead on allocation works in practice for ldb because indexes + tend to expand by one record at a time. + This internal fragmentation can be resolved by having an +\begin_inset Quotes eld +\end_inset + +expanded +\begin_inset Quotes erd +\end_inset + + bit in the header to note entries that have previously expanded, and allocating + more space for them. +\end_layout + +\begin_layout Standard +There are is a spectrum of possible solutions for external fragmentation: + one is to use a fragmentation-avoiding allocation strategy such as best-fit + address-order allocator. + The other end of the spectrum would be to use a bump allocator (very fast + and simple) and simply repack the file when we reach the end. +\end_layout + +\begin_layout Standard +There are three problems with efficient fragmentation-avoiding allocators: + they are non-trivial, they tend to use a single free list for each size, + and there's no evidence that tdb allocation patterns will match those recorded + for general allocators (though it seems likely). +\end_layout + +\begin_layout Standard +Thus we don't spend too much effort on external fragmentation; we will be + no worse than the current code if we need to repack on occasion. + More effort is spent on reducing freelist contention, and reducing overhead. +\end_layout + +\begin_layout Subsection +\begin_inset CommandInset label +LatexCommand label +name "sub:Records-Incur-A" + +\end_inset + +Records Incur A 28-Byte Overhead +\end_layout + +\begin_layout Standard +Each TDB record has a header as follows: +\end_layout + +\begin_layout LyX-Code +struct tdb_record { +\end_layout + +\begin_layout LyX-Code +        tdb_off_t next; /* offset of the next record in the list */ +\end_layout + +\begin_layout LyX-Code +        tdb_len_t rec_len; /* total byte length of record */ +\end_layout + +\begin_layout LyX-Code +        tdb_len_t key_len; /* byte length of key */ +\end_layout + +\begin_layout LyX-Code +        tdb_len_t data_len; /* byte length of data */ +\end_layout + +\begin_layout LyX-Code +        uint32_t full_hash; /* the full 32 bit hash of the key */ +\end_layout + +\begin_layout LyX-Code +        uint32_t magic;   /* try to catch errors */ +\end_layout + +\begin_layout LyX-Code +        /* the following union is implied: +\end_layout + +\begin_layout LyX-Code +                union { +\end_layout + +\begin_layout LyX-Code +                        char record[rec_len]; +\end_layout + +\begin_layout LyX-Code +                        struct { +\end_layout + +\begin_layout LyX-Code +                                char key[key_len]; +\end_layout + +\begin_layout LyX-Code +                                char data[data_len]; +\end_layout + +\begin_layout LyX-Code +                        } +\end_layout + +\begin_layout LyX-Code +                        uint32_t totalsize; (tailer) +\end_layout + +\begin_layout LyX-Code +                } +\end_layout + +\begin_layout LyX-Code +        */ +\end_layout + +\begin_layout LyX-Code +}; +\end_layout + +\begin_layout Standard +Naively, this would double to a 56-byte overhead on a 64 bit implementation. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +We can use various techniques to reduce this for an allocated block: +\end_layout + +\begin_layout Enumerate +The 'next' pointer is not required, as we are using a flat hash table. +\end_layout + +\begin_layout Enumerate +'rec_len' can instead be expressed as an addition to key_len and data_len + (it accounts for wasted or overallocated length in the record). + Since the record length is always a multiple of 8, we can conveniently + fit it in 32 bits (representing up to 35 bits). +\end_layout + +\begin_layout Enumerate +'key_len' and 'data_len' can be reduced. + I'm unwilling to restrict 'data_len' to 32 bits, but instead we can combine + the two into one 64-bit field and using a 5 bit value which indicates at + what bit to divide the two. + Keys are unlikely to scale as fast as data, so I'm assuming a maximum key + size of 32 bits. +\end_layout + +\begin_layout Enumerate +'full_hash' is used to avoid a memcmp on the +\begin_inset Quotes eld +\end_inset + +miss +\begin_inset Quotes erd +\end_inset + + case, but this is diminishing returns after a handful of bits (at 10 bits, + it reduces 99.9% of false memcmp). + As an aside, as the lower bits are already incorporated in the hash table + resolution, the upper bits should be used here. + Note that it's not clear that these bits will be a win, given the extra + bits in the hash table itself (see +\begin_inset CommandInset ref +LatexCommand ref +reference "sub:Hash-Size-Solution" + +\end_inset + +). +\end_layout + +\begin_layout Enumerate +'magic' does not need to be enlarged: it currently reflects one of 5 values + (used, free, dead, recovery, and unused_recovery). + It is useful for quick sanity checking however, and should not be eliminated. +\end_layout + +\begin_layout Enumerate +'tailer' is only used to coalesce free blocks (so a block to the right can + find the header to check if this block is free). + This can be replaced by a single 'free' bit in the header of the following + block (and the tailer only exists in free blocks). +\begin_inset Foot +status collapsed + +\begin_layout Plain Layout +This technique from Thomas Standish. + Data Structure Techniques. + Addison-Wesley, Reading, Massachusetts, 1980. +\end_layout + +\end_inset + + The current proposed coalescing algorithm doesn't need this, however. +\end_layout + +\begin_layout Standard +This produces a 16 byte used header like this: +\end_layout + +\begin_layout LyX-Code +struct tdb_used_record { +\end_layout + +\begin_layout LyX-Code +        uint32_t used_magic : 16, +\end_layout + +\begin_layout LyX-Code + +\end_layout + +\begin_layout LyX-Code +                 key_data_divide: 5, +\end_layout + +\begin_layout LyX-Code +                 top_hash: 11; +\end_layout + +\begin_layout LyX-Code +        uint32_t extra_octets; +\end_layout + +\begin_layout LyX-Code +        uint64_t key_and_data_len; +\end_layout + +\begin_layout LyX-Code +}; +\end_layout + +\begin_layout Standard +And a free record like this: +\end_layout + +\begin_layout LyX-Code +struct tdb_free_record { +\end_layout + +\begin_layout LyX-Code +        uint64_t free_magic: 8, +\end_layout + +\begin_layout LyX-Code +                   prev : 56; +\end_layout + +\begin_layout LyX-Code + +\end_layout + +\begin_layout LyX-Code +        uint64_t free_table: 8, +\end_layout + +\begin_layout LyX-Code +                 total_length : 56 +\end_layout + +\begin_layout LyX-Code +        uint64_t next;; +\end_layout + +\begin_layout LyX-Code +}; +\end_layout + +\begin_layout Standard + +\change_deleted 0 1291206079 + +\change_unchanged +Note that by limiting valid offsets to 56 bits, we can pack everything we + need into 3 64-byte words, meaning our minimum record size is 8 bytes. +\end_layout + +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Complete. +\end_layout + +\begin_layout Subsection +Transaction Commit Requires 4 fdatasync +\end_layout + +\begin_layout Standard +The current transaction algorithm is: +\end_layout + +\begin_layout Enumerate +write_recovery_data(); +\end_layout + +\begin_layout Enumerate +sync(); +\end_layout + +\begin_layout Enumerate +write_recovery_header(); +\end_layout + +\begin_layout Enumerate +sync(); +\end_layout + +\begin_layout Enumerate +overwrite_with_new_data(); +\end_layout + +\begin_layout Enumerate +sync(); +\end_layout + +\begin_layout Enumerate +remove_recovery_header(); +\end_layout + +\begin_layout Enumerate +sync(); +\end_layout + +\begin_layout Standard +On current ext3, each sync flushes all data to disk, so the next 3 syncs + are relatively expensive. + But this could become a performance bottleneck on other filesystems such + as ext4. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +Neil Brown points out that this is overzealous, and only one sync is needed: +\end_layout + +\begin_layout Enumerate +Bundle the recovery data, a transaction counter and a strong checksum of + the new data. +\end_layout + +\begin_layout Enumerate +Strong checksum that whole bundle. +\end_layout + +\begin_layout Enumerate +Store the bundle in the database. +\end_layout + +\begin_layout Enumerate +Overwrite the oldest of the two recovery pointers in the header (identified + using the transaction counter) with the offset of this bundle. +\end_layout + +\begin_layout Enumerate +sync. +\end_layout + +\begin_layout Enumerate +Write the new data to the file. +\end_layout + +\begin_layout Standard +Checking for recovery means identifying the latest bundle with a valid checksum + and using the new data checksum to ensure that it has been applied. + This is more expensive than the current check, but need only be done at + open. + For running databases, a separate header field can be used to indicate + a transaction in progress; we need only check for recovery if this is set. +\end_layout + +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Deferred. +\end_layout + +\begin_layout Subsection +\begin_inset CommandInset label +LatexCommand label +name "sub:TDB-Does-Not" + +\end_inset + +TDB Does Not Have Snapshot Support +\end_layout + +\begin_layout Subsubsection +Proposed SolutionNone. + At some point you say +\begin_inset Quotes eld +\end_inset + +use a real database +\begin_inset Quotes erd +\end_inset + + (but see +\begin_inset CommandInset ref +LatexCommand ref +reference "replay-attribute" + +\end_inset + +). +\end_layout + +\begin_layout Standard +But as a thought experiment, if we implemented transactions to only overwrite + free entries (this is tricky: there must not be a header in each entry + which indicates whether it is free, but use of presence in metadata elsewhere), + and a pointer to the hash table, we could create an entirely new commit + without destroying existing data. + Then it would be easy to implement snapshots in a similar way. +\end_layout + +\begin_layout Standard +This would not allow arbitrary changes to the database, such as tdb_repack + does, and would require more space (since we have to preserve the current + and future entries at once). + If we used hash trees rather than one big hash table, we might only have + to rewrite some sections of the hash, too. +\end_layout + +\begin_layout Standard +We could then implement snapshots using a similar method, using multiple + different hash tables/free tables. +\end_layout + +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Deferred. +\end_layout + +\begin_layout Subsection +Transactions Cannot Operate in Parallel +\end_layout + +\begin_layout Standard +This would be useless for ldb, as it hits the index records with just about + every update. + It would add significant complexity in resolving clashes, and cause the + all transaction callers to write their code to loop in the case where the + transactions spuriously failed. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +None (but see +\begin_inset CommandInset ref +LatexCommand ref +reference "replay-attribute" + +\end_inset + +). + We could solve a small part of the problem by providing read-only transactions. + These would allow one write transaction to begin, but it could not commit + until all r/o transactions are done. + This would require a new RO_TRANSACTION_LOCK, which would be upgraded on + commit. +\end_layout + +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Deferred. +\end_layout + +\begin_layout Subsection +Default Hash Function Is Suboptimal +\end_layout + +\begin_layout Standard +The Knuth-inspired multiplicative hash used by tdb is fairly slow (especially + if we expand it to 64 bits), and works best when the hash bucket size is + a prime number (which also means a slow modulus). + In addition, it is highly predictable which could potentially lead to a + Denial of Service attack in some TDB uses. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +The Jenkins lookup3 hash +\begin_inset Foot +status open + +\begin_layout Plain Layout +http://burtleburtle.net/bob/c/lookup3.c +\end_layout + +\end_inset + + is a fast and superbly-mixing hash. + It's used by the Linux kernel and almost everything else. + This has the particular properties that it takes an initial seed, and produces + two 32 bit hash numbers, which we can combine into a 64-bit hash. +\end_layout + +\begin_layout Standard +The seed should be created at tdb-creation time from some random source, + and placed in the header. + This is far from foolproof, but adds a little bit of protection against + hash bombing. +\end_layout + +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Complete. +\end_layout + +\begin_layout Subsection +\begin_inset CommandInset label +LatexCommand label +name "Reliable-Traversal-Adds" + +\end_inset + +Reliable Traversal Adds Complexity +\end_layout + +\begin_layout Standard +We lock a record during traversal iteration, and try to grab that lock in + the delete code. + If that grab on delete fails, we simply mark it deleted and continue onwards; + traversal checks for this condition and does the delete when it moves off + the record. +\end_layout + +\begin_layout Standard +If traversal terminates, the dead record may be left indefinitely. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +Remove reliability guarantees; see +\begin_inset CommandInset ref +LatexCommand ref +reference "traverse-Proposed-Solution" + +\end_inset + +. +\end_layout + +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Complete. +\end_layout + +\begin_layout Subsection +Fcntl Locking Adds Overhead +\end_layout + +\begin_layout Standard +Placing a fcntl lock means a system call, as does removing one. + This is actually one reason why transactions can be faster (everything + is locked once at transaction start). + In the uncontended case, this overhead can theoretically be eliminated. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +None. +\end_layout + +\begin_layout Standard +We tried this before with spinlock support, in the early days of TDB, and + it didn't make much difference except in manufactured benchmarks. +\end_layout + +\begin_layout Standard +We could use spinlocks (with futex kernel support under Linux), but it means + that we lose automatic cleanup when a process dies with a lock. + There is a method of auto-cleanup under Linux, but it's not supported by + other operating systems. + We could reintroduce a clear-if-first-style lock and sweep for dead futexes + on open, but that wouldn't help the normal case of one concurrent opener + dying. + Increasingly elaborate repair schemes could be considered, but they require + an ABI change (everyone must use them) anyway, so there's no need to do + this at the same time as everything else. +\end_layout + +\begin_layout Subsection +Some Transactions Don't Require Durability +\end_layout + +\begin_layout Standard +Volker points out that gencache uses a CLEAR_IF_FIRST tdb for normal (fast) + usage, and occasionally empties the results into a transactional TDB. + This kind of usage prioritizes performance over durability: as long as + we are consistent, data can be lost. +\end_layout + +\begin_layout Standard +This would be more neatly implemented inside tdb: a +\begin_inset Quotes eld +\end_inset + +soft +\begin_inset Quotes erd +\end_inset + + transaction commit (ie. + syncless) which meant that data may be reverted on a crash. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +None. +\end_layout + +\begin_layout Standard +Unfortunately any transaction scheme which overwrites old data requires + a sync before that overwrite to avoid the possibility of corruption. +\end_layout + +\begin_layout Standard +It seems possible to use a scheme similar to that described in +\begin_inset CommandInset ref +LatexCommand ref +reference "sub:TDB-Does-Not" + +\end_inset + +,where transactions are committed without overwriting existing data, and + an array of top-level pointers were available in the header. + If the transaction is +\begin_inset Quotes eld +\end_inset + +soft +\begin_inset Quotes erd +\end_inset + + then we would not need a sync at all: existing processes would pick up + the new hash table and free list and work with that. +\end_layout + +\begin_layout Standard +At some later point, a sync would allow recovery of the old data into the + free lists (perhaps when the array of top-level pointers filled). + On crash, tdb_open() would examine the array of top levels, and apply the + transactions until it encountered an invalid checksum. +\end_layout + +\begin_layout Subsection +Tracing Is Fragile, Replay Is External +\end_layout + +\begin_layout Standard +The current TDB has compile-time-enabled tracing code, but it often breaks + as it is not enabled by default. + In a similar way, the ctdb code has an external wrapper which does replay + tracing so it can coordinate cluster-wide transactions. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\begin_inset CommandInset label +LatexCommand label +name "replay-attribute" + +\end_inset + + +\end_layout + +\begin_layout Standard +Tridge points out that an attribute can be later added to tdb_open (see + +\begin_inset CommandInset ref +LatexCommand ref +reference "attributes" + +\end_inset + +) to provide replay/trace hooks, which could become the basis for this and + future parallel transactions and snapshot support. +\end_layout + +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Deferred. +\end_layout + +\end_body +\end_document +@ + + +1.12 +log +@Add status, some fixes, linked freelists. +@ +text +@d53 1 +a53 7 + +\change_deleted 0 1291204535 +14-September +\change_inserted 0 1291204533 +1-December +\change_unchanged +-2010 +a580 2 +\change_inserted 0 1291204563 + +a583 2 + +\change_inserted 0 1291204572 +a587 2 + +\change_inserted 0 1291204573 +a588 2 +\change_unchanged + +a629 2 +\change_inserted 0 1291204588 + +a632 2 + +\change_inserted 0 1291204588 +a636 2 + +\change_inserted 0 1291204631 +a639 2 +\change_unchanged + +a693 2 +\change_inserted 0 1291204639 + +a696 2 + +\change_inserted 0 1291204640 +d702 1 +a702 1 +\change_inserted 0 1291204665 +d704 2 +a728 2 +\change_inserted 0 1291204671 + +a731 2 + +\change_inserted 0 1291204671 +a735 2 + +\change_inserted 0 1291204673 +a736 2 +\change_unchanged + +a780 2 +\change_inserted 0 1291204731 + +a783 2 + +\change_inserted 0 1291204732 +a787 2 + +\change_inserted 0 1291204779 +a790 2 +\change_unchanged + +a842 2 +\change_inserted 0 1291204830 + +a845 2 + +\change_inserted 0 1291204831 +a849 2 + +\change_inserted 0 1291204834 +a850 2 +\change_unchanged + +d879 9 +a887 2 + deal of churn; we are better to guarantee that the tdb_errcode is per-thread + so the current programming model can be maintained. +d891 9 +d903 2 +a922 2 +\change_inserted 0 1291204847 + +a925 2 + +\change_inserted 0 1291204847 +d930 5 +a934 3 + +\change_inserted 0 1291204852 +Incomplete. +a1051 2 +\change_inserted 0 1291204881 + +a1054 2 + +\change_inserted 0 1291204881 +a1058 2 + +\change_inserted 0 1291204885 +a1059 2 +\change_unchanged + +a1140 2 +\change_inserted 0 1291204898 + +a1143 2 + +\change_inserted 0 1291204898 +a1147 2 + +\change_inserted 0 1291204901 +a1148 2 +\change_unchanged + +a1224 2 +\change_inserted 0 1291204908 + +a1227 2 + +\change_inserted 0 1291204908 +a1231 2 + +\change_inserted 0 1291204908 +a1232 2 +\change_unchanged + +a1271 2 +\change_inserted 0 1291204917 + +a1274 2 + +\change_inserted 0 1291204917 +a1278 2 + +\change_inserted 0 1291204920 +a1279 2 +\change_unchanged + +a1316 2 +\change_inserted 0 1291204927 + +a1319 2 + +\change_inserted 0 1291204928 +d1325 1 +a1325 1 +\change_inserted 0 1291204942 +d1327 2 +a1381 2 +\change_inserted 0 1291205003 + +a1384 2 + +\change_inserted 0 1291205004 +a1388 2 + +\change_inserted 0 1291205007 +a1411 2 +\change_inserted 0 1291205019 + +a1414 2 + +\change_inserted 0 1291205019 +a1418 2 + +\change_inserted 0 1291205023 +a1419 2 +\change_unchanged + +a1465 2 +\change_inserted 0 1291205029 + +a1468 2 + +\change_inserted 0 1291205029 +a1472 2 + +\change_inserted 0 1291206020 +a1473 2 +\change_unchanged + +a1528 2 +\change_inserted 0 1291205043 + +a1531 2 + +\change_inserted 0 1291205043 +d1537 1 +a1537 1 +\change_inserted 0 1291205057 +d1539 2 +a1589 2 +\change_inserted 0 1291205062 + +a1592 2 + +\change_inserted 0 1291205062 +a1596 2 + +\change_inserted 0 1291205062 +a1597 2 +\change_unchanged + +a1626 2 +\change_inserted 0 1291205072 + +a1629 2 + +\change_inserted 0 1291205073 +a1633 2 + +\change_inserted 0 1291205073 +a1634 2 +\change_unchanged + +a1674 4 + +\change_deleted 0 1291204504 + +\change_unchanged +a1699 2 +\change_inserted 0 1291205079 + +a1702 2 + +\change_inserted 0 1291205080 +a1706 2 + +\change_inserted 0 1291205080 +a1707 2 +\change_unchanged + +a1833 2 +\change_inserted 0 1291205090 + +d1869 2 +a1870 7 + is to divide the file into zones, and using a free list (or +\change_inserted 0 1291205498 +table +\change_deleted 0 1291205497 +set +\change_unchanged + of free lists) for each. +a1871 2 +\change_inserted 0 1291205203 + +a1874 2 + +\change_inserted 0 1291205358 +a1890 21 +\change_unchanged + +\end_layout + +\begin_layout Standard + +\change_deleted 0 1291205198 +Note that this means we need to split the free lists when we expand the + file; this is probably acceptable when we double the hash table size, since + that is such an expensive operation already. + In the case of increasing the file size, there is an optimization we can + use: if we use M in the formula above as the file size rounded up to the + next power of 2, we only need reshuffle free lists when the file size crosses + a power of 2 boundary, +\emph on +and +\emph default +reshuffling the free lists is trivial: we simply merge every consecutive + pair of free lists. +\change_unchanged + +d1899 1 +a1899 7 +Identify the correct +\change_inserted 0 1291205366 +free list +\change_deleted 0 1291205364 +zone +\change_unchanged +. +d1907 2 +a1908 7 +Re-check the +\change_inserted 0 1291205372 +list +\change_deleted 0 1291205371 +zone +\change_unchanged + (we didn't have a lock, sizes could have changed): relock if necessary. +d1912 1 +a1912 5 +Place the freed entry in the list +\change_deleted 0 1291205382 + for that zone +\change_unchanged +. +d1921 1 +a1921 15 +Pick a +\change_deleted 0 1291205403 +zone either the zone we last freed into, or based on a +\begin_inset Quotes eld +\end_inset + +random +\begin_inset Quotes erd +\end_inset + + number. +\change_inserted 0 1291205411 +free table; usually the previous one. +\change_unchanged + +a1925 10 +\change_deleted 0 1291205432 + +\end_layout + +\begin_layout Enumerate + +\change_deleted 0 1291205428 +Re-check the zone: relock if necessary. +\change_unchanged + +d1934 1 +a1934 7 + unlock the list and try the next +\change_inserted 0 1291205455 +largest list +\change_deleted 0 1291205452 +zone. +\change_inserted 0 1291205457 + +a1937 2 + +\change_inserted 0 1291205476 +a1938 2 +\change_unchanged + +a1966 2 +\change_inserted 0 1291205542 + +a1969 2 + +\change_inserted 0 1291205591 +a1971 70 +\change_unchanged + +\end_layout + +\begin_layout Standard + +\change_deleted 0 1291205539 +I anticipate that the number of entries in each free zone would be small, + but it might be worth using one free entry to hold pointers to the others + for cache efficiency. +\change_unchanged + +\end_layout + +\begin_layout Standard + +\change_deleted 0 1291205534 +\begin_inset CommandInset label +LatexCommand label +name "freelist-in-zone" + +\end_inset + +If we want to avoid locking complexity (enlarging the free lists when we + enlarge the file) we could place the array of free lists at the beginning + of each zone. + This means existing array lists never move, but means that a record cannot + be larger than a zone. + That in turn implies that zones should be variable sized (say, power of + 2), which makes the question +\begin_inset Quotes eld +\end_inset + +what zone is this record in? +\begin_inset Quotes erd +\end_inset + + much harder (and +\begin_inset Quotes eld +\end_inset + +pick a random zone +\begin_inset Quotes erd +\end_inset + +, but that's less common). + It could be done with as few as 4 bits from the record header. +\begin_inset Foot +status collapsed + +\begin_layout Plain Layout +Using +\begin_inset Formula $2^{16+N*3}$ +\end_inset + +means 0 gives a minimal 65536-byte zone, 15 gives the maximal +\begin_inset Formula $2^{61}$ +\end_inset + + byte zone. + Zones range in factor of 8 steps. + Given the zone size for the zone the current record is in, we can determine + the start of the zone. +\end_layout + +\end_inset + + +\change_inserted 0 1291205139 + +d2218 1 +a2218 5 +        uint32_t +\change_inserted 0 1291205758 +used_ +\change_unchanged +magic : 16, +a2222 4 +\change_deleted 0 1291205693 +                 prev_is_free: 1, +\change_unchanged + +d2230 1 +a2230 7 +                 top_hash: 1 +\change_inserted 0 1291205704 +1 +\change_deleted 0 1291205704 +0 +\change_unchanged +; +d2254 1 +a2254 9 +        uint +\change_inserted 0 1291205725 +64 +\change_deleted 0 1291205723 +32 +\change_unchanged +_t +\change_inserted 0 1291205753 +free_magic: 8, +a2257 2 + +\change_inserted 0 1291205746 +a2262 24 +\change_deleted 0 1291205749 +free_magic; +\change_unchanged + +\end_layout + +\begin_layout LyX-Code +        uint64_t +\change_inserted 0 1291205786 +free_table: 8, +\end_layout + +\begin_layout LyX-Code + +\change_inserted 0 1291205788 + +\change_unchanged +total_length +\change_inserted 0 1291205792 + : 56 +\change_deleted 0 1291205790 +; +\change_unchanged + +d2266 1 +a2266 7 +        uint64_t +\change_deleted 0 1291205801 +prev, +\change_unchanged +next; +\change_deleted 0 1291205811 + +d2270 1 +a2270 3 + +\change_deleted 0 1291205811 +        ... +d2274 1 +a2274 5 + +\change_deleted 0 1291205808 +        uint64_t tailer +\change_unchanged +; +d2283 5 +a2287 16 +\change_deleted 0 1291205827 +We might want to take some bits from the used record's top_hash (and the + free record which has 32 bits of padding to spare anyway) if we use variable + sized zones. + See +\begin_inset CommandInset ref +LatexCommand ref +reference "freelist-in-zone" + +\end_inset + +. + +\change_inserted 0 1291205885 + Note that by limiting valid offsets to 56 bits, we can pack everything + we need into 3 64-byte words, meaning our minimum record size is 8 bytes. +a2290 2 + +\change_inserted 0 1291205886 +a2294 2 + +\change_inserted 0 1291205886 +a2295 2 +\change_unchanged + +a2385 2 +\change_inserted 0 1291205894 + +a2388 2 + +\change_inserted 0 1291205894 +a2392 2 + +\change_inserted 0 1291205902 +a2393 2 +\change_unchanged + +a2415 4 + +\change_deleted 0 1291204504 + +\change_unchanged +a2445 2 +\change_inserted 0 1291205910 + +a2448 2 + +\change_inserted 0 1291205910 +a2452 2 + +\change_inserted 0 1291205914 +a2453 2 +\change_unchanged + +a2485 2 +\change_inserted 0 1291205919 + +a2488 2 + +\change_inserted 0 1291205919 +a2492 2 + +\change_inserted 0 1291205922 +a2493 2 +\change_unchanged + +a2533 2 +\change_inserted 0 1291205929 + +a2536 2 + +\change_inserted 0 1291205929 +a2540 2 + +\change_inserted 0 1291205929 +a2541 2 +\change_unchanged + +a2578 2 +\change_inserted 0 1291205932 + +a2581 2 + +\change_inserted 0 1291205933 +a2585 2 + +\change_inserted 0 1291205933 +a2586 2 +\change_unchanged + +a2724 2 +\change_inserted 0 1291205944 + +a2727 2 + +\change_inserted 0 1291205945 +a2731 2 + +\change_inserted 0 1291205948 +a2732 2 +\change_unchanged + +@ + + +1.11 +log +@Merge changes +@ +text +@d53 7 +a59 1 +14-September-2010 +d587 16 +d644 18 +d716 16 +d753 16 +d813 18 +d883 16 +d953 16 +d1084 16 +d1181 16 +d1273 16 +d1328 16 +d1381 16 +d1447 19 +a1465 2 + if older code (which doesn't understand the feature) writes to the database.Reco +rd Headers Are Not Expandible +d1484 16 +d1546 16 +d1617 16 +d1680 16 +d1725 16 +d1810 16 +d1951 8 +a1958 3 +Proposed SolutionThe first step is to remove all the current heuristics, + as they obviously interact, then examine them once the lock contention + is addressed. +d1989 7 +a1995 2 + is to divide the file into zones, and using a free list (or set of free + lists) for each. +d1997 2 +d2002 25 +d2039 2 +d2049 7 +a2055 1 +Identify the correct zone. +d2063 7 +a2069 2 +Re-check the zone (we didn't have a lock, sizes could have changed): relock + if necessary. +d2073 5 +a2077 1 +Place the freed entry in the list for that zone. +d2086 3 +a2088 1 +Pick a zone either the zone we last freed into, or based on a +d2097 4 +d2105 2 +d2110 2 +d2113 2 +d2123 15 +a2137 1 + unlock the list and try the next zone. +d2166 11 +d2180 2 +d2185 2 +d2190 2 +d2223 1 +a2223 1 +status open +d2243 2 +d2491 5 +a2495 1 +        uint32_t magic : 16, +d2499 2 +d2502 2 +d2511 7 +a2517 1 +                 top_hash: 10; +d2541 29 +a2569 1 +        uint32_t free_magic; +d2573 11 +a2583 1 +        uint64_t total_length; +d2587 7 +a2593 1 +        uint64_t prev, next; +d2597 2 +d2603 5 +a2607 1 +        uint64_t tailer; +d2615 2 +d2628 18 +d2736 16 +d2808 16 +d2856 16 +d2912 16 +d2965 16 +d3119 16 +@ + + +1.10 +log +@Tracing attribute, talloc support. +@ +text +@d1 1 +a1 1 +#LyX 1.6.5 created this file. For more info see http://www.lyx.org/ +d53 1 +a53 7 + +\change_deleted 0 1283307542 +26-July +\change_inserted 0 1284423485 +14-September +\change_unchanged +-2010 +a472 2 +\change_inserted 0 1284422789 + +a479 2 +\change_unchanged + +a838 2 + +\change_inserted 0 1284016998 +a846 2 +\change_unchanged + +a1194 2 +\change_inserted 0 1284015637 + +a1197 2 + +\change_inserted 0 1284015716 +a1201 2 + +\change_inserted 0 1284015906 +a1210 2 + +\change_inserted 0 1284015637 +a1214 2 + +\change_inserted 0 1284016114 +a1227 2 + +\change_inserted 0 1284016149 +a1232 2 + +\change_inserted 0 1284016639 +a1237 2 + +\change_inserted 0 1284016821 +a1243 2 + +\change_inserted 0 1284016803 +d1245 2 +a1246 9 + if older code (which doesn't understand the feature) writes to the database. +\change_deleted 0 1284016101 + +\end_layout + +\begin_layout Subsection + +\change_inserted 0 1284015634 +Record Headers Are Not Expandible +a1249 2 + +\change_inserted 0 1284015634 +a1254 2 + +\change_inserted 0 1284015634 +a1258 2 + +\change_inserted 0 1284422552 +a1267 2 + +\change_inserted 0 1284422568 +a1271 2 + +\change_inserted 0 1284422646 +a1276 2 + +\change_inserted 0 1284422656 +a1280 2 + +\change_inserted 0 1284423065 +a1305 2 + +\change_inserted 0 1284423042 +a1310 2 +\change_unchanged + +a1457 2 + +\change_inserted 0 1283336713 +a1463 2 + +\change_unchanged +d1482 2 +d1485 1 +a1485 51 +\change_deleted 0 1283307675 +There are three details which become important: +\end_layout + +\begin_layout Enumerate + +\change_deleted 0 1283307675 +On encountering a full bucket, we use the next bucket. +\end_layout + +\begin_layout Enumerate + +\change_deleted 0 1283307675 +Extra hash bits are stored with the offset, to reduce comparisons. +\end_layout + +\begin_layout Enumerate + +\change_deleted 0 1283307675 +A marker entry is used on deleting an entry. +\end_layout + +\begin_layout Standard + +\change_deleted 0 1283307675 +The doubling of the table must be done under a transaction; we will not + reduce it on deletion, so it will be an unusual case. + It will either be placed at the head (other entries will be moved out the + way so we can expand). + We could have a pointer in the header to the current hashtable location, + but that pointer would have to be read frequently to check for hashtable + moves. +\end_layout + +\begin_layout Standard + +\change_deleted 0 1283307675 +The locking for this is slightly more complex than the chained case; we + currently have one lock per bucket, and that means we would need to expand + the lock if we overflow to the next bucket. + The frequency of such collisions will effect our locking heuristics: we + can always lock more buckets than we need. +\end_layout + +\begin_layout Standard + +\change_deleted 0 1283307675 +One possible optimization is to only re-check the hash size on an insert + or a lookup miss. + +\change_inserted 0 1283307770 +a1492 2 + +\change_inserted 0 1283336187 +a1500 2 + +\change_inserted 0 1283336586 +a1510 2 +\change_unchanged + +d1636 3 +a1638 8 +Proposed Solution +\change_deleted 0 1283336858 + +\end_layout + +\begin_layout Standard +The first step is to remove all the current heuristics, as they obviously + interact, then examine them once the lock contention is addressed. +a1647 2 +\change_inserted 0 1283336910 + +a1650 2 + +\change_inserted 0 1283337052 +a1655 2 +\change_unchanged + +a1776 2 +\change_inserted 0 1283309850 + +a1779 2 + +\change_inserted 0 1283337216 +a1813 2 + +\change_inserted 0 1284424151 +a1825 2 +\change_unchanged + +a1830 2 +\change_unchanged + +a2031 2 + +\change_inserted 0 1283336739 +a2040 2 +\change_unchanged + +a2117 2 +\change_inserted 0 1283337133 + +a2120 2 + +\change_inserted 0 1283337139 +a2121 2 +\change_unchanged + +a2136 2 + +\change_inserted 0 1283337235 +a2147 2 +\change_unchanged + +d2251 1 +a2251 7 +Proposed Solution +\change_deleted 0 1284423472 + +\end_layout + +\begin_layout Standard +None. +d2261 1 +a2261 1 +\change_inserted 0 1284423891 +d2263 1 +a2263 4 +\change_deleted 0 1284423891 +. + +\change_inserted 0 1284423901 +a2271 2 +\change_unchanged + +a2293 2 +\change_inserted 0 1284423495 + +a2312 2 + +\change_inserted 0 1284424201 +d2321 1 +a2321 3 + +\change_unchanged +We could solve a small part of the problem by providing read-only transactions. +a2505 2 +\change_inserted 0 1284423555 + +a2508 2 + +\change_inserted 0 1284423617 +a2512 2 + +\change_inserted 0 1284423719 +a2519 2 + +\change_inserted 0 1284423864 +a2530 2 + +\change_inserted 0 1284423850 +a2540 2 +\change_unchanged + +@ + + +1.9 +log +@Extension mechanism. +@ +text +@d56 2 +a57 2 +\change_inserted 0 1284016854 +9-September +d479 11 +d1303 1 +a1303 1 +\change_inserted 0 1284016847 +d1310 56 +d1945 1 +a1945 1 +\change_inserted 0 1283310945 +d1956 2 +d2402 2 +d2416 4 +d2421 12 +d2455 2 +d2476 12 +d2673 47 +@ + + +1.8 +log +@Remove bogus footnote +@ +text +@d56 2 +a57 2 +\change_inserted 0 1283307544 +1-September +d838 12 +d1198 103 +@ + + +1.7 +log +@Moving hash table does not work. +@ +text +@a1436 12 +\begin_inset Foot +status collapsed + +\begin_layout Plain Layout + +\change_inserted 0 1283336450 +If we make the hash offsets zone-relative, then this only restricts the + zone size, not the overall database size. +\end_layout + +\end_inset + +@ + + +1.6 +log +@Commit changes +@ +text +@d38 1 +a38 1 +\author "" +d53 7 +a59 1 +26-July-2010 +d1333 10 +d1361 3 +a1363 1 + There are three details which become important: +d1367 2 +d1373 2 +d1379 2 +d1385 2 +d1397 2 +d1407 2 +d1411 45 +d1582 2 +d1598 14 +d1733 62 +d1996 13 +d2086 10 +d2110 15 +a2124 1 +\begin_layout LyX-Code +@ + + +1.5 +log +@Soft transaction commit +@ +text +@d38 1 +a38 1 +\author "Rusty Russell,,," +a52 4 + +\change_deleted 0 1280141199 +10-May-2010 +\change_inserted 0 1280141202 +a53 2 +\change_unchanged + +a2028 2 + +\change_inserted 0 1280140902 +a2034 2 + +\change_unchanged +a2212 2 +\change_inserted 0 1280140661 + +a2215 2 + +\change_inserted 0 1280140703 +a2219 2 + +\change_inserted 0 1280708312 +a2226 2 + +\change_inserted 0 1280708400 +a2239 2 + +\change_inserted 0 1280140836 +a2243 2 + +\change_inserted 0 1280708255 +a2247 2 + +\change_inserted 0 1280708374 +a2252 2 + +\change_inserted 0 1280141181 +a2274 2 + +\change_inserted 0 1280141345 +@ + + +1.4 +log +@Merge changes +@ +text +@d38 1 +a38 1 +\author "" +d53 2 +d56 4 +d2035 10 +d2223 84 +@ + + +1.3 +log +@Transaction and freelist rethink. +@ +text +@d38 1 +a38 1 +\author "Rusty Russell,,," +d53 1 +a53 1 +27-April-2010 +d662 1 +a662 5 + behavior of disallowing +\change_inserted 0 1272940179 +nested +\change_unchanged +transactions should become the default. +a1210 2 +\change_inserted 0 1272944650 + +a1214 2 + +\change_inserted 0 1272944763 +a1218 2 +\change_unchanged + +a1223 2 +\change_unchanged + +a1301 2 + +\change_inserted 0 1273478114 +a1310 2 +\change_unchanged + +d1515 1 +a1515 11 +The free list +\change_deleted 0 1273469807 +should +\change_inserted 0 1273469810 +must +\change_unchanged + be split +\change_deleted 0 1273469815 +into multiple lists +\change_unchanged +to reduce contention. +a1520 2 +\change_inserted 0 1273470006 + +a1523 2 + +\change_inserted 0 1273492055 +a1539 2 + +\change_inserted 0 1273483888 +a1551 2 +\change_unchanged + +a1554 8 + +\change_deleted 0 1272942055 +There are various ways to organize these lisys, but because we want to be + able to quickly identify which free list an entry is in, and reduce the + number of locks required for merging, we will use zoning (eg. + each free list covers some fixed fraction of the file). + +\change_inserted 0 1273484187 +d1556 1 +a1556 7 + +\change_deleted 0 1273484194 +The algorithm for f +\change_inserted 0 1273484194 +F +\change_unchanged +reeing is simple: +d1560 1 +a1560 7 +Identify the correct +\change_deleted 0 1273482856 +free list +\change_inserted 0 1273482857 +zone +\change_unchanged +. +d1564 1 +a1564 7 +Lock the +\change_inserted 0 1273482895 +corresponding +\change_unchanged +list +\change_inserted 0 1273482863 +. +a1567 2 + +\change_inserted 0 1273482909 +d1573 1 +a1573 13 + +\change_deleted 0 1273482885 +, and p +\change_inserted 0 1273482888 +P +\change_unchanged +lace the freed entry +\change_deleted 0 1273492415 +at the head +\change_inserted 0 1273492415 +in the list for that zone +\change_unchanged +. +d1577 2 +a1578 7 +Allocation is a little more complicated, as we +\change_deleted 0 1273483240 +merge entries as we walk the list: +\change_inserted 0 1273484250 +perform delayed coalescing at this point: +\change_unchanged + +d1582 1 +a1582 19 +Pick a +\change_deleted 0 1273482955 +free list; +\change_inserted 0 1273482957 +zone +\change_unchanged + either the +\change_deleted 0 1273482962 +list +\change_inserted 0 1273482962 +zone +\change_unchanged + we last freed +\change_deleted 0 1273482966 +o +\change_inserted 0 1273482966 +i +\change_unchanged +nto, or based on a +d1594 1 +a1594 9 +Lock th +\change_inserted 0 1273482980 +e corresponding +\change_deleted 0 1273482973 +at +\change_unchanged + list. +\change_inserted 0 1273482982 + +a1597 2 + +\change_inserted 0 1273483084 +a1598 53 +\change_unchanged + +\end_layout + +\begin_layout Enumerate +If the top entry is +\change_deleted 0 1273492155 +well-sized, +\change_inserted 0 1273492159 +-large enough, +\change_unchanged +remove it from the list and return it. +\end_layout + +\begin_layout Enumerate +Otherwise, +\change_inserted 0 1273492206 +coalesce entries in the list. +\change_deleted 0 1273492200 +examine the entry to the right of it in the file. + If it is free: +\end_layout + +\begin_deeper +\begin_layout Enumerate + +\change_deleted 0 1273492200 +If that entry is in a different list, lock that list too. +\end_layout + +\begin_layout Enumerate + +\change_deleted 0 1273492200 +If we had to place a new lock, re-check that the entry is free. +\end_layout + +\begin_layout Enumerate + +\change_deleted 0 1273492200 +Remove that entry from its free list and expand this entry to cover it. +\end_layout + +\begin_layout Enumerate + +\change_deleted 0 1273485554 +Goto step 3. +\end_layout + +\end_deeper +\begin_layout Enumerate + +\change_inserted 0 1273485311 +If there was no entry large enough, unlock the list and try the next zone. +d1602 1 +a1602 5 + +\change_deleted 0 1273483646 +Repeat step 3 with each entry in the list. +\change_unchanged + +d1606 2 +a1607 5 + +\change_deleted 0 1273483668 +Unlock the list and repeat step 2 with the next list. +\change_unchanged + +d1611 1 +a1611 7 +If no +\change_deleted 0 1273483671 +list +\change_inserted 0 1273483671 +zone +\change_unchanged + satisfies, expand the file. +d1615 2 +a1616 9 +This optimizes rapid insert/delete of free list entries +\change_inserted 0 1273485794 + by not coalescing them all the time. +\change_deleted 0 1273483685 +, and allows us to get rid of the tailer altogether +\change_unchanged +. + +\change_inserted 0 1273492299 +a1638 39 + +\change_deleted 0 1273476840 +The question of +\begin_inset Quotes eld +\end_inset + +well-sized +\begin_inset Quotes erd +\end_inset + + free entries is more difficult: the 25% overhead works in practice for + ldb because indexes tend to expand by one record at a time. + This can be resolved by having an +\begin_inset Quotes eld +\end_inset + +expanded +\begin_inset Quotes erd +\end_inset + + bit in the header to note entries that have previously expanded, and allocating + more space for them. + Whether the +\begin_inset Quotes eld +\end_inset + +increasing slack +\begin_inset Quotes erd +\end_inset + + algorithm should be implemented or first-fit used is still unknown: we + will determine this once these other ideas are implemented. +\change_inserted 0 1273483750 + +\end_layout + +\begin_layout Standard + +\change_inserted 0 1273492450 +a1644 2 + +\change_inserted 0 1273470441 +a1654 2 + +\change_inserted 0 1273476556 +a1659 2 + +\change_inserted 0 1273470423 +a1661 2 +\change_unchanged + +a1672 2 + +\change_inserted 0 1273476847 +a1676 2 + +\change_inserted 0 1273476886 +a1691 2 + +\change_inserted 0 1273477233 +a1699 2 + +\change_inserted 0 1273477534 +a1706 2 + +\change_inserted 0 1273482700 +a1712 2 + +\change_inserted 0 1273478079 +a1722 2 + +\change_inserted 0 1273477839 +a1726 2 + +\change_inserted 0 1273477925 +a1730 2 + +\change_inserted 0 1273477925 +a1734 2 + +\change_inserted 0 1273477925 +a1738 2 + +\change_inserted 0 1273477925 +a1742 2 + +\change_inserted 0 1273477925 +a1746 2 + +\change_inserted 0 1273477925 +a1750 2 + +\change_inserted 0 1273477925 +a1754 2 + +\change_inserted 0 1273477925 +a1758 2 + +\change_inserted 0 1273477925 +a1762 2 + +\change_inserted 0 1273477925 +a1766 2 + +\change_inserted 0 1273477925 +a1770 2 + +\change_inserted 0 1273477925 +a1774 2 + +\change_inserted 0 1273477925 +a1778 2 + +\change_inserted 0 1273477925 +a1782 2 + +\change_inserted 0 1273477925 +a1786 2 + +\change_inserted 0 1273477925 +a1790 2 + +\change_inserted 0 1273477925 +a1794 2 + +\change_inserted 0 1273477925 +a1798 2 + +\change_inserted 0 1273492522 +a1802 2 + +\change_inserted 0 1273492530 +a1806 2 + +\change_inserted 0 1273492546 +a1810 2 + +\change_inserted 0 1273478239 +a1814 2 + +\change_inserted 0 1273479960 +a1821 2 + +\change_inserted 0 1273480265 +a1830 2 + +\change_inserted 0 1273480354 +a1845 2 + +\change_inserted 0 1273478968 +a1851 2 + +\change_inserted 0 1273492604 +a1859 2 + +\change_inserted 0 1273479572 +a1862 2 +\change_unchanged + +a1870 2 + +\change_inserted 0 1273480282 +a1874 2 + +\change_inserted 0 1273478931 +a1878 2 + +\change_inserted 0 1273481549 +a1882 2 + +\change_inserted 0 1273481557 +a1886 2 + +\change_inserted 0 1273480307 +a1890 2 + +\change_inserted 0 1273480335 +a1894 2 + +\change_inserted 0 1273479897 +a1898 2 + +\change_inserted 0 1273479653 +a1902 2 + +\change_inserted 0 1273480371 +a1906 2 + +\change_inserted 0 1273480464 +a1910 2 + +\change_inserted 0 1273480399 +a1914 2 + +\change_inserted 0 1273480425 +a1918 2 + +\change_inserted 0 1273480453 +a1922 2 + +\change_inserted 0 1273480455 +a1926 2 + +\change_inserted 0 1273480450 +a1930 2 + +\change_inserted 0 1273480452 +a1935 2 +\change_inserted 0 1273478830 + +a1942 5 + +\change_deleted 0 1273481604 +In theory, we could get away with 2: one after we write the new data, and + one to somehow atomically change over to it. +\change_inserted 0 1273481632 +a1946 2 + +\change_inserted 0 1273481724 +a1950 2 + +\change_inserted 0 1273481713 +a1954 2 + +\change_inserted 0 1273481717 +a1958 2 + +\change_inserted 0 1273481730 +a1962 2 + +\change_inserted 0 1273481736 +a1966 2 + +\change_inserted 0 1273481744 +a1970 2 + +\change_inserted 0 1273481748 +a1974 2 + +\change_inserted 0 1273482185 +a1978 2 + +\change_inserted 0 1273482259 +a1989 50 + +\change_deleted 0 1273481848 +None. + Trying to rewrite the transaction code is a separate experiment, which + I encourage someone else to do. + At some point you say +\begin_inset Quotes eld +\end_inset + +use a real database +\begin_inset Quotes erd +\end_inset + +. +\end_layout + +\begin_layout Standard + +\change_deleted 0 1273481848 +But as a thought experiment: +\change_unchanged + +\end_layout + +\begin_layout Standard + +\change_deleted 0 1273481788 +Say there was a pointer in the header which said where the hash table and + free list tables were, and that no blocks were labeled with whether they + were free or not (it had to be derived from what list they were in). + We could create new hash table and free list in some free space, and populate + it as we want the post-committed state to look. + Then we sync, then we switch the offset in the header, then we sync again. +\end_layout + +\begin_layout Standard + +\change_deleted 0 1273481788 +This would not allow arbitrary changes to the database, such as tdb_repack + does, and would require more space (since we have to preserve the current + and future entries at once). + If we used hash trees rather than one big hash table, we might only have + to rewrite some sections of the hash, too. +\change_inserted 0 1273481854 + +\end_layout + +\begin_layout Standard + +\change_inserted 0 1273482102 +a1993 2 + +\change_inserted 0 1273482061 +a1998 2 + +\change_inserted 0 1273482063 +a2002 2 + +\change_inserted 0 1273482072 +a2006 2 + +\change_inserted 0 1273482139 +a2011 2 + +\change_inserted 0 1273482364 +a2015 2 + +\change_inserted 0 1273482163 +a2019 2 + +\change_inserted 0 1273482493 +a2037 2 + +\change_inserted 0 1273482536 +a2046 2 +\change_unchanged + +a2049 2 + +\change_inserted 0 1273482641 +a2058 2 + +\change_inserted 0 1273481827 +d2067 2 +a2068 11 +We could +\change_inserted 0 1273481829 +then +\change_unchanged +implement snapshots using a similar method +\change_deleted 0 1273481838 + to the above, only +\change_inserted 0 1273481840 +, +\change_unchanged + using multiple different hash tables/free tables. +@ + + +1.2 +log +@After first feedback (Ronnie & Volker) +@ +text +@d1314 13 +d1531 11 +a1541 1 +The free list should be split into multiple lists to reduce contention. +d1547 39 +d1596 7 +d1604 1 +a1604 1 +The algorithm for freeing is simple: +d1608 7 +a1614 1 +Identify the correct free list. +d1618 30 +a1647 1 +Lock the list, and place the freed entry at the head. +d1651 7 +a1657 2 +Allocation is a little more complicated, as we merge entries as we walk + the list: +d1661 19 +a1679 1 +Pick a free list; either the list we last freed onto, or based on a +d1691 17 +a1707 1 +Lock that list. +d1711 7 +a1717 1 +If the top entry is well-sized, remove it from the list and return it. +d1721 5 +a1725 1 +Otherwise, examine the entry to the right of it in the file. +d1731 2 +d1737 2 +d1743 2 +d1749 2 +d1756 8 +d1765 2 +d1770 2 +d1773 2 +d1778 7 +a1784 1 +If no list satisfies, expand the file. +d1788 28 +a1815 2 +This optimizes rapid insert/delete of free list entries, and allows us to + get rid of the tailer altogether. +d1819 2 +d1851 1 +a1851 1 +\change_inserted 0 1272941474 +d1857 303 +a2159 18 +\change_inserted 0 1272942759 +There are various ways to organize these lists, but because we want to be + able to quickly identify which free list an entry is in, and reduce the + number of locks required for merging, we will use zoning (eg. + each of the N free lists in a tdb file of size M covers a fixed fraction + M/N). + Note that this means we need to reshuffle the free lists when we expand + the file; this is probably acceptable when we double the hash table size, + since that is such an expensive operation already. + In the case of increasing the file size, there is an optimization we can + use: if we use M in the formula above as the file size rounded up to the + next power of 2, we only need reshuffle free lists when the file size crosses + a power of 2 boundary, +\emph on +and +\emph default +reshuffling the free lists is trivial: we simply merge every consecutive + pair of free lists. +d2164 107 +d2276 2 +d2280 59 +d2346 2 +d2363 2 +d2366 2 +d2371 2 +d2382 2 +d2389 57 +d2458 13 +d2474 32 +a2505 2 +We could implement snapshots using a similar method to the above, only using + multiple different hash tables/free tables. +@ + + +1.1 +log +@Initial revision +@ +text +@d1 1 +a1 1 +#LyX 1.6.4 created this file. For more info see http://www.lyx.org/ +d36 3 +a38 3 +\tracking_changes false +\output_changes false +\author "" +d662 5 +a666 1 + behavior of disallowing transactions should become the default. +d1215 21 +d1527 2 +d1533 3 +a1535 1 + The algorithm for freeing is simple: +d1642 26 +@ diff --git a/lib/tdb2/doc/design.pdf b/lib/tdb2/doc/design.pdf Binary files differnew file mode 100644 index 0000000000..558dc1f8c2 --- /dev/null +++ b/lib/tdb2/doc/design.pdf diff --git a/lib/tdb2/doc/design.txt b/lib/tdb2/doc/design.txt new file mode 100644 index 0000000000..bd2ffde4db --- /dev/null +++ b/lib/tdb2/doc/design.txt @@ -0,0 +1,1258 @@ +TDB2: A Redesigning The Trivial DataBase + +Rusty Russell, IBM Corporation + +1-December-2010 + +Abstract + +The Trivial DataBase on-disk format is 32 bits; with usage cases +heading towards the 4G limit, that must change. This required +breakage provides an opportunity to revisit TDB's other design +decisions and reassess them. + +1 Introduction + +The Trivial DataBase was originally written by Andrew Tridgell as +a simple key/data pair storage system with the same API as dbm, +but allowing multiple readers and writers while being small +enough (< 1000 lines of C) to include in SAMBA. The simple design +created in 1999 has proven surprisingly robust and performant, +used in Samba versions 3 and 4 as well as numerous other +projects. Its useful life was greatly increased by the +(backwards-compatible!) addition of transaction support in 2005. + +The wider variety and greater demands of TDB-using code has lead +to some organic growth of the API, as well as some compromises on +the implementation. None of these, by themselves, are seen as +show-stoppers, but the cumulative effect is to a loss of elegance +over the initial, simple TDB implementation. Here is a table of +the approximate number of lines of implementation code and number +of API functions at the end of each year: + + ++-----------+----------------+--------------------------------+ +| Year End  | API Functions  | Lines of C Code Implementation | ++-----------+----------------+--------------------------------+ ++-----------+----------------+--------------------------------+ +|   1999    |      13        |              1195              | ++-----------+----------------+--------------------------------+ +|   2000    |      24        |              1725              | ++-----------+----------------+--------------------------------+ +|   2001    |      32        |              2228              | ++-----------+----------------+--------------------------------+ +|   2002    |      35        |              2481              | ++-----------+----------------+--------------------------------+ +|   2003    |      35        |              2552              | ++-----------+----------------+--------------------------------+ +|   2004    |      40        |              2584              | ++-----------+----------------+--------------------------------+ +|   2005    |      38        |              2647              | ++-----------+----------------+--------------------------------+ +|   2006    |      52        |              3754              | ++-----------+----------------+--------------------------------+ +|   2007    |      66        |              4398              | ++-----------+----------------+--------------------------------+ +|   2008    |      71        |              4768              | ++-----------+----------------+--------------------------------+ +|   2009    |      73        |              5715              | ++-----------+----------------+--------------------------------+ + + +This review is an attempt to catalog and address all the known +issues with TDB and create solutions which address the problems +without significantly increasing complexity; all involved are far +too aware of the dangers of second system syndrome in rewriting a +successful project like this. + +2 API Issues + +2.1 tdb_open_ex Is Not Expandable + +The tdb_open() call was expanded to tdb_open_ex(), which added an +optional hashing function and an optional logging function +argument. Additional arguments to open would require the +introduction of a tdb_open_ex2 call etc. + +2.1.1 Proposed Solution<attributes> + +tdb_open() will take a linked-list of attributes: + +enum tdb_attribute { + +    TDB_ATTRIBUTE_LOG = 0, + +    TDB_ATTRIBUTE_HASH = 1 + +}; + +struct tdb_attribute_base { + +    enum tdb_attribute attr; + +    union tdb_attribute *next; + +}; + +struct tdb_attribute_log { + +    struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_LOG +*/ + +    tdb_log_func log_fn; + +    void *log_private; + +}; + +struct tdb_attribute_hash { + +    struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_HASH +*/ + +    tdb_hash_func hash_fn; + +    void *hash_private; + +}; + +union tdb_attribute { + +    struct tdb_attribute_base base; + +    struct tdb_attribute_log log; + +    struct tdb_attribute_hash hash; + +}; + +This allows future attributes to be added, even if this expands +the size of the union. + +2.1.2 Status + +Complete. + +2.2 tdb_traverse Makes Impossible Guarantees + +tdb_traverse (and tdb_firstkey/tdb_nextkey) predate transactions, +and it was thought that it was important to guarantee that all +records which exist at the start and end of the traversal would +be included, and no record would be included twice. + +This adds complexity (see[Reliable-Traversal-Adds]) and does not +work anyway for records which are altered (in particular, those +which are expanded may be effectively deleted and re-added behind +the traversal). + +2.2.1 <traverse-Proposed-Solution>Proposed Solution + +Abandon the guarantee. You will see every record if no changes +occur during your traversal, otherwise you will see some subset. +You can prevent changes by using a transaction or the locking +API. + +2.2.2 Status + +Complete. Delete-during-traverse will still delete every record, +too (assuming no other changes). + +2.3 Nesting of Transactions Is Fraught + +TDB has alternated between allowing nested transactions and not +allowing them. Various paths in the Samba codebase assume that +transactions will nest, and in a sense they can: the operation is +only committed to disk when the outer transaction is committed. +There are two problems, however: + +1. Canceling the inner transaction will cause the outer +  transaction commit to fail, and will not undo any operations +  since the inner transaction began. This problem is soluble with +  some additional internal code. + +2. An inner transaction commit can be cancelled by the outer +  transaction. This is desirable in the way which Samba's +  database initialization code uses transactions, but could be a +  surprise to any users expecting a successful transaction commit +  to expose changes to others. + +The current solution is to specify the behavior at tdb_open(), +with the default currently that nested transactions are allowed. +This flag can also be changed at runtime. + +2.3.1 Proposed Solution + +Given the usage patterns, it seems that the “least-surprise” +behavior of disallowing nested transactions should become the +default. Additionally, it seems the outer transaction is the only +code which knows whether inner transactions should be allowed, so +a flag to indicate this could be added to tdb_transaction_start. +However, this behavior can be simulated with a wrapper which uses +tdb_add_flags() and tdb_remove_flags(), so the API should not be +expanded for this relatively-obscure case. + +2.3.2 Status + +Incomplete; nesting flag is still defined as per tdb1. + +2.4 Incorrect Hash Function is Not Detected + +tdb_open_ex() allows the calling code to specify a different hash +function to use, but does not check that all other processes +accessing this tdb are using the same hash function. The result +is that records are missing from tdb_fetch(). + +2.4.1 Proposed Solution + +The header should contain an example hash result (eg. the hash of +0xdeadbeef), and tdb_open_ex() should check that the given hash +function produces the same answer, or fail the tdb_open call. + +2.4.2 Status + +Complete. + +2.5 tdb_set_max_dead/TDB_VOLATILE Expose Implementation + +In response to scalability issues with the free list ([TDB-Freelist-Is] +) two API workarounds have been incorporated in TDB: +tdb_set_max_dead() and the TDB_VOLATILE flag to tdb_open. The +latter actually calls the former with an argument of “5”. + +This code allows deleted records to accumulate without putting +them in the free list. On delete we iterate through each chain +and free them in a batch if there are more than max_dead entries. +These are never otherwise recycled except as a side-effect of a +tdb_repack. + +2.5.1 Proposed Solution + +With the scalability problems of the freelist solved, this API +can be removed. The TDB_VOLATILE flag may still be useful as a +hint that store and delete of records will be at least as common +as fetch in order to allow some internal tuning, but initially +will become a no-op. + +2.5.2 Status + +Incomplete. TDB_VOLATILE still defined, but implementation should +fail on unknown flags to be future-proof. + +2.6 <TDB-Files-Cannot>TDB Files Cannot Be Opened Multiple Times +  In The Same Process + +No process can open the same TDB twice; we check and disallow it. +This is an unfortunate side-effect of fcntl locks, which operate +on a per-file rather than per-file-descriptor basis, and do not +nest. Thus, closing any file descriptor on a file clears all the +locks obtained by this process, even if they were placed using a +different file descriptor! + +Note that even if this were solved, deadlock could occur if +operations were nested: this is a more manageable programming +error in most cases. + +2.6.1 Proposed Solution + +We could lobby POSIX to fix the perverse rules, or at least lobby +Linux to violate them so that the most common implementation does +not have this restriction. This would be a generally good idea +for other fcntl lock users. + +Samba uses a wrapper which hands out the same tdb_context to +multiple callers if this happens, and does simple reference +counting. We should do this inside the tdb library, which already +emulates lock nesting internally; it would need to recognize when +deadlock occurs within a single process. This would create a new +failure mode for tdb operations (while we currently handle +locking failures, they are impossible in normal use and a process +encountering them can do little but give up). + +I do not see benefit in an additional tdb_open flag to indicate +whether re-opening is allowed, as though there may be some +benefit to adding a call to detect when a tdb_context is shared, +to allow other to create such an API. + +2.6.2 Status + +Incomplete. + +2.7 TDB API Is Not POSIX Thread-safe + +The TDB API uses an error code which can be queried after an +operation to determine what went wrong. This programming model +does not work with threads, unless specific additional guarantees +are given by the implementation. In addition, even +otherwise-independent threads cannot open the same TDB (as in [TDB-Files-Cannot] +). + +2.7.1 Proposed Solution + +Reachitecting the API to include a tdb_errcode pointer would be a +great deal of churn; we are better to guarantee that the +tdb_errcode is per-thread so the current programming model can be +maintained. + +This requires dynamic per-thread allocations, which is awkward +with POSIX threads (pthread_key_create space is limited and we +cannot simply allocate a key for every TDB). + +Internal locking is required to make sure that fcntl locks do not +overlap between threads, and also that the global list of tdbs is +maintained. + +The aim is that building tdb with -DTDB_PTHREAD will result in a +pthread-safe version of the library, and otherwise no overhead +will exist. Alternatively, a hooking mechanism similar to that +proposed for [Proposed-Solution-locking-hook] could be used to +enable pthread locking at runtime. + +2.7.2 Status + +Incomplete. + +2.8 *_nonblock Functions And *_mark Functions Expose +  Implementation + +CTDB[footnote: +Clustered TDB, see http://ctdb.samba.org +] wishes to operate on TDB in a non-blocking manner. This is +currently done as follows: + +1. Call the _nonblock variant of an API function (eg. +  tdb_lockall_nonblock). If this fails: + +2. Fork a child process, and wait for it to call the normal +  variant (eg. tdb_lockall). + +3. If the child succeeds, call the _mark variant to indicate we +  already have the locks (eg. tdb_lockall_mark). + +4. Upon completion, tell the child to release the locks (eg. +  tdb_unlockall). + +5. Indicate to tdb that it should consider the locks removed (eg. +  tdb_unlockall_mark). + +There are several issues with this approach. Firstly, adding two +new variants of each function clutters the API for an obscure +use, and so not all functions have three variants. Secondly, it +assumes that all paths of the functions ask for the same locks, +otherwise the parent process will have to get a lock which the +child doesn't have under some circumstances. I don't believe this +is currently the case, but it constrains the implementation. + +2.8.1 <Proposed-Solution-locking-hook>Proposed Solution + +Implement a hook for locking methods, so that the caller can +control the calls to create and remove fcntl locks. In this +scenario, ctdbd would operate as follows: + +1. Call the normal API function, eg tdb_lockall(). + +2. When the lock callback comes in, check if the child has the +  lock. Initially, this is always false. If so, return 0. +  Otherwise, try to obtain it in non-blocking mode. If that +  fails, return EWOULDBLOCK. + +3. Release locks in the unlock callback as normal. + +4. If tdb_lockall() fails, see if we recorded a lock failure; if +  so, call the child to repeat the operation. + +5. The child records what locks it obtains, and returns that +  information to the parent. + +6. When the child has succeeded, goto 1. + +This is flexible enough to handle any potential locking scenario, +even when lock requirements change. It can be optimized so that +the parent does not release locks, just tells the child which +locks it doesn't need to obtain. + +It also keeps the complexity out of the API, and in ctdbd where +it is needed. + +2.8.2 Status + +Incomplete. + +2.9 tdb_chainlock Functions Expose Implementation + +tdb_chainlock locks some number of records, including the record +indicated by the given key. This gave atomicity guarantees; +no-one can start a transaction, alter, read or delete that key +while the lock is held. + +It also makes the same guarantee for any other key in the chain, +which is an internal implementation detail and potentially a +cause for deadlock. + +2.9.1 Proposed Solution + +None. It would be nice to have an explicit single entry lock +which effected no other keys. Unfortunately, this won't work for +an entry which doesn't exist. Thus while chainlock may be +implemented more efficiently for the existing case, it will still +have overlap issues with the non-existing case. So it is best to +keep the current (lack of) guarantee about which records will be +effected to avoid constraining our implementation. + +2.10 Signal Handling is Not Race-Free + +The tdb_setalarm_sigptr() call allows the caller's signal handler +to indicate that the tdb locking code should return with a +failure, rather than trying again when a signal is received (and +errno == EAGAIN). This is usually used to implement timeouts. + +Unfortunately, this does not work in the case where the signal is +received before the tdb code enters the fcntl() call to place the +lock: the code will sleep within the fcntl() code, unaware that +the signal wants it to exit. In the case of long timeouts, this +does not happen in practice. + +2.10.1 Proposed Solution + +The locking hooks proposed in[Proposed-Solution-locking-hook] +would allow the user to decide on whether to fail the lock +acquisition on a signal. This allows the caller to choose their +own compromise: they could narrow the race by checking +immediately before the fcntl call.[footnote: +It may be possible to make this race-free in some implementations +by having the signal handler alter the struct flock to make it +invalid. This will cause the fcntl() lock call to fail with +EINVAL if the signal occurs before the kernel is entered, +otherwise EAGAIN. +] + +2.10.2 Status + +Incomplete. + +2.11 The API Uses Gratuitous Typedefs, Capitals + +typedefs are useful for providing source compatibility when types +can differ across implementations, or arguably in the case of +function pointer definitions which are hard for humans to parse. +Otherwise it is simply obfuscation and pollutes the namespace. + +Capitalization is usually reserved for compile-time constants and +macros. + +  TDB_CONTEXT There is no reason to use this over 'struct +  tdb_context'; the definition isn't visible to the API user +  anyway. + +  TDB_DATA There is no reason to use this over struct TDB_DATA; +  the struct needs to be understood by the API user. + +  struct TDB_DATA This would normally be called 'struct +  tdb_data'. + +  enum TDB_ERROR Similarly, this would normally be enum +  tdb_error. + +2.11.1 Proposed Solution + +None. Introducing lower case variants would please pedants like +myself, but if it were done the existing ones should be kept. +There is little point forcing a purely cosmetic change upon tdb +users. + +2.12 <tdb_log_func-Doesnt-Take>tdb_log_func Doesn't Take The +  Private Pointer + +For API compatibility reasons, the logging function needs to call +tdb_get_logging_private() to retrieve the pointer registered by +the tdb_open_ex for logging. + +2.12.1 Proposed Solution + +It should simply take an extra argument, since we are prepared to +break the API/ABI. + +2.12.2 Status + +Complete. + +2.13 Various Callback Functions Are Not Typesafe + +The callback functions in tdb_set_logging_function (after [tdb_log_func-Doesnt-Take] + is resolved), tdb_parse_record, tdb_traverse, tdb_traverse_read +and tdb_check all take void * and must internally convert it to +the argument type they were expecting. + +If this type changes, the compiler will not produce warnings on +the callers, since it only sees void *. + +2.13.1 Proposed Solution + +With careful use of macros, we can create callback functions +which give a warning when used on gcc and the types of the +callback and its private argument differ. Unsupported compilers +will not give a warning, which is no worse than now. In addition, +the callbacks become clearer, as they need not use void * for +their parameter. + +See CCAN's typesafe_cb module at +http://ccan.ozlabs.org/info/typesafe_cb.html + +2.13.2 Status + +Incomplete. + +2.14 TDB_CLEAR_IF_FIRST Must Be Specified On All Opens, +  tdb_reopen_all Problematic + +The TDB_CLEAR_IF_FIRST flag to tdb_open indicates that the TDB +file should be cleared if the caller discovers it is the only +process with the TDB open. However, if any caller does not +specify TDB_CLEAR_IF_FIRST it will not be detected, so will have +the TDB erased underneath them (usually resulting in a crash). + +There is a similar issue on fork(); if the parent exits (or +otherwise closes the tdb) before the child calls tdb_reopen_all() +to establish the lock used to indicate the TDB is opened by +someone, a TDB_CLEAR_IF_FIRST opener at that moment will believe +it alone has opened the TDB and will erase it. + +2.14.1 Proposed Solution + +Remove TDB_CLEAR_IF_FIRST. Other workarounds are possible, but +see [TDB_CLEAR_IF_FIRST-Imposes-Performance]. + +2.14.2 Status + +Incomplete, TDB_CLEAR_IF_FIRST still defined, but not +implemented. + +2.15 Extending The Header Is Difficult + +We have reserved (zeroed) words in the TDB header, which can be +used for future features. If the future features are compulsory, +the version number must be updated to prevent old code from +accessing the database. But if the future feature is optional, we +have no way of telling if older code is accessing the database or +not. + +2.15.1 Proposed Solution + +The header should contain a “format variant” value (64-bit). This +is divided into two 32-bit parts: + +1. The lower part reflects the format variant understood by code +  accessing the database. + +2. The upper part reflects the format variant you must understand +  to write to the database (otherwise you can only open for +  reading). + +The latter field can only be written at creation time, the former +should be written under the OPEN_LOCK when opening the database +for writing, if the variant of the code is lower than the current +lowest variant. + +This should allow backwards-compatible features to be added, and +detection if older code (which doesn't understand the feature) +writes to the database. + +2.15.2 Status + +Incomplete. + +2.16 Record Headers Are Not Expandible + +If we later want to add (say) checksums on keys and data, it +would require another format change, which we'd like to avoid. + +2.16.1 Proposed Solution + +We often have extra padding at the tail of a record. If we ensure +that the first byte (if any) of this padding is zero, we will +have a way for future changes to detect code which doesn't +understand a new format: the new code would write (say) a 1 at +the tail, and thus if there is no tail or the first byte is 0, we +would know the extension is not present on that record. + +2.16.2 Status + +Incomplete. + +2.17 TDB Does Not Use Talloc + +Many users of TDB (particularly Samba) use the talloc allocator, +and thus have to wrap TDB in a talloc context to use it +conveniently. + +2.17.1 Proposed Solution + +The allocation within TDB is not complicated enough to justify +the use of talloc, and I am reluctant to force another +(excellent) library on TDB users. Nonetheless a compromise is +possible. An attribute (see [attributes]) can be added later to +tdb_open() to provide an alternate allocation mechanism, +specifically for talloc but usable by any other allocator (which +would ignore the “context” argument). + +This would form a talloc heirarchy as expected, but the caller +would still have to attach a destructor to the tdb context +returned from tdb_open to close it. All TDB_DATA fields would be +children of the tdb_context, and the caller would still have to +manage them (using talloc_free() or talloc_steal()). + +2.17.2 Status + +Deferred. + +3 Performance And Scalability Issues + +3.1 <TDB_CLEAR_IF_FIRST-Imposes-Performance>TDB_CLEAR_IF_FIRST +  Imposes Performance Penalty + +When TDB_CLEAR_IF_FIRST is specified, a 1-byte read lock is +placed at offset 4 (aka. the ACTIVE_LOCK). While these locks +never conflict in normal tdb usage, they do add substantial +overhead for most fcntl lock implementations when the kernel +scans to detect if a lock conflict exists. This is often a single +linked list, making the time to acquire and release a fcntl lock +O(N) where N is the number of processes with the TDB open, not +the number actually doing work. + +In a Samba server it is common to have huge numbers of clients +sitting idle, and thus they have weaned themselves off the +TDB_CLEAR_IF_FIRST flag.[footnote: +There is a flag to tdb_reopen_all() which is used for this +optimization: if the parent process will outlive the child, the +child does not need the ACTIVE_LOCK. This is a workaround for +this very performance issue. +] + +3.1.1 Proposed Solution + +Remove the flag. It was a neat idea, but even trivial servers +tend to know when they are initializing for the first time and +can simply unlink the old tdb at that point. + +3.1.2 Status + +Incomplete; TDB_CLEAR_IF_FIRST still defined, but does nothing. + +3.2 TDB Files Have a 4G Limit + +This seems to be becoming an issue (so much for “trivial”!), +particularly for ldb. + +3.2.1 Proposed Solution + +A new, incompatible TDB format which uses 64 bit offsets +internally rather than 32 bit as now. For simplicity of endian +conversion (which TDB does on the fly if required), all values +will be 64 bit on disk. In practice, some upper bits may be used +for other purposes, but at least 56 bits will be available for +file offsets. + +tdb_open() will automatically detect the old version, and even +create them if TDB_VERSION6 is specified to tdb_open. + +32 bit processes will still be able to access TDBs larger than 4G +(assuming that their off_t allows them to seek to 64 bits), they +will gracefully fall back as they fail to mmap. This can happen +already with large TDBs. + +Old versions of tdb will fail to open the new TDB files (since 28 +August 2009, commit 398d0c29290: prior to that any unrecognized +file format would be erased and initialized as a fresh tdb!) + +3.2.2 Status + +Complete. + +3.3 TDB Records Have a 4G Limit + +This has not been a reported problem, and the API uses size_t +which can be 64 bit on 64 bit platforms. However, other limits +may have made such an issue moot. + +3.3.1 Proposed Solution + +Record sizes will be 64 bit, with an error returned on 32 bit +platforms which try to access such records (the current +implementation would return TDB_ERR_OOM in a similar case). It +seems unlikely that 32 bit keys will be a limitation, so the +implementation may not support this (see [sub:Records-Incur-A]). + +3.3.2 Status + +Complete. + +3.4 Hash Size Is Determined At TDB Creation Time + +TDB contains a number of hash chains in the header; the number is +specified at creation time, and defaults to 131. This is such a +bottleneck on large databases (as each hash chain gets quite +long), that LDB uses 10,000 for this hash. In general it is +impossible to know what the 'right' answer is at database +creation time. + +3.4.1 <sub:Hash-Size-Solution>Proposed Solution + +After comprehensive performance testing on various scalable hash +variants[footnote: +http://rusty.ozlabs.org/?p=89 and http://rusty.ozlabs.org/?p=94 +This was annoying because I was previously convinced that an +expanding tree of hashes would be very close to optimal. +], it became clear that it is hard to beat a straight linear hash +table which doubles in size when it reaches saturation. +Unfortunately, altering the hash table introduces serious locking +complications: the entire hash table needs to be locked to +enlarge the hash table, and others might be holding locks. +Particularly insidious are insertions done under tdb_chainlock. + +Thus an expanding layered hash will be used: an array of hash +groups, with each hash group exploding into pointers to lower +hash groups once it fills, turning into a hash tree. This has +implications for locking: we must lock the entire group in case +we need to expand it, yet we don't know how deep the tree is at +that point. + +Note that bits from the hash table entries should be stolen to +hold more hash bits to reduce the penalty of collisions. We can +use the otherwise-unused lower 3 bits. If we limit the size of +the database to 64 exabytes, we can use the top 8 bits of the +hash entry as well. These 11 bits would reduce false positives +down to 1 in 2000 which is more than we need: we can use one of +the bits to indicate that the extra hash bits are valid. This +means we can choose not to re-hash all entries when we expand a +hash group; simply use the next bits we need and mark them +invalid. + +3.4.2 Status + +Complete. + +3.5 <TDB-Freelist-Is>TDB Freelist Is Highly Contended + +TDB uses a single linked list for the free list. Allocation +occurs as follows, using heuristics which have evolved over time: + +1. Get the free list lock for this whole operation. + +2. Multiply length by 1.25, so we always over-allocate by 25%. + +3. Set the slack multiplier to 1. + +4. Examine the current freelist entry: if it is > length but < +  the current best case, remember it as the best case. + +5. Multiply the slack multiplier by 1.05. + +6. If our best fit so far is less than length * slack multiplier, +  return it. The slack will be turned into a new free record if +  it's large enough. + +7. Otherwise, go onto the next freelist entry. + +Deleting a record occurs as follows: + +1. Lock the hash chain for this whole operation. + +2. Walk the chain to find the record, keeping the prev pointer +  offset. + +3. If max_dead is non-zero: + +  (a) Walk the hash chain again and count the dead records. + +  (b) If it's more than max_dead, bulk free all the dead ones +    (similar to steps 4 and below, but the lock is only obtained +    once). + +  (c) Simply mark this record as dead and return. + +4. Get the free list lock for the remainder of this operation. + +5. <right-merging>Examine the following block to see if it is +  free; if so, enlarge the current block and remove that block +  from the free list. This was disabled, as removal from the free +  list was O(entries-in-free-list). + +6. Examine the preceeding block to see if it is free: for this +  reason, each block has a 32-bit tailer which indicates its +  length. If it is free, expand it to cover our new block and +  return. + +7. Otherwise, prepend ourselves to the free list. + +Disabling right-merging (step [right-merging]) causes +fragmentation; the other heuristics proved insufficient to +address this, so the final answer to this was that when we expand +the TDB file inside a transaction commit, we repack the entire +tdb. + +The single list lock limits our allocation rate; due to the other +issues this is not currently seen as a bottleneck. + +3.5.1 Proposed Solution + +The first step is to remove all the current heuristics, as they +obviously interact, then examine them once the lock contention is +addressed. + +The free list must be split to reduce contention. Assuming +perfect free merging, we can at most have 1 free list entry for +each entry. This implies that the number of free lists is related +to the size of the hash table, but as it is rare to walk a large +number of free list entries we can use far fewer, say 1/32 of the +number of hash buckets. + +It seems tempting to try to reuse the hash implementation which +we use for records here, but we have two ways of searching for +free entries: for allocation we search by size (and possibly +zone) which produces too many clashes for our hash table to +handle well, and for coalescing we search by address. Thus an +array of doubly-linked free lists seems preferable. + +There are various benefits in using per-size free lists (see [sub:TDB-Becomes-Fragmented] +) but it's not clear this would reduce contention in the common +case where all processes are allocating/freeing the same size. +Thus we almost certainly need to divide in other ways: the most +obvious is to divide the file into zones, and using a free list +(or table of free lists) for each. This approximates address +ordering. + +Unfortunately it is difficult to know what heuristics should be +used to determine zone sizes, and our transaction code relies on +being able to create a “recovery area” by simply appending to the +file (difficult if it would need to create a new zone header). +Thus we use a linked-list of free tables; currently we only ever +create one, but if there is more than one we choose one at random +to use. In future we may use heuristics to add new free tables on +contention. We only expand the file when all free tables are +exhausted. + +The basic algorithm is as follows. Freeing is simple: + +1. Identify the correct free list. + +2. Lock the corresponding list. + +3. Re-check the list (we didn't have a lock, sizes could have +  changed): relock if necessary. + +4. Place the freed entry in the list. + +Allocation is a little more complicated, as we perform delayed +coalescing at this point: + +1. Pick a free table; usually the previous one. + +2. Lock the corresponding list. + +3. If the top entry is -large enough, remove it from the list and +  return it. + +4. Otherwise, coalesce entries in the list.If there was no entry +  large enough, unlock the list and try the next largest list + +5. If no list has an entry which meets our needs, try the next +  free table. + +6. If no zone satisfies, expand the file. + +This optimizes rapid insert/delete of free list entries by not +coalescing them all the time.. First-fit address ordering +ordering seems to be fairly good for keeping fragmentation low +(see [sub:TDB-Becomes-Fragmented]). Note that address ordering +does not need a tailer to coalesce, though if we needed one we +could have one cheaply: see [sub:Records-Incur-A]. + +Each free entry has the free table number in the header: less +than 255. It also contains a doubly-linked list for easy +deletion. + +3.6 <sub:TDB-Becomes-Fragmented>TDB Becomes Fragmented + +Much of this is a result of allocation strategy[footnote: +The Memory Fragmentation Problem: Solved? Johnstone & Wilson 1995 +ftp://ftp.cs.utexas.edu/pub/garbage/malloc/ismm98.ps +] and deliberate hobbling of coalescing; internal fragmentation +(aka overallocation) is deliberately set at 25%, and external +fragmentation is only cured by the decision to repack the entire +db when a transaction commit needs to enlarge the file. + +3.6.1 Proposed Solution + +The 25% overhead on allocation works in practice for ldb because +indexes tend to expand by one record at a time. This internal +fragmentation can be resolved by having an “expanded” bit in the +header to note entries that have previously expanded, and +allocating more space for them. + +There are is a spectrum of possible solutions for external +fragmentation: one is to use a fragmentation-avoiding allocation +strategy such as best-fit address-order allocator. The other end +of the spectrum would be to use a bump allocator (very fast and +simple) and simply repack the file when we reach the end. + +There are three problems with efficient fragmentation-avoiding +allocators: they are non-trivial, they tend to use a single free +list for each size, and there's no evidence that tdb allocation +patterns will match those recorded for general allocators (though +it seems likely). + +Thus we don't spend too much effort on external fragmentation; we +will be no worse than the current code if we need to repack on +occasion. More effort is spent on reducing freelist contention, +and reducing overhead. + +3.7 <sub:Records-Incur-A>Records Incur A 28-Byte Overhead + +Each TDB record has a header as follows: + +struct tdb_record { + +        tdb_off_t next; /* offset of the next record in the list +*/ + +        tdb_len_t rec_len; /* total byte length of record */ + +        tdb_len_t key_len; /* byte length of key */ + +        tdb_len_t data_len; /* byte length of data */ + +        uint32_t full_hash; /* the full 32 bit hash of the key */ + +        uint32_t magic;   /* try to catch errors */ + +        /* the following union is implied: + +                union { + +                        char record[rec_len]; + +                        struct { + +                                char key[key_len]; + +                                char data[data_len]; + +                        } + +                        uint32_t totalsize; (tailer) + +                } + +        */ + +}; + +Naively, this would double to a 56-byte overhead on a 64 bit +implementation. + +3.7.1 Proposed Solution + +We can use various techniques to reduce this for an allocated +block: + +1. The 'next' pointer is not required, as we are using a flat +  hash table. + +2. 'rec_len' can instead be expressed as an addition to key_len +  and data_len (it accounts for wasted or overallocated length in +  the record). Since the record length is always a multiple of 8, +  we can conveniently fit it in 32 bits (representing up to 35 +  bits). + +3. 'key_len' and 'data_len' can be reduced. I'm unwilling to +  restrict 'data_len' to 32 bits, but instead we can combine the +  two into one 64-bit field and using a 5 bit value which +  indicates at what bit to divide the two. Keys are unlikely to +  scale as fast as data, so I'm assuming a maximum key size of 32 +  bits. + +4. 'full_hash' is used to avoid a memcmp on the “miss” case, but +  this is diminishing returns after a handful of bits (at 10 +  bits, it reduces 99.9% of false memcmp). As an aside, as the +  lower bits are already incorporated in the hash table +  resolution, the upper bits should be used here. Note that it's +  not clear that these bits will be a win, given the extra bits +  in the hash table itself (see [sub:Hash-Size-Solution]). + +5. 'magic' does not need to be enlarged: it currently reflects +  one of 5 values (used, free, dead, recovery, and +  unused_recovery). It is useful for quick sanity checking +  however, and should not be eliminated. + +6. 'tailer' is only used to coalesce free blocks (so a block to +  the right can find the header to check if this block is free). +  This can be replaced by a single 'free' bit in the header of +  the following block (and the tailer only exists in free +  blocks).[footnote: +This technique from Thomas Standish. Data Structure Techniques. +Addison-Wesley, Reading, Massachusetts, 1980. +] The current proposed coalescing algorithm doesn't need this, +  however. + +This produces a 16 byte used header like this: + +struct tdb_used_record { + +        uint32_t used_magic : 16, + + + +                 key_data_divide: 5, + +                 top_hash: 11; + +        uint32_t extra_octets; + +        uint64_t key_and_data_len; + +}; + +And a free record like this: + +struct tdb_free_record { + +        uint64_t free_magic: 8, + +                   prev : 56; + + + +        uint64_t free_table: 8, + +                 total_length : 56 + +        uint64_t next;; + +}; + +Note that by limiting valid offsets to 56 bits, we can pack +everything we need into 3 64-byte words, meaning our minimum +record size is 8 bytes. + +3.7.2 Status + +Complete. + +3.8 Transaction Commit Requires 4 fdatasync + +The current transaction algorithm is: + +1. write_recovery_data(); + +2. sync(); + +3. write_recovery_header(); + +4. sync(); + +5. overwrite_with_new_data(); + +6. sync(); + +7. remove_recovery_header(); + +8. sync(); + +On current ext3, each sync flushes all data to disk, so the next +3 syncs are relatively expensive. But this could become a +performance bottleneck on other filesystems such as ext4. + +3.8.1 Proposed Solution + +Neil Brown points out that this is overzealous, and only one sync +is needed: + +1. Bundle the recovery data, a transaction counter and a strong +  checksum of the new data. + +2. Strong checksum that whole bundle. + +3. Store the bundle in the database. + +4. Overwrite the oldest of the two recovery pointers in the +  header (identified using the transaction counter) with the +  offset of this bundle. + +5. sync. + +6. Write the new data to the file. + +Checking for recovery means identifying the latest bundle with a +valid checksum and using the new data checksum to ensure that it +has been applied. This is more expensive than the current check, +but need only be done at open. For running databases, a separate +header field can be used to indicate a transaction in progress; +we need only check for recovery if this is set. + +3.8.2 Status + +Deferred. + +3.9 <sub:TDB-Does-Not>TDB Does Not Have Snapshot Support + +3.9.1 Proposed SolutionNone. At some point you say “use a real +  database” (but see [replay-attribute]). + +But as a thought experiment, if we implemented transactions to +only overwrite free entries (this is tricky: there must not be a +header in each entry which indicates whether it is free, but use +of presence in metadata elsewhere), and a pointer to the hash +table, we could create an entirely new commit without destroying +existing data. Then it would be easy to implement snapshots in a +similar way. + +This would not allow arbitrary changes to the database, such as +tdb_repack does, and would require more space (since we have to +preserve the current and future entries at once). If we used hash +trees rather than one big hash table, we might only have to +rewrite some sections of the hash, too. + +We could then implement snapshots using a similar method, using +multiple different hash tables/free tables. + +3.9.2 Status + +Deferred. + +3.10 Transactions Cannot Operate in Parallel + +This would be useless for ldb, as it hits the index records with +just about every update. It would add significant complexity in +resolving clashes, and cause the all transaction callers to write +their code to loop in the case where the transactions spuriously +failed. + +3.10.1 Proposed Solution + +None (but see [replay-attribute]). We could solve a small part of +the problem by providing read-only transactions. These would +allow one write transaction to begin, but it could not commit +until all r/o transactions are done. This would require a new +RO_TRANSACTION_LOCK, which would be upgraded on commit. + +3.10.2 Status + +Deferred. + +3.11 Default Hash Function Is Suboptimal + +The Knuth-inspired multiplicative hash used by tdb is fairly slow +(especially if we expand it to 64 bits), and works best when the +hash bucket size is a prime number (which also means a slow +modulus). In addition, it is highly predictable which could +potentially lead to a Denial of Service attack in some TDB uses. + +3.11.1 Proposed Solution + +The Jenkins lookup3 hash[footnote: +http://burtleburtle.net/bob/c/lookup3.c +] is a fast and superbly-mixing hash. It's used by the Linux +kernel and almost everything else. This has the particular +properties that it takes an initial seed, and produces two 32 bit +hash numbers, which we can combine into a 64-bit hash. + +The seed should be created at tdb-creation time from some random +source, and placed in the header. This is far from foolproof, but +adds a little bit of protection against hash bombing. + +3.11.2 Status + +Complete. + +3.12 <Reliable-Traversal-Adds>Reliable Traversal Adds Complexity + +We lock a record during traversal iteration, and try to grab that +lock in the delete code. If that grab on delete fails, we simply +mark it deleted and continue onwards; traversal checks for this +condition and does the delete when it moves off the record. + +If traversal terminates, the dead record may be left +indefinitely. + +3.12.1 Proposed Solution + +Remove reliability guarantees; see [traverse-Proposed-Solution]. + +3.12.2 Status + +Complete. + +3.13 Fcntl Locking Adds Overhead + +Placing a fcntl lock means a system call, as does removing one. +This is actually one reason why transactions can be faster +(everything is locked once at transaction start). In the +uncontended case, this overhead can theoretically be eliminated. + +3.13.1 Proposed Solution + +None. + +We tried this before with spinlock support, in the early days of +TDB, and it didn't make much difference except in manufactured +benchmarks. + +We could use spinlocks (with futex kernel support under Linux), +but it means that we lose automatic cleanup when a process dies +with a lock. There is a method of auto-cleanup under Linux, but +it's not supported by other operating systems. We could +reintroduce a clear-if-first-style lock and sweep for dead +futexes on open, but that wouldn't help the normal case of one +concurrent opener dying. Increasingly elaborate repair schemes +could be considered, but they require an ABI change (everyone +must use them) anyway, so there's no need to do this at the same +time as everything else. + +3.14 Some Transactions Don't Require Durability + +Volker points out that gencache uses a CLEAR_IF_FIRST tdb for +normal (fast) usage, and occasionally empties the results into a +transactional TDB. This kind of usage prioritizes performance +over durability: as long as we are consistent, data can be lost. + +This would be more neatly implemented inside tdb: a “soft” +transaction commit (ie. syncless) which meant that data may be +reverted on a crash. + +3.14.1 Proposed Solution + +None. + +Unfortunately any transaction scheme which overwrites old data +requires a sync before that overwrite to avoid the possibility of +corruption. + +It seems possible to use a scheme similar to that described in [sub:TDB-Does-Not] +,where transactions are committed without overwriting existing +data, and an array of top-level pointers were available in the +header. If the transaction is “soft” then we would not need a +sync at all: existing processes would pick up the new hash table +and free list and work with that. + +At some later point, a sync would allow recovery of the old data +into the free lists (perhaps when the array of top-level pointers +filled). On crash, tdb_open() would examine the array of top +levels, and apply the transactions until it encountered an +invalid checksum. + +3.15 Tracing Is Fragile, Replay Is External + +The current TDB has compile-time-enabled tracing code, but it +often breaks as it is not enabled by default. In a similar way, +the ctdb code has an external wrapper which does replay tracing +so it can coordinate cluster-wide transactions. + +3.15.1 Proposed Solution<replay-attribute> + +Tridge points out that an attribute can be later added to +tdb_open (see [attributes]) to provide replay/trace hooks, which +could become the basis for this and future parallel transactions +and snapshot support. + +3.15.2 Status + +Deferred. diff --git a/lib/tdb2/free.c b/lib/tdb2/free.c new file mode 100644 index 0000000000..a770751dc0 --- /dev/null +++ b/lib/tdb2/free.c @@ -0,0 +1,968 @@ + /* +   Trivial Database 2: free list/block handling +   Copyright (C) Rusty Russell 2010 + +   This library is free software; you can redistribute it and/or +   modify it under the terms of the GNU Lesser General Public +   License as published by the Free Software Foundation; either +   version 3 of the License, or (at your option) any later version. + +   This library is distributed in the hope that it will be useful, +   but WITHOUT ANY WARRANTY; without even the implied warranty of +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU +   Lesser General Public License for more details. + +   You should have received a copy of the GNU Lesser General Public +   License along with this library; if not, see <http://www.gnu.org/licenses/>. +*/ +#include "private.h" +#include <ccan/likely/likely.h> +#include <ccan/ilog/ilog.h> +#include <time.h> +#include <assert.h> +#include <limits.h> + +static unsigned fls64(uint64_t val) +{ +	return ilog64(val); +} + +/* In which bucket would we find a particular record size? (ignoring header) */ +unsigned int size_to_bucket(tdb_len_t data_len) +{ +	unsigned int bucket; + +	/* We can't have records smaller than this. */ +	assert(data_len >= TDB_MIN_DATA_LEN); + +	/* Ignoring the header... */ +	if (data_len - TDB_MIN_DATA_LEN <= 64) { +		/* 0 in bucket 0, 8 in bucket 1... 64 in bucket 8. */ +		bucket = (data_len - TDB_MIN_DATA_LEN) / 8; +	} else { +		/* After that we go power of 2. */ +		bucket = fls64(data_len - TDB_MIN_DATA_LEN) + 2; +	} + +	if (unlikely(bucket >= TDB_FREE_BUCKETS)) +		bucket = TDB_FREE_BUCKETS - 1; +	return bucket; +} + +tdb_off_t first_ftable(struct tdb_context *tdb) +{ +	return tdb_read_off(tdb, offsetof(struct tdb_header, free_table)); +} + +tdb_off_t next_ftable(struct tdb_context *tdb, tdb_off_t ftable) +{ +	return tdb_read_off(tdb, ftable + offsetof(struct tdb_freetable,next)); +} + +enum TDB_ERROR tdb_ftable_init(struct tdb_context *tdb) +{ +	/* Use reservoir sampling algorithm to select a free list at random. */ +	unsigned int rnd, max = 0, count = 0; +	tdb_off_t off; + +	tdb->ftable_off = off = first_ftable(tdb); +	tdb->ftable = 0; + +	while (off) { +		if (TDB_OFF_IS_ERR(off)) { +			return off; +		} + +		rnd = random(); +		if (rnd >= max) { +			tdb->ftable_off = off; +			tdb->ftable = count; +			max = rnd; +		} + +		off = next_ftable(tdb, off); +		count++; +	} +	return TDB_SUCCESS; +} + +/* Offset of a given bucket. */ +tdb_off_t bucket_off(tdb_off_t ftable_off, unsigned bucket) +{ +	return ftable_off + offsetof(struct tdb_freetable, buckets) +		+ bucket * sizeof(tdb_off_t); +} + +/* Returns free_buckets + 1, or list number to search, or -ve error. */ +static tdb_off_t find_free_head(struct tdb_context *tdb, +				tdb_off_t ftable_off, +				tdb_off_t bucket) +{ +	/* Speculatively search for a non-zero bucket. */ +	return tdb_find_nonzero_off(tdb, bucket_off(ftable_off, 0), +				    bucket, TDB_FREE_BUCKETS); +} + +static void check_list(struct tdb_context *tdb, tdb_off_t b_off) +{ +#ifdef CCAN_TDB2_DEBUG +	tdb_off_t off, prev = 0, first; +	struct tdb_free_record r; + +	first = off = (tdb_read_off(tdb, b_off) & TDB_OFF_MASK); +	while (off != 0) { +		tdb_read_convert(tdb, off, &r, sizeof(r)); +		if (frec_magic(&r) != TDB_FREE_MAGIC) +			abort(); +		if (prev && frec_prev(&r) != prev) +			abort(); +		prev = off; +		off = r.next; +	} + +	if (first) { +		tdb_read_convert(tdb, first, &r, sizeof(r)); +		if (frec_prev(&r) != prev) +			abort(); +	} +#endif +} + +/* Remove from free bucket. */ +static enum TDB_ERROR remove_from_list(struct tdb_context *tdb, +				       tdb_off_t b_off, tdb_off_t r_off, +				       const struct tdb_free_record *r) +{ +	tdb_off_t off, prev_next, head; +	enum TDB_ERROR ecode; + +	/* Is this only element in list?  Zero out bucket, and we're done. */ +	if (frec_prev(r) == r_off) +		return tdb_write_off(tdb, b_off, 0); + +	/* off = &r->prev->next */ +	off = frec_prev(r) + offsetof(struct tdb_free_record, next); + +	/* Get prev->next */ +	prev_next = tdb_read_off(tdb, off); +	if (TDB_OFF_IS_ERR(prev_next)) +		return prev_next; + +	/* If prev->next == 0, we were head: update bucket to point to next. */ +	if (prev_next == 0) { +		/* We must preserve upper bits. */ +		head = tdb_read_off(tdb, b_off); +		if (TDB_OFF_IS_ERR(head)) +			return head; + +		if ((head & TDB_OFF_MASK) != r_off) { +			return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR, +					  "remove_from_list:" +					  " %llu head %llu on list %llu", +					  (long long)r_off, +					  (long long)head, +					  (long long)b_off); +		} +		head = ((head & ~TDB_OFF_MASK) | r->next); +		ecode = tdb_write_off(tdb, b_off, head); +		if (ecode != TDB_SUCCESS) +			return ecode; +	} else { +		/* r->prev->next = r->next */ +		ecode = tdb_write_off(tdb, off, r->next); +		if (ecode != TDB_SUCCESS) +			return ecode; +	} + +	/* If we were the tail, off = &head->prev. */ +	if (r->next == 0) { +		head = tdb_read_off(tdb, b_off); +		if (TDB_OFF_IS_ERR(head)) +			return head; +		head &= TDB_OFF_MASK; +		off = head + offsetof(struct tdb_free_record, magic_and_prev); +	} else { +		/* off = &r->next->prev */ +		off = r->next + offsetof(struct tdb_free_record, +					 magic_and_prev); +	} + +#ifdef CCAN_TDB2_DEBUG +	/* *off == r */ +	if ((tdb_read_off(tdb, off) & TDB_OFF_MASK) != r_off) { +		return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR, +				  "remove_from_list:" +				  " %llu bad prev in list %llu", +				  (long long)r_off, (long long)b_off); +	} +#endif +	/* r->next->prev = r->prev */ +	return tdb_write_off(tdb, off, r->magic_and_prev); +} + +/* Enqueue in this free bucket: sets coalesce if we've added 128 + * entries to it. */ +static enum TDB_ERROR enqueue_in_free(struct tdb_context *tdb, +				      tdb_off_t b_off, +				      tdb_off_t off, +				      tdb_len_t len, +				      bool *coalesce) +{ +	struct tdb_free_record new; +	enum TDB_ERROR ecode; +	tdb_off_t prev, head; +	uint64_t magic = (TDB_FREE_MAGIC << (64 - TDB_OFF_UPPER_STEAL)); + +	head = tdb_read_off(tdb, b_off); +	if (TDB_OFF_IS_ERR(head)) +		return head; + +	/* We only need to set ftable_and_len; rest is set in enqueue_in_free */ +	new.ftable_and_len = ((uint64_t)tdb->ftable << (64 - TDB_OFF_UPPER_STEAL)) +		| len; + +	/* new->next = head. */ +	new.next = (head & TDB_OFF_MASK); + +	/* First element?  Prev points to ourselves. */ +	if (!new.next) { +		new.magic_and_prev = (magic | off); +	} else { +		/* new->prev = next->prev */ +		prev = tdb_read_off(tdb, +				    new.next + offsetof(struct tdb_free_record, +							magic_and_prev)); +		new.magic_and_prev = prev; +		if (frec_magic(&new) != TDB_FREE_MAGIC) { +			return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR, +					  "enqueue_in_free: %llu bad head" +					  " prev %llu", +					  (long long)new.next, +					  (long long)prev); +		} +		/* next->prev = new. */ +		ecode = tdb_write_off(tdb, new.next +				      + offsetof(struct tdb_free_record, +						 magic_and_prev), +				      off | magic); +		if (ecode != TDB_SUCCESS) { +			return ecode; +		} + +#ifdef CCAN_TDB2_DEBUG +		prev = tdb_read_off(tdb, frec_prev(&new) +				    + offsetof(struct tdb_free_record, next)); +		if (prev != 0) { +			return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR, +					  "enqueue_in_free:" +					  " %llu bad tail next ptr %llu", +					  (long long)frec_prev(&new) +					  + offsetof(struct tdb_free_record, +						     next), +					  (long long)prev); +		} +#endif +	} + +	/* Update enqueue count, but don't set high bit: see TDB_OFF_IS_ERR */ +	if (*coalesce) +		head += (1ULL << (64 - TDB_OFF_UPPER_STEAL)); +	head &= ~(TDB_OFF_MASK | (1ULL << 63)); +	head |= off; + +	ecode = tdb_write_off(tdb, b_off, head); +	if (ecode != TDB_SUCCESS) { +		return ecode; +	} + +	/* It's time to coalesce if counter wrapped. */ +	if (*coalesce) +		*coalesce = ((head & ~TDB_OFF_MASK) == 0); + +	return tdb_write_convert(tdb, off, &new, sizeof(new)); +} + +static tdb_off_t ftable_offset(struct tdb_context *tdb, unsigned int ftable) +{ +	tdb_off_t off; +	unsigned int i; + +	if (likely(tdb->ftable == ftable)) +		return tdb->ftable_off; + +	off = first_ftable(tdb); +	for (i = 0; i < ftable; i++) { +		if (TDB_OFF_IS_ERR(off)) { +			break; +		} +		off = next_ftable(tdb, off); +	} +	return off; +} + +/* Note: we unlock the current bucket if fail (-ve), or coalesce (+ve) and + * need to blatt the *protect record (which is set to an error). */ +static tdb_len_t coalesce(struct tdb_context *tdb, +			  tdb_off_t off, tdb_off_t b_off, +			  tdb_len_t data_len, +			  tdb_off_t *protect) +{ +	tdb_off_t end; +	struct tdb_free_record rec; +	enum TDB_ERROR ecode; + +	tdb->stats.alloc_coalesce_tried++; +	end = off + sizeof(struct tdb_used_record) + data_len; + +	while (end < tdb->file->map_size) { +		const struct tdb_free_record *r; +		tdb_off_t nb_off; +		unsigned ftable, bucket; + +		r = tdb_access_read(tdb, end, sizeof(*r), true); +		if (TDB_PTR_IS_ERR(r)) { +			ecode = TDB_PTR_ERR(r); +			goto err; +		} + +		if (frec_magic(r) != TDB_FREE_MAGIC +		    || frec_ftable(r) == TDB_FTABLE_NONE) { +			tdb_access_release(tdb, r); +			break; +		} + +		ftable = frec_ftable(r); +		bucket = size_to_bucket(frec_len(r)); +		nb_off = ftable_offset(tdb, ftable); +		if (TDB_OFF_IS_ERR(nb_off)) { +			tdb_access_release(tdb, r); +			ecode = nb_off; +			goto err; +		} +		nb_off = bucket_off(nb_off, bucket); +		tdb_access_release(tdb, r); + +		/* We may be violating lock order here, so best effort. */ +		if (tdb_lock_free_bucket(tdb, nb_off, TDB_LOCK_NOWAIT) +		    != TDB_SUCCESS) { +			tdb->stats.alloc_coalesce_lockfail++; +			break; +		} + +		/* Now we have lock, re-check. */ +		ecode = tdb_read_convert(tdb, end, &rec, sizeof(rec)); +		if (ecode != TDB_SUCCESS) { +			tdb_unlock_free_bucket(tdb, nb_off); +			goto err; +		} + +		if (unlikely(frec_magic(&rec) != TDB_FREE_MAGIC)) { +			tdb->stats.alloc_coalesce_race++; +			tdb_unlock_free_bucket(tdb, nb_off); +			break; +		} + +		if (unlikely(frec_ftable(&rec) != ftable) +		    || unlikely(size_to_bucket(frec_len(&rec)) != bucket)) { +			tdb->stats.alloc_coalesce_race++; +			tdb_unlock_free_bucket(tdb, nb_off); +			break; +		} + +		/* Did we just mess up a record you were hoping to use? */ +		if (end == *protect) { +			tdb->stats.alloc_coalesce_iterate_clash++; +			*protect = TDB_ERR_NOEXIST; +		} + +		ecode = remove_from_list(tdb, nb_off, end, &rec); +		check_list(tdb, nb_off); +		if (ecode != TDB_SUCCESS) { +			tdb_unlock_free_bucket(tdb, nb_off); +			goto err; +		} + +		end += sizeof(struct tdb_used_record) + frec_len(&rec); +		tdb_unlock_free_bucket(tdb, nb_off); +		tdb->stats.alloc_coalesce_num_merged++; +	} + +	/* Didn't find any adjacent free? */ +	if (end == off + sizeof(struct tdb_used_record) + data_len) +		return 0; + +	/* Before we expand, check this isn't one you wanted protected? */ +	if (off == *protect) { +		*protect = TDB_ERR_EXISTS; +		tdb->stats.alloc_coalesce_iterate_clash++; +	} + +	/* OK, expand initial record */ +	ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec)); +	if (ecode != TDB_SUCCESS) { +		goto err; +	} + +	if (frec_len(&rec) != data_len) { +		ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR, +				   "coalesce: expected data len %zu not %zu", +				   (size_t)data_len, (size_t)frec_len(&rec)); +		goto err; +	} + +	ecode = remove_from_list(tdb, b_off, off, &rec); +	check_list(tdb, b_off); +	if (ecode != TDB_SUCCESS) { +		goto err; +	} + +	/* Try locking violation first.  We don't allow coalesce recursion! */ +	ecode = add_free_record(tdb, off, end - off, TDB_LOCK_NOWAIT, false); +	if (ecode != TDB_SUCCESS) { +		/* Need to drop lock.  Can't rely on anything stable. */ +		tdb->stats.alloc_coalesce_lockfail++; +		*protect = TDB_ERR_CORRUPT; + +		/* We have to drop this to avoid deadlocks, so make sure record +		 * doesn't get coalesced by someone else! */ +		rec.ftable_and_len = (TDB_FTABLE_NONE +				      << (64 - TDB_OFF_UPPER_STEAL)) +			| (end - off - sizeof(struct tdb_used_record)); +		ecode = tdb_write_off(tdb, +				      off + offsetof(struct tdb_free_record, +						     ftable_and_len), +				      rec.ftable_and_len); +		if (ecode != TDB_SUCCESS) { +			goto err; +		} + +		tdb_unlock_free_bucket(tdb, b_off); + +		ecode = add_free_record(tdb, off, end - off, TDB_LOCK_WAIT, +					false); +		if (ecode != TDB_SUCCESS) { +			return ecode; +		} +	} else if (TDB_OFF_IS_ERR(*protect)) { +		/* For simplicity, we always drop lock if they can't continue */ +		tdb_unlock_free_bucket(tdb, b_off); +	} +	tdb->stats.alloc_coalesce_succeeded++; + +	/* Return usable length. */ +	return end - off - sizeof(struct tdb_used_record); + +err: +	/* To unify error paths, we *always* unlock bucket on error. */ +	tdb_unlock_free_bucket(tdb, b_off); +	return ecode; +} + +/* List is locked: we unlock it. */ +static enum TDB_ERROR coalesce_list(struct tdb_context *tdb, +				    tdb_off_t ftable_off, +				    tdb_off_t b_off, +				    unsigned int limit) +{ +	enum TDB_ERROR ecode; +	tdb_off_t off; + +	off = tdb_read_off(tdb, b_off); +	if (TDB_OFF_IS_ERR(off)) { +		ecode = off; +		goto unlock_err; +	} +	/* A little bit of paranoia: counter should be 0. */ +	off &= TDB_OFF_MASK; + +	while (off && limit--) { +		struct tdb_free_record rec; +		tdb_len_t coal; +		tdb_off_t next; + +		ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec)); +		if (ecode != TDB_SUCCESS) +			goto unlock_err; + +		next = rec.next; +		coal = coalesce(tdb, off, b_off, frec_len(&rec), &next); +		if (TDB_OFF_IS_ERR(coal)) { +			/* This has already unlocked on error. */ +			return coal; +		} +		if (TDB_OFF_IS_ERR(next)) { +			/* Coalescing had to unlock, so stop. */ +			return TDB_SUCCESS; +		} +		/* Keep going if we're doing well... */ +		limit += size_to_bucket(coal / 16 + TDB_MIN_DATA_LEN); +		off = next; +	} + +	/* Now, move those elements to the tail of the list so we get something +	 * else next time. */ +	if (off) { +		struct tdb_free_record oldhrec, newhrec, oldtrec, newtrec; +		tdb_off_t oldhoff, oldtoff, newtoff; + +		/* The record we were up to is the new head. */ +		ecode = tdb_read_convert(tdb, off, &newhrec, sizeof(newhrec)); +		if (ecode != TDB_SUCCESS) +			goto unlock_err; + +		/* Get the new tail. */ +		newtoff = frec_prev(&newhrec); +		ecode = tdb_read_convert(tdb, newtoff, &newtrec, +					 sizeof(newtrec)); +		if (ecode != TDB_SUCCESS) +			goto unlock_err; + +		/* Get the old head. */ +		oldhoff = tdb_read_off(tdb, b_off); +		if (TDB_OFF_IS_ERR(oldhoff)) { +			ecode = oldhoff; +			goto unlock_err; +		} + +		/* This could happen if they all coalesced away. */ +		if (oldhoff == off) +			goto out; + +		ecode = tdb_read_convert(tdb, oldhoff, &oldhrec, +					 sizeof(oldhrec)); +		if (ecode != TDB_SUCCESS) +			goto unlock_err; + +		/* Get the old tail. */ +		oldtoff = frec_prev(&oldhrec); +		ecode = tdb_read_convert(tdb, oldtoff, &oldtrec, +					 sizeof(oldtrec)); +		if (ecode != TDB_SUCCESS) +			goto unlock_err; + +		/* Old tail's next points to old head. */ +		oldtrec.next = oldhoff; + +		/* Old head's prev points to old tail. */ +		oldhrec.magic_and_prev +			= (TDB_FREE_MAGIC << (64 - TDB_OFF_UPPER_STEAL)) +			| oldtoff; + +		/* New tail's next is 0. */ +		newtrec.next = 0; + +		/* Write out the modified versions. */ +		ecode = tdb_write_convert(tdb, oldtoff, &oldtrec, +					  sizeof(oldtrec)); +		if (ecode != TDB_SUCCESS) +			goto unlock_err; + +		ecode = tdb_write_convert(tdb, oldhoff, &oldhrec, +					  sizeof(oldhrec)); +		if (ecode != TDB_SUCCESS) +			goto unlock_err; + +		ecode = tdb_write_convert(tdb, newtoff, &newtrec, +					  sizeof(newtrec)); +		if (ecode != TDB_SUCCESS) +			goto unlock_err; + +		/* And finally link in new head. */ +		ecode = tdb_write_off(tdb, b_off, off); +		if (ecode != TDB_SUCCESS) +			goto unlock_err; +	} +out: +	tdb_unlock_free_bucket(tdb, b_off); +	return TDB_SUCCESS; + +unlock_err: +	tdb_unlock_free_bucket(tdb, b_off); +	return ecode; +} + +/* List must not be locked if coalesce_ok is set. */ +enum TDB_ERROR add_free_record(struct tdb_context *tdb, +			       tdb_off_t off, tdb_len_t len_with_header, +			       enum tdb_lock_flags waitflag, +			       bool coalesce) +{ +	tdb_off_t b_off; +	tdb_len_t len; +	enum TDB_ERROR ecode; + +	assert(len_with_header >= sizeof(struct tdb_free_record)); + +	len = len_with_header - sizeof(struct tdb_used_record); + +	b_off = bucket_off(tdb->ftable_off, size_to_bucket(len)); +	ecode = tdb_lock_free_bucket(tdb, b_off, waitflag); +	if (ecode != TDB_SUCCESS) { +		return ecode; +	} + +	ecode = enqueue_in_free(tdb, b_off, off, len, &coalesce); +	check_list(tdb, b_off); + +	/* Coalescing unlocks free list. */ +	if (!ecode && coalesce) +		ecode = coalesce_list(tdb, tdb->ftable_off, b_off, 2); +	else +		tdb_unlock_free_bucket(tdb, b_off); +	return ecode; +} + +static size_t adjust_size(size_t keylen, size_t datalen) +{ +	size_t size = keylen + datalen; + +	if (size < TDB_MIN_DATA_LEN) +		size = TDB_MIN_DATA_LEN; + +	/* Round to next uint64_t boundary. */ +	return (size + (sizeof(uint64_t) - 1ULL)) & ~(sizeof(uint64_t) - 1ULL); +} + +/* If we have enough left over to be useful, split that off. */ +static size_t record_leftover(size_t keylen, size_t datalen, +			      bool want_extra, size_t total_len) +{ +	ssize_t leftover; + +	if (want_extra) +		datalen += datalen / 2; +	leftover = total_len - adjust_size(keylen, datalen); + +	if (leftover < (ssize_t)sizeof(struct tdb_free_record)) +		return 0; + +	return leftover; +} + +/* We need size bytes to put our key and data in. */ +static tdb_off_t lock_and_alloc(struct tdb_context *tdb, +				tdb_off_t ftable_off, +				tdb_off_t bucket, +				size_t keylen, size_t datalen, +				bool want_extra, +				unsigned magic, +				unsigned hashlow) +{ +	tdb_off_t off, b_off,best_off; +	struct tdb_free_record best = { 0 }; +	double multiplier; +	size_t size = adjust_size(keylen, datalen); +	enum TDB_ERROR ecode; + +	tdb->stats.allocs++; +	b_off = bucket_off(ftable_off, bucket); + +	/* FIXME: Try non-blocking wait first, to measure contention. */ +	/* Lock this bucket. */ +	ecode = tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT); +	if (ecode != TDB_SUCCESS) { +		return ecode; +	} + +	best.ftable_and_len = -1ULL; +	best_off = 0; + +	/* Get slack if we're after extra. */ +	if (want_extra) +		multiplier = 1.5; +	else +		multiplier = 1.0; + +	/* Walk the list to see if any are large enough, getting less fussy +	 * as we go. */ +	off = tdb_read_off(tdb, b_off); +	if (TDB_OFF_IS_ERR(off)) { +		ecode = off; +		goto unlock_err; +	} +	off &= TDB_OFF_MASK; + +	while (off) { +		const struct tdb_free_record *r; +		tdb_len_t len; +		tdb_off_t next; + +		r = tdb_access_read(tdb, off, sizeof(*r), true); +		if (TDB_PTR_IS_ERR(r)) { +			ecode = TDB_PTR_ERR(r); +			goto unlock_err; +		} + +		if (frec_magic(r) != TDB_FREE_MAGIC) { +			ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR, +					   "lock_and_alloc:" +					   " %llu non-free 0x%llx", +					   (long long)off, +					   (long long)r->magic_and_prev); +			tdb_access_release(tdb, r); +			goto unlock_err; +		} + +		if (frec_len(r) >= size && frec_len(r) < frec_len(&best)) { +			best_off = off; +			best = *r; +		} + +		if (frec_len(&best) <= size * multiplier && best_off) { +			tdb_access_release(tdb, r); +			break; +		} + +		multiplier *= 1.01; + +		next = r->next; +		len = frec_len(r); +		tdb_access_release(tdb, r); +		off = next; +	} + +	/* If we found anything at all, use it. */ +	if (best_off) { +		struct tdb_used_record rec; +		size_t leftover; + +		/* We're happy with this size: take it. */ +		ecode = remove_from_list(tdb, b_off, best_off, &best); +		check_list(tdb, b_off); +		if (ecode != TDB_SUCCESS) { +			goto unlock_err; +		} + +		leftover = record_leftover(keylen, datalen, want_extra, +					   frec_len(&best)); + +		assert(keylen + datalen + leftover <= frec_len(&best)); +		/* We need to mark non-free before we drop lock, otherwise +		 * coalesce() could try to merge it! */ +		ecode = set_header(tdb, &rec, magic, keylen, datalen, +				   frec_len(&best) - leftover, hashlow); +		if (ecode != TDB_SUCCESS) { +			goto unlock_err; +		} + +		ecode = tdb_write_convert(tdb, best_off, &rec, sizeof(rec)); +		if (ecode != TDB_SUCCESS) { +			goto unlock_err; +		} + +		/* For futureproofing, we put a 0 in any unused space. */ +		if (rec_extra_padding(&rec)) { +			ecode = tdb->methods->twrite(tdb, best_off + sizeof(rec) +						     + keylen + datalen, "", 1); +			if (ecode != TDB_SUCCESS) { +				goto unlock_err; +			} +		} + +		/* Bucket of leftover will be <= current bucket, so nested +		 * locking is allowed. */ +		if (leftover) { +			tdb->stats.alloc_leftover++; +			ecode = add_free_record(tdb, +						best_off + sizeof(rec) +						+ frec_len(&best) - leftover, +						leftover, TDB_LOCK_WAIT, false); +			if (ecode != TDB_SUCCESS) { +				best_off = ecode; +			} +		} +		tdb_unlock_free_bucket(tdb, b_off); + +		return best_off; +	} + +	tdb_unlock_free_bucket(tdb, b_off); +	return 0; + +unlock_err: +	tdb_unlock_free_bucket(tdb, b_off); +	return ecode; +} + +/* Get a free block from current free list, or 0 if none, -ve on error. */ +static tdb_off_t get_free(struct tdb_context *tdb, +			  size_t keylen, size_t datalen, bool want_extra, +			  unsigned magic, unsigned hashlow) +{ +	tdb_off_t off, ftable_off; +	tdb_off_t start_b, b, ftable; +	bool wrapped = false; + +	/* If they are growing, add 50% to get to higher bucket. */ +	if (want_extra) +		start_b = size_to_bucket(adjust_size(keylen, +						     datalen + datalen / 2)); +	else +		start_b = size_to_bucket(adjust_size(keylen, datalen)); + +	ftable_off = tdb->ftable_off; +	ftable = tdb->ftable; +	while (!wrapped || ftable_off != tdb->ftable_off) { +		/* Start at exact size bucket, and search up... */ +		for (b = find_free_head(tdb, ftable_off, start_b); +		     b < TDB_FREE_BUCKETS; +		     b = find_free_head(tdb, ftable_off, b + 1)) { +			/* Try getting one from list. */ +			off = lock_and_alloc(tdb, ftable_off, +					     b, keylen, datalen, want_extra, +					     magic, hashlow); +			if (TDB_OFF_IS_ERR(off)) +				return off; +			if (off != 0) { +				if (b == start_b) +					tdb->stats.alloc_bucket_exact++; +				if (b == TDB_FREE_BUCKETS - 1) +					tdb->stats.alloc_bucket_max++; +				/* Worked?  Stay using this list. */ +				tdb->ftable_off = ftable_off; +				tdb->ftable = ftable; +				return off; +			} +			/* Didn't work.  Try next bucket. */ +		} + +		if (TDB_OFF_IS_ERR(b)) { +			return b; +		} + +		/* Hmm, try next table. */ +		ftable_off = next_ftable(tdb, ftable_off); +		if (TDB_OFF_IS_ERR(ftable_off)) { +			return ftable_off; +		} +		ftable++; + +		if (ftable_off == 0) { +			wrapped = true; +			ftable_off = first_ftable(tdb); +			if (TDB_OFF_IS_ERR(ftable_off)) { +				return ftable_off; +			} +			ftable = 0; +		} +	} + +	return 0; +} + +enum TDB_ERROR set_header(struct tdb_context *tdb, +			  struct tdb_used_record *rec, +			  unsigned magic, uint64_t keylen, uint64_t datalen, +			  uint64_t actuallen, unsigned hashlow) +{ +	uint64_t keybits = (fls64(keylen) + 1) / 2; + +	/* Use bottom bits of hash, so it's independent of hash table size. */ +	rec->magic_and_meta = (hashlow & ((1 << 11)-1)) +		| ((actuallen - (keylen + datalen)) << 11) +		| (keybits << 43) +		| ((uint64_t)magic << 48); +	rec->key_and_data_len = (keylen | (datalen << (keybits*2))); + +	/* Encoding can fail on big values. */ +	if (rec_key_length(rec) != keylen +	    || rec_data_length(rec) != datalen +	    || rec_extra_padding(rec) != actuallen - (keylen + datalen)) { +		return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR, +				  "Could not encode k=%llu,d=%llu,a=%llu", +				  (long long)keylen, (long long)datalen, +				  (long long)actuallen); +	} +	return TDB_SUCCESS; +} + +/* Expand the database. */ +static enum TDB_ERROR tdb_expand(struct tdb_context *tdb, tdb_len_t size) +{ +	uint64_t old_size, rec_size, map_size; +	tdb_len_t wanted; +	enum TDB_ERROR ecode; + +	/* Need to hold a hash lock to expand DB: transactions rely on it. */ +	if (!(tdb->flags & TDB_NOLOCK) +	    && !tdb->file->allrecord_lock.count && !tdb_has_hash_locks(tdb)) { +		return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR, +				  "tdb_expand: must hold lock during expand"); +	} + +	/* Only one person can expand file at a time. */ +	ecode = tdb_lock_expand(tdb, F_WRLCK); +	if (ecode != TDB_SUCCESS) { +		return ecode; +	} + +	/* Someone else may have expanded the file, so retry. */ +	old_size = tdb->file->map_size; +	tdb->methods->oob(tdb, tdb->file->map_size + 1, true); +	if (tdb->file->map_size != old_size) { +		tdb_unlock_expand(tdb, F_WRLCK); +		return TDB_SUCCESS; +	} + +	/* limit size in order to avoid using up huge amounts of memory for +	 * in memory tdbs if an oddball huge record creeps in */ +	if (size > 100 * 1024) { +		rec_size = size * 2; +	} else { +		rec_size = size * 100; +	} + +	/* always make room for at least rec_size more records, and at +	   least 25% more space. if the DB is smaller than 100MiB, +	   otherwise grow it by 10% only. */ +	if (old_size > 100 * 1024 * 1024) { +		map_size = old_size / 10; +	} else { +		map_size = old_size / 4; +	} + +	if (map_size > rec_size) { +		wanted = map_size; +	} else { +		wanted = rec_size; +	} + +	/* We need room for the record header too. */ +	wanted = adjust_size(0, sizeof(struct tdb_used_record) + wanted); + +	ecode = tdb->methods->expand_file(tdb, wanted); +	if (ecode != TDB_SUCCESS) { +		tdb_unlock_expand(tdb, F_WRLCK); +		return ecode; +	} + +	/* We need to drop this lock before adding free record. */ +	tdb_unlock_expand(tdb, F_WRLCK); + +	tdb->stats.expands++; +	return add_free_record(tdb, old_size, wanted, TDB_LOCK_WAIT, true); +} + +/* This won't fail: it will expand the database if it has to. */ +tdb_off_t alloc(struct tdb_context *tdb, size_t keylen, size_t datalen, +		uint64_t hash, unsigned magic, bool growing) +{ +	tdb_off_t off; + +	/* We can't hold pointers during this: we could unmap! */ +	assert(!tdb->direct_access); + +	for (;;) { +		enum TDB_ERROR ecode; +		off = get_free(tdb, keylen, datalen, growing, magic, hash); +		if (likely(off != 0)) +			break; + +		ecode = tdb_expand(tdb, adjust_size(keylen, datalen)); +		if (ecode != TDB_SUCCESS) { +			return ecode; +		} +	} + +	return off; +} diff --git a/lib/tdb2/hash.c b/lib/tdb2/hash.c new file mode 100644 index 0000000000..1359cfecd6 --- /dev/null +++ b/lib/tdb2/hash.c @@ -0,0 +1,881 @@ + /* +   Trivial Database 2: hash handling +   Copyright (C) Rusty Russell 2010 + +   This library is free software; you can redistribute it and/or +   modify it under the terms of the GNU Lesser General Public +   License as published by the Free Software Foundation; either +   version 3 of the License, or (at your option) any later version. + +   This library is distributed in the hope that it will be useful, +   but WITHOUT ANY WARRANTY; without even the implied warranty of +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU +   Lesser General Public License for more details. + +   You should have received a copy of the GNU Lesser General Public +   License along with this library; if not, see <http://www.gnu.org/licenses/>. +*/ +#include "private.h" +#include <assert.h> + +uint64_t tdb_hash(struct tdb_context *tdb, const void *ptr, size_t len) +{ +	return tdb->hash_fn(ptr, len, tdb->hash_seed, tdb->hash_data); +} + +uint64_t hash_record(struct tdb_context *tdb, tdb_off_t off) +{ +	const struct tdb_used_record *r; +	const void *key; +	uint64_t klen, hash; + +	r = tdb_access_read(tdb, off, sizeof(*r), true); +	if (TDB_PTR_IS_ERR(r)) { +		/* FIXME */ +		return 0; +	} + +	klen = rec_key_length(r); +	tdb_access_release(tdb, r); + +	key = tdb_access_read(tdb, off + sizeof(*r), klen, false); +	if (TDB_PTR_IS_ERR(key)) { +		return 0; +	} + +	hash = tdb_hash(tdb, key, klen); +	tdb_access_release(tdb, key); +	return hash; +} + +/* Get bits from a value. */ +static uint32_t bits_from(uint64_t val, unsigned start, unsigned num) +{ +	assert(num <= 32); +	return (val >> start) & ((1U << num) - 1); +} + +/* We take bits from the top: that way we can lock whole sections of the hash + * by using lock ranges. */ +static uint32_t use_bits(struct hash_info *h, unsigned num) +{ +	h->hash_used += num; +	return bits_from(h->h, 64 - h->hash_used, num); +} + +static tdb_bool_err key_matches(struct tdb_context *tdb, +				const struct tdb_used_record *rec, +				tdb_off_t off, +				const struct tdb_data *key) +{ +	tdb_bool_err ret = false; +	const char *rkey; + +	if (rec_key_length(rec) != key->dsize) { +		tdb->stats.compare_wrong_keylen++; +		return ret; +	} + +	rkey = tdb_access_read(tdb, off + sizeof(*rec), key->dsize, false); +	if (TDB_PTR_IS_ERR(rkey)) { +		return TDB_PTR_ERR(rkey); +	} +	if (memcmp(rkey, key->dptr, key->dsize) == 0) +		ret = true; +	else +		tdb->stats.compare_wrong_keycmp++; +	tdb_access_release(tdb, rkey); +	return ret; +} + +/* Does entry match? */ +static tdb_bool_err match(struct tdb_context *tdb, +			  struct hash_info *h, +			  const struct tdb_data *key, +			  tdb_off_t val, +			  struct tdb_used_record *rec) +{ +	tdb_off_t off; +	enum TDB_ERROR ecode; + +	tdb->stats.compares++; +	/* Desired bucket must match. */ +	if (h->home_bucket != (val & TDB_OFF_HASH_GROUP_MASK)) { +		tdb->stats.compare_wrong_bucket++; +		return false; +	} + +	/* Top bits of offset == next bits of hash. */ +	if (bits_from(val, TDB_OFF_HASH_EXTRA_BIT, TDB_OFF_UPPER_STEAL_EXTRA) +	    != bits_from(h->h, 64 - h->hash_used - TDB_OFF_UPPER_STEAL_EXTRA, +		    TDB_OFF_UPPER_STEAL_EXTRA)) { +		tdb->stats.compare_wrong_offsetbits++; +		return false; +	} + +	off = val & TDB_OFF_MASK; +	ecode = tdb_read_convert(tdb, off, rec, sizeof(*rec)); +	if (ecode != TDB_SUCCESS) { +		return ecode; +	} + +	if ((h->h & ((1 << 11)-1)) != rec_hash(rec)) { +		tdb->stats.compare_wrong_rechash++; +		return false; +	} + +	return key_matches(tdb, rec, off, key); +} + +static tdb_off_t hbucket_off(tdb_off_t group_start, unsigned bucket) +{ +	return group_start +		+ (bucket % (1 << TDB_HASH_GROUP_BITS)) * sizeof(tdb_off_t); +} + +bool is_subhash(tdb_off_t val) +{ +	return (val >> TDB_OFF_UPPER_STEAL_SUBHASH_BIT) & 1; +} + +/* FIXME: Guess the depth, don't over-lock! */ +static tdb_off_t hlock_range(tdb_off_t group, tdb_off_t *size) +{ +	*size = 1ULL << (64 - (TDB_TOPLEVEL_HASH_BITS - TDB_HASH_GROUP_BITS)); +	return group << (64 - (TDB_TOPLEVEL_HASH_BITS - TDB_HASH_GROUP_BITS)); +} + +static tdb_off_t COLD find_in_chain(struct tdb_context *tdb, +				    struct tdb_data key, +				    tdb_off_t chain, +				    struct hash_info *h, +				    struct tdb_used_record *rec, +				    struct traverse_info *tinfo) +{ +	tdb_off_t off, next; +	enum TDB_ERROR ecode; + +	/* In case nothing is free, we set these to zero. */ +	h->home_bucket = h->found_bucket = 0; + +	for (off = chain; off; off = next) { +		unsigned int i; + +		h->group_start = off; +		ecode = tdb_read_convert(tdb, off, h->group, sizeof(h->group)); +		if (ecode != TDB_SUCCESS) { +			return ecode; +		} + +		for (i = 0; i < (1 << TDB_HASH_GROUP_BITS); i++) { +			tdb_off_t recoff; +			if (!h->group[i]) { +				/* Remember this empty bucket. */ +				h->home_bucket = h->found_bucket = i; +				continue; +			} + +			/* We can insert extra bits via add_to_hash +			 * empty bucket logic. */ +			recoff = h->group[i] & TDB_OFF_MASK; +			ecode = tdb_read_convert(tdb, recoff, rec, +						 sizeof(*rec)); +			if (ecode != TDB_SUCCESS) { +				return ecode; +			} + +			ecode = key_matches(tdb, rec, recoff, &key); +			if (ecode < 0) { +				return ecode; +			} +			if (ecode == 1) { +				h->home_bucket = h->found_bucket = i; + +				if (tinfo) { +					tinfo->levels[tinfo->num_levels] +						.hashtable = off; +					tinfo->levels[tinfo->num_levels] +						.total_buckets +						= 1 << TDB_HASH_GROUP_BITS; +					tinfo->levels[tinfo->num_levels].entry +						= i; +					tinfo->num_levels++; +				} +				return recoff; +			} +		} +		next = tdb_read_off(tdb, off +				    + offsetof(struct tdb_chain, next)); +		if (TDB_OFF_IS_ERR(next)) { +			return next; +		} +		if (next) +			next += sizeof(struct tdb_used_record); +	} +	return 0; +} + +/* This is the core routine which searches the hashtable for an entry. + * On error, no locks are held and -ve is returned. + * Otherwise, hinfo is filled in (and the optional tinfo). + * If not found, the return value is 0. + * If found, the return value is the offset, and *rec is the record. */ +tdb_off_t find_and_lock(struct tdb_context *tdb, +			struct tdb_data key, +			int ltype, +			struct hash_info *h, +			struct tdb_used_record *rec, +			struct traverse_info *tinfo) +{ +	uint32_t i, group; +	tdb_off_t hashtable; +	enum TDB_ERROR ecode; + +	h->h = tdb_hash(tdb, key.dptr, key.dsize); +	h->hash_used = 0; +	group = use_bits(h, TDB_TOPLEVEL_HASH_BITS - TDB_HASH_GROUP_BITS); +	h->home_bucket = use_bits(h, TDB_HASH_GROUP_BITS); + +	h->hlock_start = hlock_range(group, &h->hlock_range); +	ecode = tdb_lock_hashes(tdb, h->hlock_start, h->hlock_range, ltype, +				TDB_LOCK_WAIT); +	if (ecode != TDB_SUCCESS) { +		return ecode; +	} + +	hashtable = offsetof(struct tdb_header, hashtable); +	if (tinfo) { +		tinfo->toplevel_group = group; +		tinfo->num_levels = 1; +		tinfo->levels[0].entry = 0; +		tinfo->levels[0].hashtable = hashtable +			+ (group << TDB_HASH_GROUP_BITS) * sizeof(tdb_off_t); +		tinfo->levels[0].total_buckets = 1 << TDB_HASH_GROUP_BITS; +	} + +	while (h->hash_used <= 64) { +		/* Read in the hash group. */ +		h->group_start = hashtable +			+ group * (sizeof(tdb_off_t) << TDB_HASH_GROUP_BITS); + +		ecode = tdb_read_convert(tdb, h->group_start, &h->group, +					 sizeof(h->group)); +		if (ecode != TDB_SUCCESS) { +			goto fail; +		} + +		/* Pointer to another hash table?  Go down... */ +		if (is_subhash(h->group[h->home_bucket])) { +			hashtable = (h->group[h->home_bucket] & TDB_OFF_MASK) +				+ sizeof(struct tdb_used_record); +			if (tinfo) { +				/* When we come back, use *next* bucket */ +				tinfo->levels[tinfo->num_levels-1].entry +					+= h->home_bucket + 1; +			} +			group = use_bits(h, TDB_SUBLEVEL_HASH_BITS +					 - TDB_HASH_GROUP_BITS); +			h->home_bucket = use_bits(h, TDB_HASH_GROUP_BITS); +			if (tinfo) { +				tinfo->levels[tinfo->num_levels].hashtable +					= hashtable; +				tinfo->levels[tinfo->num_levels].total_buckets +					= 1 << TDB_SUBLEVEL_HASH_BITS; +				tinfo->levels[tinfo->num_levels].entry +					= group << TDB_HASH_GROUP_BITS; +				tinfo->num_levels++; +			} +			continue; +		} + +		/* It's in this group: search (until 0 or all searched) */ +		for (i = 0, h->found_bucket = h->home_bucket; +		     i < (1 << TDB_HASH_GROUP_BITS); +		     i++, h->found_bucket = ((h->found_bucket+1) +					     % (1 << TDB_HASH_GROUP_BITS))) { +			tdb_bool_err berr; +			if (is_subhash(h->group[h->found_bucket])) +				continue; + +			if (!h->group[h->found_bucket]) +				break; + +			berr = match(tdb, h, &key, h->group[h->found_bucket], +				     rec); +			if (berr < 0) { +				ecode = berr; +				goto fail; +			} +			if (berr) { +				if (tinfo) { +					tinfo->levels[tinfo->num_levels-1].entry +						+= h->found_bucket; +				} +				return h->group[h->found_bucket] & TDB_OFF_MASK; +			} +		} +		/* Didn't find it: h indicates where it would go. */ +		return 0; +	} + +	return find_in_chain(tdb, key, hashtable, h, rec, tinfo); + +fail: +	tdb_unlock_hashes(tdb, h->hlock_start, h->hlock_range, ltype); +	return ecode; +} + +/* I wrote a simple test, expanding a hash to 2GB, for the following + * cases: + * 1) Expanding all the buckets at once, + * 2) Expanding the bucket we wanted to place the new entry into. + * 3) Expanding the most-populated bucket, + * + * I measured the worst/average/best density during this process. + * 1) 3%/16%/30% + * 2) 4%/20%/38% + * 3) 6%/22%/41% + * + * So we figure out the busiest bucket for the moment. + */ +static unsigned fullest_bucket(struct tdb_context *tdb, +			       const tdb_off_t *group, +			       unsigned new_bucket) +{ +	unsigned counts[1 << TDB_HASH_GROUP_BITS] = { 0 }; +	unsigned int i, best_bucket; + +	/* Count the new entry. */ +	counts[new_bucket]++; +	best_bucket = new_bucket; + +	for (i = 0; i < (1 << TDB_HASH_GROUP_BITS); i++) { +		unsigned this_bucket; + +		if (is_subhash(group[i])) +			continue; +		this_bucket = group[i] & TDB_OFF_HASH_GROUP_MASK; +		if (++counts[this_bucket] > counts[best_bucket]) +			best_bucket = this_bucket; +	} + +	return best_bucket; +} + +static bool put_into_group(tdb_off_t *group, +			   unsigned bucket, tdb_off_t encoded) +{ +	unsigned int i; + +	for (i = 0; i < (1 << TDB_HASH_GROUP_BITS); i++) { +		unsigned b = (bucket + i) % (1 << TDB_HASH_GROUP_BITS); + +		if (group[b] == 0) { +			group[b] = encoded; +			return true; +		} +	} +	return false; +} + +static void force_into_group(tdb_off_t *group, +			     unsigned bucket, tdb_off_t encoded) +{ +	if (!put_into_group(group, bucket, encoded)) +		abort(); +} + +static tdb_off_t encode_offset(tdb_off_t new_off, struct hash_info *h) +{ +	return h->home_bucket +		| new_off +		| ((uint64_t)bits_from(h->h, +				  64 - h->hash_used - TDB_OFF_UPPER_STEAL_EXTRA, +				  TDB_OFF_UPPER_STEAL_EXTRA) +		   << TDB_OFF_HASH_EXTRA_BIT); +} + +/* Simply overwrite the hash entry we found before. */ +enum TDB_ERROR replace_in_hash(struct tdb_context *tdb, +			       struct hash_info *h, +			       tdb_off_t new_off) +{ +	return tdb_write_off(tdb, hbucket_off(h->group_start, h->found_bucket), +			     encode_offset(new_off, h)); +} + +/* We slot in anywhere that's empty in the chain. */ +static enum TDB_ERROR COLD add_to_chain(struct tdb_context *tdb, +					tdb_off_t subhash, +					tdb_off_t new_off) +{ +	tdb_off_t entry; +	enum TDB_ERROR ecode; + +	entry = tdb_find_zero_off(tdb, subhash, 1<<TDB_HASH_GROUP_BITS); +	if (TDB_OFF_IS_ERR(entry)) { +		return entry; +	} + +	if (entry == 1 << TDB_HASH_GROUP_BITS) { +		tdb_off_t next; + +		next = tdb_read_off(tdb, subhash +				    + offsetof(struct tdb_chain, next)); +		if (TDB_OFF_IS_ERR(next)) { +			return next; +		} + +		if (!next) { +			next = alloc(tdb, 0, sizeof(struct tdb_chain), 0, +				     TDB_CHAIN_MAGIC, false); +			if (TDB_OFF_IS_ERR(next)) +				return next; +			ecode = zero_out(tdb, +					 next+sizeof(struct tdb_used_record), +					 sizeof(struct tdb_chain)); +			if (ecode != TDB_SUCCESS) { +				return ecode; +			} +			ecode = tdb_write_off(tdb, subhash +					      + offsetof(struct tdb_chain, +							 next), +					      next); +			if (ecode != TDB_SUCCESS) { +				return ecode; +			} +		} +		return add_to_chain(tdb, next, new_off); +	} + +	return tdb_write_off(tdb, subhash + entry * sizeof(tdb_off_t), +			     new_off); +} + +/* Add into a newly created subhash. */ +static enum TDB_ERROR add_to_subhash(struct tdb_context *tdb, tdb_off_t subhash, +				     unsigned hash_used, tdb_off_t val) +{ +	tdb_off_t off = (val & TDB_OFF_MASK), *group; +	struct hash_info h; +	unsigned int gnum; + +	h.hash_used = hash_used; + +	if (hash_used + TDB_SUBLEVEL_HASH_BITS > 64) +		return add_to_chain(tdb, subhash, off); + +	h.h = hash_record(tdb, off); +	gnum = use_bits(&h, TDB_SUBLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS); +	h.group_start = subhash +		+ gnum * (sizeof(tdb_off_t) << TDB_HASH_GROUP_BITS); +	h.home_bucket = use_bits(&h, TDB_HASH_GROUP_BITS); + +	group = tdb_access_write(tdb, h.group_start, +				 sizeof(*group) << TDB_HASH_GROUP_BITS, true); +	if (TDB_PTR_IS_ERR(group)) { +		return TDB_PTR_ERR(group); +	} +	force_into_group(group, h.home_bucket, encode_offset(off, &h)); +	return tdb_access_commit(tdb, group); +} + +static enum TDB_ERROR expand_group(struct tdb_context *tdb, struct hash_info *h) +{ +	unsigned bucket, num_vals, i, magic; +	size_t subsize; +	tdb_off_t subhash; +	tdb_off_t vals[1 << TDB_HASH_GROUP_BITS]; +	enum TDB_ERROR ecode; + +	/* Attach new empty subhash under fullest bucket. */ +	bucket = fullest_bucket(tdb, h->group, h->home_bucket); + +	if (h->hash_used == 64) { +		tdb->stats.alloc_chain++; +		subsize = sizeof(struct tdb_chain); +		magic = TDB_CHAIN_MAGIC; +	} else { +		tdb->stats.alloc_subhash++; +		subsize = (sizeof(tdb_off_t) << TDB_SUBLEVEL_HASH_BITS); +		magic = TDB_HTABLE_MAGIC; +	} + +	subhash = alloc(tdb, 0, subsize, 0, magic, false); +	if (TDB_OFF_IS_ERR(subhash)) { +		return subhash; +	} + +	ecode = zero_out(tdb, subhash + sizeof(struct tdb_used_record), +			 subsize); +	if (ecode != TDB_SUCCESS) { +		return ecode; +	} + +	/* Remove any which are destined for bucket or are in wrong place. */ +	num_vals = 0; +	for (i = 0; i < (1 << TDB_HASH_GROUP_BITS); i++) { +		unsigned home_bucket = h->group[i] & TDB_OFF_HASH_GROUP_MASK; +		if (!h->group[i] || is_subhash(h->group[i])) +			continue; +		if (home_bucket == bucket || home_bucket != i) { +			vals[num_vals++] = h->group[i]; +			h->group[i] = 0; +		} +	} +	/* FIXME: This assert is valid, but we do this during unit test :( */ +	/* assert(num_vals); */ + +	/* Overwrite expanded bucket with subhash pointer. */ +	h->group[bucket] = subhash | (1ULL << TDB_OFF_UPPER_STEAL_SUBHASH_BIT); + +	/* Point to actual contents of record. */ +	subhash += sizeof(struct tdb_used_record); + +	/* Put values back. */ +	for (i = 0; i < num_vals; i++) { +		unsigned this_bucket = vals[i] & TDB_OFF_HASH_GROUP_MASK; + +		if (this_bucket == bucket) { +			ecode = add_to_subhash(tdb, subhash, h->hash_used, +					       vals[i]); +			if (ecode != TDB_SUCCESS) +				return ecode; +		} else { +			/* There should be room to put this back. */ +			force_into_group(h->group, this_bucket, vals[i]); +		} +	} +	return TDB_SUCCESS; +} + +enum TDB_ERROR delete_from_hash(struct tdb_context *tdb, struct hash_info *h) +{ +	unsigned int i, num_movers = 0; +	tdb_off_t movers[1 << TDB_HASH_GROUP_BITS]; + +	h->group[h->found_bucket] = 0; +	for (i = 1; i < (1 << TDB_HASH_GROUP_BITS); i++) { +		unsigned this_bucket; + +		this_bucket = (h->found_bucket+i) % (1 << TDB_HASH_GROUP_BITS); +		/* Empty bucket?  We're done. */ +		if (!h->group[this_bucket]) +			break; + +		/* Ignore subhashes. */ +		if (is_subhash(h->group[this_bucket])) +			continue; + +		/* If this one is not happy where it is, we'll move it. */ +		if ((h->group[this_bucket] & TDB_OFF_HASH_GROUP_MASK) +		    != this_bucket) { +			movers[num_movers++] = h->group[this_bucket]; +			h->group[this_bucket] = 0; +		} +	} + +	/* Put back the ones we erased. */ +	for (i = 0; i < num_movers; i++) { +		force_into_group(h->group, movers[i] & TDB_OFF_HASH_GROUP_MASK, +				 movers[i]); +	} + +	/* Now we write back the hash group */ +	return tdb_write_convert(tdb, h->group_start, +				 h->group, sizeof(h->group)); +} + +enum TDB_ERROR add_to_hash(struct tdb_context *tdb, struct hash_info *h, +			   tdb_off_t new_off) +{ +	enum TDB_ERROR ecode; + +	/* We hit an empty bucket during search?  That's where it goes. */ +	if (!h->group[h->found_bucket]) { +		h->group[h->found_bucket] = encode_offset(new_off, h); +		/* Write back the modified group. */ +		return tdb_write_convert(tdb, h->group_start, +					 h->group, sizeof(h->group)); +	} + +	if (h->hash_used > 64) +		return add_to_chain(tdb, h->group_start, new_off); + +	/* We're full.  Expand. */ +	ecode = expand_group(tdb, h); +	if (ecode != TDB_SUCCESS) { +		return ecode; +	} + +	if (is_subhash(h->group[h->home_bucket])) { +		/* We were expanded! */ +		tdb_off_t hashtable; +		unsigned int gnum; + +		/* Write back the modified group. */ +		ecode = tdb_write_convert(tdb, h->group_start, h->group, +					  sizeof(h->group)); +		if (ecode != TDB_SUCCESS) { +			return ecode; +		} + +		/* Move hashinfo down a level. */ +		hashtable = (h->group[h->home_bucket] & TDB_OFF_MASK) +			+ sizeof(struct tdb_used_record); +		gnum = use_bits(h,TDB_SUBLEVEL_HASH_BITS - TDB_HASH_GROUP_BITS); +		h->home_bucket = use_bits(h, TDB_HASH_GROUP_BITS); +		h->group_start = hashtable +			+ gnum * (sizeof(tdb_off_t) << TDB_HASH_GROUP_BITS); +		ecode = tdb_read_convert(tdb, h->group_start, &h->group, +					 sizeof(h->group)); +		if (ecode != TDB_SUCCESS) { +			return ecode; +		} +	} + +	/* Expanding the group must have made room if it didn't choose this +	 * bucket. */ +	if (put_into_group(h->group, h->home_bucket, encode_offset(new_off,h))){ +		return tdb_write_convert(tdb, h->group_start, +					 h->group, sizeof(h->group)); +	} + +	/* This can happen if all hashes in group (and us) dropped into same +	 * group in subhash. */ +	return add_to_hash(tdb, h, new_off); +} + +/* Traverse support: returns offset of record, or 0 or -ve error. */ +static tdb_off_t iterate_hash(struct tdb_context *tdb, +			      struct traverse_info *tinfo) +{ +	tdb_off_t off, val, i; +	struct traverse_level *tlevel; + +	tlevel = &tinfo->levels[tinfo->num_levels-1]; + +again: +	for (i = tdb_find_nonzero_off(tdb, tlevel->hashtable, +				      tlevel->entry, tlevel->total_buckets); +	     i != tlevel->total_buckets; +	     i = tdb_find_nonzero_off(tdb, tlevel->hashtable, +				      i+1, tlevel->total_buckets)) { +		if (TDB_OFF_IS_ERR(i)) { +			return i; +		} + +		val = tdb_read_off(tdb, tlevel->hashtable+sizeof(tdb_off_t)*i); +		if (TDB_OFF_IS_ERR(val)) { +			return val; +		} + +		off = val & TDB_OFF_MASK; + +		/* This makes the delete-all-in-traverse case work +		 * (and simplifies our logic a little). */ +		if (off == tinfo->prev) +			continue; + +		tlevel->entry = i; + +		if (!is_subhash(val)) { +			/* Found one. */ +			tinfo->prev = off; +			return off; +		} + +		/* When we come back, we want the next one */ +		tlevel->entry++; +		tinfo->num_levels++; +		tlevel++; +		tlevel->hashtable = off + sizeof(struct tdb_used_record); +		tlevel->entry = 0; +		/* Next level is a chain? */ +		if (unlikely(tinfo->num_levels == TDB_MAX_LEVELS + 1)) +			tlevel->total_buckets = (1 << TDB_HASH_GROUP_BITS); +		else +			tlevel->total_buckets = (1 << TDB_SUBLEVEL_HASH_BITS); +		goto again; +	} + +	/* Nothing there? */ +	if (tinfo->num_levels == 1) +		return 0; + +	/* Handle chained entries. */ +	if (unlikely(tinfo->num_levels == TDB_MAX_LEVELS + 1)) { +		tlevel->hashtable = tdb_read_off(tdb, tlevel->hashtable +						 + offsetof(struct tdb_chain, +							    next)); +		if (TDB_OFF_IS_ERR(tlevel->hashtable)) { +			return tlevel->hashtable; +		} +		if (tlevel->hashtable) { +			tlevel->hashtable += sizeof(struct tdb_used_record); +			tlevel->entry = 0; +			goto again; +		} +	} + +	/* Go back up and keep searching. */ +	tinfo->num_levels--; +	tlevel--; +	goto again; +} + +/* Return success if we find something, TDB_ERR_NOEXIST if none. */ +enum TDB_ERROR next_in_hash(struct tdb_context *tdb, +			    struct traverse_info *tinfo, +			    TDB_DATA *kbuf, size_t *dlen) +{ +	const unsigned group_bits = TDB_TOPLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS; +	tdb_off_t hl_start, hl_range, off; +	enum TDB_ERROR ecode; + +	while (tinfo->toplevel_group < (1 << group_bits)) { +		hl_start = (tdb_off_t)tinfo->toplevel_group +			<< (64 - group_bits); +		hl_range = 1ULL << group_bits; +		ecode = tdb_lock_hashes(tdb, hl_start, hl_range, F_RDLCK, +					TDB_LOCK_WAIT); +		if (ecode != TDB_SUCCESS) { +			return ecode; +		} + +		off = iterate_hash(tdb, tinfo); +		if (off) { +			struct tdb_used_record rec; + +			if (TDB_OFF_IS_ERR(off)) { +				ecode = off; +				goto fail; +			} + +			ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec)); +			if (ecode != TDB_SUCCESS) { +				goto fail; +			} +			if (rec_magic(&rec) != TDB_USED_MAGIC) { +				ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT, +						   TDB_LOG_ERROR, +						   "next_in_hash:" +						   " corrupt record at %llu", +						   (long long)off); +				goto fail; +			} + +			kbuf->dsize = rec_key_length(&rec); + +			/* They want data as well? */ +			if (dlen) { +				*dlen = rec_data_length(&rec); +				kbuf->dptr = tdb_alloc_read(tdb, +							    off + sizeof(rec), +							    kbuf->dsize +							    + *dlen); +			} else { +				kbuf->dptr = tdb_alloc_read(tdb, +							    off + sizeof(rec), +							    kbuf->dsize); +			} +			tdb_unlock_hashes(tdb, hl_start, hl_range, F_RDLCK); +			if (TDB_PTR_IS_ERR(kbuf->dptr)) { +				return TDB_PTR_ERR(kbuf->dptr); +			} +			return TDB_SUCCESS; +		} + +		tdb_unlock_hashes(tdb, hl_start, hl_range, F_RDLCK); + +		tinfo->toplevel_group++; +		tinfo->levels[0].hashtable +			+= (sizeof(tdb_off_t) << TDB_HASH_GROUP_BITS); +		tinfo->levels[0].entry = 0; +	} +	return TDB_ERR_NOEXIST; + +fail: +	tdb_unlock_hashes(tdb, hl_start, hl_range, F_RDLCK); +	return ecode; + +} + +enum TDB_ERROR first_in_hash(struct tdb_context *tdb, +			     struct traverse_info *tinfo, +			     TDB_DATA *kbuf, size_t *dlen) +{ +	tinfo->prev = 0; +	tinfo->toplevel_group = 0; +	tinfo->num_levels = 1; +	tinfo->levels[0].hashtable = offsetof(struct tdb_header, hashtable); +	tinfo->levels[0].entry = 0; +	tinfo->levels[0].total_buckets = (1 << TDB_HASH_GROUP_BITS); + +	return next_in_hash(tdb, tinfo, kbuf, dlen); +} + +/* Even if the entry isn't in this hash bucket, you'd have to lock this + * bucket to find it. */ +static enum TDB_ERROR chainlock(struct tdb_context *tdb, const TDB_DATA *key, +				int ltype, enum tdb_lock_flags waitflag, +				const char *func) +{ +	enum TDB_ERROR ecode; +	uint64_t h = tdb_hash(tdb, key->dptr, key->dsize); +	tdb_off_t lockstart, locksize; +	unsigned int group, gbits; + +	gbits = TDB_TOPLEVEL_HASH_BITS - TDB_HASH_GROUP_BITS; +	group = bits_from(h, 64 - gbits, gbits); + +	lockstart = hlock_range(group, &locksize); + +	ecode = tdb_lock_hashes(tdb, lockstart, locksize, ltype, waitflag); +	tdb_trace_1rec(tdb, func, *key); +	return ecode; +} + +/* lock/unlock one hash chain. This is meant to be used to reduce +   contention - it cannot guarantee how many records will be locked */ +enum TDB_ERROR tdb_chainlock(struct tdb_context *tdb, TDB_DATA key) +{ +	return tdb->last_error = chainlock(tdb, &key, F_WRLCK, TDB_LOCK_WAIT, +					   "tdb_chainlock"); +} + +void tdb_chainunlock(struct tdb_context *tdb, TDB_DATA key) +{ +	uint64_t h = tdb_hash(tdb, key.dptr, key.dsize); +	tdb_off_t lockstart, locksize; +	unsigned int group, gbits; + +	gbits = TDB_TOPLEVEL_HASH_BITS - TDB_HASH_GROUP_BITS; +	group = bits_from(h, 64 - gbits, gbits); + +	lockstart = hlock_range(group, &locksize); + +	tdb_trace_1rec(tdb, "tdb_chainunlock", key); +	tdb_unlock_hashes(tdb, lockstart, locksize, F_WRLCK); +} + +enum TDB_ERROR tdb_chainlock_read(struct tdb_context *tdb, TDB_DATA key) +{ +	return tdb->last_error = chainlock(tdb, &key, F_RDLCK, TDB_LOCK_WAIT, +					   "tdb_chainlock_read"); +} + +void tdb_chainunlock_read(struct tdb_context *tdb, TDB_DATA key) +{ +	uint64_t h = tdb_hash(tdb, key.dptr, key.dsize); +	tdb_off_t lockstart, locksize; +	unsigned int group, gbits; + +	gbits = TDB_TOPLEVEL_HASH_BITS - TDB_HASH_GROUP_BITS; +	group = bits_from(h, 64 - gbits, gbits); + +	lockstart = hlock_range(group, &locksize); + +	tdb_trace_1rec(tdb, "tdb_chainunlock_read", key); +	tdb_unlock_hashes(tdb, lockstart, locksize, F_RDLCK); +} diff --git a/lib/tdb2/io.c b/lib/tdb2/io.c new file mode 100644 index 0000000000..8c5f45f308 --- /dev/null +++ b/lib/tdb2/io.c @@ -0,0 +1,615 @@ + /* +   Unix SMB/CIFS implementation. + +   trivial database library + +   Copyright (C) Andrew Tridgell              1999-2005 +   Copyright (C) Paul `Rusty' Russell		   2000 +   Copyright (C) Jeremy Allison			   2000-2003 +   Copyright (C) Rusty Russell			   2010 + +     ** NOTE! The following LGPL license applies to the tdb +     ** library. This does NOT imply that all of Samba is released +     ** under the LGPL + +   This library is free software; you can redistribute it and/or +   modify it under the terms of the GNU Lesser General Public +   License as published by the Free Software Foundation; either +   version 3 of the License, or (at your option) any later version. + +   This library is distributed in the hope that it will be useful, +   but WITHOUT ANY WARRANTY; without even the implied warranty of +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU +   Lesser General Public License for more details. + +   You should have received a copy of the GNU Lesser General Public +   License along with this library; if not, see <http://www.gnu.org/licenses/>. +*/ +#include "private.h" +#include <assert.h> +#include <ccan/likely/likely.h> + +void tdb_munmap(struct tdb_file *file) +{ +	if (file->fd == -1) +		return; + +	if (file->map_ptr) { +		munmap(file->map_ptr, file->map_size); +		file->map_ptr = NULL; +	} +} + +void tdb_mmap(struct tdb_context *tdb) +{ +	if (tdb->flags & TDB_INTERNAL) +		return; + +	if (tdb->flags & TDB_NOMMAP) +		return; + +	/* size_t can be smaller than off_t. */ +	if ((size_t)tdb->file->map_size == tdb->file->map_size) { +		tdb->file->map_ptr = mmap(NULL, tdb->file->map_size, +					  tdb->mmap_flags, +					  MAP_SHARED, tdb->file->fd, 0); +	} else +		tdb->file->map_ptr = MAP_FAILED; + +	/* +	 * NB. When mmap fails it returns MAP_FAILED *NOT* NULL !!!! +	 */ +	if (tdb->file->map_ptr == MAP_FAILED) { +		tdb->file->map_ptr = NULL; +		tdb_logerr(tdb, TDB_SUCCESS, TDB_LOG_WARNING, +			   "tdb_mmap failed for size %lld (%s)", +			   (long long)tdb->file->map_size, strerror(errno)); +	} +} + +/* check for an out of bounds access - if it is out of bounds then +   see if the database has been expanded by someone else and expand +   if necessary +   note that "len" is the minimum length needed for the db +*/ +static enum TDB_ERROR tdb_oob(struct tdb_context *tdb, tdb_off_t len, +			      bool probe) +{ +	struct stat st; +	enum TDB_ERROR ecode; + +	/* We can't hold pointers during this: we could unmap! */ +	assert(!tdb->direct_access +	       || (tdb->flags & TDB_NOLOCK) +	       || tdb_has_expansion_lock(tdb)); + +	if (len <= tdb->file->map_size) +		return 0; +	if (tdb->flags & TDB_INTERNAL) { +		if (!probe) { +			tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR, +				 "tdb_oob len %lld beyond internal" +				 " malloc size %lld", +				 (long long)len, +				 (long long)tdb->file->map_size); +		} +		return TDB_ERR_IO; +	} + +	ecode = tdb_lock_expand(tdb, F_RDLCK); +	if (ecode != TDB_SUCCESS) { +		return ecode; +	} + +	if (fstat(tdb->file->fd, &st) != 0) { +		tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR, +			   "Failed to fstat file: %s", strerror(errno)); +		tdb_unlock_expand(tdb, F_RDLCK); +		return TDB_ERR_IO; +	} + +	tdb_unlock_expand(tdb, F_RDLCK); + +	if (st.st_size < (size_t)len) { +		if (!probe) { +			tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR, +				   "tdb_oob len %zu beyond eof at %zu", +				   (size_t)len, st.st_size); +		} +		return TDB_ERR_IO; +	} + +	/* Unmap, update size, remap */ +	tdb_munmap(tdb->file); + +	tdb->file->map_size = st.st_size; +	tdb_mmap(tdb); +	return TDB_SUCCESS; +} + +/* Endian conversion: we only ever deal with 8 byte quantities */ +void *tdb_convert(const struct tdb_context *tdb, void *buf, tdb_len_t size) +{ +	assert(size % 8 == 0); +	if (unlikely((tdb->flags & TDB_CONVERT)) && buf) { +		uint64_t i, *p = (uint64_t *)buf; +		for (i = 0; i < size / 8; i++) +			p[i] = bswap_64(p[i]); +	} +	return buf; +} + +/* Return first non-zero offset in offset array, or end, or -ve error. */ +/* FIXME: Return the off? */ +uint64_t tdb_find_nonzero_off(struct tdb_context *tdb, +			      tdb_off_t base, uint64_t start, uint64_t end) +{ +	uint64_t i; +	const uint64_t *val; + +	/* Zero vs non-zero is the same unconverted: minor optimization. */ +	val = tdb_access_read(tdb, base + start * sizeof(tdb_off_t), +			      (end - start) * sizeof(tdb_off_t), false); +	if (TDB_PTR_IS_ERR(val)) { +		return TDB_PTR_ERR(val); +	} + +	for (i = 0; i < (end - start); i++) { +		if (val[i]) +			break; +	} +	tdb_access_release(tdb, val); +	return start + i; +} + +/* Return first zero offset in num offset array, or num, or -ve error. */ +uint64_t tdb_find_zero_off(struct tdb_context *tdb, tdb_off_t off, +			   uint64_t num) +{ +	uint64_t i; +	const uint64_t *val; + +	/* Zero vs non-zero is the same unconverted: minor optimization. */ +	val = tdb_access_read(tdb, off, num * sizeof(tdb_off_t), false); +	if (TDB_PTR_IS_ERR(val)) { +		return TDB_PTR_ERR(val); +	} + +	for (i = 0; i < num; i++) { +		if (!val[i]) +			break; +	} +	tdb_access_release(tdb, val); +	return i; +} + +enum TDB_ERROR zero_out(struct tdb_context *tdb, tdb_off_t off, tdb_len_t len) +{ +	char buf[8192] = { 0 }; +	void *p = tdb->methods->direct(tdb, off, len, true); +	enum TDB_ERROR ecode = TDB_SUCCESS; + +	assert(!tdb->read_only); +	if (TDB_PTR_IS_ERR(p)) { +		return TDB_PTR_ERR(p); +	} +	if (p) { +		memset(p, 0, len); +		return ecode; +	} +	while (len) { +		unsigned todo = len < sizeof(buf) ? len : sizeof(buf); +		ecode = tdb->methods->twrite(tdb, off, buf, todo); +		if (ecode != TDB_SUCCESS) { +			break; +		} +		len -= todo; +		off += todo; +	} +	return ecode; +} + +tdb_off_t tdb_read_off(struct tdb_context *tdb, tdb_off_t off) +{ +	tdb_off_t ret; +	enum TDB_ERROR ecode; + +	if (likely(!(tdb->flags & TDB_CONVERT))) { +		tdb_off_t *p = tdb->methods->direct(tdb, off, sizeof(*p), +						    false); +		if (TDB_PTR_IS_ERR(p)) { +			return TDB_PTR_ERR(p); +		} +		if (p) +			return *p; +	} + +	ecode = tdb_read_convert(tdb, off, &ret, sizeof(ret)); +	if (ecode != TDB_SUCCESS) { +		return ecode; +	} +	return ret; +} + +/* write a lump of data at a specified offset */ +static enum TDB_ERROR tdb_write(struct tdb_context *tdb, tdb_off_t off, +				const void *buf, tdb_len_t len) +{ +	enum TDB_ERROR ecode; + +	if (tdb->read_only) { +		return tdb_logerr(tdb, TDB_ERR_RDONLY, TDB_LOG_USE_ERROR, +				  "Write to read-only database"); +	} + +	ecode = tdb->methods->oob(tdb, off + len, 0); +	if (ecode != TDB_SUCCESS) { +		return ecode; +	} + +	if (tdb->file->map_ptr) { +		memcpy(off + (char *)tdb->file->map_ptr, buf, len); +	} else { +		ssize_t ret; +		ret = pwrite(tdb->file->fd, buf, len, off); +		if (ret != len) { +			/* This shouldn't happen: we avoid sparse files. */ +			if (ret >= 0) +				errno = ENOSPC; + +			return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR, +					  "tdb_write: %zi at %zu len=%zu (%s)", +					  ret, (size_t)off, (size_t)len, +					  strerror(errno)); +		} +	} +	return TDB_SUCCESS; +} + +/* read a lump of data at a specified offset */ +static enum TDB_ERROR tdb_read(struct tdb_context *tdb, tdb_off_t off, +			       void *buf, tdb_len_t len) +{ +	enum TDB_ERROR ecode; + +	ecode = tdb->methods->oob(tdb, off + len, 0); +	if (ecode != TDB_SUCCESS) { +		return ecode; +	} + +	if (tdb->file->map_ptr) { +		memcpy(buf, off + (char *)tdb->file->map_ptr, len); +	} else { +		ssize_t r = pread(tdb->file->fd, buf, len, off); +		if (r != len) { +			return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR, +					  "tdb_read failed with %zi at %zu " +					  "len=%zu (%s) map_size=%zu", +					  r, (size_t)off, (size_t)len, +					  strerror(errno), +					  (size_t)tdb->file->map_size); +		} +	} +	return TDB_SUCCESS; +} + +enum TDB_ERROR tdb_write_convert(struct tdb_context *tdb, tdb_off_t off, +				 const void *rec, size_t len) +{ +	enum TDB_ERROR ecode; + +	if (unlikely((tdb->flags & TDB_CONVERT))) { +		void *conv = malloc(len); +		if (!conv) { +			return tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR, +					  "tdb_write: no memory converting" +					  " %zu bytes", len); +		} +		memcpy(conv, rec, len); +		ecode = tdb->methods->twrite(tdb, off, +					   tdb_convert(tdb, conv, len), len); +		free(conv); +	} else { +		ecode = tdb->methods->twrite(tdb, off, rec, len); +	} +	return ecode; +} + +enum TDB_ERROR tdb_read_convert(struct tdb_context *tdb, tdb_off_t off, +				void *rec, size_t len) +{ +	enum TDB_ERROR ecode = tdb->methods->tread(tdb, off, rec, len); +	tdb_convert(tdb, rec, len); +	return ecode; +} + +enum TDB_ERROR tdb_write_off(struct tdb_context *tdb, +			     tdb_off_t off, tdb_off_t val) +{ +	if (tdb->read_only) { +		return tdb_logerr(tdb, TDB_ERR_RDONLY, TDB_LOG_USE_ERROR, +				  "Write to read-only database"); +	} + +	if (likely(!(tdb->flags & TDB_CONVERT))) { +		tdb_off_t *p = tdb->methods->direct(tdb, off, sizeof(*p), +						    true); +		if (TDB_PTR_IS_ERR(p)) { +			return TDB_PTR_ERR(p); +		} +		if (p) { +			*p = val; +			return TDB_SUCCESS; +		} +	} +	return tdb_write_convert(tdb, off, &val, sizeof(val)); +} + +static void *_tdb_alloc_read(struct tdb_context *tdb, tdb_off_t offset, +			     tdb_len_t len, unsigned int prefix) +{ +	unsigned char *buf; +	enum TDB_ERROR ecode; + +	/* some systems don't like zero length malloc */ +	buf = malloc(prefix + len ? prefix + len : 1); +	if (!buf) { +		tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_USE_ERROR, +			   "tdb_alloc_read malloc failed len=%zu", +			   (size_t)(prefix + len)); +		return TDB_ERR_PTR(TDB_ERR_OOM); +	} else { +		ecode = tdb->methods->tread(tdb, offset, buf+prefix, len); +		if (unlikely(ecode != TDB_SUCCESS)) { +			free(buf); +			return TDB_ERR_PTR(ecode); +		} +	} +	return buf; +} + +/* read a lump of data, allocating the space for it */ +void *tdb_alloc_read(struct tdb_context *tdb, tdb_off_t offset, tdb_len_t len) +{ +	return _tdb_alloc_read(tdb, offset, len, 0); +} + +static enum TDB_ERROR fill(struct tdb_context *tdb, +			   const void *buf, size_t size, +			   tdb_off_t off, tdb_len_t len) +{ +	while (len) { +		size_t n = len > size ? size : len; +		ssize_t ret = pwrite(tdb->file->fd, buf, n, off); +		if (ret != n) { +			if (ret >= 0) +				errno = ENOSPC; + +			return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR, +					  "fill failed:" +					  " %zi at %zu len=%zu (%s)", +					  ret, (size_t)off, (size_t)len, +					  strerror(errno)); +		} +		len -= n; +		off += n; +	} +	return TDB_SUCCESS; +} + +/* expand a file.  we prefer to use ftruncate, as that is what posix +  says to use for mmap expansion */ +static enum TDB_ERROR tdb_expand_file(struct tdb_context *tdb, +				      tdb_len_t addition) +{ +	char buf[8192]; +	enum TDB_ERROR ecode; + +	if (tdb->read_only) { +		return tdb_logerr(tdb, TDB_ERR_RDONLY, TDB_LOG_USE_ERROR, +				  "Expand on read-only database"); +	} + +	if (tdb->flags & TDB_INTERNAL) { +		char *new = realloc(tdb->file->map_ptr, +				    tdb->file->map_size + addition); +		if (!new) { +			return tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR, +					  "No memory to expand database"); +		} +		tdb->file->map_ptr = new; +		tdb->file->map_size += addition; +	} else { +		/* Unmap before trying to write; old TDB claimed OpenBSD had +		 * problem with this otherwise. */ +		tdb_munmap(tdb->file); + +		/* If this fails, we try to fill anyway. */ +		if (ftruncate(tdb->file->fd, tdb->file->map_size + addition)) +			; + +		/* now fill the file with something. This ensures that the +		   file isn't sparse, which would be very bad if we ran out of +		   disk. This must be done with write, not via mmap */ +		memset(buf, 0x43, sizeof(buf)); +		ecode = fill(tdb, buf, sizeof(buf), tdb->file->map_size, +			     addition); +		if (ecode != TDB_SUCCESS) +			return ecode; +		tdb->file->map_size += addition; +		tdb_mmap(tdb); +	} +	return TDB_SUCCESS; +} + +const void *tdb_access_read(struct tdb_context *tdb, +			    tdb_off_t off, tdb_len_t len, bool convert) +{ +	void *ret = NULL; + +	if (likely(!(tdb->flags & TDB_CONVERT))) { +		ret = tdb->methods->direct(tdb, off, len, false); + +		if (TDB_PTR_IS_ERR(ret)) { +			return ret; +		} +	} +	if (!ret) { +		struct tdb_access_hdr *hdr; +		hdr = _tdb_alloc_read(tdb, off, len, sizeof(*hdr)); +		if (TDB_PTR_IS_ERR(hdr)) { +			return hdr; +		} +		hdr->next = tdb->access; +		tdb->access = hdr; +		ret = hdr + 1; +		if (convert) { +			tdb_convert(tdb, (void *)ret, len); +		} +	} else +		tdb->direct_access++; + +	return ret; +} + +void *tdb_access_write(struct tdb_context *tdb, +		       tdb_off_t off, tdb_len_t len, bool convert) +{ +	void *ret = NULL; + +	if (tdb->read_only) { +		tdb_logerr(tdb, TDB_ERR_RDONLY, TDB_LOG_USE_ERROR, +			   "Write to read-only database"); +		return TDB_ERR_PTR(TDB_ERR_RDONLY); +	} + +	if (likely(!(tdb->flags & TDB_CONVERT))) { +		ret = tdb->methods->direct(tdb, off, len, true); + +		if (TDB_PTR_IS_ERR(ret)) { +			return ret; +		} +	} + +	if (!ret) { +		struct tdb_access_hdr *hdr; +		hdr = _tdb_alloc_read(tdb, off, len, sizeof(*hdr)); +		if (TDB_PTR_IS_ERR(hdr)) { +			return hdr; +		} +		hdr->next = tdb->access; +		tdb->access = hdr; +		hdr->off = off; +		hdr->len = len; +		hdr->convert = convert; +		ret = hdr + 1; +		if (convert) +			tdb_convert(tdb, (void *)ret, len); +	} else +		tdb->direct_access++; + +	return ret; +} + +static struct tdb_access_hdr **find_hdr(struct tdb_context *tdb, const void *p) +{ +	struct tdb_access_hdr **hp; + +	for (hp = &tdb->access; *hp; hp = &(*hp)->next) { +		if (*hp + 1 == p) +			return hp; +	} +	return NULL; +} + +void tdb_access_release(struct tdb_context *tdb, const void *p) +{ +	struct tdb_access_hdr *hdr, **hp = find_hdr(tdb, p); + +	if (hp) { +		hdr = *hp; +		*hp = hdr->next; +		free(hdr); +	} else +		tdb->direct_access--; +} + +enum TDB_ERROR tdb_access_commit(struct tdb_context *tdb, void *p) +{ +	struct tdb_access_hdr *hdr, **hp = find_hdr(tdb, p); +	enum TDB_ERROR ecode; + +	if (hp) { +		hdr = *hp; +		if (hdr->convert) +			ecode = tdb_write_convert(tdb, hdr->off, p, hdr->len); +		else +			ecode = tdb_write(tdb, hdr->off, p, hdr->len); +		*hp = hdr->next; +		free(hdr); +	} else { +		tdb->direct_access--; +		ecode = TDB_SUCCESS; +	} + +	return ecode; +} + +static void *tdb_direct(struct tdb_context *tdb, tdb_off_t off, size_t len, +			bool write_mode) +{ +	enum TDB_ERROR ecode; + +	if (unlikely(!tdb->file->map_ptr)) +		return NULL; + +	ecode = tdb_oob(tdb, off + len, true); +	if (unlikely(ecode != TDB_SUCCESS)) +		return TDB_ERR_PTR(ecode); +	return (char *)tdb->file->map_ptr + off; +} + +void tdb_inc_seqnum(struct tdb_context *tdb) +{ +	tdb_off_t seq; + +	if (likely(!(tdb->flags & TDB_CONVERT))) { +		int64_t *direct; + +		direct = tdb->methods->direct(tdb, +					      offsetof(struct tdb_header, +						       seqnum), +					      sizeof(*direct), true); +		if (likely(direct)) { +			/* Don't let it go negative, even briefly */ +			if (unlikely((*direct) + 1) < 0) +				*direct = 0; +			(*direct)++; +			return; +		} +	} + +	seq = tdb_read_off(tdb, offsetof(struct tdb_header, seqnum)); +	if (!TDB_OFF_IS_ERR(seq)) { +		seq++; +		if (unlikely((int64_t)seq < 0)) +			seq = 0; +		tdb_write_off(tdb, offsetof(struct tdb_header, seqnum), seq); +	} +} + +static const struct tdb_methods io_methods = { +	tdb_read, +	tdb_write, +	tdb_oob, +	tdb_expand_file, +	tdb_direct, +}; + +/* +  initialise the default methods table +*/ +void tdb_io_init(struct tdb_context *tdb) +{ +	tdb->methods = &io_methods; +} diff --git a/lib/tdb2/lock.c b/lib/tdb2/lock.c new file mode 100644 index 0000000000..76b8bc3157 --- /dev/null +++ b/lib/tdb2/lock.c @@ -0,0 +1,875 @@ + /* +   Unix SMB/CIFS implementation. + +   trivial database library + +   Copyright (C) Andrew Tridgell              1999-2005 +   Copyright (C) Paul `Rusty' Russell		   2000 +   Copyright (C) Jeremy Allison			   2000-2003 + +     ** NOTE! The following LGPL license applies to the tdb +     ** library. This does NOT imply that all of Samba is released +     ** under the LGPL + +   This library is free software; you can redistribute it and/or +   modify it under the terms of the GNU Lesser General Public +   License as published by the Free Software Foundation; either +   version 3 of the License, or (at your option) any later version. + +   This library is distributed in the hope that it will be useful, +   but WITHOUT ANY WARRANTY; without even the implied warranty of +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU +   Lesser General Public License for more details. + +   You should have received a copy of the GNU Lesser General Public +   License along with this library; if not, see <http://www.gnu.org/licenses/>. +*/ + +#include "private.h" +#include <assert.h> +#include <ccan/build_assert/build_assert.h> + +/* If we were threaded, we could wait for unlock, but we're not, so fail. */ +static enum TDB_ERROR owner_conflict(struct tdb_context *tdb, const char *call) +{ +	return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_USE_ERROR, +			  "%s: lock owned by another tdb in this process.", +			  call); +} + +/* If we fork, we no longer really own locks. */ +static bool check_lock_pid(struct tdb_context *tdb, +			   const char *call, bool log) +{ +	/* No locks?  No problem! */ +	if (tdb->file->allrecord_lock.count == 0 +	    && tdb->file->num_lockrecs == 0) { +		return true; +	} + +	/* No fork?  No problem! */ +	if (tdb->file->locker == getpid()) { +		return true; +	} + +	if (log) { +		tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_USE_ERROR, +			   "%s: fork() detected after lock acquisition!" +			   " (%u vs %u)", call, tdb->file->locker, getpid()); +	} +	return false; +} + +int tdb_fcntl_lock(int fd, int rw, off_t off, off_t len, bool waitflag, +		   void *unused) +{ +	struct flock fl; +	int ret; + +	do { +		fl.l_type = rw; +		fl.l_whence = SEEK_SET; +		fl.l_start = off; +		fl.l_len = len; + +		if (waitflag) +			ret = fcntl(fd, F_SETLKW, &fl); +		else +			ret = fcntl(fd, F_SETLK, &fl); +	} while (ret != 0 && errno == EINTR); +	return ret; +} + +int tdb_fcntl_unlock(int fd, int rw, off_t off, off_t len, void *unused) +{ +	struct flock fl; +	int ret; + +	do { +		fl.l_type = F_UNLCK; +		fl.l_whence = SEEK_SET; +		fl.l_start = off; +		fl.l_len = len; + +		ret = fcntl(fd, F_SETLKW, &fl); +	} while (ret != 0 && errno == EINTR); +	return ret; +} + +static int lock(struct tdb_context *tdb, +		      int rw, off_t off, off_t len, bool waitflag) +{ +	int ret; +	if (tdb->file->allrecord_lock.count == 0 +	    && tdb->file->num_lockrecs == 0) { +		tdb->file->locker = getpid(); +	} + +	tdb->stats.lock_lowlevel++; +	ret = tdb->lock_fn(tdb->file->fd, rw, off, len, waitflag, +			   tdb->lock_data); +	if (!waitflag) { +		tdb->stats.lock_nonblock++; +		if (ret != 0) +			tdb->stats.lock_nonblock_fail++; +	} +	return ret; +} + +static int unlock(struct tdb_context *tdb, int rw, off_t off, off_t len) +{ +#if 0 /* Check they matched up locks and unlocks correctly. */ +	char line[80]; +	FILE *locks; +	bool found = false; + +	locks = fopen("/proc/locks", "r"); + +	while (fgets(line, 80, locks)) { +		char *p; +		int type, start, l; + +		/* eg. 1: FLOCK  ADVISORY  WRITE 2440 08:01:2180826 0 EOF */ +		p = strchr(line, ':') + 1; +		if (strncmp(p, " POSIX  ADVISORY  ", strlen(" POSIX  ADVISORY  "))) +			continue; +		p += strlen(" FLOCK  ADVISORY  "); +		if (strncmp(p, "READ  ", strlen("READ  ")) == 0) +			type = F_RDLCK; +		else if (strncmp(p, "WRITE ", strlen("WRITE ")) == 0) +			type = F_WRLCK; +		else +			abort(); +		p += 6; +		if (atoi(p) != getpid()) +			continue; +		p = strchr(strchr(p, ' ') + 1, ' ') + 1; +		start = atoi(p); +		p = strchr(p, ' ') + 1; +		if (strncmp(p, "EOF", 3) == 0) +			l = 0; +		else +			l = atoi(p) - start + 1; + +		if (off == start) { +			if (len != l) { +				fprintf(stderr, "Len %u should be %u: %s", +					(int)len, l, line); +				abort(); +			} +			if (type != rw) { +				fprintf(stderr, "Type %s wrong: %s", +					rw == F_RDLCK ? "READ" : "WRITE", line); +				abort(); +			} +			found = true; +			break; +		} +	} + +	if (!found) { +		fprintf(stderr, "Unlock on %u@%u not found!", +			(int)off, (int)len); +		abort(); +	} + +	fclose(locks); +#endif + +	return tdb->unlock_fn(tdb->file->fd, rw, off, len, tdb->lock_data); +} + +/* a byte range locking function - return 0 on success +   this functions locks len bytes at the specified offset. + +   note that a len of zero means lock to end of file +*/ +static enum TDB_ERROR tdb_brlock(struct tdb_context *tdb, +				 int rw_type, tdb_off_t offset, tdb_off_t len, +				 enum tdb_lock_flags flags) +{ +	int ret; + +	if (tdb->flags & TDB_NOLOCK) { +		return TDB_SUCCESS; +	} + +	if (rw_type == F_WRLCK && tdb->read_only) { +		return tdb_logerr(tdb, TDB_ERR_RDONLY, TDB_LOG_USE_ERROR, +				  "Write lock attempted on read-only database"); +	} + +	/* A 32 bit system cannot open a 64-bit file, but it could have +	 * expanded since then: check here. */ +	if ((size_t)(offset + len) != offset + len) { +		return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR, +				  "tdb_brlock: lock on giant offset %llu", +				  (long long)(offset + len)); +	} + +	ret = lock(tdb, rw_type, offset, len, flags & TDB_LOCK_WAIT); +	if (ret != 0) { +		/* Generic lock error. errno set by fcntl. +		 * EAGAIN is an expected return from non-blocking +		 * locks. */ +		if (!(flags & TDB_LOCK_PROBE) +		    && (errno != EAGAIN && errno != EINTR)) { +			tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR, +				   "tdb_brlock failed (fd=%d) at" +				   " offset %zu rw_type=%d flags=%d len=%zu:" +				   " %s", +				   tdb->file->fd, (size_t)offset, rw_type, +				   flags, (size_t)len, strerror(errno)); +		} +		return TDB_ERR_LOCK; +	} +	return TDB_SUCCESS; +} + +static enum TDB_ERROR tdb_brunlock(struct tdb_context *tdb, +				   int rw_type, tdb_off_t offset, size_t len) +{ +	if (tdb->flags & TDB_NOLOCK) { +		return TDB_SUCCESS; +	} + +	if (!check_lock_pid(tdb, "tdb_brunlock", true)) +		return TDB_ERR_LOCK; + +	if (unlock(tdb, rw_type, offset, len) == -1) { +		return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR, +				  "tdb_brunlock failed (fd=%d) at offset %zu" +				  " rw_type=%d len=%zu: %s", +				  tdb->file->fd, (size_t)offset, rw_type, +				  (size_t)len, strerror(errno)); +	} +	return TDB_SUCCESS; +} + +/* +  upgrade a read lock to a write lock. This needs to be handled in a +  special way as some OSes (such as solaris) have too conservative +  deadlock detection and claim a deadlock when progress can be +  made. For those OSes we may loop for a while. +*/ +enum TDB_ERROR tdb_allrecord_upgrade(struct tdb_context *tdb) +{ +	int count = 1000; + +	if (!check_lock_pid(tdb, "tdb_transaction_prepare_commit", true)) +		return TDB_ERR_LOCK; + +	if (tdb->file->allrecord_lock.count != 1) { +		return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR, +				  "tdb_allrecord_upgrade failed:" +				  " count %u too high", +				  tdb->file->allrecord_lock.count); +	} + +	if (tdb->file->allrecord_lock.off != 1) { +		return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR, +				  "tdb_allrecord_upgrade failed:" +				  " already upgraded?"); +	} + +	if (tdb->file->allrecord_lock.owner != tdb) { +		return owner_conflict(tdb, "tdb_allrecord_upgrade"); +	} + +	while (count--) { +		struct timeval tv; +		if (tdb_brlock(tdb, F_WRLCK, +			       TDB_HASH_LOCK_START, 0, +			       TDB_LOCK_WAIT|TDB_LOCK_PROBE) == TDB_SUCCESS) { +			tdb->file->allrecord_lock.ltype = F_WRLCK; +			tdb->file->allrecord_lock.off = 0; +			return TDB_SUCCESS; +		} +		if (errno != EDEADLK) { +			break; +		} +		/* sleep for as short a time as we can - more portable than usleep() */ +		tv.tv_sec = 0; +		tv.tv_usec = 1; +		select(0, NULL, NULL, NULL, &tv); +	} + +	if (errno != EAGAIN && errno != EINTR) +		tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR, +			   "tdb_allrecord_upgrade failed"); +	return TDB_ERR_LOCK; +} + +static struct tdb_lock *find_nestlock(struct tdb_context *tdb, tdb_off_t offset, +				      const struct tdb_context *owner) +{ +	unsigned int i; + +	for (i=0; i<tdb->file->num_lockrecs; i++) { +		if (tdb->file->lockrecs[i].off == offset) { +			if (owner && tdb->file->lockrecs[i].owner != owner) +				return NULL; +			return &tdb->file->lockrecs[i]; +		} +	} +	return NULL; +} + +enum TDB_ERROR tdb_lock_and_recover(struct tdb_context *tdb) +{ +	enum TDB_ERROR ecode; + +	if (!check_lock_pid(tdb, "tdb_transaction_prepare_commit", true)) +		return TDB_ERR_LOCK; + +	ecode = tdb_allrecord_lock(tdb, F_WRLCK, TDB_LOCK_WAIT|TDB_LOCK_NOCHECK, +				   false); +	if (ecode != TDB_SUCCESS) { +		return ecode; +	} + +	ecode = tdb_lock_open(tdb, F_WRLCK, TDB_LOCK_WAIT|TDB_LOCK_NOCHECK); +	if (ecode != TDB_SUCCESS) { +		tdb_allrecord_unlock(tdb, F_WRLCK); +		return ecode; +	} +	ecode = tdb_transaction_recover(tdb); +	tdb_unlock_open(tdb, F_WRLCK); +	tdb_allrecord_unlock(tdb, F_WRLCK); + +	return ecode; +} + +/* lock an offset in the database. */ +static enum TDB_ERROR tdb_nest_lock(struct tdb_context *tdb, +				    tdb_off_t offset, int ltype, +				    enum tdb_lock_flags flags) +{ +	struct tdb_lock *new_lck; +	enum TDB_ERROR ecode; + +	if (offset > (TDB_HASH_LOCK_START + TDB_HASH_LOCK_RANGE +		      + tdb->file->map_size / 8)) { +		return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR, +				  "tdb_nest_lock: invalid offset %zu ltype=%d", +				  (size_t)offset, ltype); +	} + +	if (tdb->flags & TDB_NOLOCK) +		return TDB_SUCCESS; + +	if (!check_lock_pid(tdb, "tdb_nest_lock", true)) { +		return TDB_ERR_LOCK; +	} + +	tdb->stats.locks++; + +	new_lck = find_nestlock(tdb, offset, NULL); +	if (new_lck) { +		if (new_lck->owner != tdb) { +			return owner_conflict(tdb, "tdb_nest_lock"); +		} + +		if (new_lck->ltype == F_RDLCK && ltype == F_WRLCK) { +			return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR, +					  "tdb_nest_lock:" +					  " offset %zu has read lock", +					  (size_t)offset); +		} +		/* Just increment the struct, posix locks don't stack. */ +		new_lck->count++; +		return TDB_SUCCESS; +	} + +#if 0 +	if (tdb->file->num_lockrecs +	    && offset >= TDB_HASH_LOCK_START +	    && offset < TDB_HASH_LOCK_START + TDB_HASH_LOCK_RANGE) { +		return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR, +				  "tdb_nest_lock: already have a hash lock?"); +	} +#endif + +	new_lck = (struct tdb_lock *)realloc( +		tdb->file->lockrecs, +		sizeof(*tdb->file->lockrecs) * (tdb->file->num_lockrecs+1)); +	if (new_lck == NULL) { +		return tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR, +				  "tdb_nest_lock:" +				  " unable to allocate %zu lock struct", +				  tdb->file->num_lockrecs + 1); +	} +	tdb->file->lockrecs = new_lck; + +	/* Since fcntl locks don't nest, we do a lock for the first one, +	   and simply bump the count for future ones */ +	ecode = tdb_brlock(tdb, ltype, offset, 1, flags); +	if (ecode != TDB_SUCCESS) { +		return ecode; +	} + +	/* First time we grab a lock, perhaps someone died in commit? */ +	if (!(flags & TDB_LOCK_NOCHECK) +	    && tdb->file->num_lockrecs == 0) { +		tdb_bool_err berr = tdb_needs_recovery(tdb); +		if (berr != false) { +			tdb_brunlock(tdb, ltype, offset, 1); + +			if (berr < 0) +				return berr; +			ecode = tdb_lock_and_recover(tdb); +			if (ecode == TDB_SUCCESS) { +				ecode = tdb_brlock(tdb, ltype, offset, 1, +						   flags); +			} +			if (ecode != TDB_SUCCESS) { +				return ecode; +			} +		} +	} + +	tdb->file->lockrecs[tdb->file->num_lockrecs].owner = tdb; +	tdb->file->lockrecs[tdb->file->num_lockrecs].off = offset; +	tdb->file->lockrecs[tdb->file->num_lockrecs].count = 1; +	tdb->file->lockrecs[tdb->file->num_lockrecs].ltype = ltype; +	tdb->file->num_lockrecs++; + +	return TDB_SUCCESS; +} + +static enum TDB_ERROR tdb_nest_unlock(struct tdb_context *tdb, +				      tdb_off_t off, int ltype) +{ +	struct tdb_lock *lck; +	enum TDB_ERROR ecode; + +	if (tdb->flags & TDB_NOLOCK) +		return TDB_SUCCESS; + +	lck = find_nestlock(tdb, off, tdb); +	if ((lck == NULL) || (lck->count == 0)) { +		return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR, +				  "tdb_nest_unlock: no lock for %zu", +				  (size_t)off); +	} + +	if (lck->count > 1) { +		lck->count--; +		return TDB_SUCCESS; +	} + +	/* +	 * This lock has count==1 left, so we need to unlock it in the +	 * kernel. We don't bother with decrementing the in-memory array +	 * element, we're about to overwrite it with the last array element +	 * anyway. +	 */ +	ecode = tdb_brunlock(tdb, ltype, off, 1); + +	/* +	 * Shrink the array by overwriting the element just unlocked with the +	 * last array element. +	 */ +	*lck = tdb->file->lockrecs[--tdb->file->num_lockrecs]; + +	return ecode; +} + +/* +  get the transaction lock + */ +enum TDB_ERROR tdb_transaction_lock(struct tdb_context *tdb, int ltype) +{ +	return tdb_nest_lock(tdb, TDB_TRANSACTION_LOCK, ltype, TDB_LOCK_WAIT); +} + +/* +  release the transaction lock + */ +void tdb_transaction_unlock(struct tdb_context *tdb, int ltype) +{ +	tdb_nest_unlock(tdb, TDB_TRANSACTION_LOCK, ltype); +} + +/* We only need to lock individual bytes, but Linux merges consecutive locks + * so we lock in contiguous ranges. */ +static enum TDB_ERROR tdb_lock_gradual(struct tdb_context *tdb, +				       int ltype, enum tdb_lock_flags flags, +				       tdb_off_t off, tdb_off_t len) +{ +	enum TDB_ERROR ecode; +	enum tdb_lock_flags nb_flags = (flags & ~TDB_LOCK_WAIT); + +	if (len <= 1) { +		/* 0 would mean to end-of-file... */ +		assert(len != 0); +		/* Single hash.  Just do blocking lock. */ +		return tdb_brlock(tdb, ltype, off, len, flags); +	} + +	/* First we try non-blocking. */ +	if (tdb_brlock(tdb, ltype, off, len, nb_flags) == TDB_SUCCESS) { +		return TDB_SUCCESS; +	} + +	/* Try locking first half, then second. */ +	ecode = tdb_lock_gradual(tdb, ltype, flags, off, len / 2); +	if (ecode != TDB_SUCCESS) +		return ecode; + +	ecode = tdb_lock_gradual(tdb, ltype, flags, +				 off + len / 2, len - len / 2); +	if (ecode != TDB_SUCCESS) { +		tdb_brunlock(tdb, ltype, off, len / 2); +	} +	return ecode; +} + +/* lock/unlock entire database.  It can only be upgradable if you have some + * other way of guaranteeing exclusivity (ie. transaction write lock). */ +enum TDB_ERROR tdb_allrecord_lock(struct tdb_context *tdb, int ltype, +				  enum tdb_lock_flags flags, bool upgradable) +{ +	enum TDB_ERROR ecode; +	tdb_bool_err berr; + +	if (tdb->flags & TDB_NOLOCK) +		return TDB_SUCCESS; + +	if (!check_lock_pid(tdb, "tdb_allrecord_lock", true)) { +		return TDB_ERR_LOCK; +	} + +	if (tdb->file->allrecord_lock.count) { +		if (tdb->file->allrecord_lock.owner != tdb) { +			return owner_conflict(tdb, "tdb_allrecord_lock"); +		} + +		if (ltype == F_RDLCK +		    || tdb->file->allrecord_lock.ltype == F_WRLCK) { +			tdb->file->allrecord_lock.count++; +			return TDB_SUCCESS; +		} + +		/* a global lock of a different type exists */ +		return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_USE_ERROR, +				  "tdb_allrecord_lock: already have %s lock", +				  tdb->file->allrecord_lock.ltype == F_RDLCK +				  ? "read" : "write"); +	} + +	if (tdb_has_hash_locks(tdb)) { +		/* can't combine global and chain locks */ +		return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_USE_ERROR, +				  "tdb_allrecord_lock:" +				  " already have chain lock"); +	} + +	if (upgradable && ltype != F_RDLCK) { +		/* tdb error: you can't upgrade a write lock! */ +		return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR, +				  "tdb_allrecord_lock:" +				  " can't upgrade a write lock"); +	} + +	tdb->stats.locks++; +again: +	/* Lock hashes, gradually. */ +	ecode = tdb_lock_gradual(tdb, ltype, flags, TDB_HASH_LOCK_START, +				 TDB_HASH_LOCK_RANGE); +	if (ecode != TDB_SUCCESS) +		return ecode; + +	/* Lock free tables: there to end of file. */ +	ecode = tdb_brlock(tdb, ltype, +			   TDB_HASH_LOCK_START + TDB_HASH_LOCK_RANGE, +			   0, flags); +	if (ecode != TDB_SUCCESS) { +		tdb_brunlock(tdb, ltype, TDB_HASH_LOCK_START, +			     TDB_HASH_LOCK_RANGE); +		return ecode; +	} + +	tdb->file->allrecord_lock.owner = tdb; +	tdb->file->allrecord_lock.count = 1; +	/* If it's upgradable, it's actually exclusive so we can treat +	 * it as a write lock. */ +	tdb->file->allrecord_lock.ltype = upgradable ? F_WRLCK : ltype; +	tdb->file->allrecord_lock.off = upgradable; + +	/* Now check for needing recovery. */ +	if (flags & TDB_LOCK_NOCHECK) +		return TDB_SUCCESS; + +	berr = tdb_needs_recovery(tdb); +	if (likely(berr == false)) +		return TDB_SUCCESS; + +	tdb_allrecord_unlock(tdb, ltype); +	if (berr < 0) +		return berr; +	ecode = tdb_lock_and_recover(tdb); +	if (ecode != TDB_SUCCESS) { +		return ecode; +	} +	goto again; +} + +enum TDB_ERROR tdb_lock_open(struct tdb_context *tdb, +			     int ltype, enum tdb_lock_flags flags) +{ +	return tdb_nest_lock(tdb, TDB_OPEN_LOCK, ltype, flags); +} + +void tdb_unlock_open(struct tdb_context *tdb, int ltype) +{ +	tdb_nest_unlock(tdb, TDB_OPEN_LOCK, ltype); +} + +bool tdb_has_open_lock(struct tdb_context *tdb) +{ +	return !(tdb->flags & TDB_NOLOCK) +		&& find_nestlock(tdb, TDB_OPEN_LOCK, tdb) != NULL; +} + +enum TDB_ERROR tdb_lock_expand(struct tdb_context *tdb, int ltype) +{ +	/* Lock doesn't protect data, so don't check (we recurse if we do!) */ +	return tdb_nest_lock(tdb, TDB_EXPANSION_LOCK, ltype, +			     TDB_LOCK_WAIT | TDB_LOCK_NOCHECK); +} + +void tdb_unlock_expand(struct tdb_context *tdb, int ltype) +{ +	tdb_nest_unlock(tdb, TDB_EXPANSION_LOCK, ltype); +} + +/* unlock entire db */ +void tdb_allrecord_unlock(struct tdb_context *tdb, int ltype) +{ +	if (tdb->flags & TDB_NOLOCK) +		return; + +	if (tdb->file->allrecord_lock.count == 0) { +		tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_USE_ERROR, +			   "tdb_allrecord_unlock: not locked!"); +		return; +	} + +	if (tdb->file->allrecord_lock.owner != tdb) { +		tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_USE_ERROR, +			   "tdb_allrecord_unlock: not locked by us!"); +		return; +	} + +	/* Upgradable locks are marked as write locks. */ +	if (tdb->file->allrecord_lock.ltype != ltype +	    && (!tdb->file->allrecord_lock.off || ltype != F_RDLCK)) { +		tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR, +			   "tdb_allrecord_unlock: have %s lock", +			   tdb->file->allrecord_lock.ltype == F_RDLCK +			   ? "read" : "write"); +		return; +	} + +	if (tdb->file->allrecord_lock.count > 1) { +		tdb->file->allrecord_lock.count--; +		return; +	} + +	tdb->file->allrecord_lock.count = 0; +	tdb->file->allrecord_lock.ltype = 0; + +	tdb_brunlock(tdb, ltype, TDB_HASH_LOCK_START, 0); +} + +bool tdb_has_expansion_lock(struct tdb_context *tdb) +{ +	return find_nestlock(tdb, TDB_EXPANSION_LOCK, tdb) != NULL; +} + +bool tdb_has_hash_locks(struct tdb_context *tdb) +{ +	unsigned int i; + +	for (i=0; i<tdb->file->num_lockrecs; i++) { +		if (tdb->file->lockrecs[i].off >= TDB_HASH_LOCK_START +		    && tdb->file->lockrecs[i].off < (TDB_HASH_LOCK_START +						     + TDB_HASH_LOCK_RANGE)) +			return true; +	} +	return false; +} + +static bool tdb_has_free_lock(struct tdb_context *tdb) +{ +	unsigned int i; + +	if (tdb->flags & TDB_NOLOCK) +		return false; + +	for (i=0; i<tdb->file->num_lockrecs; i++) { +		if (tdb->file->lockrecs[i].off +		    > TDB_HASH_LOCK_START + TDB_HASH_LOCK_RANGE) +			return true; +	} +	return false; +} + +enum TDB_ERROR tdb_lock_hashes(struct tdb_context *tdb, +			       tdb_off_t hash_lock, +			       tdb_len_t hash_range, +			       int ltype, enum tdb_lock_flags waitflag) +{ +	/* FIXME: Do this properly, using hlock_range */ +	unsigned l = TDB_HASH_LOCK_START +		+ (hash_lock >> (64 - TDB_HASH_LOCK_RANGE_BITS)); + +	/* a allrecord lock allows us to avoid per chain locks */ +	if (tdb->file->allrecord_lock.count) { +		if (!check_lock_pid(tdb, "tdb_lock_hashes", true)) +			return TDB_ERR_LOCK; + +		if (tdb->file->allrecord_lock.owner != tdb) +			return owner_conflict(tdb, "tdb_lock_hashes"); +		if (ltype == tdb->file->allrecord_lock.ltype +		    || ltype == F_RDLCK) { +			return TDB_SUCCESS; +		} + +		return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_USE_ERROR, +				  "tdb_lock_hashes:" +				  " already have %s allrecordlock", +				  tdb->file->allrecord_lock.ltype == F_RDLCK +				  ? "read" : "write"); +	} + +	if (tdb_has_free_lock(tdb)) { +		return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR, +				  "tdb_lock_hashes: already have free lock"); +	} + +	if (tdb_has_expansion_lock(tdb)) { +		return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR, +				  "tdb_lock_hashes:" +				  " already have expansion lock"); +	} + +	return tdb_nest_lock(tdb, l, ltype, waitflag); +} + +enum TDB_ERROR tdb_unlock_hashes(struct tdb_context *tdb, +				 tdb_off_t hash_lock, +				 tdb_len_t hash_range, int ltype) +{ +	unsigned l = TDB_HASH_LOCK_START +		+ (hash_lock >> (64 - TDB_HASH_LOCK_RANGE_BITS)); + +	if (tdb->flags & TDB_NOLOCK) +		return 0; + +	/* a allrecord lock allows us to avoid per chain locks */ +	if (tdb->file->allrecord_lock.count) { +		if (tdb->file->allrecord_lock.ltype == F_RDLCK +		    && ltype == F_WRLCK) { +			return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR, +					  "tdb_unlock_hashes RO allrecord!"); +		} +		return TDB_SUCCESS; +	} + +	return tdb_nest_unlock(tdb, l, ltype); +} + +/* Hash locks use TDB_HASH_LOCK_START + the next 30 bits. + * Then we begin; bucket offsets are sizeof(tdb_len_t) apart, so we divide. + * The result is that on 32 bit systems we don't use lock values > 2^31 on + * files that are less than 4GB. + */ +static tdb_off_t free_lock_off(tdb_off_t b_off) +{ +	return TDB_HASH_LOCK_START + TDB_HASH_LOCK_RANGE +		+ b_off / sizeof(tdb_off_t); +} + +enum TDB_ERROR tdb_lock_free_bucket(struct tdb_context *tdb, tdb_off_t b_off, +				    enum tdb_lock_flags waitflag) +{ +	assert(b_off >= sizeof(struct tdb_header)); + +	if (tdb->flags & TDB_NOLOCK) +		return 0; + +	/* a allrecord lock allows us to avoid per chain locks */ +	if (tdb->file->allrecord_lock.count) { +		if (!check_lock_pid(tdb, "tdb_lock_free_bucket", true)) +			return TDB_ERR_LOCK; + +		if (tdb->file->allrecord_lock.ltype == F_WRLCK) +			return 0; +		return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR, +				  "tdb_lock_free_bucket with" +				  " read-only allrecordlock!"); +	} + +#if 0 /* FIXME */ +	if (tdb_has_expansion_lock(tdb)) { +		return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR, +				  "tdb_lock_free_bucket:" +				  " already have expansion lock"); +	} +#endif + +	return tdb_nest_lock(tdb, free_lock_off(b_off), F_WRLCK, waitflag); +} + +void tdb_unlock_free_bucket(struct tdb_context *tdb, tdb_off_t b_off) +{ +	if (tdb->file->allrecord_lock.count) +		return; + +	tdb_nest_unlock(tdb, free_lock_off(b_off), F_WRLCK); +} + +enum TDB_ERROR tdb_lockall(struct tdb_context *tdb) +{ +	return tdb_allrecord_lock(tdb, F_WRLCK, TDB_LOCK_WAIT, false); +} + +void tdb_unlockall(struct tdb_context *tdb) +{ +	tdb_allrecord_unlock(tdb, F_WRLCK); +} + +enum TDB_ERROR tdb_lockall_read(struct tdb_context *tdb) +{ +	return tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_WAIT, false); +} + +void tdb_unlockall_read(struct tdb_context *tdb) +{ +	tdb_allrecord_unlock(tdb, F_RDLCK); +} + +void tdb_lock_cleanup(struct tdb_context *tdb) +{ +	unsigned int i; + +	/* We don't want to warn: they're allowed to close tdb after fork. */ +	if (!check_lock_pid(tdb, "tdb_close", false)) +		return; + +	while (tdb->file->allrecord_lock.count +	       && tdb->file->allrecord_lock.owner == tdb) { +		tdb_allrecord_unlock(tdb, tdb->file->allrecord_lock.ltype); +	} + +	for (i=0; i<tdb->file->num_lockrecs; i++) { +		if (tdb->file->lockrecs[i].owner == tdb) { +			tdb_nest_unlock(tdb, +					tdb->file->lockrecs[i].off, +					tdb->file->lockrecs[i].ltype); +			i--; +		} +	} +} diff --git a/lib/tdb2/open.c b/lib/tdb2/open.c new file mode 100644 index 0000000000..c35598cdcc --- /dev/null +++ b/lib/tdb2/open.c @@ -0,0 +1,661 @@ +#include "private.h" +#include <ccan/hash/hash.h> +#include <assert.h> + +/* all lock info, to detect double-opens (fcntl file don't nest!) */ +static struct tdb_file *files = NULL; + +static struct tdb_file *find_file(dev_t device, ino_t ino) +{ +	struct tdb_file *i; + +	for (i = files; i; i = i->next) { +		if (i->device == device && i->inode == ino) { +			i->refcnt++; +			break; +		} +	} +	return i; +} + +static bool read_all(int fd, void *buf, size_t len) +{ +	while (len) { +		ssize_t ret; +		ret = read(fd, buf, len); +		if (ret < 0) +			return false; +		if (ret == 0) { +			/* ETOOSHORT? */ +			errno = EWOULDBLOCK; +			return false; +		} +		buf = (char *)buf + ret; +		len -= ret; +	} +	return true; +} + +static uint64_t random_number(struct tdb_context *tdb) +{ +	int fd; +	uint64_t ret = 0; +	struct timeval now; + +	fd = open("/dev/urandom", O_RDONLY); +	if (fd >= 0) { +		if (read_all(fd, &ret, sizeof(ret))) { +			close(fd); +			return ret; +		} +		close(fd); +	} +	/* FIXME: Untested!  Based on Wikipedia protocol description! */ +	fd = open("/dev/egd-pool", O_RDWR); +	if (fd >= 0) { +		/* Command is 1, next byte is size we want to read. */ +		char cmd[2] = { 1, sizeof(uint64_t) }; +		if (write(fd, cmd, sizeof(cmd)) == sizeof(cmd)) { +			char reply[1 + sizeof(uint64_t)]; +			int r = read(fd, reply, sizeof(reply)); +			if (r > 1) { +				/* Copy at least some bytes. */ +				memcpy(&ret, reply+1, r - 1); +				if (reply[0] == sizeof(uint64_t) +				    && r == sizeof(reply)) { +					close(fd); +					return ret; +				} +			} +		} +		close(fd); +	} + +	/* Fallback: pid and time. */ +	gettimeofday(&now, NULL); +	ret = getpid() * 100132289ULL + now.tv_sec * 1000000ULL + now.tv_usec; +	tdb_logerr(tdb, TDB_SUCCESS, TDB_LOG_WARNING, +		   "tdb_open: random from getpid and time"); +	return ret; +} + +struct new_database { +	struct tdb_header hdr; +	struct tdb_freetable ftable; +}; + +/* initialise a new database */ +static enum TDB_ERROR tdb_new_database(struct tdb_context *tdb, +				       struct tdb_attribute_seed *seed, +				       struct tdb_header *hdr) +{ +	/* We make it up in memory, then write it out if not internal */ +	struct new_database newdb; +	unsigned int magic_len; +	ssize_t rlen; +	enum TDB_ERROR ecode; + +	/* Fill in the header */ +	newdb.hdr.version = TDB_VERSION; +	if (seed) +		newdb.hdr.hash_seed = seed->seed; +	else +		newdb.hdr.hash_seed = random_number(tdb); +	newdb.hdr.hash_test = TDB_HASH_MAGIC; +	newdb.hdr.hash_test = tdb->hash_fn(&newdb.hdr.hash_test, +					   sizeof(newdb.hdr.hash_test), +					   newdb.hdr.hash_seed, +					   tdb->hash_data); +	newdb.hdr.recovery = 0; +	newdb.hdr.features_used = newdb.hdr.features_offered = TDB_FEATURE_MASK; +	newdb.hdr.seqnum = 0; +	memset(newdb.hdr.reserved, 0, sizeof(newdb.hdr.reserved)); +	/* Initial hashes are empty. */ +	memset(newdb.hdr.hashtable, 0, sizeof(newdb.hdr.hashtable)); + +	/* Free is empty. */ +	newdb.hdr.free_table = offsetof(struct new_database, ftable); +	memset(&newdb.ftable, 0, sizeof(newdb.ftable)); +	ecode = set_header(NULL, &newdb.ftable.hdr, TDB_FTABLE_MAGIC, 0, +			   sizeof(newdb.ftable) - sizeof(newdb.ftable.hdr), +			   sizeof(newdb.ftable) - sizeof(newdb.ftable.hdr), +			   0); +	if (ecode != TDB_SUCCESS) { +		return ecode; +	} + +	/* Magic food */ +	memset(newdb.hdr.magic_food, 0, sizeof(newdb.hdr.magic_food)); +	strcpy(newdb.hdr.magic_food, TDB_MAGIC_FOOD); + +	/* This creates an endian-converted database, as if read from disk */ +	magic_len = sizeof(newdb.hdr.magic_food); +	tdb_convert(tdb, +		    (char *)&newdb.hdr + magic_len, sizeof(newdb) - magic_len); + +	*hdr = newdb.hdr; + +	if (tdb->flags & TDB_INTERNAL) { +		tdb->file->map_size = sizeof(newdb); +		tdb->file->map_ptr = malloc(tdb->file->map_size); +		if (!tdb->file->map_ptr) { +			return tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR, +					  "tdb_new_database:" +					  " failed to allocate"); +		} +		memcpy(tdb->file->map_ptr, &newdb, tdb->file->map_size); +		return TDB_SUCCESS; +	} +	if (lseek(tdb->file->fd, 0, SEEK_SET) == -1) { +		return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR, +				  "tdb_new_database:" +				  " failed to seek: %s", strerror(errno)); +	} + +	if (ftruncate(tdb->file->fd, 0) == -1) { +		return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR, +				  "tdb_new_database:" +				  " failed to truncate: %s", strerror(errno)); +	} + +	rlen = write(tdb->file->fd, &newdb, sizeof(newdb)); +	if (rlen != sizeof(newdb)) { +		if (rlen >= 0) +			errno = ENOSPC; +		return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR, +				  "tdb_new_database: %zi writing header: %s", +				  rlen, strerror(errno)); +	} +	return TDB_SUCCESS; +} + +static enum TDB_ERROR tdb_new_file(struct tdb_context *tdb) +{ +	tdb->file = malloc(sizeof(*tdb->file)); +	if (!tdb->file) +		return tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR, +				  "tdb_open: cannot alloc tdb_file structure"); +	tdb->file->num_lockrecs = 0; +	tdb->file->lockrecs = NULL; +	tdb->file->allrecord_lock.count = 0; +	tdb->file->refcnt = 1; +	return TDB_SUCCESS; +} + +enum TDB_ERROR tdb_set_attribute(struct tdb_context *tdb, +				 const union tdb_attribute *attr) +{ +	switch (attr->base.attr) { +	case TDB_ATTRIBUTE_LOG: +		tdb->log_fn = attr->log.fn; +		tdb->log_data = attr->log.data; +		break; +	case TDB_ATTRIBUTE_HASH: +	case TDB_ATTRIBUTE_SEED: +	case TDB_ATTRIBUTE_OPENHOOK: +		return tdb->last_error +			= tdb_logerr(tdb, TDB_ERR_EINVAL, +				     TDB_LOG_USE_ERROR, +				     "tdb_set_attribute:" +				     " cannot set %s after opening", +				     attr->base.attr == TDB_ATTRIBUTE_HASH +				     ? "TDB_ATTRIBUTE_HASH" +				     : attr->base.attr == TDB_ATTRIBUTE_SEED +				     ? "TDB_ATTRIBUTE_SEED" +				     : "TDB_ATTRIBUTE_OPENHOOK"); +	case TDB_ATTRIBUTE_STATS: +		return tdb->last_error +			= tdb_logerr(tdb, TDB_ERR_EINVAL, +				     TDB_LOG_USE_ERROR, +				     "tdb_set_attribute:" +				     " cannot set TDB_ATTRIBUTE_STATS"); +	case TDB_ATTRIBUTE_FLOCK: +		tdb->lock_fn = attr->flock.lock; +		tdb->unlock_fn = attr->flock.unlock; +		tdb->lock_data = attr->flock.data; +		break; +	default: +		return tdb->last_error +			= tdb_logerr(tdb, TDB_ERR_EINVAL, +				     TDB_LOG_USE_ERROR, +				     "tdb_set_attribute:" +				     " unknown attribute type %u", +				     attr->base.attr); +	} +	return TDB_SUCCESS; +} + +static uint64_t jenkins_hash(const void *key, size_t length, uint64_t seed, +			     void *unused) +{ +	uint64_t ret; +	/* hash64_stable assumes lower bits are more important; they are a +	 * slightly better hash.  We use the upper bits first, so swap them. */ +	ret = hash64_stable((const unsigned char *)key, length, seed); +	return (ret >> 32) | (ret << 32); +} + +enum TDB_ERROR tdb_get_attribute(struct tdb_context *tdb, +				 union tdb_attribute *attr) +{ +	switch (attr->base.attr) { +	case TDB_ATTRIBUTE_LOG: +		if (!tdb->log_fn) +			return tdb->last_error = TDB_ERR_NOEXIST; +		attr->log.fn = tdb->log_fn; +		attr->log.data = tdb->log_data; +		break; +	case TDB_ATTRIBUTE_HASH: +		attr->hash.fn = tdb->hash_fn; +		attr->hash.data = tdb->hash_data; +		break; +	case TDB_ATTRIBUTE_SEED: +		attr->seed.seed = tdb->hash_seed; +		break; +	case TDB_ATTRIBUTE_OPENHOOK: +		return tdb->last_error +			= tdb_logerr(tdb, TDB_ERR_EINVAL, +				     TDB_LOG_USE_ERROR, +				     "tdb_get_attribute:" +				     " cannot get TDB_ATTRIBUTE_OPENHOOK"); +	case TDB_ATTRIBUTE_STATS: { +		size_t size = attr->stats.size; +		if (size > tdb->stats.size) +			size = tdb->stats.size; +		memcpy(&attr->stats, &tdb->stats, size); +		break; +	} +	case TDB_ATTRIBUTE_FLOCK: +		attr->flock.lock = tdb->lock_fn; +		attr->flock.unlock = tdb->unlock_fn; +		attr->flock.data = tdb->lock_data; +		break; +	default: +		return tdb->last_error +			= tdb_logerr(tdb, TDB_ERR_EINVAL, +				     TDB_LOG_USE_ERROR, +				     "tdb_get_attribute:" +				     " unknown attribute type %u", +				     attr->base.attr); +	} +	attr->base.next = NULL; +	return TDB_SUCCESS; +} + +void tdb_unset_attribute(struct tdb_context *tdb, +			 enum tdb_attribute_type type) +{ +	switch (type) { +	case TDB_ATTRIBUTE_LOG: +		tdb->log_fn = NULL; +		break; +	case TDB_ATTRIBUTE_HASH: +	case TDB_ATTRIBUTE_SEED: +	case TDB_ATTRIBUTE_OPENHOOK: +		tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_LOG_USE_ERROR, +			   "tdb_unset_attribute: cannot unset %s after opening", +			   type == TDB_ATTRIBUTE_HASH +			   ? "TDB_ATTRIBUTE_HASH" +			   : type == TDB_ATTRIBUTE_SEED +			   ? "TDB_ATTRIBUTE_SEED" +			   : "TDB_ATTRIBUTE_OPENHOOK"); +		break; +	case TDB_ATTRIBUTE_STATS: +		tdb_logerr(tdb, TDB_ERR_EINVAL, +			   TDB_LOG_USE_ERROR, +			   "tdb_unset_attribute:" +			   "cannot unset TDB_ATTRIBUTE_STATS"); +		break; +	case TDB_ATTRIBUTE_FLOCK: +		tdb->lock_fn = tdb_fcntl_lock; +		tdb->unlock_fn = tdb_fcntl_unlock; +		break; +	default: +		tdb_logerr(tdb, TDB_ERR_EINVAL, +			   TDB_LOG_USE_ERROR, +			   "tdb_unset_attribute: unknown attribute type %u", +			   type); +	} +} + +struct tdb_context *tdb_open(const char *name, int tdb_flags, +			     int open_flags, mode_t mode, +			     union tdb_attribute *attr) +{ +	struct tdb_context *tdb; +	struct stat st; +	int saved_errno = 0; +	uint64_t hash_test; +	unsigned v; +	ssize_t rlen; +	struct tdb_header hdr; +	struct tdb_attribute_seed *seed = NULL; +	struct tdb_attribute_openhook *openhook = NULL; +	tdb_bool_err berr; +	enum TDB_ERROR ecode; +	int openlock; + +	tdb = malloc(sizeof(*tdb) + (name ? strlen(name) + 1 : 0)); +	if (!tdb) { +		/* Can't log this */ +		errno = ENOMEM; +		return NULL; +	} +	/* Set name immediately for logging functions. */ +	if (name) { +		tdb->name = strcpy((char *)(tdb + 1), name); +	} else { +		tdb->name = NULL; +	} +	tdb->direct_access = 0; +	tdb->flags = tdb_flags; +	tdb->log_fn = NULL; +	tdb->transaction = NULL; +	tdb->access = NULL; +	tdb->last_error = TDB_SUCCESS; +	tdb->file = NULL; +	tdb->lock_fn = tdb_fcntl_lock; +	tdb->unlock_fn = tdb_fcntl_unlock; +	tdb->hash_fn = jenkins_hash; +	memset(&tdb->stats, 0, sizeof(tdb->stats)); +	tdb->stats.base.attr = TDB_ATTRIBUTE_STATS; +	tdb->stats.size = sizeof(tdb->stats); +	tdb_io_init(tdb); + +	while (attr) { +		switch (attr->base.attr) { +		case TDB_ATTRIBUTE_HASH: +			tdb->hash_fn = attr->hash.fn; +			tdb->hash_data = attr->hash.data; +			break; +		case TDB_ATTRIBUTE_SEED: +			seed = &attr->seed; +			break; +		case TDB_ATTRIBUTE_OPENHOOK: +			openhook = &attr->openhook; +			break; +		default: +			/* These are set as normal. */ +			ecode = tdb_set_attribute(tdb, attr); +			if (ecode != TDB_SUCCESS) +				goto fail; +		} +		attr = attr->base.next; +	} + +	if (tdb_flags & ~(TDB_INTERNAL | TDB_NOLOCK | TDB_NOMMAP | TDB_CONVERT +			  | TDB_NOSYNC | TDB_SEQNUM | TDB_ALLOW_NESTING)) { +		ecode = tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_LOG_USE_ERROR, +				   "tdb_open: unknown flags %u", tdb_flags); +		goto fail; +	} + +	if ((open_flags & O_ACCMODE) == O_WRONLY) { +		ecode = tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_LOG_USE_ERROR, +				   "tdb_open: can't open tdb %s write-only", +				   name); +		goto fail; +	} + +	if ((open_flags & O_ACCMODE) == O_RDONLY) { +		tdb->read_only = true; +		tdb->mmap_flags = PROT_READ; +		openlock = F_RDLCK; +	} else { +		tdb->read_only = false; +		tdb->mmap_flags = PROT_READ | PROT_WRITE; +		openlock = F_WRLCK; +	} + +	/* internal databases don't need any of the rest. */ +	if (tdb->flags & TDB_INTERNAL) { +		tdb->flags |= (TDB_NOLOCK | TDB_NOMMAP); +		ecode = tdb_new_file(tdb); +		if (ecode != TDB_SUCCESS) { +			goto fail; +		} +		tdb->file->fd = -1; +		ecode = tdb_new_database(tdb, seed, &hdr); +		if (ecode != TDB_SUCCESS) { +			goto fail; +		} +		tdb_convert(tdb, &hdr.hash_seed, sizeof(hdr.hash_seed)); +		tdb->hash_seed = hdr.hash_seed; +		tdb_ftable_init(tdb); +		return tdb; +	} + +	if (stat(name, &st) != -1) +		tdb->file = find_file(st.st_dev, st.st_ino); + +	if (!tdb->file) { +		int fd; + +		if ((fd = open(name, open_flags, mode)) == -1) { +			/* errno set by open(2) */ +			saved_errno = errno; +			tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR, +				   "tdb_open: could not open file %s: %s", +				   name, strerror(errno)); +			goto fail_errno; +		} + +		/* on exec, don't inherit the fd */ +		v = fcntl(fd, F_GETFD, 0); +		fcntl(fd, F_SETFD, v | FD_CLOEXEC); + +		if (fstat(fd, &st) == -1) { +			saved_errno = errno; +			tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR, +				   "tdb_open: could not stat open %s: %s", +				   name, strerror(errno)); +			close(fd); +			goto fail_errno; +		} + +		ecode = tdb_new_file(tdb); +		if (ecode != TDB_SUCCESS) { +			close(fd); +			goto fail; +		} + +		tdb->file->next = files; +		tdb->file->fd = fd; +		tdb->file->device = st.st_dev; +		tdb->file->inode = st.st_ino; +		tdb->file->map_ptr = NULL; +		tdb->file->map_size = sizeof(struct tdb_header); +	} + +	/* ensure there is only one process initialising at once */ +	ecode = tdb_lock_open(tdb, openlock, TDB_LOCK_WAIT|TDB_LOCK_NOCHECK); +	if (ecode != TDB_SUCCESS) { +		saved_errno = errno; +		goto fail_errno; +	} + +	/* call their open hook if they gave us one. */ +	if (openhook) { +		ecode = openhook->fn(tdb->file->fd, openhook->data); +		if (ecode != TDB_SUCCESS) { +			tdb_logerr(tdb, ecode, TDB_LOG_ERROR, +				   "tdb_open: open hook failed"); +			goto fail; +		} +		open_flags |= O_CREAT; +	} + +	/* If they used O_TRUNC, read will return 0. */ +	rlen = pread(tdb->file->fd, &hdr, sizeof(hdr), 0); +	if (rlen == 0 && (open_flags & O_CREAT)) { +		ecode = tdb_new_database(tdb, seed, &hdr); +		if (ecode != TDB_SUCCESS) { +			goto fail; +		} +	} else if (rlen < 0) { +		ecode = tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR, +				   "tdb_open: error %s reading %s", +				   strerror(errno), name); +		goto fail; +	} else if (rlen < sizeof(hdr) +		   || strcmp(hdr.magic_food, TDB_MAGIC_FOOD) != 0) { +		ecode = tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR, +				   "tdb_open: %s is not a tdb file", name); +		goto fail; +	} + +	if (hdr.version != TDB_VERSION) { +		if (hdr.version == bswap_64(TDB_VERSION)) +			tdb->flags |= TDB_CONVERT; +		else { +			/* wrong version */ +			ecode = tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR, +					   "tdb_open:" +					   " %s is unknown version 0x%llx", +					   name, (long long)hdr.version); +			goto fail; +		} +	} + +	tdb_convert(tdb, &hdr, sizeof(hdr)); +	tdb->hash_seed = hdr.hash_seed; +	hash_test = TDB_HASH_MAGIC; +	hash_test = tdb_hash(tdb, &hash_test, sizeof(hash_test)); +	if (hdr.hash_test != hash_test) { +		/* wrong hash variant */ +		ecode = tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR, +				   "tdb_open:" +				   " %s uses a different hash function", +				   name); +		goto fail; +	} + +	/* Clear any features we don't understand. */ +	if ((open_flags & O_ACCMODE) != O_RDONLY) { +		hdr.features_used &= TDB_FEATURE_MASK; +		if (tdb_write_convert(tdb, offsetof(struct tdb_header, +						    features_used), +				      &hdr.features_used, +				      sizeof(hdr.features_used)) == -1) +			goto fail; +	} + +	tdb_unlock_open(tdb, openlock); + +	/* This make sure we have current map_size and mmap. */ +	tdb->methods->oob(tdb, tdb->file->map_size + 1, true); + +	/* Now it's fully formed, recover if necessary. */ +	berr = tdb_needs_recovery(tdb); +	if (unlikely(berr != false)) { +		if (berr < 0) { +			ecode = berr; +			goto fail; +		} +		ecode = tdb_lock_and_recover(tdb); +		if (ecode != TDB_SUCCESS) { +			goto fail; +		} +	} + +	ecode = tdb_ftable_init(tdb); +	if (ecode != TDB_SUCCESS) { +		goto fail; +	} + +	/* Add to linked list if we're new. */ +	if (tdb->file->refcnt == 1) +		files = tdb->file; +	return tdb; + + fail: +	/* Map ecode to some logical errno. */ +	switch (ecode) { +	case TDB_ERR_CORRUPT: +	case TDB_ERR_IO: +		saved_errno = EIO; +		break; +	case TDB_ERR_LOCK: +		saved_errno = EWOULDBLOCK; +		break; +	case TDB_ERR_OOM: +		saved_errno = ENOMEM; +		break; +	case TDB_ERR_EINVAL: +		saved_errno = EINVAL; +		break; +	default: +		saved_errno = EINVAL; +		break; +	} + +fail_errno: +#ifdef TDB_TRACE +	close(tdb->tracefd); +#endif +	if (tdb->file) { +		tdb_lock_cleanup(tdb); +		if (--tdb->file->refcnt == 0) { +			assert(tdb->file->num_lockrecs == 0); +			if (tdb->file->map_ptr) { +				if (tdb->flags & TDB_INTERNAL) { +					free(tdb->file->map_ptr); +				} else +					tdb_munmap(tdb->file); +			} +			if (close(tdb->file->fd) != 0) +				tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR, +					   "tdb_open: failed to close tdb fd" +					   " on error: %s", strerror(errno)); +			free(tdb->file->lockrecs); +			free(tdb->file); +		} +	} + +	free(tdb); +	errno = saved_errno; +	return NULL; +} + +int tdb_close(struct tdb_context *tdb) +{ +	int ret = 0; + +	tdb_trace(tdb, "tdb_close"); + +	if (tdb->transaction) { +		tdb_transaction_cancel(tdb); +	} + +	if (tdb->file->map_ptr) { +		if (tdb->flags & TDB_INTERNAL) +			free(tdb->file->map_ptr); +		else +			tdb_munmap(tdb->file); +	} +	if (tdb->file) { +		struct tdb_file **i; + +		tdb_lock_cleanup(tdb); +		if (--tdb->file->refcnt == 0) { +			ret = close(tdb->file->fd); + +			/* Remove from files list */ +			for (i = &files; *i; i = &(*i)->next) { +				if (*i == tdb->file) { +					*i = tdb->file->next; +					break; +				} +			} +			free(tdb->file->lockrecs); +			free(tdb->file); +		} +	} + +#ifdef TDB_TRACE +	close(tdb->tracefd); +#endif +	free(tdb); + +	return ret; +} diff --git a/lib/tdb2/private.h b/lib/tdb2/private.h new file mode 100644 index 0000000000..135e3df936 --- /dev/null +++ b/lib/tdb2/private.h @@ -0,0 +1,614 @@ +#ifndef TDB_PRIVATE_H +#define TDB_PRIVATE_H + /* +   Trivial Database 2: private types and prototypes +   Copyright (C) Rusty Russell 2010 + +   This library is free software; you can redistribute it and/or +   modify it under the terms of the GNU Lesser General Public +   License as published by the Free Software Foundation; either +   version 3 of the License, or (at your option) any later version. + +   This library is distributed in the hope that it will be useful, +   but WITHOUT ANY WARRANTY; without even the implied warranty of +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU +   Lesser General Public License for more details. + +   You should have received a copy of the GNU Lesser General Public +   License along with this library; if not, see <http://www.gnu.org/licenses/>. +*/ + +#include "config.h" +#if HAVE_FILE_OFFSET_BITS +#define _FILE_OFFSET_BITS 64 +#endif +#include <stdint.h> +#include <stdbool.h> +#include <stdlib.h> +#include <stddef.h> +#include <sys/time.h> +#include <sys/mman.h> +#include <unistd.h> +#include <fcntl.h> +#include <string.h> +#include <errno.h> +#include <stdio.h> +#include <utime.h> +#include <unistd.h> +#include <ccan/tdb2/tdb2.h> +#include <ccan/likely/likely.h> +#include <ccan/compiler/compiler.h> +#include <ccan/endian/endian.h> + +#ifndef TEST_IT +#define TEST_IT(cond) +#endif + +/* #define TDB_TRACE 1 */ + +#ifndef __STRING +#define __STRING(x)    #x +#endif + +#ifndef __STRINGSTRING +#define __STRINGSTRING(x) __STRING(x) +#endif + +#ifndef __location__ +#define __location__ __FILE__ ":" __STRINGSTRING(__LINE__) +#endif + +typedef uint64_t tdb_len_t; +typedef uint64_t tdb_off_t; + +#define TDB_MAGIC_FOOD "TDB file\n" +#define TDB_VERSION ((uint64_t)(0x26011967 + 7)) +#define TDB_USED_MAGIC ((uint64_t)0x1999) +#define TDB_HTABLE_MAGIC ((uint64_t)0x1888) +#define TDB_CHAIN_MAGIC ((uint64_t)0x1777) +#define TDB_FTABLE_MAGIC ((uint64_t)0x1666) +#define TDB_FREE_MAGIC ((uint64_t)0xFE) +#define TDB_HASH_MAGIC (0xA1ABE11A01092008ULL) +#define TDB_RECOVERY_MAGIC (0xf53bc0e7ad124589ULL) +#define TDB_RECOVERY_INVALID_MAGIC (0x0ULL) + +#define TDB_OFF_IS_ERR(off) unlikely(off >= (tdb_off_t)TDB_ERR_LAST) + +/* Packing errors into pointers and v.v. */ +#define TDB_PTR_IS_ERR(ptr) \ +	unlikely((unsigned long)(ptr) >= (unsigned long)TDB_ERR_LAST) +#define TDB_PTR_ERR(p) ((enum TDB_ERROR)(long)(p)) +#define TDB_ERR_PTR(err) ((void *)(long)(err)) + +/* Common case of returning true, false or -ve error. */ +typedef int tdb_bool_err; + +/* Prevent others from opening the file. */ +#define TDB_OPEN_LOCK 0 +/* Doing a transaction. */ +#define TDB_TRANSACTION_LOCK 1 +/* Expanding file. */ +#define TDB_EXPANSION_LOCK 2 +/* Hash chain locks. */ +#define TDB_HASH_LOCK_START 64 + +/* Range for hash locks. */ +#define TDB_HASH_LOCK_RANGE_BITS 30 +#define TDB_HASH_LOCK_RANGE (1 << TDB_HASH_LOCK_RANGE_BITS) + +/* We have 1024 entries in the top level. */ +#define TDB_TOPLEVEL_HASH_BITS 10 +/* And 64 entries in each sub-level: thus 64 bits exactly after 9 levels. */ +#define TDB_SUBLEVEL_HASH_BITS 6 +/* And 8 entries in each group, ie 8 groups per sublevel. */ +#define TDB_HASH_GROUP_BITS 3 +/* This is currently 10: beyond this we chain. */ +#define TDB_MAX_LEVELS (1+(64-TDB_TOPLEVEL_HASH_BITS) / TDB_SUBLEVEL_HASH_BITS) + +/* Extend file by least 100 times larger than needed. */ +#define TDB_EXTENSION_FACTOR 100 + +/* We steal bits from the offsets to store hash info. */ +#define TDB_OFF_HASH_GROUP_MASK ((1ULL << TDB_HASH_GROUP_BITS) - 1) +/* We steal this many upper bits, giving a maximum offset of 64 exabytes. */ +#define TDB_OFF_UPPER_STEAL 8 +#define   TDB_OFF_UPPER_STEAL_EXTRA 7 +/* The bit number where we store extra hash bits. */ +#define TDB_OFF_HASH_EXTRA_BIT 57 +#define TDB_OFF_UPPER_STEAL_SUBHASH_BIT 56 + +/* Additional features we understand.  Currently: none. */ +#define TDB_FEATURE_MASK ((uint64_t)0) + +/* The bit number where we store the extra hash bits. */ +/* Convenience mask to get actual offset. */ +#define TDB_OFF_MASK \ +	(((1ULL << (64 - TDB_OFF_UPPER_STEAL)) - 1) - TDB_OFF_HASH_GROUP_MASK) + +/* How many buckets in a free list: see size_to_bucket(). */ +#define TDB_FREE_BUCKETS (64 - TDB_OFF_UPPER_STEAL) + +/* We have to be able to fit a free record here. */ +#define TDB_MIN_DATA_LEN	\ +	(sizeof(struct tdb_free_record) - sizeof(struct tdb_used_record)) + +/* Indicates this entry is not on an flist (can happen during coalescing) */ +#define TDB_FTABLE_NONE ((1ULL << TDB_OFF_UPPER_STEAL) - 1) + +struct tdb_used_record { +	/* For on-disk compatibility, we avoid bitfields: +	   magic: 16,        (highest) +	   key_len_bits: 5, +	   extra_padding: 32 +	   hash_bits: 11 +	*/ +        uint64_t magic_and_meta; +	/* The bottom key_len_bits*2 are key length, rest is data length. */ +        uint64_t key_and_data_len; +}; + +static inline unsigned rec_key_bits(const struct tdb_used_record *r) +{ +	return ((r->magic_and_meta >> 43) & ((1 << 5)-1)) * 2; +} + +static inline uint64_t rec_key_length(const struct tdb_used_record *r) +{ +	return r->key_and_data_len & ((1ULL << rec_key_bits(r)) - 1); +} + +static inline uint64_t rec_data_length(const struct tdb_used_record *r) +{ +	return r->key_and_data_len >> rec_key_bits(r); +} + +static inline uint64_t rec_extra_padding(const struct tdb_used_record *r) +{ +	return (r->magic_and_meta >> 11) & 0xFFFFFFFF; +} + +static inline uint32_t rec_hash(const struct tdb_used_record *r) +{ +	return r->magic_and_meta & ((1 << 11) - 1); +} + +static inline uint16_t rec_magic(const struct tdb_used_record *r) +{ +	return (r->magic_and_meta >> 48); +} + +struct tdb_free_record { +        uint64_t magic_and_prev; /* TDB_OFF_UPPER_STEAL bits magic, then prev */ +        uint64_t ftable_and_len; /* Len not counting these two fields. */ +	/* This is why the minimum record size is 8 bytes.  */ +	uint64_t next; +}; + +static inline uint64_t frec_prev(const struct tdb_free_record *f) +{ +	return f->magic_and_prev & ((1ULL << (64 - TDB_OFF_UPPER_STEAL)) - 1); +} + +static inline uint64_t frec_magic(const struct tdb_free_record *f) +{ +	return f->magic_and_prev >> (64 - TDB_OFF_UPPER_STEAL); +} + +static inline uint64_t frec_len(const struct tdb_free_record *f) +{ +	return f->ftable_and_len & ((1ULL << (64 - TDB_OFF_UPPER_STEAL))-1); +} + +static inline unsigned frec_ftable(const struct tdb_free_record *f) +{ +	return f->ftable_and_len >> (64 - TDB_OFF_UPPER_STEAL); +} + +struct tdb_recovery_record { +	uint64_t magic; +	/* Length of record (add this header to get total length). */ +	uint64_t max_len; +	/* Length used. */ +	uint64_t len; +	/* Old length of file before transaction. */ +	uint64_t eof; +}; + +/* If we bottom out of the subhashes, we chain. */ +struct tdb_chain { +	tdb_off_t rec[1 << TDB_HASH_GROUP_BITS]; +	tdb_off_t next; +}; + +/* this is stored at the front of every database */ +struct tdb_header { +	char magic_food[64]; /* for /etc/magic */ +	/* FIXME: Make me 32 bit? */ +	uint64_t version; /* version of the code */ +	uint64_t hash_test; /* result of hashing HASH_MAGIC. */ +	uint64_t hash_seed; /* "random" seed written at creation time. */ +	tdb_off_t free_table; /* (First) free table. */ +	tdb_off_t recovery; /* Transaction recovery area. */ + +	uint64_t features_used; /* Features all writers understand */ +	uint64_t features_offered; /* Features offered */ + +	uint64_t seqnum; /* Sequence number for TDB_SEQNUM */ + +	tdb_off_t reserved[23]; + +	/* Top level hash table. */ +	tdb_off_t hashtable[1ULL << TDB_TOPLEVEL_HASH_BITS]; +}; + +struct tdb_freetable { +	struct tdb_used_record hdr; +	tdb_off_t next; +	tdb_off_t buckets[TDB_FREE_BUCKETS]; +}; + +/* Information about a particular (locked) hash entry. */ +struct hash_info { +	/* Full hash value of entry. */ +	uint64_t h; +	/* Start and length of lock acquired. */ +	tdb_off_t hlock_start; +	tdb_len_t hlock_range; +	/* Start of hash group. */ +	tdb_off_t group_start; +	/* Bucket we belong in. */ +	unsigned int home_bucket; +	/* Bucket we (or an empty space) were found in. */ +	unsigned int found_bucket; +	/* How many bits of the hash are already used. */ +	unsigned int hash_used; +	/* Current working group. */ +	tdb_off_t group[1 << TDB_HASH_GROUP_BITS]; +}; + +struct traverse_info { +	struct traverse_level { +		tdb_off_t hashtable; +		/* We ignore groups here, and treat it as a big array. */ +		unsigned entry; +		unsigned int total_buckets; +	} levels[TDB_MAX_LEVELS + 1]; +	unsigned int num_levels; +	unsigned int toplevel_group; +	/* This makes delete-everything-inside-traverse work as expected. */ +	tdb_off_t prev; +}; + +enum tdb_lock_flags { +	/* WAIT == F_SETLKW, NOWAIT == F_SETLK */ +	TDB_LOCK_NOWAIT = 0, +	TDB_LOCK_WAIT = 1, +	/* If set, don't log an error on failure. */ +	TDB_LOCK_PROBE = 2, +	/* If set, don't check for recovery (used by recovery code). */ +	TDB_LOCK_NOCHECK = 4, +}; + +struct tdb_lock { +	struct tdb_context *owner; +	uint32_t off; +	uint32_t count; +	uint32_t ltype; +}; + +/* This is only needed for tdb_access_commit, but used everywhere to + * simplify. */ +struct tdb_access_hdr { +	struct tdb_access_hdr *next; +	tdb_off_t off; +	tdb_len_t len; +	bool convert; +}; + +struct tdb_file { +	/* Single list of all TDBs, to detect multiple opens. */ +	struct tdb_file *next; + +	/* How many are sharing us? */ +	unsigned int refcnt; + +	/* Mmap (if any), or malloc (for TDB_INTERNAL). */ +	void *map_ptr; + +	/* How much space has been mapped (<= current file size) */ +	tdb_len_t map_size; + +	/* The file descriptor (-1 for TDB_INTERNAL). */ +	int fd; + +	/* Lock information */ +	pid_t locker; +	struct tdb_lock allrecord_lock; +	size_t num_lockrecs; +	struct tdb_lock *lockrecs; + +	/* Identity of this file. */ +	dev_t device; +	ino_t inode; +}; + +struct tdb_context { +	/* Filename of the database. */ +	const char *name; + +	/* Are we accessing directly? (debugging check). */ +	int direct_access; + +	/* Operating read-only? (Opened O_RDONLY, or in traverse_read) */ +	bool read_only; + +	/* mmap read only? */ +	int mmap_flags; + +	/* the flags passed to tdb_open, for tdb_reopen. */ +	uint32_t flags; + +	/* Logging function */ +	void (*log_fn)(struct tdb_context *tdb, +		       enum tdb_log_level level, +		       const char *message, +		       void *data); +	void *log_data; + +	/* Hash function. */ +	uint64_t (*hash_fn)(const void *key, size_t len, uint64_t seed, void *); +	void *hash_data; +	uint64_t hash_seed; + +	/* low level (fnctl) lock functions. */ +	int (*lock_fn)(int fd, int rw, off_t off, off_t len, bool w, void *); +	int (*unlock_fn)(int fd, int rw, off_t off, off_t len, void *); +	void *lock_data; + +	/* Set if we are in a transaction. */ +	struct tdb_transaction *transaction; + +	/* What free table are we using? */ +	tdb_off_t ftable_off; +	unsigned int ftable; + +	/* IO methods: changes for transactions. */ +	const struct tdb_methods *methods; + +	/* Our statistics. */ +	struct tdb_attribute_stats stats; + +	/* Direct access information */ +	struct tdb_access_hdr *access; + +	/* Last error we returned. */ +	enum TDB_ERROR last_error; + +	/* The actual file information */ +	struct tdb_file *file; +}; + +struct tdb_methods { +	enum TDB_ERROR (*tread)(struct tdb_context *, tdb_off_t, void *, +				tdb_len_t); +	enum TDB_ERROR (*twrite)(struct tdb_context *, tdb_off_t, const void *, +				 tdb_len_t); +	enum TDB_ERROR (*oob)(struct tdb_context *, tdb_off_t, bool); +	enum TDB_ERROR (*expand_file)(struct tdb_context *, tdb_len_t); +	void *(*direct)(struct tdb_context *, tdb_off_t, size_t, bool); +}; + +/* +  internal prototypes +*/ +/* hash.c: */ +tdb_bool_err first_in_hash(struct tdb_context *tdb, +			   struct traverse_info *tinfo, +			   TDB_DATA *kbuf, size_t *dlen); + +tdb_bool_err next_in_hash(struct tdb_context *tdb, +			  struct traverse_info *tinfo, +			  TDB_DATA *kbuf, size_t *dlen); + +/* Hash random memory. */ +uint64_t tdb_hash(struct tdb_context *tdb, const void *ptr, size_t len); + +/* Hash on disk. */ +uint64_t hash_record(struct tdb_context *tdb, tdb_off_t off); + +/* Find and lock a hash entry (or where it would be). */ +tdb_off_t find_and_lock(struct tdb_context *tdb, +			struct tdb_data key, +			int ltype, +			struct hash_info *h, +			struct tdb_used_record *rec, +			struct traverse_info *tinfo); + +enum TDB_ERROR replace_in_hash(struct tdb_context *tdb, +			       struct hash_info *h, +			       tdb_off_t new_off); + +enum TDB_ERROR add_to_hash(struct tdb_context *tdb, struct hash_info *h, +			   tdb_off_t new_off); + +enum TDB_ERROR delete_from_hash(struct tdb_context *tdb, struct hash_info *h); + +/* For tdb_check */ +bool is_subhash(tdb_off_t val); + +/* free.c: */ +enum TDB_ERROR tdb_ftable_init(struct tdb_context *tdb); + +/* check.c needs these to iterate through free lists. */ +tdb_off_t first_ftable(struct tdb_context *tdb); +tdb_off_t next_ftable(struct tdb_context *tdb, tdb_off_t ftable); + +/* This returns space or -ve error number. */ +tdb_off_t alloc(struct tdb_context *tdb, size_t keylen, size_t datalen, +		uint64_t hash, unsigned magic, bool growing); + +/* Put this record in a free list. */ +enum TDB_ERROR add_free_record(struct tdb_context *tdb, +			       tdb_off_t off, tdb_len_t len_with_header, +			       enum tdb_lock_flags waitflag, +			       bool coalesce_ok); + +/* Set up header for a used/ftable/htable/chain record. */ +enum TDB_ERROR set_header(struct tdb_context *tdb, +			  struct tdb_used_record *rec, +			  unsigned magic, uint64_t keylen, uint64_t datalen, +			  uint64_t actuallen, unsigned hashlow); + +/* Used by tdb_check to verify. */ +unsigned int size_to_bucket(tdb_len_t data_len); +tdb_off_t bucket_off(tdb_off_t ftable_off, unsigned bucket); + +/* Used by tdb_summary */ +tdb_off_t dead_space(struct tdb_context *tdb, tdb_off_t off); + +/* io.c: */ +/* Initialize tdb->methods. */ +void tdb_io_init(struct tdb_context *tdb); + +/* Convert endian of the buffer if required. */ +void *tdb_convert(const struct tdb_context *tdb, void *buf, tdb_len_t size); + +/* Unmap and try to map the tdb. */ +void tdb_munmap(struct tdb_file *file); +void tdb_mmap(struct tdb_context *tdb); + +/* Either alloc a copy, or give direct access.  Release frees or noop. */ +const void *tdb_access_read(struct tdb_context *tdb, +			    tdb_off_t off, tdb_len_t len, bool convert); +void *tdb_access_write(struct tdb_context *tdb, +		       tdb_off_t off, tdb_len_t len, bool convert); + +/* Release result of tdb_access_read/write. */ +void tdb_access_release(struct tdb_context *tdb, const void *p); +/* Commit result of tdb_acces_write. */ +enum TDB_ERROR tdb_access_commit(struct tdb_context *tdb, void *p); + +/* Convenience routine to get an offset. */ +tdb_off_t tdb_read_off(struct tdb_context *tdb, tdb_off_t off); + +/* Write an offset at an offset. */ +enum TDB_ERROR tdb_write_off(struct tdb_context *tdb, tdb_off_t off, +			     tdb_off_t val); + +/* Clear an ondisk area. */ +enum TDB_ERROR zero_out(struct tdb_context *tdb, tdb_off_t off, tdb_len_t len); + +/* Return a non-zero offset between >= start < end in this array (or end). */ +tdb_off_t tdb_find_nonzero_off(struct tdb_context *tdb, +			       tdb_off_t base, +			       uint64_t start, +			       uint64_t end); + +/* Return a zero offset in this array, or num. */ +tdb_off_t tdb_find_zero_off(struct tdb_context *tdb, tdb_off_t off, +			    uint64_t num); + +/* Allocate and make a copy of some offset. */ +void *tdb_alloc_read(struct tdb_context *tdb, tdb_off_t offset, tdb_len_t len); + +/* Writes a converted copy of a record. */ +enum TDB_ERROR tdb_write_convert(struct tdb_context *tdb, tdb_off_t off, +				 const void *rec, size_t len); + +/* Reads record and converts it */ +enum TDB_ERROR tdb_read_convert(struct tdb_context *tdb, tdb_off_t off, +				void *rec, size_t len); + +/* Bump the seqnum (caller checks for tdb->flags & TDB_SEQNUM) */ +void tdb_inc_seqnum(struct tdb_context *tdb); + +/* lock.c: */ +/* Lock/unlock a range of hashes. */ +enum TDB_ERROR tdb_lock_hashes(struct tdb_context *tdb, +			       tdb_off_t hash_lock, tdb_len_t hash_range, +			       int ltype, enum tdb_lock_flags waitflag); +enum TDB_ERROR tdb_unlock_hashes(struct tdb_context *tdb, +				 tdb_off_t hash_lock, +				 tdb_len_t hash_range, int ltype); + +/* For closing the file. */ +void tdb_lock_cleanup(struct tdb_context *tdb); + +/* Lock/unlock a particular free bucket. */ +enum TDB_ERROR tdb_lock_free_bucket(struct tdb_context *tdb, tdb_off_t b_off, +				    enum tdb_lock_flags waitflag); +void tdb_unlock_free_bucket(struct tdb_context *tdb, tdb_off_t b_off); + +/* Serialize transaction start. */ +enum TDB_ERROR tdb_transaction_lock(struct tdb_context *tdb, int ltype); +void tdb_transaction_unlock(struct tdb_context *tdb, int ltype); + +/* Do we have any hash locks (ie. via tdb_chainlock) ? */ +bool tdb_has_hash_locks(struct tdb_context *tdb); + +/* Lock entire database. */ +enum TDB_ERROR tdb_allrecord_lock(struct tdb_context *tdb, int ltype, +				  enum tdb_lock_flags flags, bool upgradable); +void tdb_allrecord_unlock(struct tdb_context *tdb, int ltype); +enum TDB_ERROR tdb_allrecord_upgrade(struct tdb_context *tdb); + +/* Serialize db open. */ +enum TDB_ERROR tdb_lock_open(struct tdb_context *tdb, +			     int ltype, enum tdb_lock_flags flags); +void tdb_unlock_open(struct tdb_context *tdb, int ltype); +bool tdb_has_open_lock(struct tdb_context *tdb); + +/* Serialize db expand. */ +enum TDB_ERROR tdb_lock_expand(struct tdb_context *tdb, int ltype); +void tdb_unlock_expand(struct tdb_context *tdb, int ltype); +bool tdb_has_expansion_lock(struct tdb_context *tdb); + +/* If it needs recovery, grab all the locks and do it. */ +enum TDB_ERROR tdb_lock_and_recover(struct tdb_context *tdb); + +/* Default lock and unlock functions. */ +int tdb_fcntl_lock(int fd, int rw, off_t off, off_t len, bool waitflag, void *); +int tdb_fcntl_unlock(int fd, int rw, off_t off, off_t len, void *); + +/* transaction.c: */ +enum TDB_ERROR tdb_transaction_recover(struct tdb_context *tdb); +tdb_bool_err tdb_needs_recovery(struct tdb_context *tdb); + +/* tdb.c: */ +enum TDB_ERROR COLD tdb_logerr(struct tdb_context *tdb, +			       enum TDB_ERROR ecode, +			       enum tdb_log_level level, +			       const char *fmt, ...); + +#ifdef TDB_TRACE +void tdb_trace(struct tdb_context *tdb, const char *op); +void tdb_trace_seqnum(struct tdb_context *tdb, uint32_t seqnum, const char *op); +void tdb_trace_open(struct tdb_context *tdb, const char *op, +		    unsigned hash_size, unsigned tdb_flags, unsigned open_flags); +void tdb_trace_ret(struct tdb_context *tdb, const char *op, int ret); +void tdb_trace_retrec(struct tdb_context *tdb, const char *op, TDB_DATA ret); +void tdb_trace_1rec(struct tdb_context *tdb, const char *op, +		    TDB_DATA rec); +void tdb_trace_1rec_ret(struct tdb_context *tdb, const char *op, +			TDB_DATA rec, int ret); +void tdb_trace_1rec_retrec(struct tdb_context *tdb, const char *op, +			   TDB_DATA rec, TDB_DATA ret); +void tdb_trace_2rec_flag_ret(struct tdb_context *tdb, const char *op, +			     TDB_DATA rec1, TDB_DATA rec2, unsigned flag, +			     int ret); +void tdb_trace_2rec_retrec(struct tdb_context *tdb, const char *op, +			   TDB_DATA rec1, TDB_DATA rec2, TDB_DATA ret); +#else +#define tdb_trace(tdb, op) +#define tdb_trace_seqnum(tdb, seqnum, op) +#define tdb_trace_open(tdb, op, hash_size, tdb_flags, open_flags) +#define tdb_trace_ret(tdb, op, ret) +#define tdb_trace_retrec(tdb, op, ret) +#define tdb_trace_1rec(tdb, op, rec) +#define tdb_trace_1rec_ret(tdb, op, rec, ret) +#define tdb_trace_1rec_retrec(tdb, op, rec, ret) +#define tdb_trace_2rec_flag_ret(tdb, op, rec1, rec2, flag, ret) +#define tdb_trace_2rec_retrec(tdb, op, rec1, rec2, ret) +#endif /* !TDB_TRACE */ + +#endif diff --git a/lib/tdb2/summary.c b/lib/tdb2/summary.c new file mode 100644 index 0000000000..26cdd3e4fe --- /dev/null +++ b/lib/tdb2/summary.c @@ -0,0 +1,282 @@ + /* +   Trivial Database 2: human-readable summary code +   Copyright (C) Rusty Russell 2010 + +   This library is free software; you can redistribute it and/or +   modify it under the terms of the GNU Lesser General Public +   License as published by the Free Software Foundation; either +   version 3 of the License, or (at your option) any later version. + +   This library is distributed in the hope that it will be useful, +   but WITHOUT ANY WARRANTY; without even the implied warranty of +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU +   Lesser General Public License for more details. + +   You should have received a copy of the GNU Lesser General Public +   License along with this library; if not, see <http://www.gnu.org/licenses/>. +*/ +#include "private.h" +#include <assert.h> +#include <ccan/tally/tally.h> + +static tdb_off_t count_hash(struct tdb_context *tdb, +			    tdb_off_t hash_off, unsigned bits) +{ +	const tdb_off_t *h; +	tdb_off_t count = 0; +	unsigned int i; + +	h = tdb_access_read(tdb, hash_off, sizeof(*h) << bits, true); +	if (TDB_PTR_IS_ERR(h)) { +		return TDB_PTR_ERR(h); +	} +	for (i = 0; i < (1 << bits); i++) +		count += (h[i] != 0); + +	tdb_access_release(tdb, h); +	return count; +} + +static enum TDB_ERROR summarize(struct tdb_context *tdb, +				struct tally *hashes, +				struct tally *ftables, +				struct tally *fr, +				struct tally *keys, +				struct tally *data, +				struct tally *extra, +				struct tally *uncoal, +				struct tally *chains) +{ +	tdb_off_t off; +	tdb_len_t len; +	tdb_len_t unc = 0; + +	for (off = sizeof(struct tdb_header); +	     off < tdb->file->map_size; +	     off += len) { +		const union { +			struct tdb_used_record u; +			struct tdb_free_record f; +			struct tdb_recovery_record r; +		} *p; +		/* We might not be able to get the whole thing. */ +		p = tdb_access_read(tdb, off, sizeof(p->f), true); +		if (TDB_PTR_IS_ERR(p)) { +			return TDB_PTR_ERR(p); +		} +		if (frec_magic(&p->f) != TDB_FREE_MAGIC) { +			if (unc > 1) { +				tally_add(uncoal, unc); +				unc = 0; +			} +		} + +		if (p->r.magic == TDB_RECOVERY_INVALID_MAGIC +		    || p->r.magic == TDB_RECOVERY_MAGIC) { +			len = sizeof(p->r) + p->r.max_len; +		} else if (frec_magic(&p->f) == TDB_FREE_MAGIC) { +			len = frec_len(&p->f); +			tally_add(fr, len); +			len += sizeof(p->u); +			unc++; +		} else if (rec_magic(&p->u) == TDB_USED_MAGIC) { +			len = sizeof(p->u) +				+ rec_key_length(&p->u) +				+ rec_data_length(&p->u) +				+ rec_extra_padding(&p->u); + +			tally_add(keys, rec_key_length(&p->u)); +			tally_add(data, rec_data_length(&p->u)); +			tally_add(extra, rec_extra_padding(&p->u)); +		} else if (rec_magic(&p->u) == TDB_HTABLE_MAGIC) { +			tdb_off_t count = count_hash(tdb, +						     off + sizeof(p->u), +						     TDB_SUBLEVEL_HASH_BITS); +			if (TDB_OFF_IS_ERR(count)) { +				return count; +			} +			tally_add(hashes, count); +			tally_add(extra, rec_extra_padding(&p->u)); +			len = sizeof(p->u) +				+ rec_data_length(&p->u) +				+ rec_extra_padding(&p->u); +		} else if (rec_magic(&p->u) == TDB_FTABLE_MAGIC) { +			len = sizeof(p->u) +				+ rec_data_length(&p->u) +				+ rec_extra_padding(&p->u); +			tally_add(ftables, rec_data_length(&p->u)); +			tally_add(extra, rec_extra_padding(&p->u)); +		} else if (rec_magic(&p->u) == TDB_CHAIN_MAGIC) { +			len = sizeof(p->u) +				+ rec_data_length(&p->u) +				+ rec_extra_padding(&p->u); +			tally_add(chains, 1); +			tally_add(extra, rec_extra_padding(&p->u)); +		} else { +			len = dead_space(tdb, off); +			if (TDB_OFF_IS_ERR(len)) { +				return len; +			} +		} +		tdb_access_release(tdb, p); +	} +	if (unc) +		tally_add(uncoal, unc); +	return TDB_SUCCESS; +} + +#define SUMMARY_FORMAT \ +	"Size of file/data: %zu/%zu\n" \ +	"Number of records: %zu\n" \ +	"Smallest/average/largest keys: %zu/%zu/%zu\n%s" \ +	"Smallest/average/largest data: %zu/%zu/%zu\n%s" \ +	"Smallest/average/largest padding: %zu/%zu/%zu\n%s" \ +	"Number of free records: %zu\n" \ +	"Smallest/average/largest free records: %zu/%zu/%zu\n%s" \ +	"Number of uncoalesced records: %zu\n" \ +	"Smallest/average/largest uncoalesced runs: %zu/%zu/%zu\n%s" \ +	"Toplevel hash used: %u of %u\n" \ +	"Number of chains: %zu\n" \ +	"Number of subhashes: %zu\n" \ +	"Smallest/average/largest subhash entries: %zu/%zu/%zu\n%s" \ +	"Percentage keys/data/padding/free/rechdrs/freehdrs/hashes: %.0f/%.0f/%.0f/%.0f/%.0f/%.0f/%.0f\n" + +#define BUCKET_SUMMARY_FORMAT_A					\ +	"Free bucket %zu: total entries %zu.\n"			\ +	"Smallest/average/largest length: %zu/%zu/%zu\n%s" +#define BUCKET_SUMMARY_FORMAT_B					\ +	"Free bucket %zu-%zu: total entries %zu.\n"		\ +	"Smallest/average/largest length: %zu/%zu/%zu\n%s" + +#define HISTO_WIDTH 70 +#define HISTO_HEIGHT 20 + +enum TDB_ERROR tdb_summary(struct tdb_context *tdb, +			   enum tdb_summary_flags flags, +			   char **summary) +{ +	tdb_len_t len; +	struct tally *ftables, *hashes, *freet, *keys, *data, *extra, *uncoal, +		*chains; +	char *hashesg, *freeg, *keysg, *datag, *extrag, *uncoalg; +	enum TDB_ERROR ecode; + +	hashesg = freeg = keysg = datag = extrag = uncoalg = NULL; + +	ecode = tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_WAIT, false); +	if (ecode != TDB_SUCCESS) { +		return tdb->last_error = ecode; +	} + +	ecode = tdb_lock_expand(tdb, F_RDLCK); +	if (ecode != TDB_SUCCESS) { +		tdb_allrecord_unlock(tdb, F_RDLCK); +		return tdb->last_error = ecode; +	} + +	/* Start stats off empty. */ +	ftables = tally_new(HISTO_HEIGHT); +	hashes = tally_new(HISTO_HEIGHT); +	freet = tally_new(HISTO_HEIGHT); +	keys = tally_new(HISTO_HEIGHT); +	data = tally_new(HISTO_HEIGHT); +	extra = tally_new(HISTO_HEIGHT); +	uncoal = tally_new(HISTO_HEIGHT); +	chains = tally_new(HISTO_HEIGHT); +	if (!ftables || !hashes || !freet || !keys || !data || !extra +	    || !uncoal || !chains) { +		ecode = tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR, +				   "tdb_summary: failed to allocate" +				   " tally structures"); +		goto unlock; +	} + +	ecode = summarize(tdb, hashes, ftables, freet, keys, data, extra, +			  uncoal, chains); +	if (ecode != TDB_SUCCESS) { +		goto unlock; +	} + +	if (flags & TDB_SUMMARY_HISTOGRAMS) { +		hashesg = tally_histogram(hashes, HISTO_WIDTH, HISTO_HEIGHT); +		freeg = tally_histogram(freet, HISTO_WIDTH, HISTO_HEIGHT); +		keysg = tally_histogram(keys, HISTO_WIDTH, HISTO_HEIGHT); +		datag = tally_histogram(data, HISTO_WIDTH, HISTO_HEIGHT); +		extrag = tally_histogram(extra, HISTO_WIDTH, HISTO_HEIGHT); +		uncoalg = tally_histogram(uncoal, HISTO_WIDTH, HISTO_HEIGHT); +	} + +	/* 20 is max length of a %llu. */ +	len = strlen(SUMMARY_FORMAT) + 33*20 + 1 +		+ (hashesg ? strlen(hashesg) : 0) +		+ (freeg ? strlen(freeg) : 0) +		+ (keysg ? strlen(keysg) : 0) +		+ (datag ? strlen(datag) : 0) +		+ (extrag ? strlen(extrag) : 0) +		+ (uncoalg ? strlen(uncoalg) : 0); + +	*summary = malloc(len); +	if (!*summary) { +		ecode = tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR, +				   "tdb_summary: failed to allocate string"); +		goto unlock; +	} + +	sprintf(*summary, SUMMARY_FORMAT, +		(size_t)tdb->file->map_size, +		tally_total(keys, NULL) + tally_total(data, NULL), +		tally_num(keys), +		tally_min(keys), tally_mean(keys), tally_max(keys), +		keysg ? keysg : "", +		tally_min(data), tally_mean(data), tally_max(data), +		datag ? datag : "", +		tally_min(extra), tally_mean(extra), tally_max(extra), +		extrag ? extrag : "", +		tally_num(freet), +		tally_min(freet), tally_mean(freet), tally_max(freet), +		freeg ? freeg : "", +		tally_total(uncoal, NULL), +		tally_min(uncoal), tally_mean(uncoal), tally_max(uncoal), +		uncoalg ? uncoalg : "", +		(unsigned)count_hash(tdb, offsetof(struct tdb_header, +						   hashtable), +				     TDB_TOPLEVEL_HASH_BITS), +		1 << TDB_TOPLEVEL_HASH_BITS, +		tally_num(chains), +		tally_num(hashes), +		tally_min(hashes), tally_mean(hashes), tally_max(hashes), +		hashesg ? hashesg : "", +		tally_total(keys, NULL) * 100.0 / tdb->file->map_size, +		tally_total(data, NULL) * 100.0 / tdb->file->map_size, +		tally_total(extra, NULL) * 100.0 / tdb->file->map_size, +		tally_total(freet, NULL) * 100.0 / tdb->file->map_size, +		(tally_num(keys) + tally_num(freet) + tally_num(hashes)) +		* sizeof(struct tdb_used_record) * 100.0 / tdb->file->map_size, +		tally_num(ftables) * sizeof(struct tdb_freetable) +		* 100.0 / tdb->file->map_size, +		(tally_num(hashes) +		 * (sizeof(tdb_off_t) << TDB_SUBLEVEL_HASH_BITS) +		 + (sizeof(tdb_off_t) << TDB_TOPLEVEL_HASH_BITS) +		 + sizeof(struct tdb_chain) * tally_num(chains)) +		* 100.0 / tdb->file->map_size); + +unlock: +	free(hashesg); +	free(freeg); +	free(keysg); +	free(datag); +	free(extrag); +	free(uncoalg); +	free(hashes); +	free(freet); +	free(keys); +	free(data); +	free(extra); +	free(uncoal); +	free(ftables); +	free(chains); + +	tdb_allrecord_unlock(tdb, F_RDLCK); +	tdb_unlock_expand(tdb, F_RDLCK); +	return tdb->last_error = ecode; +} diff --git a/lib/tdb2/tdb.c b/lib/tdb2/tdb.c new file mode 100644 index 0000000000..b8b5aac128 --- /dev/null +++ b/lib/tdb2/tdb.c @@ -0,0 +1,484 @@ +#include "private.h" +#include <ccan/asprintf/asprintf.h> +#include <stdarg.h> + +static enum TDB_ERROR update_rec_hdr(struct tdb_context *tdb, +				     tdb_off_t off, +				     tdb_len_t keylen, +				     tdb_len_t datalen, +				     struct tdb_used_record *rec, +				     uint64_t h) +{ +	uint64_t dataroom = rec_data_length(rec) + rec_extra_padding(rec); +	enum TDB_ERROR ecode; + +	ecode = set_header(tdb, rec, TDB_USED_MAGIC, keylen, datalen, +			   keylen + dataroom, h); +	if (ecode == TDB_SUCCESS) { +		ecode = tdb_write_convert(tdb, off, rec, sizeof(*rec)); +	} +	return ecode; +} + +static enum TDB_ERROR replace_data(struct tdb_context *tdb, +				   struct hash_info *h, +				   struct tdb_data key, struct tdb_data dbuf, +				   tdb_off_t old_off, tdb_len_t old_room, +				   bool growing) +{ +	tdb_off_t new_off; +	enum TDB_ERROR ecode; + +	/* Allocate a new record. */ +	new_off = alloc(tdb, key.dsize, dbuf.dsize, h->h, TDB_USED_MAGIC, +			growing); +	if (TDB_OFF_IS_ERR(new_off)) { +		return new_off; +	} + +	/* We didn't like the existing one: remove it. */ +	if (old_off) { +		tdb->stats.frees++; +		ecode = add_free_record(tdb, old_off, +					sizeof(struct tdb_used_record) +					+ key.dsize + old_room, +					TDB_LOCK_WAIT, true); +		if (ecode == TDB_SUCCESS) +			ecode = replace_in_hash(tdb, h, new_off); +	} else { +		ecode = add_to_hash(tdb, h, new_off); +	} +	if (ecode != TDB_SUCCESS) { +		return ecode; +	} + +	new_off += sizeof(struct tdb_used_record); +	ecode = tdb->methods->twrite(tdb, new_off, key.dptr, key.dsize); +	if (ecode != TDB_SUCCESS) { +		return ecode; +	} + +	new_off += key.dsize; +	ecode = tdb->methods->twrite(tdb, new_off, dbuf.dptr, dbuf.dsize); +	if (ecode != TDB_SUCCESS) { +		return ecode; +	} + +	if (tdb->flags & TDB_SEQNUM) +		tdb_inc_seqnum(tdb); + +	return TDB_SUCCESS; +} + +static enum TDB_ERROR update_data(struct tdb_context *tdb, +				  tdb_off_t off, +				  struct tdb_data dbuf, +				  tdb_len_t extra) +{ +	enum TDB_ERROR ecode; + +	ecode = tdb->methods->twrite(tdb, off, dbuf.dptr, dbuf.dsize); +	if (ecode == TDB_SUCCESS && extra) { +		/* Put a zero in; future versions may append other data. */ +		ecode = tdb->methods->twrite(tdb, off + dbuf.dsize, "", 1); +	} +	if (tdb->flags & TDB_SEQNUM) +		tdb_inc_seqnum(tdb); + +	return ecode; +} + +enum TDB_ERROR tdb_store(struct tdb_context *tdb, +			 struct tdb_data key, struct tdb_data dbuf, int flag) +{ +	struct hash_info h; +	tdb_off_t off; +	tdb_len_t old_room = 0; +	struct tdb_used_record rec; +	enum TDB_ERROR ecode; + +	off = find_and_lock(tdb, key, F_WRLCK, &h, &rec, NULL); +	if (TDB_OFF_IS_ERR(off)) { +		return tdb->last_error = off; +	} + +	/* Now we have lock on this hash bucket. */ +	if (flag == TDB_INSERT) { +		if (off) { +			ecode = TDB_ERR_EXISTS; +			goto out; +		} +	} else { +		if (off) { +			old_room = rec_data_length(&rec) +				+ rec_extra_padding(&rec); +			if (old_room >= dbuf.dsize) { +				/* Can modify in-place.  Easy! */ +				ecode = update_rec_hdr(tdb, off, +						       key.dsize, dbuf.dsize, +						       &rec, h.h); +				if (ecode != TDB_SUCCESS) { +					goto out; +				} +				ecode = update_data(tdb, +						    off + sizeof(rec) +						    + key.dsize, dbuf, +						    old_room - dbuf.dsize); +				if (ecode != TDB_SUCCESS) { +					goto out; +				} +				tdb_unlock_hashes(tdb, h.hlock_start, +						  h.hlock_range, F_WRLCK); +				return tdb->last_error = TDB_SUCCESS; +			} +		} else { +			if (flag == TDB_MODIFY) { +				/* if the record doesn't exist and we +				   are in TDB_MODIFY mode then we should fail +				   the store */ +				ecode = TDB_ERR_NOEXIST; +				goto out; +			} +		} +	} + +	/* If we didn't use the old record, this implies we're growing. */ +	ecode = replace_data(tdb, &h, key, dbuf, off, old_room, off); +out: +	tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range, F_WRLCK); +	return tdb->last_error = ecode; +} + +enum TDB_ERROR tdb_append(struct tdb_context *tdb, +			  struct tdb_data key, struct tdb_data dbuf) +{ +	struct hash_info h; +	tdb_off_t off; +	struct tdb_used_record rec; +	tdb_len_t old_room = 0, old_dlen; +	unsigned char *newdata; +	struct tdb_data new_dbuf; +	enum TDB_ERROR ecode; + +	off = find_and_lock(tdb, key, F_WRLCK, &h, &rec, NULL); +	if (TDB_OFF_IS_ERR(off)) { +		return tdb->last_error = off; +	} + +	if (off) { +		old_dlen = rec_data_length(&rec); +		old_room = old_dlen + rec_extra_padding(&rec); + +		/* Fast path: can append in place. */ +		if (rec_extra_padding(&rec) >= dbuf.dsize) { +			ecode = update_rec_hdr(tdb, off, key.dsize, +					       old_dlen + dbuf.dsize, &rec, +					       h.h); +			if (ecode != TDB_SUCCESS) { +				goto out; +			} + +			off += sizeof(rec) + key.dsize + old_dlen; +			ecode = update_data(tdb, off, dbuf, +					    rec_extra_padding(&rec)); +			goto out; +		} + +		/* Slow path. */ +		newdata = malloc(key.dsize + old_dlen + dbuf.dsize); +		if (!newdata) { +			ecode = tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR, +					   "tdb_append:" +					   " failed to allocate %zu bytes", +					   (size_t)(key.dsize + old_dlen +						    + dbuf.dsize)); +			goto out; +		} +		ecode = tdb->methods->tread(tdb, off + sizeof(rec) + key.dsize, +					    newdata, old_dlen); +		if (ecode != TDB_SUCCESS) { +			goto out_free_newdata; +		} +		memcpy(newdata + old_dlen, dbuf.dptr, dbuf.dsize); +		new_dbuf.dptr = newdata; +		new_dbuf.dsize = old_dlen + dbuf.dsize; +	} else { +		newdata = NULL; +		new_dbuf = dbuf; +	} + +	/* If they're using tdb_append(), it implies they're growing record. */ +	ecode = replace_data(tdb, &h, key, new_dbuf, off, old_room, true); + +out_free_newdata: +	free(newdata); +out: +	tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range, F_WRLCK); +	return tdb->last_error = ecode; +} + +enum TDB_ERROR tdb_fetch(struct tdb_context *tdb, struct tdb_data key, +			 struct tdb_data *data) +{ +	tdb_off_t off; +	struct tdb_used_record rec; +	struct hash_info h; +	enum TDB_ERROR ecode; + +	off = find_and_lock(tdb, key, F_RDLCK, &h, &rec, NULL); +	if (TDB_OFF_IS_ERR(off)) { +		return tdb->last_error = off; +	} + +	if (!off) { +		ecode = TDB_ERR_NOEXIST; +	} else { +		data->dsize = rec_data_length(&rec); +		data->dptr = tdb_alloc_read(tdb, off + sizeof(rec) + key.dsize, +					    data->dsize); +		if (TDB_PTR_IS_ERR(data->dptr)) { +			ecode = TDB_PTR_ERR(data->dptr); +		} else +			ecode = TDB_SUCCESS; +	} + +	tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range, F_RDLCK); +	return tdb->last_error = ecode; +} + +bool tdb_exists(struct tdb_context *tdb, TDB_DATA key) +{ +	tdb_off_t off; +	struct tdb_used_record rec; +	struct hash_info h; + +	off = find_and_lock(tdb, key, F_RDLCK, &h, &rec, NULL); +	if (TDB_OFF_IS_ERR(off)) { +		tdb->last_error = off; +		return false; +	} +	tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range, F_RDLCK); + +	tdb->last_error = TDB_SUCCESS; +	return off ? true : false; +} + +enum TDB_ERROR tdb_delete(struct tdb_context *tdb, struct tdb_data key) +{ +	tdb_off_t off; +	struct tdb_used_record rec; +	struct hash_info h; +	enum TDB_ERROR ecode; + +	off = find_and_lock(tdb, key, F_WRLCK, &h, &rec, NULL); +	if (TDB_OFF_IS_ERR(off)) { +		return tdb->last_error = off; +	} + +	if (!off) { +		ecode = TDB_ERR_NOEXIST; +		goto unlock; +	} + +	ecode = delete_from_hash(tdb, &h); +	if (ecode != TDB_SUCCESS) { +		goto unlock; +	} + +	/* Free the deleted entry. */ +	tdb->stats.frees++; +	ecode = add_free_record(tdb, off, +				sizeof(struct tdb_used_record) +				+ rec_key_length(&rec) +				+ rec_data_length(&rec) +				+ rec_extra_padding(&rec), +				TDB_LOCK_WAIT, true); + +	if (tdb->flags & TDB_SEQNUM) +		tdb_inc_seqnum(tdb); + +unlock: +	tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range, F_WRLCK); +	return tdb->last_error = ecode; +} + +unsigned int tdb_get_flags(struct tdb_context *tdb) +{ +	return tdb->flags; +} + +void tdb_add_flag(struct tdb_context *tdb, unsigned flag) +{ +	if (tdb->flags & TDB_INTERNAL) { +		tdb->last_error = tdb_logerr(tdb, TDB_ERR_EINVAL, +					     TDB_LOG_USE_ERROR, +					     "tdb_add_flag: internal db"); +		return; +	} +	switch (flag) { +	case TDB_NOLOCK: +		tdb->flags |= TDB_NOLOCK; +		break; +	case TDB_NOMMAP: +		tdb->flags |= TDB_NOMMAP; +		tdb_munmap(tdb->file); +		break; +	case TDB_NOSYNC: +		tdb->flags |= TDB_NOSYNC; +		break; +	case TDB_SEQNUM: +		tdb->flags |= TDB_SEQNUM; +		break; +	case TDB_ALLOW_NESTING: +		tdb->flags |= TDB_ALLOW_NESTING; +		break; +	default: +		tdb->last_error = tdb_logerr(tdb, TDB_ERR_EINVAL, +					     TDB_LOG_USE_ERROR, +					     "tdb_add_flag: Unknown flag %u", +					     flag); +	} +} + +void tdb_remove_flag(struct tdb_context *tdb, unsigned flag) +{ +	if (tdb->flags & TDB_INTERNAL) { +		tdb->last_error = tdb_logerr(tdb, TDB_ERR_EINVAL, +					     TDB_LOG_USE_ERROR, +					     "tdb_remove_flag: internal db"); +		return; +	} +	switch (flag) { +	case TDB_NOLOCK: +		tdb->flags &= ~TDB_NOLOCK; +		break; +	case TDB_NOMMAP: +		tdb->flags &= ~TDB_NOMMAP; +		tdb_mmap(tdb); +		break; +	case TDB_NOSYNC: +		tdb->flags &= ~TDB_NOSYNC; +		break; +	case TDB_SEQNUM: +		tdb->flags &= ~TDB_SEQNUM; +		break; +	case TDB_ALLOW_NESTING: +		tdb->flags &= ~TDB_ALLOW_NESTING; +		break; +	default: +		tdb->last_error = tdb_logerr(tdb, TDB_ERR_EINVAL, +					     TDB_LOG_USE_ERROR, +					     "tdb_remove_flag: Unknown flag %u", +					     flag); +	} +} + +const char *tdb_errorstr(enum TDB_ERROR ecode) +{ +	/* Gcc warns if you miss a case in the switch, so use that. */ +	switch (ecode) { +	case TDB_SUCCESS: return "Success"; +	case TDB_ERR_CORRUPT: return "Corrupt database"; +	case TDB_ERR_IO: return "IO Error"; +	case TDB_ERR_LOCK: return "Locking error"; +	case TDB_ERR_OOM: return "Out of memory"; +	case TDB_ERR_EXISTS: return "Record exists"; +	case TDB_ERR_EINVAL: return "Invalid parameter"; +	case TDB_ERR_NOEXIST: return "Record does not exist"; +	case TDB_ERR_RDONLY: return "write not permitted"; +	} +	return "Invalid error code"; +} + +enum TDB_ERROR tdb_error(struct tdb_context *tdb) +{ +	return tdb->last_error; +} + +enum TDB_ERROR COLD tdb_logerr(struct tdb_context *tdb, +			       enum TDB_ERROR ecode, +			       enum tdb_log_level level, +			       const char *fmt, ...) +{ +	char *message; +	va_list ap; +	size_t len; +	/* tdb_open paths care about errno, so save it. */ +	int saved_errno = errno; + +	if (!tdb->log_fn) +		return ecode; + +	va_start(ap, fmt); +	len = vasprintf(&message, fmt, ap); +	va_end(ap); + +	if (len < 0) { +		tdb->log_fn(tdb, TDB_LOG_ERROR, +			    "out of memory formatting message:", tdb->log_data); +		tdb->log_fn(tdb, level, fmt, tdb->log_data); +	} else { +		tdb->log_fn(tdb, level, message, tdb->log_data); +		free(message); +	} +	errno = saved_errno; +	return ecode; +} + +enum TDB_ERROR tdb_parse_record_(struct tdb_context *tdb, +				 TDB_DATA key, +				 enum TDB_ERROR (*parse)(TDB_DATA k, +							 TDB_DATA d, +							 void *data), +				 void *data) +{ +	tdb_off_t off; +	struct tdb_used_record rec; +	struct hash_info h; +	enum TDB_ERROR ecode; + +	off = find_and_lock(tdb, key, F_RDLCK, &h, &rec, NULL); +	if (TDB_OFF_IS_ERR(off)) { +		return tdb->last_error = off; +	} + +	if (!off) { +		ecode = TDB_ERR_NOEXIST; +	} else { +		const void *dptr; +		dptr = tdb_access_read(tdb, off + sizeof(rec) + key.dsize, +				       rec_data_length(&rec), false); +		if (TDB_PTR_IS_ERR(dptr)) { +			ecode = TDB_PTR_ERR(dptr); +		} else { +			TDB_DATA d = tdb_mkdata(dptr, rec_data_length(&rec)); + +			ecode = parse(key, d, data); +			tdb_access_release(tdb, dptr); +		} +	} + +	tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range, F_RDLCK); +	return tdb->last_error = ecode; +} + +const char *tdb_name(const struct tdb_context *tdb) +{ +	return tdb->name; +} + +int64_t tdb_get_seqnum(struct tdb_context *tdb) +{ +	tdb_off_t off = tdb_read_off(tdb, offsetof(struct tdb_header, seqnum)); +	if (TDB_OFF_IS_ERR(off)) +		tdb->last_error = off; +	else +		tdb->last_error = TDB_SUCCESS; +	return off; +} + + +int tdb_fd(const struct tdb_context *tdb) +{ +	return tdb->file->fd; +} diff --git a/lib/tdb2/tdb2.h b/lib/tdb2/tdb2.h new file mode 100644 index 0000000000..c6e09e9f16 --- /dev/null +++ b/lib/tdb2/tdb2.h @@ -0,0 +1,846 @@ +#ifndef CCAN_TDB2_H +#define CCAN_TDB2_H + +/* +   TDB version 2: trivial database library + +   Copyright (C) Andrew Tridgell 1999-2004 +   Copyright (C) Rusty Russell 2010-2011 + +     ** NOTE! The following LGPL license applies to the tdb +     ** library. This does NOT imply that all of Samba is released +     ** under the LGPL + +   This library is free software; you can redistribute it and/or +   modify it under the terms of the GNU Lesser General Public +   License as published by the Free Software Foundation; either +   version 3 of the License, or (at your option) any later version. + +   This library is distributed in the hope that it will be useful, +   but WITHOUT ANY WARRANTY; without even the implied warranty of +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU +   Lesser General Public License for more details. + +   You should have received a copy of the GNU Lesser General Public +   License along with this library; if not, see <http://www.gnu.org/licenses/>. +*/ + +#ifdef  __cplusplus +extern "C" { +#endif + +#ifndef _SAMBA_BUILD_ +/* For mode_t */ +#include <sys/types.h> +/* For O_* flags. */ +#include <sys/stat.h> +/* For sig_atomic_t. */ +#include <signal.h> +/* For uint64_t */ +#include <stdint.h> +/* For bool */ +#include <stdbool.h> +/* For memcmp */ +#include <string.h> +#endif +#include <ccan/compiler/compiler.h> +#include <ccan/typesafe_cb/typesafe_cb.h> +#include <ccan/cast/cast.h> + +union tdb_attribute; +struct tdb_context; + +/** + * tdb_open - open a database file + * @name: the file name (can be NULL if flags contains TDB_INTERNAL) + * @tdb_flags: options for this database + * @open_flags: flags argument for tdb's open() call. + * @mode: mode argument for tdb's open() call. + * @attributes: linked list of extra attributes for this tdb. + * + * This call opens (and potentially creates) a database file. + * Multiple processes can have the TDB file open at once. + * + * On failure it will return NULL, and set errno: it may also call + * any log attribute found in @attributes. + * + * See also: + *	union tdb_attribute + */ +struct tdb_context *tdb_open(const char *name, int tdb_flags, +			     int open_flags, mode_t mode, +			     union tdb_attribute *attributes); + + +/* flags for tdb_open() */ +#define TDB_DEFAULT 0 /* just a readability place holder */ +#define TDB_INTERNAL 2 /* don't store on disk */ +#define TDB_NOLOCK   4 /* don't do any locking */ +#define TDB_NOMMAP   8 /* don't use mmap */ +#define TDB_CONVERT 16 /* convert endian */ +#define TDB_NOSYNC   64 /* don't use synchronous transactions */ +#define TDB_SEQNUM   128 /* maintain a sequence number */ +#define TDB_ALLOW_NESTING   256 /* fake nested transactions */ + +/** + * tdb_close - close and free a tdb. + * @tdb: the tdb context returned from tdb_open() + * + * This always succeeds, in that @tdb is unusable after this call.  But if + * some unexpected error occurred while closing, it will return non-zero + * (the only clue as to cause will be via the log attribute). + */ +int tdb_close(struct tdb_context *tdb); + +/** + * struct tdb_data - representation of keys or values. + * @dptr: the data pointer + * @dsize: the size of the data pointed to by dptr. + * + * This is the "blob" representation of keys and data used by TDB. + */ +typedef struct tdb_data { +	unsigned char *dptr; +	size_t dsize; +} TDB_DATA; + +/** + * enum TDB_ERROR - error returns for TDB + * + * See Also: + *	tdb_errorstr() + */ +enum TDB_ERROR { +	TDB_SUCCESS	= 0,	/* No error. */ +	TDB_ERR_CORRUPT = -1,	/* We read the db, and it was bogus. */ +	TDB_ERR_IO	= -2,	/* We couldn't read/write the db. */ +	TDB_ERR_LOCK	= -3,	/* Locking failed. */ +	TDB_ERR_OOM	= -4,	/* Out of Memory. */ +	TDB_ERR_EXISTS	= -5,	/* The key already exists. */ +	TDB_ERR_NOEXIST	= -6,	/* The key does not exist. */ +	TDB_ERR_EINVAL	= -7,	/* You're using it wrong. */ +	TDB_ERR_RDONLY	= -8,	/* The database is read-only. */ +	TDB_ERR_LAST = TDB_ERR_RDONLY +}; + +/** + * tdb_store - store a key/value pair in a tdb. + * @tdb: the tdb context returned from tdb_open() + * @key: the key + * @dbuf: the data to associate with the key. + * @flag: TDB_REPLACE, TDB_INSERT or TDB_MODIFY. + * + * This inserts (or overwrites) a key/value pair in the TDB.  If flag + * is TDB_REPLACE, it doesn't matter whether the key exists or not; + * TDB_INSERT means it must not exist (returns TDB_ERR_EXISTS otherwise), + * and TDB_MODIFY means it must exist (returns TDB_ERR_NOEXIST otherwise). + * + * On success, this returns TDB_SUCCESS. + * + * See also: + *	tdb_fetch, tdb_transaction_start, tdb_append, tdb_delete. + */ +enum TDB_ERROR tdb_store(struct tdb_context *tdb, +			 struct tdb_data key, +			 struct tdb_data dbuf, +			 int flag); + +/* flags to tdb_store() */ +#define TDB_REPLACE 1		/* A readability place holder */ +#define TDB_INSERT 2 		/* Don't overwrite an existing entry */ +#define TDB_MODIFY 3		/* Don't create an existing entry    */ + +/** + * tdb_fetch - fetch a value from a tdb. + * @tdb: the tdb context returned from tdb_open() + * @key: the key + * @data: pointer to data. + * + * This looks up a key in the database and sets it in @data. + * + * If it returns TDB_SUCCESS, the key was found: it is your + * responsibility to call free() on @data->dptr. + * + * Otherwise, it returns an error (usually, TDB_ERR_NOEXIST) and @data is + * undefined. + */ +enum TDB_ERROR tdb_fetch(struct tdb_context *tdb, struct tdb_data key, +			 struct tdb_data *data); + +/** + * tdb_errorstr - map the tdb error onto a constant readable string + * @ecode: the enum TDB_ERROR to map. + * + * This is useful for displaying errors to users. + */ +const char *tdb_errorstr(enum TDB_ERROR ecode); + +/** + * tdb_append - append a value to a key/value pair in a tdb. + * @tdb: the tdb context returned from tdb_open() + * @key: the key + * @dbuf: the data to append. + * + * This is equivalent to fetching a record, reallocating .dptr to add the + * data, and writing it back, only it's much more efficient.  If the key + * doesn't exist, it's equivalent to tdb_store (with an additional hint that + * you expect to expand the record in future). + * + * See Also: + *	tdb_fetch(), tdb_store() + */ +enum TDB_ERROR tdb_append(struct tdb_context *tdb, +			  struct tdb_data key, struct tdb_data dbuf); + +/** + * tdb_delete - delete a key from a tdb. + * @tdb: the tdb context returned from tdb_open() + * @key: the key to delete. + * + * Returns TDB_SUCCESS on success, or an error (usually TDB_ERR_NOEXIST). + * + * See Also: + *	tdb_fetch(), tdb_store() + */ +enum TDB_ERROR tdb_delete(struct tdb_context *tdb, struct tdb_data key); + +/** + * tdb_exists - does a key exist in the database? + * @tdb: the tdb context returned from tdb_open() + * @key: the key to search for. + * + * Returns true if it exists, or false if it doesn't or any other error. + */ +bool tdb_exists(struct tdb_context *tdb, TDB_DATA key); + +/** + * tdb_deq - are struct tdb_data equal? + * @a: one struct tdb_data + * @b: another struct tdb_data + */ +static inline bool tdb_deq(struct tdb_data a, struct tdb_data b) +{ +	return a.dsize == b.dsize && memcmp(a.dptr, b.dptr, a.dsize) == 0; +} + +/** + * tdb_mkdata - make a struct tdb_data from const data + * @p: the constant pointer + * @len: the length + * + * As the dptr member of struct tdb_data is not constant, you need to + * cast it.  This function keeps thost casts in one place, as well as + * suppressing the warning some compilers give when casting away a + * qualifier (eg. gcc with -Wcast-qual) + */ +static inline struct tdb_data tdb_mkdata(const void *p, size_t len) +{ +	struct tdb_data d; +	d.dptr = cast_const(void *, p); +	d.dsize = len; +	return d; +} + +/** + * tdb_transaction_start - start a transaction + * @tdb: the tdb context returned from tdb_open() + * + * This begins a series of atomic operations.  Other processes will be able + * to read the tdb, but not alter it (they will block), nor will they see + * any changes until tdb_transaction_commit() is called. + * + * Note that if the TDB_ALLOW_NESTING flag is set, a tdb_transaction_start() + * within a transaction will succeed, but it's not a real transaction: + * (1) An inner transaction which is committed is not actually committed until + *     the outer transaction is; if the outer transaction is cancelled, the + *     inner ones are discarded. + * (2) tdb_transaction_cancel() marks the outer transaction as having an error, + *     so the final tdb_transaction_commit() will fail. + * (3) the outer transaction will see the results of the inner transaction. + * + * See Also: + *	tdb_transaction_cancel, tdb_transaction_commit. + */ +enum TDB_ERROR tdb_transaction_start(struct tdb_context *tdb); + +/** + * tdb_transaction_cancel - abandon a transaction + * @tdb: the tdb context returned from tdb_open() + * + * This aborts a transaction, discarding any changes which were made. + * tdb_close() does this implicitly. + */ +void tdb_transaction_cancel(struct tdb_context *tdb); + +/** + * tdb_transaction_commit - commit a transaction + * @tdb: the tdb context returned from tdb_open() + * + * This completes a transaction, writing any changes which were made. + * + * fsync() is used to commit the transaction (unless TDB_NOSYNC is set), + * making it robust against machine crashes, but very slow compared to + * other TDB operations. + * + * A failure can only be caused by unexpected errors (eg. I/O or + * memory); this is no point looping on transaction failure. + * + * See Also: + *	tdb_transaction_prepare_commit() + */ +enum TDB_ERROR tdb_transaction_commit(struct tdb_context *tdb); + +/** + * tdb_transaction_prepare_commit - prepare to commit a transaction + * @tdb: the tdb context returned from tdb_open() + * + * This ensures we have the resources to commit a transaction (using + * tdb_transaction_commit): if this succeeds then a transaction will only + * fail if the write() or fsync() calls fail. + * + * If this fails you must still call tdb_transaction_cancel() to cancel + * the transaction. + * + * See Also: + *	tdb_transaction_commit() + */ +enum TDB_ERROR tdb_transaction_prepare_commit(struct tdb_context *tdb); + +/** + * tdb_traverse - traverse a TDB + * @tdb: the tdb context returned from tdb_open() + * @fn: the function to call for every key/value pair (or NULL) + * @p: the pointer to hand to @f + * + * This walks the TDB until all they keys have been traversed, or @fn + * returns non-zero.  If the traverse function or other processes are + * changing data or adding or deleting keys, the traverse may be + * unreliable: keys may be skipped or (rarely) visited twice. + * + * There is one specific exception: the special case of deleting the + * current key does not undermine the reliability of the traversal. + * + * On success, returns the number of keys iterated.  On error returns + * a negative enum TDB_ERROR value. + */ +#define tdb_traverse(tdb, fn, p)					\ +	tdb_traverse_(tdb, typesafe_cb_preargs(int, void *, (fn), (p),	\ +					       struct tdb_context *,	\ +					       TDB_DATA, TDB_DATA), (p)) + +int64_t tdb_traverse_(struct tdb_context *tdb, +		      int (*fn)(struct tdb_context *, +				TDB_DATA, TDB_DATA, void *), void *p); + +/** + * tdb_parse_record - operate directly on data in the database. + * @tdb: the tdb context returned from tdb_open() + * @key: the key whose record we should hand to @parse + * @parse: the function to call for the data + * @data: the private pointer to hand to @parse (types must match). + * + * This avoids a copy for many cases, by handing you a pointer into + * the memory-mapped database.  It also locks the record to prevent + * other accesses at the same time. + * + * Do not alter the data handed to parse()! + */ +#define tdb_parse_record(tdb, key, parse, data)				\ +	tdb_parse_record_((tdb), (key),					\ +			  typesafe_cb_preargs(enum TDB_ERROR, void *,	\ +					      (parse), (data),		\ +					      TDB_DATA, TDB_DATA), (data)) + +enum TDB_ERROR tdb_parse_record_(struct tdb_context *tdb, +				 TDB_DATA key, +				 enum TDB_ERROR (*parse)(TDB_DATA k, +							 TDB_DATA d, +							 void *data), +				 void *data); + +/** + * tdb_get_seqnum - get a database sequence number + * @tdb: the tdb context returned from tdb_open() + * + * This returns a sequence number: any change to the database from a + * tdb context opened with the TDB_SEQNUM flag will cause that number + * to increment.  Note that the incrementing is unreliable (it is done + * without locking), so this is only useful as an optimization. + * + * For example, you may have a regular database backup routine which + * does not operate if the sequence number is unchanged.  In the + * unlikely event of a failed increment, it will be backed up next + * time any way. + * + * Returns an enum TDB_ERROR (ie. negative) on error. + */ +int64_t tdb_get_seqnum(struct tdb_context *tdb); + +/** + * tdb_firstkey - get the "first" key in a TDB + * @tdb: the tdb context returned from tdb_open() + * @key: pointer to key. + * + * This returns an arbitrary key in the database; with tdb_nextkey() it allows + * open-coded traversal of the database, though it is slightly less efficient + * than tdb_traverse. + * + * It is your responsibility to free @key->dptr on success. + * + * Returns TDB_ERR_NOEXIST if the database is empty. + */ +enum TDB_ERROR tdb_firstkey(struct tdb_context *tdb, struct tdb_data *key); + +/** + * tdb_nextkey - get the "next" key in a TDB + * @tdb: the tdb context returned from tdb_open() + * @key: a key returned by tdb_firstkey() or tdb_nextkey(). + * + * This returns another key in the database; it will free @key.dptr for + * your convenience. + * + * Returns TDB_ERR_NOEXIST if there are no more keys. + */ +enum TDB_ERROR tdb_nextkey(struct tdb_context *tdb, struct tdb_data *key); + +/** + * tdb_chainlock - lock a record in the TDB + * @tdb: the tdb context returned from tdb_open() + * @key: the key to lock. + * + * This prevents any access occurring to a group of keys including @key, + * even if @key does not exist.  This allows primitive atomic updates of + * records without using transactions. + * + * You cannot begin a transaction while holding a tdb_chainlock(), nor can + * you do any operations on any other keys in the database.  This also means + * that you cannot hold more than one tdb_chainlock() at a time. + * + * See Also: + *	tdb_chainunlock() + */ +enum TDB_ERROR tdb_chainlock(struct tdb_context *tdb, TDB_DATA key); + +/** + * tdb_chainunlock - unlock a record in the TDB + * @tdb: the tdb context returned from tdb_open() + * @key: the key to unlock. + * + * The key must have previously been locked by tdb_chainlock(). + */ +void tdb_chainunlock(struct tdb_context *tdb, TDB_DATA key); + +/** + * tdb_chainlock_read - lock a record in the TDB, for reading + * @tdb: the tdb context returned from tdb_open() + * @key: the key to lock. + * + * This prevents any changes from occurring to a group of keys including @key, + * even if @key does not exist.  This allows primitive atomic updates of + * records without using transactions. + * + * You cannot begin a transaction while holding a tdb_chainlock_read(), nor can + * you do any operations on any other keys in the database.  This also means + * that you cannot hold more than one tdb_chainlock()/read() at a time. + * + * See Also: + *	tdb_chainlock() + */ +enum TDB_ERROR tdb_chainlock_read(struct tdb_context *tdb, TDB_DATA key); + +/** + * tdb_chainunlock_read - unlock a record in the TDB for reading + * @tdb: the tdb context returned from tdb_open() + * @key: the key to unlock. + * + * The key must have previously been locked by tdb_chainlock_read(). + */ +void tdb_chainunlock_read(struct tdb_context *tdb, TDB_DATA key); + +/** + * tdb_lockall - lock the entire TDB + * @tdb: the tdb context returned from tdb_open() + * + * You cannot hold a tdb_chainlock while calling this.  It nests, so you + * must call tdb_unlockall as many times as you call tdb_lockall. + */ +enum TDB_ERROR tdb_lockall(struct tdb_context *tdb); + +/** + * tdb_unlockall - unlock the entire TDB + * @tdb: the tdb context returned from tdb_open() + */ +void tdb_unlockall(struct tdb_context *tdb); + +/** + * tdb_lockall_read - lock the entire TDB for reading + * @tdb: the tdb context returned from tdb_open() + * + * This prevents others writing to the database, eg. tdb_delete, tdb_store, + * tdb_append, but not tdb_fetch. + * + * You cannot hold a tdb_chainlock while calling this.  It nests, so you + * must call tdb_unlockall_read as many times as you call tdb_lockall_read. + */ +enum TDB_ERROR tdb_lockall_read(struct tdb_context *tdb); + +/** + * tdb_unlockall_read - unlock the entire TDB for reading + * @tdb: the tdb context returned from tdb_open() + */ +void tdb_unlockall_read(struct tdb_context *tdb); + +/** + * tdb_wipe_all - wipe the database clean + * @tdb: the tdb context returned from tdb_open() + * + * Completely erase the database.  This is faster than iterating through + * each key and doing tdb_delete. + */ +enum TDB_ERROR tdb_wipe_all(struct tdb_context *tdb); + +/** + * tdb_check - check a TDB for consistency + * @tdb: the tdb context returned from tdb_open() + * @check: function to check each key/data pair (or NULL) + * @data: argument for @check, must match type. + * + * This performs a consistency check of the open database, optionally calling + * a check() function on each record so you can do your own data consistency + * checks as well.  If check() returns an error, that is returned from + * tdb_check(). + * + * Returns TDB_SUCCESS or an error. + */ +#define tdb_check(tdb, check, data)					\ +	tdb_check_((tdb), typesafe_cb_preargs(enum TDB_ERROR, void *,	\ +					      (check), (data),		\ +					      struct tdb_data,		\ +					      struct tdb_data),		\ +		   (data)) + +enum TDB_ERROR tdb_check_(struct tdb_context *tdb, +			  enum TDB_ERROR (*check)(struct tdb_data k, +						  struct tdb_data d, +						  void *data), +			  void *data); + +/** + * tdb_error - get the last error (not threadsafe) + * @tdb: the tdb context returned from tdb_open() + * + * Returns the last error returned by a TDB function. + * + * This makes porting from TDB1 easier, but note that the last error is not + * reliable in threaded programs. + */ +enum TDB_ERROR tdb_error(struct tdb_context *tdb); + +/** + * enum tdb_summary_flags - flags for tdb_summary. + */ +enum tdb_summary_flags { +	TDB_SUMMARY_HISTOGRAMS = 1 /* Draw graphs in the summary. */ +}; + +/** + * tdb_summary - return a string describing the TDB state + * @tdb: the tdb context returned from tdb_open() + * @flags: flags to control the summary output. + * @summary: pointer to string to allocate. + * + * This returns a developer-readable string describing the overall + * state of the tdb, such as the percentage used and sizes of records. + * It is designed to provide information about the tdb at a glance + * without displaying any keys or data in the database. + * + * On success, sets @summary to point to a malloc()'ed nul-terminated + * multi-line string.  It is your responsibility to free() it. + */ +enum TDB_ERROR tdb_summary(struct tdb_context *tdb, +			   enum tdb_summary_flags flags, +			   char **summary); + + +/** + * tdb_get_flags - return the flags for a tdb + * @tdb: the tdb context returned from tdb_open() + * + * This returns the flags on the current tdb.  Some of these are caused by + * the flags argument to tdb_open(), others (such as TDB_CONVERT) are + * intuited. + */ +unsigned int tdb_get_flags(struct tdb_context *tdb); + +/** + * tdb_add_flag - set a flag for a tdb + * @tdb: the tdb context returned from tdb_open() + * @flag: one of TDB_NOLOCK, TDB_NOMMAP, TDB_NOSYNC or TDB_ALLOW_NESTING. + * + * You can use this to set a flag on the TDB.  You cannot set these flags + * on a TDB_INTERNAL tdb. + */ +void tdb_add_flag(struct tdb_context *tdb, unsigned flag); + +/** + * tdb_remove_flag - unset a flag for a tdb + * @tdb: the tdb context returned from tdb_open() + * @flag: one of TDB_NOLOCK, TDB_NOMMAP, TDB_NOSYNC or TDB_ALLOW_NESTING. + * + * You can use this to clear a flag on the TDB.  You cannot clear flags + * on a TDB_INTERNAL tdb. + */ +void tdb_remove_flag(struct tdb_context *tdb, unsigned flag); + +/** + * enum tdb_attribute_type - descriminator for union tdb_attribute. + */ +enum tdb_attribute_type { +	TDB_ATTRIBUTE_LOG = 0, +	TDB_ATTRIBUTE_HASH = 1, +	TDB_ATTRIBUTE_SEED = 2, +	TDB_ATTRIBUTE_STATS = 3, +	TDB_ATTRIBUTE_OPENHOOK = 4, +	TDB_ATTRIBUTE_FLOCK = 5 +}; + +/** + * tdb_get_attribute - get an attribute for an existing tdb + * @tdb: the tdb context returned from tdb_open() + * @attr: the union tdb_attribute to set. + * + * This gets an attribute from a TDB which has previously been set (or + * may return the default values).  Set @attr.base.attr to the + * attribute type you want get. + * + * Currently this does not work for TDB_ATTRIBUTE_OPENHOOK. + */ +enum TDB_ERROR tdb_get_attribute(struct tdb_context *tdb, +				 union tdb_attribute *attr); + +/** + * tdb_set_attribute - set an attribute for an existing tdb + * @tdb: the tdb context returned from tdb_open() + * @attr: the union tdb_attribute to set. + * + * This sets an attribute on a TDB, overriding any previous attribute + * of the same type.  It returns TDB_ERR_EINVAL if the attribute is + * unknown or invalid. + * + * Note that TDB_ATTRIBUTE_HASH, TDB_ATTRIBUTE_SEED and + * TDB_ATTRIBUTE_OPENHOOK cannot currently be set after tdb_open. + */ +enum TDB_ERROR tdb_set_attribute(struct tdb_context *tdb, +				 const union tdb_attribute *attr); + +/** + * tdb_unset_attribute - reset an attribute for an existing tdb + * @tdb: the tdb context returned from tdb_open() + * @type: the attribute type to unset. + * + * This unsets an attribute on a TDB, returning it to the defaults + * (where applicable). + * + * Note that it only makes sense for TDB_ATTRIBUTE_LOG and TDB_ATTRIBUTE_FLOCK + * to be unset. + */ +void tdb_unset_attribute(struct tdb_context *tdb, +			 enum tdb_attribute_type type); + +/** + * tdb_name - get the name of a tdb + * @tdb: the tdb context returned from tdb_open() + * + * This returns a copy of the name string, made at tdb_open() time.  If that + * argument was NULL (possible for a TDB_INTERNAL db) this will return NULL. + * + * This is mostly useful for logging. + */ +const char *tdb_name(const struct tdb_context *tdb); + +/** + * tdb_fd - get the file descriptor of a tdb + * @tdb: the tdb context returned from tdb_open() + * + * This returns the file descriptor for the underlying database file, or -1 + * for TDB_INTERNAL. + */ +int tdb_fd(const struct tdb_context *tdb); + +/** + * struct tdb_attribute_base - common fields for all tdb attributes. + */ +struct tdb_attribute_base { +	enum tdb_attribute_type attr; +	union tdb_attribute *next; +}; + +/** + * enum tdb_log_level - log levels for tdb_attribute_log + * @TDB_LOG_ERROR: used to log unrecoverable errors such as I/O errors + *		   or internal consistency failures. + * @TDB_LOG_USE_ERROR: used to log usage errors such as invalid parameters + *		   or writing to a read-only database. + * @TDB_LOG_WARNING: used for informational messages on issues which + *		     are unusual but handled by TDB internally, such + *		     as a failure to mmap or failure to open /dev/urandom. + */ +enum tdb_log_level { +	TDB_LOG_ERROR, +	TDB_LOG_USE_ERROR, +	TDB_LOG_WARNING +}; + +/** + * struct tdb_attribute_log - log function attribute + * + * This attribute provides a hook for you to log errors. + */ +struct tdb_attribute_log { +	struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_LOG */ +	void (*fn)(struct tdb_context *tdb, +		   enum tdb_log_level level, +		   const char *message, +		   void *data); +	void *data; +}; + +/** + * struct tdb_attribute_hash - hash function attribute + * + * This attribute allows you to provide an alternative hash function. + * This hash function will be handed keys from the database; it will also + * be handed the 8-byte TDB_HASH_MAGIC value for checking the header (the + * tdb_open() will fail if the hash value doesn't match the header). + * + * Note that if your hash function gives different results on + * different machine endians, your tdb will no longer work across + * different architectures! + */ +struct tdb_attribute_hash { +	struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_HASH */ +	uint64_t (*fn)(const void *key, size_t len, uint64_t seed, +		       void *data); +	void *data; +}; + +/** + * struct tdb_attribute_seed - hash function seed attribute + * + * The hash function seed is normally taken from /dev/urandom (or equivalent) + * but can be set manually here.  This is mainly for testing purposes. + */ +struct tdb_attribute_seed { +	struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_SEED */ +	uint64_t seed; +}; + +/** + * struct tdb_attribute_stats - tdb operational statistics + * + * This attribute records statistics of various low-level TDB operations. + * This can be used to assist performance evaluation.  This is only + * useful for tdb_get_attribute(). + * + * New fields will be added at the end, hence the "size" argument which + * indicates how large your structure is: it must be filled in before + * calling tdb_get_attribute(), which will overwrite it with the size + * tdb knows about. + */ +struct tdb_attribute_stats { +	struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_STATS */ +	size_t size; /* = sizeof(struct tdb_attribute_stats) */ +	uint64_t allocs; +	uint64_t   alloc_subhash; +	uint64_t   alloc_chain; +	uint64_t   alloc_bucket_exact; +	uint64_t   alloc_bucket_max; +	uint64_t   alloc_leftover; +	uint64_t   alloc_coalesce_tried; +	uint64_t     alloc_coalesce_iterate_clash; +	uint64_t     alloc_coalesce_lockfail; +	uint64_t     alloc_coalesce_race; +	uint64_t     alloc_coalesce_succeeded; +	uint64_t       alloc_coalesce_num_merged; +	uint64_t compares; +	uint64_t   compare_wrong_bucket; +	uint64_t   compare_wrong_offsetbits; +	uint64_t   compare_wrong_keylen; +	uint64_t   compare_wrong_rechash; +	uint64_t   compare_wrong_keycmp; +	uint64_t transactions; +	uint64_t   transaction_cancel; +	uint64_t   transaction_nest; +	uint64_t   transaction_expand_file; +	uint64_t   transaction_read_direct; +	uint64_t      transaction_read_direct_fail; +	uint64_t   transaction_write_direct; +	uint64_t      transaction_write_direct_fail; +	uint64_t expands; +	uint64_t frees; +	uint64_t locks; +	uint64_t   lock_lowlevel; +	uint64_t   lock_nonblock; +	uint64_t     lock_nonblock_fail; +}; + +/** + * struct tdb_attribute_openhook - tdb special effects hook for open + * + * This attribute contains a function to call once we have the OPEN_LOCK + * for the tdb, but before we've examined its contents.  If this succeeds, + * the tdb will be populated if it's then zero-length. + * + * This is a hack to allow support for TDB1-style TDB_CLEAR_IF_FIRST + * behaviour. + */ +struct tdb_attribute_openhook { +	struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_OPENHOOK */ +	enum TDB_ERROR (*fn)(int fd, void *data); +	void *data; +}; + +/** + * struct tdb_attribute_flock - tdb special effects hook for file locking + * + * This attribute contains function to call to place locks on a file; it can + * be used to support non-blocking operations or lock proxying. + * + * They should return 0 on success, -1 on failure and set errno. + * + * An error will be logged on error if errno is neither EAGAIN nor EINTR + * (normally it would only return EAGAIN if waitflag is false, and + * loop internally on EINTR). + */ +struct tdb_attribute_flock { +	struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_FLOCK */ +	int (*lock)(int fd,int rw, off_t off, off_t len, bool waitflag, void *); +	int (*unlock)(int fd, int rw, off_t off, off_t len, void *); +	void *data; +}; + +/** + * union tdb_attribute - tdb attributes. + * + * This represents all the known attributes. + * + * See also: + *	struct tdb_attribute_log, struct tdb_attribute_hash, + *	struct tdb_attribute_seed, struct tdb_attribute_stats, + *	struct tdb_attribute_openhook, struct tdb_attribute_flock. + */ +union tdb_attribute { +	struct tdb_attribute_base base; +	struct tdb_attribute_log log; +	struct tdb_attribute_hash hash; +	struct tdb_attribute_seed seed; +	struct tdb_attribute_stats stats; +	struct tdb_attribute_openhook openhook; +	struct tdb_attribute_flock flock; +}; + +#ifdef  __cplusplus +} +#endif + +#endif /* tdb2.h */ diff --git a/lib/tdb2/test/external-agent.c b/lib/tdb2/test/external-agent.c new file mode 100644 index 0000000000..055b5de736 --- /dev/null +++ b/lib/tdb2/test/external-agent.c @@ -0,0 +1,250 @@ +#include "external-agent.h" +#include "logging.h" +#include "lock-tracking.h" +#include <sys/types.h> +#include <sys/wait.h> +#include <unistd.h> +#include <err.h> +#include <fcntl.h> +#include <stdlib.h> +#include <limits.h> +#include <string.h> +#include <errno.h> +#include <ccan/tdb2/private.h> +#include <ccan/tap/tap.h> +#include <stdio.h> +#include <stdarg.h> + +static struct tdb_context *tdb; + +static enum TDB_ERROR clear_if_first(int fd, void *arg) +{ +/* We hold a lock offset 63 always, so we can tell if anyone is holding it. */ +	struct flock fl; + +	fl.l_type = F_WRLCK; +	fl.l_whence = SEEK_SET; +	fl.l_start = 63; +	fl.l_len = 1; + +	if (fcntl(fd, F_SETLK, &fl) == 0) { +		/* We must be first ones to open it! */ +		diag("agent truncating file!"); +		if (ftruncate(fd, 0) != 0) { +			return TDB_ERR_IO; +		} +	} +	fl.l_type = F_RDLCK; +	if (fcntl(fd, F_SETLKW, &fl) != 0) { +		return TDB_ERR_IO; +	} +	return TDB_SUCCESS; +} + +static enum agent_return do_operation(enum operation op, const char *name) +{ +	TDB_DATA k; +	enum agent_return ret; +	TDB_DATA data; +	enum TDB_ERROR ecode; +	union tdb_attribute cif; + +	if (op != OPEN && op != OPEN_WITH_HOOK && !tdb) { +		diag("external: No tdb open!"); +		return OTHER_FAILURE; +	} + +	diag("external: %s", operation_name(op)); + +	k = tdb_mkdata(name, strlen(name)); + +	locking_would_block = 0; +	switch (op) { +	case OPEN: +		if (tdb) { +			diag("Already have tdb %s open", tdb->name); +			return OTHER_FAILURE; +		} +		tdb = tdb_open(name, TDB_DEFAULT, O_RDWR, 0, &tap_log_attr); +		if (!tdb) { +			if (!locking_would_block) +				diag("Opening tdb gave %s", strerror(errno)); +			forget_locking(); +			ret = OTHER_FAILURE; +		} else +			ret = SUCCESS; +		break; +	case OPEN_WITH_HOOK: +		if (tdb) { +			diag("Already have tdb %s open", tdb->name); +			return OTHER_FAILURE; +		} +		cif.openhook.base.attr = TDB_ATTRIBUTE_OPENHOOK; +		cif.openhook.base.next = &tap_log_attr; +		cif.openhook.fn = clear_if_first; +		tdb = tdb_open(name, TDB_DEFAULT, O_RDWR, 0, &cif); +		if (!tdb) { +			if (!locking_would_block) +				diag("Opening tdb gave %s", strerror(errno)); +			forget_locking(); +			ret = OTHER_FAILURE; +		} else +			ret = SUCCESS; +		break; +	case FETCH: +		ecode = tdb_fetch(tdb, k, &data); +		if (ecode == TDB_ERR_NOEXIST) { +			ret = FAILED; +		} else if (ecode < 0) { +			ret = OTHER_FAILURE; +		} else if (!tdb_deq(data, k)) { +			ret = OTHER_FAILURE; +			free(data.dptr); +		} else { +			ret = SUCCESS; +			free(data.dptr); +		} +		break; +	case STORE: +		ret = tdb_store(tdb, k, k, 0) == 0 ? SUCCESS : OTHER_FAILURE; +		break; +	case TRANSACTION_START: +		ret = tdb_transaction_start(tdb) == 0 ? SUCCESS : OTHER_FAILURE; +		break; +	case TRANSACTION_COMMIT: +		ret = tdb_transaction_commit(tdb)==0 ? SUCCESS : OTHER_FAILURE; +		break; +	case NEEDS_RECOVERY: +		ret = tdb_needs_recovery(tdb) ? SUCCESS : FAILED; +		break; +	case CHECK: +		ret = tdb_check(tdb, NULL, NULL) == 0 ? SUCCESS : OTHER_FAILURE; +		break; +	case CLOSE: +		ret = tdb_close(tdb) == 0 ? SUCCESS : OTHER_FAILURE; +		tdb = NULL; +		break; +	case SEND_SIGNAL: +		/* We do this async */ +		ret = SUCCESS; +		break; +	default: +		ret = OTHER_FAILURE; +	} + +	if (locking_would_block) +		ret = WOULD_HAVE_BLOCKED; + +	return ret; +} + +struct agent { +	int cmdfd, responsefd; +}; + +/* Do this before doing any tdb stuff.  Return handle, or NULL. */ +struct agent *prepare_external_agent(void) +{ +	int pid, ret; +	int command[2], response[2]; +	char name[1+PATH_MAX]; + +	if (pipe(command) != 0 || pipe(response) != 0) +		return NULL; + +	pid = fork(); +	if (pid < 0) +		return NULL; + +	if (pid != 0) { +		struct agent *agent = malloc(sizeof(*agent)); + +		close(command[0]); +		close(response[1]); +		agent->cmdfd = command[1]; +		agent->responsefd = response[0]; +		return agent; +	} + +	close(command[1]); +	close(response[0]); + +	/* We want to fail, not block. */ +	nonblocking_locks = true; +	log_prefix = "external: "; +	while ((ret = read(command[0], name, sizeof(name))) > 0) { +		enum agent_return result; + +		result = do_operation(name[0], name+1); +		if (write(response[1], &result, sizeof(result)) +		    != sizeof(result)) +			err(1, "Writing response"); +		if (name[0] == SEND_SIGNAL) { +			struct timeval ten_ms; +			ten_ms.tv_sec = 0; +			ten_ms.tv_usec = 10000; +			select(0, NULL, NULL, NULL, &ten_ms); +			kill(getppid(), SIGUSR1); +		} +	} +	exit(0); +} + +/* Ask the external agent to try to do an operation. */ +enum agent_return external_agent_operation(struct agent *agent, +					   enum operation op, +					   const char *name) +{ +	enum agent_return res; +	unsigned int len; +	char *string; + +	if (!name) +		name = ""; +	len = 1 + strlen(name) + 1; +	string = malloc(len); + +	string[0] = op; +	strcpy(string+1, name); + +	if (write(agent->cmdfd, string, len) != len +	    || read(agent->responsefd, &res, sizeof(res)) != sizeof(res)) +		res = AGENT_DIED; + +	free(string); +	return res; +} + +const char *agent_return_name(enum agent_return ret) +{ +	return ret == SUCCESS ? "SUCCESS" +		: ret == WOULD_HAVE_BLOCKED ? "WOULD_HAVE_BLOCKED" +		: ret == AGENT_DIED ? "AGENT_DIED" +		: ret == FAILED ? "FAILED" +		: ret == OTHER_FAILURE ? "OTHER_FAILURE" +		: "**INVALID**"; +} + +const char *operation_name(enum operation op) +{ +	switch (op) { +	case OPEN: return "OPEN"; +	case OPEN_WITH_HOOK: return "OPEN_WITH_HOOK"; +	case FETCH: return "FETCH"; +	case STORE: return "STORE"; +	case CHECK: return "CHECK"; +	case TRANSACTION_START: return "TRANSACTION_START"; +	case TRANSACTION_COMMIT: return "TRANSACTION_COMMIT"; +	case NEEDS_RECOVERY: return "NEEDS_RECOVERY"; +	case SEND_SIGNAL: return "SEND_SIGNAL"; +	case CLOSE: return "CLOSE"; +	} +	return "**INVALID**"; +} + +void free_external_agent(struct agent *agent) +{ +	close(agent->cmdfd); +	close(agent->responsefd); +	free(agent); +} diff --git a/lib/tdb2/test/external-agent.h b/lib/tdb2/test/external-agent.h new file mode 100644 index 0000000000..9eada10750 --- /dev/null +++ b/lib/tdb2/test/external-agent.h @@ -0,0 +1,43 @@ +#ifndef TDB2_TEST_EXTERNAL_AGENT_H +#define TDB2_TEST_EXTERNAL_AGENT_H + +/* For locking tests, we need a different process to try things at + * various times. */ +enum operation { +	OPEN, +	OPEN_WITH_HOOK, +	FETCH, +	STORE, +	TRANSACTION_START, +	TRANSACTION_COMMIT, +	NEEDS_RECOVERY, +	CHECK, +	SEND_SIGNAL, +	CLOSE, +}; + +/* Do this before doing any tdb stuff.  Return handle, or -1. */ +struct agent *prepare_external_agent(void); + +enum agent_return { +	SUCCESS, +	WOULD_HAVE_BLOCKED, +	AGENT_DIED, +	FAILED, /* For fetch, or NEEDS_RECOVERY */ +	OTHER_FAILURE, +}; + +/* Ask the external agent to try to do an operation. + * name == tdb name for OPEN/OPEN_WITH_CLEAR_IF_FIRST, + * record name for FETCH/STORE (store stores name as data too) + */ +enum agent_return external_agent_operation(struct agent *handle, +					   enum operation op, +					   const char *name); + +/* Mapping enum -> string. */ +const char *agent_return_name(enum agent_return ret); +const char *operation_name(enum operation op); + +void free_external_agent(struct agent *agent); +#endif /* TDB2_TEST_EXTERNAL_AGENT_H */ diff --git a/lib/tdb2/test/failtest_helper.c b/lib/tdb2/test/failtest_helper.c new file mode 100644 index 0000000000..1358a6c6b2 --- /dev/null +++ b/lib/tdb2/test/failtest_helper.c @@ -0,0 +1,117 @@ +#include "failtest_helper.h" +#include "logging.h" +#include <string.h> +#include <ccan/tap/tap.h> + +/* FIXME: From ccan/str */ +static inline bool strends(const char *str, const char *postfix) +{ +	if (strlen(str) < strlen(postfix)) +		return false; + +	return !strcmp(str + strlen(str) - strlen(postfix), postfix); +} + +bool failmatch(const struct failtest_call *call, +	       const char *file, int line, enum failtest_call_type type) +{ +	return call->type == type +		&& call->line == line +		&& ((strcmp(call->file, file) == 0) +		    || (strends(call->file, file) +			&& (call->file[strlen(call->file) - strlen(file) - 1] +			    == '/'))); +} + +static const struct failtest_call * +find_repeat(const struct failtest_call *start, const struct failtest_call *end, +	    const struct failtest_call *call) +{ +	const struct failtest_call *i; + +	for (i = start; i < end; i++) { +		if (failmatch(i, call->file, call->line, call->type)) +			return i; +	} +	return NULL; +} + +static bool is_nonblocking_lock(const struct failtest_call *call) +{ +	return call->type == FAILTEST_FCNTL && call->u.fcntl.cmd == F_SETLK; +} + +static bool is_unlock(const struct failtest_call *call) +{ +	return call->type == FAILTEST_FCNTL +		&& call->u.fcntl.arg.fl.l_type == F_UNLCK; +} + +bool exit_check_log(struct failtest_call *history, unsigned num) +{ +	unsigned int i; + +	for (i = 0; i < num; i++) { +		if (!history[i].fail) +			continue; +		/* Failing the /dev/urandom open doesn't count: we fall back. */ +		if (failmatch(&history[i], URANDOM_OPEN)) +			continue; + +		/* Similarly with read fail. */ +		if (failmatch(&history[i], URANDOM_READ)) +			continue; + +		/* Initial allocation of tdb doesn't log. */ +		if (failmatch(&history[i], INITIAL_TDB_MALLOC)) +			continue; + +		/* We don't block "failures" on non-blocking locks. */ +		if (is_nonblocking_lock(&history[i])) +			continue; + +		if (!tap_log_messages) +			diag("We didn't log for %u (%s:%u)", +			     i, history[i].file, history[i].line); +		return tap_log_messages != 0; +	} +	return true; +} + +/* Some places we soldier on despite errors: only fail them once. */ +enum failtest_result +block_repeat_failures(struct failtest_call *history, unsigned num) +{ +	const struct failtest_call *i, *last = &history[num-1]; + +	if (failmatch(last, INITIAL_TDB_MALLOC) +	    || failmatch(last, URANDOM_OPEN) +	    || failmatch(last, URANDOM_READ)) { +		if (find_repeat(history, last, last)) +			return FAIL_DONT_FAIL; +		return FAIL_PROBE; +	} + +	/* Unlock or non-blocking lock is fail-once. */ +	if (is_unlock(last)) { +		/* Find a previous unlock at this point? */ +		for (i = find_repeat(history, last, last); +		     i; +		     i = find_repeat(history, i, last)) { +			if (is_unlock(i)) +				return FAIL_DONT_FAIL; +		} +		return FAIL_PROBE; +	} else if (is_nonblocking_lock(last)) { +		/* Find a previous non-blocking lock at this point? */ +		for (i = find_repeat(history, last, last); +		     i; +		     i = find_repeat(history, i, last)) { +			if (is_nonblocking_lock(i)) +				return FAIL_DONT_FAIL; +		} +		return FAIL_PROBE; +	} + +	return FAIL_OK; +} diff --git a/lib/tdb2/test/failtest_helper.h b/lib/tdb2/test/failtest_helper.h new file mode 100644 index 0000000000..a62efbad58 --- /dev/null +++ b/lib/tdb2/test/failtest_helper.h @@ -0,0 +1,17 @@ +#ifndef TDB2_TEST_FAILTEST_HELPER_H +#define TDB2_TEST_FAILTEST_HELPER_H +#include <ccan/failtest/failtest.h> +#include <stdbool.h> + +/* FIXME: Check these! */ +#define INITIAL_TDB_MALLOC	"open.c", 338, FAILTEST_MALLOC +#define URANDOM_OPEN		"open.c", 45, FAILTEST_OPEN +#define URANDOM_READ		"open.c", 25, FAILTEST_READ + +bool exit_check_log(struct failtest_call *history, unsigned num); +bool failmatch(const struct failtest_call *call, +	       const char *file, int line, enum failtest_call_type type); +enum failtest_result +block_repeat_failures(struct failtest_call *history, unsigned num); + +#endif /* TDB2_TEST_LOGGING_H */ diff --git a/lib/tdb2/test/layout.c b/lib/tdb2/test/layout.c new file mode 100644 index 0000000000..31889ad080 --- /dev/null +++ b/lib/tdb2/test/layout.c @@ -0,0 +1,348 @@ +/* TDB tools to create various canned database layouts. */ +#include "layout.h" +#include <stdlib.h> +#include <string.h> +#include <assert.h> +#include <err.h> +#include "logging.h" + +struct tdb_layout *new_tdb_layout(const char *filename) +{ +	struct tdb_layout *layout = malloc(sizeof(*layout)); +	layout->filename = filename; +	layout->num_elems = 0; +	layout->elem = NULL; +	return layout; +} + +static void add(struct tdb_layout *layout, union tdb_layout_elem elem) +{ +	layout->elem = realloc(layout->elem, +			       sizeof(layout->elem[0]) +			       * (layout->num_elems+1)); +	layout->elem[layout->num_elems++] = elem; +} + +void tdb_layout_add_freetable(struct tdb_layout *layout) +{ +	union tdb_layout_elem elem; +	elem.base.type = FREETABLE; +	add(layout, elem); +} + +void tdb_layout_add_free(struct tdb_layout *layout, tdb_len_t len, +			 unsigned ftable) +{ +	union tdb_layout_elem elem; +	elem.base.type = FREE; +	elem.free.len = len; +	elem.free.ftable_num = ftable; +	add(layout, elem); +} + +static struct tdb_data dup_key(struct tdb_data key) +{ +	struct tdb_data ret; +	ret.dsize = key.dsize; +	ret.dptr = malloc(ret.dsize); +	memcpy(ret.dptr, key.dptr, ret.dsize); +	return ret; +} + +void tdb_layout_add_used(struct tdb_layout *layout, +			 TDB_DATA key, TDB_DATA data, +			 tdb_len_t extra) +{ +	union tdb_layout_elem elem; +	elem.base.type = DATA; +	elem.used.key = dup_key(key); +	elem.used.data = dup_key(data); +	elem.used.extra = extra; +	add(layout, elem); +} + +static tdb_len_t free_record_len(tdb_len_t len) +{ +	return sizeof(struct tdb_used_record) + len; +} + +static tdb_len_t data_record_len(struct tle_used *used) +{ +	tdb_len_t len; +	len = sizeof(struct tdb_used_record) +		+ used->key.dsize + used->data.dsize + used->extra; +	assert(len >= sizeof(struct tdb_free_record)); +	return len; +} + +static tdb_len_t hashtable_len(struct tle_hashtable *htable) +{ +	return sizeof(struct tdb_used_record) +		+ (sizeof(tdb_off_t) << TDB_SUBLEVEL_HASH_BITS) +		+ htable->extra; +} + +static tdb_len_t freetable_len(struct tle_freetable *ftable) +{ +	return sizeof(struct tdb_freetable); +} + +static void set_free_record(void *mem, tdb_len_t len) +{ +	/* We do all the work in add_to_freetable */ +} + +static void add_zero_pad(struct tdb_used_record *u, size_t len, size_t extra) +{ +	if (extra) +		((char *)(u + 1))[len] = '\0'; +} + +static void set_data_record(void *mem, struct tdb_context *tdb, +			    struct tle_used *used) +{ +	struct tdb_used_record *u = mem; + +	set_header(tdb, u, TDB_USED_MAGIC, used->key.dsize, used->data.dsize, +		   used->key.dsize + used->data.dsize + used->extra, +		   tdb_hash(tdb, used->key.dptr, used->key.dsize)); +	memcpy(u + 1, used->key.dptr, used->key.dsize); +	memcpy((char *)(u + 1) + used->key.dsize, +	       used->data.dptr, used->data.dsize); +	add_zero_pad(u, used->key.dsize + used->data.dsize, used->extra); +} + +static void set_hashtable(void *mem, struct tdb_context *tdb, +			  struct tle_hashtable *htable) +{ +	struct tdb_used_record *u = mem; +	tdb_len_t len = sizeof(tdb_off_t) << TDB_SUBLEVEL_HASH_BITS; + +	set_header(tdb, u, TDB_HTABLE_MAGIC, 0, len, len + htable->extra, 0); +	memset(u + 1, 0, len); +	add_zero_pad(u, len, htable->extra); +} + +static void set_freetable(void *mem, struct tdb_context *tdb, +			 struct tle_freetable *freetable, struct tdb_header *hdr, +			 tdb_off_t last_ftable) +{ +	struct tdb_freetable *ftable = mem; +	memset(ftable, 0, sizeof(*ftable)); +	set_header(tdb, &ftable->hdr, TDB_FTABLE_MAGIC, 0, +			sizeof(*ftable) - sizeof(ftable->hdr), +			sizeof(*ftable) - sizeof(ftable->hdr), 0); + +	if (last_ftable) { +		ftable = (struct tdb_freetable *)((char *)hdr + last_ftable); +		ftable->next = freetable->base.off; +	} else { +		hdr->free_table = freetable->base.off; +	} +} + +static void add_to_freetable(struct tdb_context *tdb, +			     tdb_off_t eoff, +			     tdb_off_t elen, +			     unsigned ftable, +			     struct tle_freetable *freetable) +{ +	tdb->ftable_off = freetable->base.off; +	tdb->ftable = ftable; +	add_free_record(tdb, eoff, sizeof(struct tdb_used_record) + elen, +			TDB_LOCK_WAIT, false); +} + +static tdb_off_t hbucket_off(tdb_off_t group_start, unsigned ingroup) +{ +	return group_start +		+ (ingroup % (1 << TDB_HASH_GROUP_BITS)) * sizeof(tdb_off_t); +} + +/* Get bits from a value. */ +static uint32_t bits(uint64_t val, unsigned start, unsigned num) +{ +	assert(num <= 32); +	return (val >> start) & ((1U << num) - 1); +} + +/* We take bits from the top: that way we can lock whole sections of the hash + * by using lock ranges. */ +static uint32_t use_bits(uint64_t h, unsigned num, unsigned *used) +{ +	*used += num; +	return bits(h, 64 - *used, num); +} + +static tdb_off_t encode_offset(tdb_off_t new_off, unsigned bucket, +			       uint64_t h) +{ +	return bucket +		| new_off +		| ((uint64_t)bits(h, 64 - TDB_OFF_UPPER_STEAL_EXTRA, +				  TDB_OFF_UPPER_STEAL_EXTRA) +		   << TDB_OFF_HASH_EXTRA_BIT); +} + +/* FIXME: Our hash table handling here is primitive: we don't expand! */ +static void add_to_hashtable(struct tdb_context *tdb, +			     tdb_off_t eoff, +			     struct tdb_data key) +{ +	uint64_t h = tdb_hash(tdb, key.dptr, key.dsize); +	tdb_off_t b_off, group_start; +	unsigned i, group, in_group; +	unsigned used = 0; + +	group = use_bits(h, TDB_TOPLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS, &used); +	in_group = use_bits(h, TDB_HASH_GROUP_BITS, &used); + +	group_start = offsetof(struct tdb_header, hashtable) +		+ group * (sizeof(tdb_off_t) << TDB_HASH_GROUP_BITS); + +	for (i = 0; i < (1 << TDB_HASH_GROUP_BITS); i++) { +		unsigned bucket = (in_group + i) % (1 << TDB_HASH_GROUP_BITS); + +		b_off = hbucket_off(group_start, bucket); +		if (tdb_read_off(tdb, b_off) == 0) { +			tdb_write_off(tdb, b_off, +				      encode_offset(eoff, bucket, h)); +			return; +		} +	} +	abort(); +} + +static struct tle_freetable *find_ftable(struct tdb_layout *layout, unsigned num) +{ +	unsigned i; + +	for (i = 0; i < layout->num_elems; i++) { +		if (layout->elem[i].base.type != FREETABLE) +			continue; +		if (num == 0) +			return &layout->elem[i].ftable; +		num--; +	} +	abort(); +} + +/* FIXME: Support TDB_CONVERT */ +struct tdb_context *tdb_layout_get(struct tdb_layout *layout) +{ +	unsigned int i; +	tdb_off_t off, len, last_ftable; +	char *mem; +	struct tdb_context *tdb; + +	off = sizeof(struct tdb_header); + +	/* First pass of layout: calc lengths */ +	for (i = 0; i < layout->num_elems; i++) { +		union tdb_layout_elem *e = &layout->elem[i]; +		e->base.off = off; +		switch (e->base.type) { +		case FREETABLE: +			len = freetable_len(&e->ftable); +			break; +		case FREE: +			len = free_record_len(e->free.len); +			break; +		case DATA: +			len = data_record_len(&e->used); +			break; +		case HASHTABLE: +			len = hashtable_len(&e->hashtable); +			break; +		default: +			abort(); +		} +		off += len; +	} + +	mem = malloc(off); +	/* Fill with some weird pattern. */ +	memset(mem, 0x99, off); +	/* Now populate our header, cribbing from a real TDB header. */ +	tdb = tdb_open(NULL, TDB_INTERNAL, O_RDWR, 0, &tap_log_attr); +	memcpy(mem, tdb->file->map_ptr, sizeof(struct tdb_header)); + +	/* Mug the tdb we have to make it use this. */ +	free(tdb->file->map_ptr); +	tdb->file->map_ptr = mem; +	tdb->file->map_size = off; + +	last_ftable = 0; +	for (i = 0; i < layout->num_elems; i++) { +		union tdb_layout_elem *e = &layout->elem[i]; +		switch (e->base.type) { +		case FREETABLE: +			set_freetable(mem + e->base.off, tdb, &e->ftable, +				     (struct tdb_header *)mem, last_ftable); +			last_ftable = e->base.off; +			break; +		case FREE: +			set_free_record(mem + e->base.off, e->free.len); +			break; +		case DATA: +			set_data_record(mem + e->base.off, tdb, &e->used); +			break; +		case HASHTABLE: +			set_hashtable(mem + e->base.off, tdb, &e->hashtable); +			break; +		} +	} +	/* Must have a free table! */ +	assert(last_ftable); + +	/* Now fill the free and hash tables. */ +	for (i = 0; i < layout->num_elems; i++) { +		union tdb_layout_elem *e = &layout->elem[i]; +		switch (e->base.type) { +		case FREE: +			add_to_freetable(tdb, e->base.off, e->free.len, +					 e->free.ftable_num, +					 find_ftable(layout, e->free.ftable_num)); +			break; +		case DATA: +			add_to_hashtable(tdb, e->base.off, e->used.key); +			break; +		default: +			break; +		} +	} + +	tdb->ftable_off = find_ftable(layout, 0)->base.off; + +	/* Get physical if they asked for it. */ +	if (layout->filename) { +		int fd = open(layout->filename, O_WRONLY|O_TRUNC|O_CREAT, +			      0600); +		if (fd < 0) +			err(1, "opening %s for writing", layout->filename); +		if (write(fd, tdb->file->map_ptr, tdb->file->map_size) +		    != tdb->file->map_size) +			err(1, "writing %s", layout->filename); +		close(fd); +		tdb_close(tdb); +		/* NOMMAP is for lockcheck. */ +		tdb = tdb_open(layout->filename, TDB_NOMMAP, O_RDWR, 0, +			       &tap_log_attr); +	} + +	return tdb; +} + +void tdb_layout_free(struct tdb_layout *layout) +{ +	unsigned int i; + +	for (i = 0; i < layout->num_elems; i++) { +		if (layout->elem[i].base.type == DATA) { +			free(layout->elem[i].used.key.dptr); +			free(layout->elem[i].used.data.dptr); +		} +	} +	free(layout->elem); +	free(layout); +} diff --git a/lib/tdb2/test/layout.h b/lib/tdb2/test/layout.h new file mode 100644 index 0000000000..6e2e6657a7 --- /dev/null +++ b/lib/tdb2/test/layout.h @@ -0,0 +1,68 @@ +#ifndef TDB2_TEST_LAYOUT_H +#define TDB2_TEST_LAYOUT_H +#include <ccan/tdb2/private.h> + +struct tdb_layout *new_tdb_layout(const char *filename); +void tdb_layout_add_freetable(struct tdb_layout *layout); +void tdb_layout_add_free(struct tdb_layout *layout, tdb_len_t len, +			 unsigned ftable); +void tdb_layout_add_used(struct tdb_layout *layout, +			 TDB_DATA key, TDB_DATA data, +			 tdb_len_t extra); +#if 0 /* FIXME: Allow allocation of subtables */ +void tdb_layout_add_hashtable(struct tdb_layout *layout, +			      int htable_parent, /* -1 == toplevel */ +			      unsigned int bucket, +			      tdb_len_t extra); +#endif +struct tdb_context *tdb_layout_get(struct tdb_layout *layout); +void tdb_layout_free(struct tdb_layout *layout); + +enum layout_type { +	FREETABLE, FREE, DATA, HASHTABLE, +}; + +/* Shared by all union members. */ +struct tle_base { +	enum layout_type type; +	tdb_off_t off; +}; + +struct tle_freetable { +	struct tle_base base; +}; + +struct tle_free { +	struct tle_base base; +	tdb_len_t len; +	unsigned ftable_num; +}; + +struct tle_used { +	struct tle_base base; +	TDB_DATA key; +	TDB_DATA data; +	tdb_len_t extra; +}; + +struct tle_hashtable { +	struct tle_base base; +	int parent; +	unsigned int bucket; +	tdb_len_t extra; +}; + +union tdb_layout_elem { +	struct tle_base base; +	struct tle_freetable ftable; +	struct tle_free free; +	struct tle_used used; +	struct tle_hashtable hashtable; +}; + +struct tdb_layout { +	const char *filename; +	unsigned int num_elems; +	union tdb_layout_elem *elem; +}; +#endif /* TDB2_TEST_LAYOUT_H */ diff --git a/lib/tdb2/test/lock-tracking.c b/lib/tdb2/test/lock-tracking.c new file mode 100644 index 0000000000..05dba32fd3 --- /dev/null +++ b/lib/tdb2/test/lock-tracking.c @@ -0,0 +1,147 @@ +/* We save the locks so we can reaquire them. */ +#include <unistd.h> +#include <fcntl.h> +#include <stdarg.h> +#include <stdlib.h> +#include <ccan/tap/tap.h> +#include <ccan/tdb2/private.h> +#include "lock-tracking.h" + +struct lock { +	struct lock *next; +	unsigned int off; +	unsigned int len; +	int type; +}; +static struct lock *locks; +int locking_errors = 0; +bool suppress_lockcheck = false; +bool nonblocking_locks; +int locking_would_block = 0; +void (*unlock_callback)(int fd); + +int fcntl_with_lockcheck(int fd, int cmd, ... /* arg */ ) +{ +	va_list ap; +	int ret, arg3; +	struct flock *fl; +	bool may_block = false; + +	if (cmd != F_SETLK && cmd != F_SETLKW) { +		/* This may be totally bogus, but we don't know in general. */ +		va_start(ap, cmd); +		arg3 = va_arg(ap, int); +		va_end(ap); + +		return fcntl(fd, cmd, arg3); +	} + +	va_start(ap, cmd); +	fl = va_arg(ap, struct flock *); +	va_end(ap); + +	if (cmd == F_SETLKW && nonblocking_locks) { +		cmd = F_SETLK; +		may_block = true; +	} +	ret = fcntl(fd, cmd, fl); + +	/* Detect when we failed, but might have been OK if we waited. */ +	if (may_block && ret == -1 && (errno == EAGAIN || errno == EACCES)) { +		locking_would_block++; +	} + +	if (fl->l_type == F_UNLCK) { +		struct lock **l; +		struct lock *old = NULL; + +		for (l = &locks; *l; l = &(*l)->next) { +			if ((*l)->off == fl->l_start +			    && (*l)->len == fl->l_len) { +				if (ret == 0) { +					old = *l; +					*l = (*l)->next; +					free(old); +				} +				break; +			} +		} +		if (!old && !suppress_lockcheck) { +			diag("Unknown unlock %u@%u - %i", +			     (int)fl->l_len, (int)fl->l_start, ret); +			locking_errors++; +		} +	} else { +		struct lock *new, *i; +		unsigned int fl_end = fl->l_start + fl->l_len; +		if (fl->l_len == 0) +			fl_end = (unsigned int)-1; + +		/* Check for overlaps: we shouldn't do this. */ +		for (i = locks; i; i = i->next) { +			unsigned int i_end = i->off + i->len; +			if (i->len == 0) +				i_end = (unsigned int)-1; + +			if (fl->l_start >= i->off && fl->l_start < i_end) +				break; +			if (fl_end > i->off && fl_end < i_end) +				break; + +			/* tdb_allrecord_lock does this, handle adjacent: */ +			if (fl->l_start > TDB_HASH_LOCK_START +			    && fl->l_start == i_end && fl->l_type == i->type) { +				if (ret == 0) { +					i->len = fl->l_len +						? i->len + fl->l_len +						: 0; +				} +				goto done; +			} +		} +		if (i) { +			/* Special case: upgrade of allrecord lock. */ +			if (i->type == F_RDLCK && fl->l_type == F_WRLCK +			    && i->off == TDB_HASH_LOCK_START +			    && fl->l_start == TDB_HASH_LOCK_START +			    && i->len == 0 +			    && fl->l_len == 0) { +				if (ret == 0) +					i->type = F_WRLCK; +				goto done; +			} +			if (!suppress_lockcheck) { +				diag("%s lock %u@%u overlaps %u@%u", +				     fl->l_type == F_WRLCK ? "write" : "read", +				     (int)fl->l_len, (int)fl->l_start, +				     i->len, (int)i->off); +				locking_errors++; +			} +		} + +		if (ret == 0) { +			new = malloc(sizeof *new); +			new->off = fl->l_start; +			new->len = fl->l_len; +			new->type = fl->l_type; +			new->next = locks; +			locks = new; +		} +	} +done: +	if (ret == 0 && fl->l_type == F_UNLCK && unlock_callback) +		unlock_callback(fd); +	return ret; +} + +unsigned int forget_locking(void) +{ +	unsigned int num = 0; +	while (locks) { +		struct lock *next = locks->next; +		free(locks); +		locks = next; +		num++; +	} +	return num; +} diff --git a/lib/tdb2/test/lock-tracking.h b/lib/tdb2/test/lock-tracking.h new file mode 100644 index 0000000000..f2c9c44653 --- /dev/null +++ b/lib/tdb2/test/lock-tracking.h @@ -0,0 +1,25 @@ +#ifndef LOCK_TRACKING_H +#define LOCK_TRACKING_H +#include <stdbool.h> + +/* Set this if you want a callback after fnctl unlock. */ +extern void (*unlock_callback)(int fd); + +/* Replacement fcntl. */ +int fcntl_with_lockcheck(int fd, int cmd, ... /* arg */ ); + +/* Discard locking info: returns number of locks outstanding. */ +unsigned int forget_locking(void); + +/* Number of errors in locking. */ +extern int locking_errors; + +/* Suppress lock checking. */ +extern bool suppress_lockcheck; + +/* Make all locks non-blocking. */ +extern bool nonblocking_locks; + +/* Number of times we failed a lock because we made it non-blocking. */ +extern int locking_would_block; +#endif /* LOCK_TRACKING_H */ diff --git a/lib/tdb2/test/logging.c b/lib/tdb2/test/logging.c new file mode 100644 index 0000000000..d32cfa9b59 --- /dev/null +++ b/lib/tdb2/test/logging.c @@ -0,0 +1,24 @@ +#include <stdio.h> +#include <stdlib.h> +#include <ccan/tap/tap.h> +#include "logging.h" + +unsigned tap_log_messages; +const char *log_prefix = ""; +bool suppress_logging; + +union tdb_attribute tap_log_attr = { +	.log = { .base = { .attr = TDB_ATTRIBUTE_LOG }, +		 .fn = tap_log_fn } +}; + +void tap_log_fn(struct tdb_context *tdb, +		enum tdb_log_level level, +		const char *message, void *priv) +{ +	if (suppress_logging) +		return; + +	diag("tdb log level %u: %s%s", level, log_prefix, message); +	tap_log_messages++; +} diff --git a/lib/tdb2/test/logging.h b/lib/tdb2/test/logging.h new file mode 100644 index 0000000000..d172f867fd --- /dev/null +++ b/lib/tdb2/test/logging.h @@ -0,0 +1,15 @@ +#ifndef TDB2_TEST_LOGGING_H +#define TDB2_TEST_LOGGING_H +#include <ccan/tdb2/tdb2.h> +#include <stdbool.h> +#include <string.h> + +extern bool suppress_logging; +extern const char *log_prefix; +extern unsigned tap_log_messages; +extern union tdb_attribute tap_log_attr; + +void tap_log_fn(struct tdb_context *tdb, +		enum tdb_log_level level, +		const char *message, void *priv); +#endif /* TDB2_TEST_LOGGING_H */ diff --git a/lib/tdb2/test/run-001-encode.c b/lib/tdb2/test/run-001-encode.c new file mode 100644 index 0000000000..ffa4b93c02 --- /dev/null +++ b/lib/tdb2/test/run-001-encode.c @@ -0,0 +1,48 @@ +#include <ccan/tdb2/tdb.c> +#include <ccan/tdb2/open.c> +#include <ccan/tdb2/free.c> +#include <ccan/tdb2/lock.c> +#include <ccan/tdb2/hash.c> +#include <ccan/tdb2/io.c> +#include <ccan/tdb2/transaction.c> +#include <ccan/tdb2/check.c> +#include <ccan/tap/tap.h> +#include "logging.h" + +int main(int argc, char *argv[]) +{ +	unsigned int i; +	struct tdb_used_record rec; +	struct tdb_context tdb = { .log_fn = tap_log_fn }; + +	plan_tests(64 + 32 + 48*6 + 1); + +	/* We should be able to encode any data value. */ +	for (i = 0; i < 64; i++) +		ok1(set_header(&tdb, &rec, TDB_USED_MAGIC, 0, 1ULL << i, +			       1ULL << i, 0) == 0); + +	/* And any key and data with < 64 bits between them. */ +	for (i = 0; i < 32; i++) { +		tdb_len_t dlen = 1ULL >> (63 - i), klen = 1ULL << i; +		ok1(set_header(&tdb, &rec, TDB_USED_MAGIC, klen, dlen, +			       klen + dlen, 0)  == 0); +	} + +	/* We should neatly encode all values. */ +	for (i = 0; i < 48; i++) { +		uint64_t h = 1ULL << (i < 5 ? i : 4); +		uint64_t klen = 1ULL << (i < 16 ? i : 15); +		uint64_t dlen = 1ULL << i; +		uint64_t xlen = 1ULL << (i < 32 ? i : 31); +		ok1(set_header(&tdb, &rec, TDB_USED_MAGIC, klen, dlen, +			       klen+dlen+xlen, h) == 0); +		ok1(rec_key_length(&rec) == klen); +		ok1(rec_data_length(&rec) == dlen); +		ok1(rec_extra_padding(&rec) == xlen); +		ok1((uint64_t)rec_hash(&rec) == h); +		ok1(rec_magic(&rec) == TDB_USED_MAGIC); +	} +	ok1(tap_log_messages == 0); +	return exit_status(); +} diff --git a/lib/tdb2/test/run-001-fls.c b/lib/tdb2/test/run-001-fls.c new file mode 100644 index 0000000000..d54cad1d1c --- /dev/null +++ b/lib/tdb2/test/run-001-fls.c @@ -0,0 +1,40 @@ +#include <ccan/tdb2/tdb.c> +#include <ccan/tdb2/open.c> +#include <ccan/tdb2/free.c> +#include <ccan/tdb2/lock.c> +#include <ccan/tdb2/io.c> +#include <ccan/tdb2/hash.c> +#include <ccan/tdb2/transaction.c> +#include <ccan/tdb2/check.c> +#include <ccan/tap/tap.h> + +static unsigned int dumb_fls(uint64_t num) +{ +	int i; + +	for (i = 63; i >= 0; i--) { +		if (num & (1ULL << i)) +			break; +	} +	return i + 1; +} + +int main(int argc, char *argv[]) +{ +	unsigned int i, j; + +	plan_tests(64 * 64 + 2); + +	ok1(fls64(0) == 0); +	ok1(dumb_fls(0) == 0); + +	for (i = 0; i < 64; i++) { +		for (j = 0; j < 64; j++) { +			uint64_t val = (1ULL << i) | (1ULL << j); +			ok(fls64(val) == dumb_fls(val), +			   "%llu -> %u should be %u", (long long)val, +			   fls64(val), dumb_fls(val)); +		} +	} +	return exit_status(); +} diff --git a/lib/tdb2/test/run-01-new_database.c b/lib/tdb2/test/run-01-new_database.c new file mode 100644 index 0000000000..32ebaf09c1 --- /dev/null +++ b/lib/tdb2/test/run-01-new_database.c @@ -0,0 +1,42 @@ +#include <ccan/failtest/failtest_override.h> +#include <ccan/tdb2/tdb.c> +#include <ccan/tdb2/open.c> +#include <ccan/tdb2/free.c> +#include <ccan/tdb2/lock.c> +#include <ccan/tdb2/io.c> +#include <ccan/tdb2/hash.c> +#include <ccan/tdb2/transaction.c> +#include <ccan/tdb2/check.c> +#include <ccan/tap/tap.h> +#include <ccan/failtest/failtest.h> +#include "logging.h" +#include "failtest_helper.h" + +int main(int argc, char *argv[]) +{ +	unsigned int i; +	struct tdb_context *tdb; +	int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP, +			TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT, +			TDB_NOMMAP|TDB_CONVERT }; + +	failtest_init(argc, argv); +	failtest_hook = block_repeat_failures; +	failtest_exit_check = exit_check_log; +	plan_tests(sizeof(flags) / sizeof(flags[0]) * 3); +	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { +		tdb = tdb_open("run-new_database.tdb", flags[i], +			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr); +		if (!ok1(tdb)) +			failtest_exit(exit_status()); +		if (tdb) { +			bool ok = ok1(tdb_check(tdb, NULL, NULL) == 0); +			tdb_close(tdb); +			if (!ok) +				failtest_exit(exit_status()); +		} +		if (!ok1(tap_log_messages == 0)) +			break; +	} +	failtest_exit(exit_status()); +} diff --git a/lib/tdb2/test/run-02-expand.c b/lib/tdb2/test/run-02-expand.c new file mode 100644 index 0000000000..6666ae167e --- /dev/null +++ b/lib/tdb2/test/run-02-expand.c @@ -0,0 +1,80 @@ +#include <ccan/failtest/failtest_override.h> +#include <ccan/tdb2/tdb.c> +#include <ccan/tdb2/open.c> +#include <ccan/tdb2/free.c> +#include <ccan/tdb2/lock.c> +#include <ccan/tdb2/io.c> +#include <ccan/tdb2/check.c> +#include <ccan/tdb2/transaction.c> +#include <ccan/tdb2/hash.c> +#include <ccan/tap/tap.h> +#include <ccan/failtest/failtest.h> +#include "logging.h" +#include "failtest_helper.h" + +static bool failtest_suppress = false; + +/* Don't need to test everything here, just want expand testing. */ +static enum failtest_result +suppress_failure(struct failtest_call *history, unsigned num) +{ +	if (failtest_suppress) +		return FAIL_DONT_FAIL; +	return block_repeat_failures(history, num); +} + +int main(int argc, char *argv[]) +{ +	unsigned int i; +	uint64_t val; +	struct tdb_context *tdb; +	int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP, +			TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT, +			TDB_NOMMAP|TDB_CONVERT }; + +	plan_tests(sizeof(flags) / sizeof(flags[0]) * 11 + 1); + +	failtest_init(argc, argv); +	failtest_hook = suppress_failure; +	failtest_exit_check = exit_check_log; + +	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { +		failtest_suppress = true; +		tdb = tdb_open("run-expand.tdb", flags[i], +			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr); +		if (!ok1(tdb)) +			break; + +		val = tdb->file->map_size; +		/* Need some hash lock for expand. */ +		ok1(tdb_lock_hashes(tdb, 0, 1, F_WRLCK, TDB_LOCK_WAIT) == 0); +		failtest_suppress = false; +		if (!ok1(tdb_expand(tdb, 1) == 0)) { +			failtest_suppress = true; +			tdb_close(tdb); +			break; +		} +		failtest_suppress = true; + +		ok1(tdb->file->map_size >= val + 1 * TDB_EXTENSION_FACTOR); +		ok1(tdb_unlock_hashes(tdb, 0, 1, F_WRLCK) == 0); +		ok1(tdb_check(tdb, NULL, NULL) == 0); + +		val = tdb->file->map_size; +		ok1(tdb_lock_hashes(tdb, 0, 1, F_WRLCK, TDB_LOCK_WAIT) == 0); +		failtest_suppress = false; +		if (!ok1(tdb_expand(tdb, 1024) == 0)) { +			failtest_suppress = true; +			tdb_close(tdb); +			break; +		} +		failtest_suppress = true; +		ok1(tdb_unlock_hashes(tdb, 0, 1, F_WRLCK) == 0); +		ok1(tdb->file->map_size >= val + 1024 * TDB_EXTENSION_FACTOR); +		ok1(tdb_check(tdb, NULL, NULL) == 0); +		tdb_close(tdb); +	} + +	ok1(tap_log_messages == 0); +	failtest_exit(exit_status()); +} diff --git a/lib/tdb2/test/run-03-coalesce.c b/lib/tdb2/test/run-03-coalesce.c new file mode 100644 index 0000000000..3fdd11c077 --- /dev/null +++ b/lib/tdb2/test/run-03-coalesce.c @@ -0,0 +1,170 @@ +#include <ccan/tdb2/tdb.c> +#include <ccan/tdb2/open.c> +#include <ccan/tdb2/free.c> +#include <ccan/tdb2/lock.c> +#include <ccan/tdb2/io.c> +#include <ccan/tdb2/hash.c> +#include <ccan/tdb2/check.c> +#include <ccan/tdb2/transaction.c> +#include <ccan/tap/tap.h> +#include "logging.h" +#include "layout.h" + +static tdb_len_t free_record_length(struct tdb_context *tdb, tdb_off_t off) +{ +	struct tdb_free_record f; +	enum TDB_ERROR ecode; + +	ecode = tdb_read_convert(tdb, off, &f, sizeof(f)); +	if (ecode != TDB_SUCCESS) +		return ecode; +	if (frec_magic(&f) != TDB_FREE_MAGIC) +		return TDB_ERR_CORRUPT; +	return frec_len(&f); +} + +int main(int argc, char *argv[]) +{ +	tdb_off_t b_off, test; +	struct tdb_context *tdb; +	struct tdb_layout *layout; +	struct tdb_data data, key; +	tdb_len_t len; + +	/* FIXME: Test TDB_CONVERT */ +	/* FIXME: Test lock order fail. */ + +	plan_tests(42); +	data = tdb_mkdata("world", 5); +	key = tdb_mkdata("hello", 5); + +	/* No coalescing can be done due to EOF */ +	layout = new_tdb_layout("run-03-coalesce.tdb"); +	tdb_layout_add_freetable(layout); +	len = 1024; +	tdb_layout_add_free(layout, len, 0); +	tdb = tdb_layout_get(layout); +	ok1(tdb_check(tdb, NULL, NULL) == 0); +	ok1(free_record_length(tdb, layout->elem[1].base.off) == len); + +	/* Figure out which bucket free entry is. */ +	b_off = bucket_off(tdb->ftable_off, size_to_bucket(len)); +	/* Lock and fail to coalesce. */ +	ok1(tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) == 0); +	test = layout->elem[1].base.off; +	ok1(coalesce(tdb, layout->elem[1].base.off, b_off, len, &test) +	    == 0); +	tdb_unlock_free_bucket(tdb, b_off); +	ok1(free_record_length(tdb, layout->elem[1].base.off) == len); +	ok1(test == layout->elem[1].base.off); +	ok1(tdb_check(tdb, NULL, NULL) == 0); +	tdb_close(tdb); +	tdb_layout_free(layout); + +	/* No coalescing can be done due to used record */ +	layout = new_tdb_layout("run-03-coalesce.tdb"); +	tdb_layout_add_freetable(layout); +	tdb_layout_add_free(layout, 1024, 0); +	tdb_layout_add_used(layout, key, data, 6); +	tdb = tdb_layout_get(layout); +	ok1(free_record_length(tdb, layout->elem[1].base.off) == 1024); +	ok1(tdb_check(tdb, NULL, NULL) == 0); + +	/* Figure out which bucket free entry is. */ +	b_off = bucket_off(tdb->ftable_off, size_to_bucket(1024)); +	/* Lock and fail to coalesce. */ +	ok1(tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) == 0); +	test = layout->elem[1].base.off; +	ok1(coalesce(tdb, layout->elem[1].base.off, b_off, 1024, &test) +	    == 0); +	tdb_unlock_free_bucket(tdb, b_off); +	ok1(free_record_length(tdb, layout->elem[1].base.off) == 1024); +	ok1(test == layout->elem[1].base.off); +	ok1(tdb_check(tdb, NULL, NULL) == 0); +	tdb_close(tdb); +	tdb_layout_free(layout); + +	/* Coalescing can be done due to two free records, then EOF */ +	layout = new_tdb_layout("run-03-coalesce.tdb"); +	tdb_layout_add_freetable(layout); +	tdb_layout_add_free(layout, 1024, 0); +	tdb_layout_add_free(layout, 2048, 0); +	tdb = tdb_layout_get(layout); +	ok1(free_record_length(tdb, layout->elem[1].base.off) == 1024); +	ok1(free_record_length(tdb, layout->elem[2].base.off) == 2048); +	ok1(tdb_check(tdb, NULL, NULL) == 0); + +	/* Figure out which bucket (first) free entry is. */ +	b_off = bucket_off(tdb->ftable_off, size_to_bucket(1024)); +	/* Lock and coalesce. */ +	ok1(tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) == 0); +	test = layout->elem[2].base.off; +	ok1(coalesce(tdb, layout->elem[1].base.off, b_off, 1024, &test) +	    == 1024 + sizeof(struct tdb_used_record) + 2048); +	/* Should tell us it's erased this one... */ +	ok1(test == TDB_ERR_NOEXIST); +	ok1(tdb->file->allrecord_lock.count == 0 && tdb->file->num_lockrecs == 0); +	ok1(free_record_length(tdb, layout->elem[1].base.off) +	    == 1024 + sizeof(struct tdb_used_record) + 2048); +	ok1(tdb_check(tdb, NULL, NULL) == 0); +	tdb_close(tdb); +	tdb_layout_free(layout); + +	/* Coalescing can be done due to two free records, then data */ +	layout = new_tdb_layout("run-03-coalesce.tdb"); +	tdb_layout_add_freetable(layout); +	tdb_layout_add_free(layout, 1024, 0); +	tdb_layout_add_free(layout, 512, 0); +	tdb_layout_add_used(layout, key, data, 6); +	tdb = tdb_layout_get(layout); +	ok1(free_record_length(tdb, layout->elem[1].base.off) == 1024); +	ok1(free_record_length(tdb, layout->elem[2].base.off) == 512); +	ok1(tdb_check(tdb, NULL, NULL) == 0); + +	/* Figure out which bucket free entry is. */ +	b_off = bucket_off(tdb->ftable_off, size_to_bucket(1024)); +	/* Lock and coalesce. */ +	ok1(tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) == 0); +	test = layout->elem[2].base.off; +	ok1(coalesce(tdb, layout->elem[1].base.off, b_off, 1024, &test) +	    == 1024 + sizeof(struct tdb_used_record) + 512); +	ok1(tdb->file->allrecord_lock.count == 0 && tdb->file->num_lockrecs == 0); +	ok1(free_record_length(tdb, layout->elem[1].base.off) +	    == 1024 + sizeof(struct tdb_used_record) + 512); +	ok1(test == TDB_ERR_NOEXIST); +	ok1(tdb_check(tdb, NULL, NULL) == 0); +	tdb_close(tdb); +	tdb_layout_free(layout); + +	/* Coalescing can be done due to three free records, then EOF */ +	layout = new_tdb_layout("run-03-coalesce.tdb"); +	tdb_layout_add_freetable(layout); +	tdb_layout_add_free(layout, 1024, 0); +	tdb_layout_add_free(layout, 512, 0); +	tdb_layout_add_free(layout, 256, 0); +	tdb = tdb_layout_get(layout); +	ok1(free_record_length(tdb, layout->elem[1].base.off) == 1024); +	ok1(free_record_length(tdb, layout->elem[2].base.off) == 512); +	ok1(free_record_length(tdb, layout->elem[3].base.off) == 256); +	ok1(tdb_check(tdb, NULL, NULL) == 0); + +	/* Figure out which bucket free entry is. */ +	b_off = bucket_off(tdb->ftable_off, size_to_bucket(1024)); +	/* Lock and coalesce. */ +	ok1(tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) == 0); +	test = layout->elem[2].base.off; +	ok1(coalesce(tdb, layout->elem[1].base.off, b_off, 1024, &test) +	    == 1024 + sizeof(struct tdb_used_record) + 512 +	    + sizeof(struct tdb_used_record) + 256); +	ok1(tdb->file->allrecord_lock.count == 0 +	    && tdb->file->num_lockrecs == 0); +	ok1(free_record_length(tdb, layout->elem[1].base.off) +	    == 1024 + sizeof(struct tdb_used_record) + 512 +	    + sizeof(struct tdb_used_record) + 256); +	ok1(tdb_check(tdb, NULL, NULL) == 0); +	tdb_close(tdb); +	tdb_layout_free(layout); + +	ok1(tap_log_messages == 0); +	return exit_status(); +} diff --git a/lib/tdb2/test/run-04-basichash.c b/lib/tdb2/test/run-04-basichash.c new file mode 100644 index 0000000000..62031bdb40 --- /dev/null +++ b/lib/tdb2/test/run-04-basichash.c @@ -0,0 +1,267 @@ +#include <ccan/tdb2/tdb.c> +#include <ccan/tdb2/open.c> +#include <ccan/tdb2/free.c> +#include <ccan/tdb2/lock.c> +#include <ccan/tdb2/io.c> +#include <ccan/tdb2/hash.c> +#include <ccan/tdb2/transaction.c> +#include <ccan/tdb2/check.c> +#include <ccan/tap/tap.h> +#include "logging.h" + +/* We rig the hash so adjacent-numbered records always clash. */ +static uint64_t clash(const void *key, size_t len, uint64_t seed, void *priv) +{ +	return ((uint64_t)*(const unsigned int *)key) +		<< (64 - TDB_TOPLEVEL_HASH_BITS - 1); +} + +int main(int argc, char *argv[]) +{ +	unsigned int i, j; +	struct tdb_context *tdb; +	unsigned int v; +	struct tdb_used_record rec; +	struct tdb_data key = { (unsigned char *)&v, sizeof(v) }; +	struct tdb_data dbuf = { (unsigned char *)&v, sizeof(v) }; +	union tdb_attribute hattr = { .hash = { .base = { TDB_ATTRIBUTE_HASH }, +						.fn = clash } }; +	int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP, +			TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT, +			TDB_NOMMAP|TDB_CONVERT, +	}; + +	hattr.base.next = &tap_log_attr; + +	plan_tests(sizeof(flags) / sizeof(flags[0]) +		   * (91 + (2 * ((1 << TDB_HASH_GROUP_BITS) - 1))) + 1); +	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { +		struct hash_info h; +		tdb_off_t new_off, off, subhash; + +		tdb = tdb_open("run-04-basichash.tdb", flags[i], +			       O_RDWR|O_CREAT|O_TRUNC, 0600, &hattr); +		ok1(tdb); +		if (!tdb) +			continue; + +		v = 0; +		/* Should not find it. */ +		ok1(find_and_lock(tdb, key, F_WRLCK, &h, &rec, NULL) == 0); +		/* Should have created correct hash. */ +		ok1(h.h == tdb_hash(tdb, key.dptr, key.dsize)); +		/* Should have located space in group 0, bucket 0. */ +		ok1(h.group_start == offsetof(struct tdb_header, hashtable)); +		ok1(h.home_bucket == 0); +		ok1(h.found_bucket == 0); +		ok1(h.hash_used == TDB_TOPLEVEL_HASH_BITS); + +		/* Should have lock on bucket 0 */ +		ok1(h.hlock_start == 0); +		ok1(h.hlock_range == +		    1ULL << (64-(TDB_TOPLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS))); +		ok1((tdb->flags & TDB_NOLOCK) || tdb->file->num_lockrecs == 1); +		ok1((tdb->flags & TDB_NOLOCK) +		    || tdb->file->lockrecs[0].off == TDB_HASH_LOCK_START); +		/* FIXME: Check lock length */ + +		/* Allocate a new record. */ +		new_off = alloc(tdb, key.dsize, dbuf.dsize, h.h, +				TDB_USED_MAGIC, false); +		ok1(!TDB_OFF_IS_ERR(new_off)); + +		/* We should be able to add it now. */ +		ok1(add_to_hash(tdb, &h, new_off) == 0); + +		/* Make sure we fill it in for later finding. */ +		off = new_off + sizeof(struct tdb_used_record); +		ok1(!tdb->methods->twrite(tdb, off, key.dptr, key.dsize)); +		off += key.dsize; +		ok1(!tdb->methods->twrite(tdb, off, dbuf.dptr, dbuf.dsize)); + +		/* We should be able to unlock that OK. */ +		ok1(tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range, +				      F_WRLCK) == 0); + +		/* Database should be consistent. */ +		ok1(tdb_check(tdb, NULL, NULL) == 0); + +		/* Now, this should give a successful lookup. */ +		ok1(find_and_lock(tdb, key, F_WRLCK, &h, &rec, NULL) +		    == new_off); +		/* Should have created correct hash. */ +		ok1(h.h == tdb_hash(tdb, key.dptr, key.dsize)); +		/* Should have located space in group 0, bucket 0. */ +		ok1(h.group_start == offsetof(struct tdb_header, hashtable)); +		ok1(h.home_bucket == 0); +		ok1(h.found_bucket == 0); +		ok1(h.hash_used == TDB_TOPLEVEL_HASH_BITS); + +		/* Should have lock on bucket 0 */ +		ok1(h.hlock_start == 0); +		ok1(h.hlock_range == +		    1ULL << (64-(TDB_TOPLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS))); +		ok1((tdb->flags & TDB_NOLOCK) || tdb->file->num_lockrecs == 1); +		ok1((tdb->flags & TDB_NOLOCK) +		    || tdb->file->lockrecs[0].off == TDB_HASH_LOCK_START); +		/* FIXME: Check lock length */ + +		ok1(tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range, +				      F_WRLCK) == 0); + +		/* Database should be consistent. */ +		ok1(tdb_check(tdb, NULL, NULL) == 0); + +		/* Test expansion. */ +		v = 1; +		ok1(find_and_lock(tdb, key, F_WRLCK, &h, &rec, NULL) == 0); +		/* Should have created correct hash. */ +		ok1(h.h == tdb_hash(tdb, key.dptr, key.dsize)); +		/* Should have located space in group 0, bucket 1. */ +		ok1(h.group_start == offsetof(struct tdb_header, hashtable)); +		ok1(h.home_bucket == 0); +		ok1(h.found_bucket == 1); +		ok1(h.hash_used == TDB_TOPLEVEL_HASH_BITS); + +		/* Should have lock on bucket 0 */ +		ok1(h.hlock_start == 0); +		ok1(h.hlock_range == +		    1ULL << (64-(TDB_TOPLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS))); +		ok1((tdb->flags & TDB_NOLOCK) || tdb->file->num_lockrecs == 1); +		ok1((tdb->flags & TDB_NOLOCK) +		    || tdb->file->lockrecs[0].off == TDB_HASH_LOCK_START); +		/* FIXME: Check lock length */ + +		/* Make it expand 0'th bucket. */ +		ok1(expand_group(tdb, &h) == 0); +		/* First one should be subhash, next should be empty. */ +		ok1(is_subhash(h.group[0])); +		subhash = (h.group[0] & TDB_OFF_MASK); +		for (j = 1; j < (1 << TDB_HASH_GROUP_BITS); j++) +			ok1(h.group[j] == 0); + +		ok1(tdb_write_convert(tdb, h.group_start, +				      h.group, sizeof(h.group)) == 0); +		ok1(tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range, +				      F_WRLCK) == 0); + +		/* Should be happy with expansion. */ +		ok1(tdb_check(tdb, NULL, NULL) == 0); + +		/* Should be able to find it. */ +		v = 0; +		ok1(find_and_lock(tdb, key, F_WRLCK, &h, &rec, NULL) +		    == new_off); +		/* Should have created correct hash. */ +		ok1(h.h == tdb_hash(tdb, key.dptr, key.dsize)); +		/* Should have located space in expanded group 0, bucket 0. */ +		ok1(h.group_start == subhash + sizeof(struct tdb_used_record)); +		ok1(h.home_bucket == 0); +		ok1(h.found_bucket == 0); +		ok1(h.hash_used == TDB_TOPLEVEL_HASH_BITS +		    + TDB_SUBLEVEL_HASH_BITS); + +		/* Should have lock on bucket 0 */ +		ok1(h.hlock_start == 0); +		ok1(h.hlock_range == +		    1ULL << (64-(TDB_TOPLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS))); +		ok1((tdb->flags & TDB_NOLOCK) || tdb->file->num_lockrecs == 1); +		ok1((tdb->flags & TDB_NOLOCK) +		    || tdb->file->lockrecs[0].off == TDB_HASH_LOCK_START); +		/* FIXME: Check lock length */ + +		/* Simple delete should work. */ +		ok1(delete_from_hash(tdb, &h) == 0); +		ok1(add_free_record(tdb, new_off, +				    sizeof(struct tdb_used_record) +				    + rec_key_length(&rec) +				    + rec_data_length(&rec) +				    + rec_extra_padding(&rec), +				    TDB_LOCK_NOWAIT, false) == 0); +		ok1(tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range, +				      F_WRLCK) == 0); +		ok1(tdb_check(tdb, NULL, NULL) == 0); + +		/* Test second-level expansion: should expand 0th bucket. */ +		v = 0; +		ok1(find_and_lock(tdb, key, F_WRLCK, &h, &rec, NULL) == 0); +		/* Should have created correct hash. */ +		ok1(h.h == tdb_hash(tdb, key.dptr, key.dsize)); +		/* Should have located space in group 0, bucket 0. */ +		ok1(h.group_start == subhash + sizeof(struct tdb_used_record)); +		ok1(h.home_bucket == 0); +		ok1(h.found_bucket == 0); +		ok1(h.hash_used == TDB_TOPLEVEL_HASH_BITS+TDB_SUBLEVEL_HASH_BITS); + +		/* Should have lock on bucket 0 */ +		ok1(h.hlock_start == 0); +		ok1(h.hlock_range == +		    1ULL << (64-(TDB_TOPLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS))); +		ok1((tdb->flags & TDB_NOLOCK) || tdb->file->num_lockrecs == 1); +		ok1((tdb->flags & TDB_NOLOCK) +		    || tdb->file->lockrecs[0].off == TDB_HASH_LOCK_START); +		/* FIXME: Check lock length */ + +		ok1(expand_group(tdb, &h) == 0); +		/* First one should be subhash, next should be empty. */ +		ok1(is_subhash(h.group[0])); +		subhash = (h.group[0] & TDB_OFF_MASK); +		for (j = 1; j < (1 << TDB_HASH_GROUP_BITS); j++) +			ok1(h.group[j] == 0); +		ok1(tdb_write_convert(tdb, h.group_start, +				      h.group, sizeof(h.group)) == 0); +		ok1(tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range, +				      F_WRLCK) == 0); + +		/* Should be happy with expansion. */ +		ok1(tdb_check(tdb, NULL, NULL) == 0); + +		ok1(find_and_lock(tdb, key, F_WRLCK, &h, &rec, NULL) == 0); +		/* Should have created correct hash. */ +		ok1(h.h == tdb_hash(tdb, key.dptr, key.dsize)); +		/* Should have located space in group 0, bucket 0. */ +		ok1(h.group_start == subhash + sizeof(struct tdb_used_record)); +		ok1(h.home_bucket == 0); +		ok1(h.found_bucket == 0); +		ok1(h.hash_used == TDB_TOPLEVEL_HASH_BITS +		    + TDB_SUBLEVEL_HASH_BITS * 2); + +		/* We should be able to add it now. */ +		/* Allocate a new record. */ +		new_off = alloc(tdb, key.dsize, dbuf.dsize, h.h, +				TDB_USED_MAGIC, false); +		ok1(!TDB_OFF_IS_ERR(new_off)); +		ok1(add_to_hash(tdb, &h, new_off) == 0); + +		/* Make sure we fill it in for later finding. */ +		off = new_off + sizeof(struct tdb_used_record); +		ok1(!tdb->methods->twrite(tdb, off, key.dptr, key.dsize)); +		off += key.dsize; +		ok1(!tdb->methods->twrite(tdb, off, dbuf.dptr, dbuf.dsize)); + +		/* We should be able to unlock that OK. */ +		ok1(tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range, +				      F_WRLCK) == 0); + +		/* Database should be consistent. */ +		ok1(tdb_check(tdb, NULL, NULL) == 0); + +		/* Should be able to find it. */ +		v = 0; +		ok1(find_and_lock(tdb, key, F_WRLCK, &h, &rec, NULL) +		    == new_off); +		/* Should have created correct hash. */ +		ok1(h.h == tdb_hash(tdb, key.dptr, key.dsize)); +		/* Should have located space in expanded group 0, bucket 0. */ +		ok1(h.group_start == subhash + sizeof(struct tdb_used_record)); +		ok1(h.home_bucket == 0); +		ok1(h.found_bucket == 0); +		ok1(h.hash_used == TDB_TOPLEVEL_HASH_BITS +		    + TDB_SUBLEVEL_HASH_BITS * 2); + +		tdb_close(tdb); +	} + +	ok1(tap_log_messages == 0); +	return exit_status(); +} diff --git a/lib/tdb2/test/run-05-readonly-open.c b/lib/tdb2/test/run-05-readonly-open.c new file mode 100644 index 0000000000..0f1a4343d8 --- /dev/null +++ b/lib/tdb2/test/run-05-readonly-open.c @@ -0,0 +1,88 @@ +#include <ccan/failtest/failtest_override.h> +#include <ccan/tdb2/tdb.c> +#include <ccan/tdb2/open.c> +#include <ccan/tdb2/free.c> +#include <ccan/tdb2/lock.c> +#include <ccan/tdb2/io.c> +#include <ccan/tdb2/hash.c> +#include <ccan/tdb2/transaction.c> +#include <ccan/tdb2/check.c> +#include <ccan/tap/tap.h> +#include <ccan/failtest/failtest.h> +#include "logging.h" +#include "failtest_helper.h" + +static bool failtest_suppress = false; + +/* Don't need to test everything here, just want expand testing. */ +static enum failtest_result +suppress_failure(struct failtest_call *history, unsigned num) +{ +	if (failtest_suppress) +		return FAIL_DONT_FAIL; +	return block_repeat_failures(history, num); +} + +int main(int argc, char *argv[]) +{ +	unsigned int i; +	struct tdb_context *tdb; +	int flags[] = { TDB_DEFAULT, TDB_NOMMAP, +			TDB_CONVERT, TDB_NOMMAP|TDB_CONVERT }; +	struct tdb_data key = tdb_mkdata("key", 3); +	struct tdb_data data = tdb_mkdata("data", 4), d; +	union tdb_attribute seed_attr; +	unsigned int msgs = 0; + +	failtest_init(argc, argv); +	failtest_hook = suppress_failure; +	failtest_exit_check = exit_check_log; + +	seed_attr.base.attr = TDB_ATTRIBUTE_SEED; +	seed_attr.base.next = &tap_log_attr; +	seed_attr.seed.seed = 0; + +	failtest_suppress = true; +	plan_tests(sizeof(flags) / sizeof(flags[0]) * 11); +	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { +		tdb = tdb_open("run-05-readonly-open.tdb", flags[i], +			       O_RDWR|O_CREAT|O_TRUNC, 0600, &seed_attr); +		ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0); +		tdb_close(tdb); + +		failtest_suppress = false; +		tdb = tdb_open("run-05-readonly-open.tdb", flags[i], +			       O_RDONLY, 0600, &tap_log_attr); +		if (!ok1(tdb)) +			break; +		ok1(tap_log_messages == msgs); +		/* Fetch should succeed, stores should fail. */ +		if (!ok1(tdb_fetch(tdb, key, &d) == 0)) +			goto fail; +		ok1(tdb_deq(d, data)); +		free(d.dptr); +		if (!ok1(tdb_store(tdb, key, data, TDB_MODIFY) +			 == TDB_ERR_RDONLY)) +			goto fail; +		ok1(tap_log_messages == ++msgs); +		if (!ok1(tdb_store(tdb, key, data, TDB_INSERT) +			 == TDB_ERR_RDONLY)) +			goto fail; +		ok1(tap_log_messages == ++msgs); +		failtest_suppress = true; +		ok1(tdb_check(tdb, NULL, NULL) == 0); +		tdb_close(tdb); +		ok1(tap_log_messages == msgs); +		/* SIGH: failtest bug, it doesn't save the tdb file because +		 * we have it read-only.  If we go around again, it gets +		 * changed underneath us and things get screwy. */ +		if (failtest_has_failed()) +			break; +	} +	failtest_exit(exit_status()); + +fail: +	failtest_suppress = true; +	tdb_close(tdb); +	failtest_exit(exit_status()); +} diff --git a/lib/tdb2/test/run-10-simple-store.c b/lib/tdb2/test/run-10-simple-store.c new file mode 100644 index 0000000000..35c387a3be --- /dev/null +++ b/lib/tdb2/test/run-10-simple-store.c @@ -0,0 +1,76 @@ +#include <ccan/failtest/failtest_override.h> +#include <ccan/tdb2/tdb.c> +#include <ccan/tdb2/open.c> +#include <ccan/tdb2/free.c> +#include <ccan/tdb2/lock.c> +#include <ccan/tdb2/io.c> +#include <ccan/tdb2/hash.c> +#include <ccan/tdb2/transaction.c> +#include <ccan/tdb2/check.c> +#include <ccan/tap/tap.h> +#include <ccan/failtest/failtest.h> +#include "logging.h" +#include "failtest_helper.h" + +static bool failtest_suppress = false; + +/* Don't need to test everything here, just want expand testing. */ +static enum failtest_result +suppress_failure(struct failtest_call *history, unsigned num) +{ +	if (failtest_suppress) +		return FAIL_DONT_FAIL; +	return block_repeat_failures(history, num); +} + +int main(int argc, char *argv[]) +{ +	unsigned int i; +	struct tdb_context *tdb; +	int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP, +			TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT, +			TDB_NOMMAP|TDB_CONVERT }; +	struct tdb_data key = tdb_mkdata("key", 3); +	struct tdb_data data = tdb_mkdata("data", 4); + +	failtest_init(argc, argv); +	failtest_hook = suppress_failure; +	failtest_exit_check = exit_check_log; + +	failtest_suppress = true; +	plan_tests(sizeof(flags) / sizeof(flags[0]) * 7 + 1); +	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { +		tdb = tdb_open("run-10-simple-store.tdb", flags[i], +			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr); +		if (!ok1(tdb)) +			break; +		/* Modify should fail. */ +		failtest_suppress = false; +		if (!ok1(tdb_store(tdb, key, data, TDB_MODIFY) +			 == TDB_ERR_NOEXIST)) +			goto fail; +		failtest_suppress = true; +		ok1(tdb_check(tdb, NULL, NULL) == 0); +		/* Insert should succeed. */ +		failtest_suppress = false; +		if (!ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0)) +			goto fail; +		failtest_suppress = true; +		ok1(tdb_check(tdb, NULL, NULL) == 0); +		/* Second insert should fail. */ +		failtest_suppress = false; +		if (!ok1(tdb_store(tdb, key, data, TDB_INSERT) +			 == TDB_ERR_EXISTS)) +			goto fail; +		failtest_suppress = true; +		ok1(tdb_check(tdb, NULL, NULL) == 0); +		tdb_close(tdb); +	} +	ok1(tap_log_messages == 0); +	failtest_exit(exit_status()); + +fail: +	failtest_suppress = true; +	tdb_close(tdb); +	failtest_exit(exit_status()); +} diff --git a/lib/tdb2/test/run-11-simple-fetch.c b/lib/tdb2/test/run-11-simple-fetch.c new file mode 100644 index 0000000000..29b6bf0872 --- /dev/null +++ b/lib/tdb2/test/run-11-simple-fetch.c @@ -0,0 +1,76 @@ +#include <ccan/failtest/failtest_override.h> +#include <ccan/tdb2/tdb.c> +#include <ccan/tdb2/open.c> +#include <ccan/tdb2/free.c> +#include <ccan/tdb2/lock.c> +#include <ccan/tdb2/io.c> +#include <ccan/tdb2/hash.c> +#include <ccan/tdb2/transaction.c> +#include <ccan/tdb2/check.c> +#include <ccan/tap/tap.h> +#include <ccan/failtest/failtest.h> +#include "logging.h" +#include "failtest_helper.h" + +static bool failtest_suppress = false; + +/* Don't need to test everything here, just want fetch testing. */ +static enum failtest_result +suppress_failure(struct failtest_call *history, unsigned num) +{ +	if (failtest_suppress) +		return FAIL_DONT_FAIL; +	return block_repeat_failures(history, num); +} + +int main(int argc, char *argv[]) +{ +	unsigned int i; +	struct tdb_context *tdb; +	int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP, +			TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT, +			TDB_NOMMAP|TDB_CONVERT }; +	struct tdb_data key = tdb_mkdata("key", 3); +	struct tdb_data data = tdb_mkdata("data", 4); + +	failtest_init(argc, argv); +	failtest_hook = suppress_failure; +	failtest_exit_check = exit_check_log; + +	failtest_suppress = true; +	plan_tests(sizeof(flags) / sizeof(flags[0]) * 8 + 1); +	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { +		tdb = tdb_open("run-11-simple-fetch.tdb", flags[i], +			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr); +		ok1(tdb); +		if (tdb) { +			struct tdb_data d = { NULL, 0 }; /* Bogus GCC warning */ + +			/* fetch should fail. */ +			failtest_suppress = false; +			if (!ok1(tdb_fetch(tdb, key, &d) == TDB_ERR_NOEXIST)) +				goto fail; +			failtest_suppress = true; +			ok1(tdb_check(tdb, NULL, NULL) == 0); +			/* Insert should succeed. */ +			ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0); +			ok1(tdb_check(tdb, NULL, NULL) == 0); +			/* Fetch should now work. */ +			failtest_suppress = false; +			if (!ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS)) +				goto fail; +			failtest_suppress = true; +			ok1(tdb_deq(d, data)); +			free(d.dptr); +			ok1(tdb_check(tdb, NULL, NULL) == 0); +			tdb_close(tdb); +		} +	} +	ok1(tap_log_messages == 0); +	return exit_status(); + +fail: +	failtest_suppress = true; +	tdb_close(tdb); +	failtest_exit(exit_status()); +} diff --git a/lib/tdb2/test/run-12-store.c b/lib/tdb2/test/run-12-store.c new file mode 100644 index 0000000000..ba2e4f8971 --- /dev/null +++ b/lib/tdb2/test/run-12-store.c @@ -0,0 +1,58 @@ +#include <ccan/tdb2/tdb.c> +#include <ccan/tdb2/open.c> +#include <ccan/tdb2/free.c> +#include <ccan/tdb2/lock.c> +#include <ccan/tdb2/io.c> +#include <ccan/tdb2/hash.c> +#include <ccan/tdb2/check.c> +#include <ccan/tdb2/transaction.c> +#include <ccan/tap/tap.h> +#include "logging.h" + +/* We use the same seed which we saw a failure on. */ +static uint64_t fixedhash(const void *key, size_t len, uint64_t seed, void *p) +{ +	return hash64_stable((const unsigned char *)key, len, +			     *(uint64_t *)p); +} + +int main(int argc, char *argv[]) +{ +	unsigned int i, j; +	struct tdb_context *tdb; +	uint64_t seed = 16014841315512641303ULL; +	union tdb_attribute fixed_hattr +		= { .hash = { .base = { TDB_ATTRIBUTE_HASH }, +			      .fn = fixedhash, +			      .data = &seed } }; +	int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP, +			TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT, +			TDB_NOMMAP|TDB_CONVERT }; +	struct tdb_data key = { (unsigned char *)&j, sizeof(j) }; +	struct tdb_data data = { (unsigned char *)&j, sizeof(j) }; + +	fixed_hattr.base.next = &tap_log_attr; + +	plan_tests(sizeof(flags) / sizeof(flags[0]) * (1 + 500 * 3) + 1); +	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { +		tdb = tdb_open("run-12-store.tdb", flags[i], +			       O_RDWR|O_CREAT|O_TRUNC, 0600, &fixed_hattr); +		ok1(tdb); +		if (!tdb) +			continue; + +		/* We seemed to lose some keys. +		 * Insert and check they're in there! */ +		for (j = 0; j < 500; j++) { +			struct tdb_data d = { NULL, 0 }; /* Bogus GCC warning */ +			ok1(tdb_store(tdb, key, data, TDB_REPLACE) == 0); +			ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS); +			ok1(tdb_deq(d, data)); +			free(d.dptr); +		} +		tdb_close(tdb); +	} + +	ok1(tap_log_messages == 0); +	return exit_status(); +} diff --git a/lib/tdb2/test/run-13-delete.c b/lib/tdb2/test/run-13-delete.c new file mode 100644 index 0000000000..3b464d927e --- /dev/null +++ b/lib/tdb2/test/run-13-delete.c @@ -0,0 +1,207 @@ +#include <ccan/tdb2/tdb.c> +#include <ccan/tdb2/open.c> +#include <ccan/tdb2/free.c> +#include <ccan/tdb2/lock.c> +#include <ccan/tdb2/io.c> +#include <ccan/tdb2/hash.c> +#include <ccan/tdb2/check.c> +#include <ccan/tdb2/transaction.c> +#include <ccan/tap/tap.h> +#include "logging.h" + +/* We rig the hash so adjacent-numbered records always clash. */ +static uint64_t clash(const void *key, size_t len, uint64_t seed, void *priv) +{ +	return ((uint64_t)*(const unsigned int *)key) +		<< (64 - TDB_TOPLEVEL_HASH_BITS - 1); +} + +/* We use the same seed which we saw a failure on. */ +static uint64_t fixedhash(const void *key, size_t len, uint64_t seed, void *p) +{ +	return hash64_stable((const unsigned char *)key, len, +			     *(uint64_t *)p); +} + +static bool store_records(struct tdb_context *tdb) +{ +	int i; +	struct tdb_data key = { (unsigned char *)&i, sizeof(i) }; +	struct tdb_data d, data = { (unsigned char *)&i, sizeof(i) }; + +	for (i = 0; i < 1000; i++) { +		if (tdb_store(tdb, key, data, TDB_REPLACE) != 0) +			return false; +		tdb_fetch(tdb, key, &d); +		if (!tdb_deq(d, data)) +			return false; +		free(d.dptr); +	} +	return true; +} + +static void test_val(struct tdb_context *tdb, uint64_t val) +{ +	uint64_t v; +	struct tdb_data key = { (unsigned char *)&v, sizeof(v) }; +	struct tdb_data d, data = { (unsigned char *)&v, sizeof(v) }; + +	/* Insert an entry, then delete it. */ +	v = val; +	/* Delete should fail. */ +	ok1(tdb_delete(tdb, key) == TDB_ERR_NOEXIST); +	ok1(tdb_check(tdb, NULL, NULL) == 0); + +	/* Insert should succeed. */ +	ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0); +	ok1(tdb_check(tdb, NULL, NULL) == 0); + +	/* Delete should succeed. */ +	ok1(tdb_delete(tdb, key) == 0); +	ok1(tdb_check(tdb, NULL, NULL) == 0); + +	/* Re-add it, then add collision. */ +	ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0); +	v = val + 1; +	ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0); +	ok1(tdb_check(tdb, NULL, NULL) == 0); + +	/* Can find both? */ +	ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS); +	ok1(d.dsize == data.dsize); +	free(d.dptr); +	v = val; +	ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS); +	ok1(d.dsize == data.dsize); +	free(d.dptr); + +	/* Delete second one. */ +	v = val + 1; +	ok1(tdb_delete(tdb, key) == 0); +	ok1(tdb_check(tdb, NULL, NULL) == 0); + +	/* Re-add */ +	ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0); +	ok1(tdb_check(tdb, NULL, NULL) == 0); + +	/* Now, try deleting first one. */ +	v = val; +	ok1(tdb_delete(tdb, key) == 0); +	ok1(tdb_check(tdb, NULL, NULL) == 0); + +	/* Can still find second? */ +	v = val + 1; +	ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS); +	ok1(d.dsize == data.dsize); +	free(d.dptr); + +	/* Now, this will be ideally placed. */ +	v = val + 2; +	ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0); +	ok1(tdb_check(tdb, NULL, NULL) == 0); + +	/* This will collide with both. */ +	v = val; +	ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0); + +	/* We can still find them all, right? */ +	ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS); +	ok1(d.dsize == data.dsize); +	free(d.dptr); +	v = val + 1; +	ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS); +	ok1(d.dsize == data.dsize); +	free(d.dptr); +	v = val + 2; +	ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS); +	ok1(d.dsize == data.dsize); +	free(d.dptr); + +	/* And if we delete val + 1, that val + 2 should not move! */ +	v = val + 1; +	ok1(tdb_delete(tdb, key) == 0); +	ok1(tdb_check(tdb, NULL, NULL) == 0); + +	v = val; +	ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS); +	ok1(d.dsize == data.dsize); +	free(d.dptr); +	v = val + 2; +	ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS); +	ok1(d.dsize == data.dsize); +	free(d.dptr); + +	/* Delete those two, so we are empty. */ +	ok1(tdb_delete(tdb, key) == 0); +	v = val; +	ok1(tdb_delete(tdb, key) == 0); + +	ok1(tdb_check(tdb, NULL, NULL) == 0); +} + +int main(int argc, char *argv[]) +{ +	unsigned int i, j; +	struct tdb_context *tdb; +	uint64_t seed = 16014841315512641303ULL; +	union tdb_attribute clash_hattr +		= { .hash = { .base = { TDB_ATTRIBUTE_HASH }, +			      .fn = clash } }; +	union tdb_attribute fixed_hattr +		= { .hash = { .base = { TDB_ATTRIBUTE_HASH }, +			      .fn = fixedhash, +			      .data = &seed } }; +	int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP, +			TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT, +			TDB_NOMMAP|TDB_CONVERT }; +	/* These two values gave trouble before. */ +	int vals[] = { 755, 837 }; + +	clash_hattr.base.next = &tap_log_attr; +	fixed_hattr.base.next = &tap_log_attr; + +	plan_tests(sizeof(flags) / sizeof(flags[0]) +		   * (39 * 3 + 5 + sizeof(vals)/sizeof(vals[0])*2) + 1); +	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { +		tdb = tdb_open("run-13-delete.tdb", flags[i], +			       O_RDWR|O_CREAT|O_TRUNC, 0600, &clash_hattr); +		ok1(tdb); +		if (!tdb) +			continue; + +		/* Check start of hash table. */ +		test_val(tdb, 0); + +		/* Check end of hash table. */ +		test_val(tdb, -1ULL); + +		/* Check mixed bitpattern. */ +		test_val(tdb, 0x123456789ABCDEF0ULL); + +		ok1(!tdb->file || (tdb->file->allrecord_lock.count == 0 +				   && tdb->file->num_lockrecs == 0)); +		tdb_close(tdb); + +		/* Deleting these entries in the db gave problems. */ +		tdb = tdb_open("run-13-delete.tdb", flags[i], +			       O_RDWR|O_CREAT|O_TRUNC, 0600, &fixed_hattr); +		ok1(tdb); +		if (!tdb) +			continue; + +		ok1(store_records(tdb)); +		ok1(tdb_check(tdb, NULL, NULL) == 0); +		for (j = 0; j < sizeof(vals)/sizeof(vals[0]); j++) { +			struct tdb_data key; + +			key.dptr = (unsigned char *)&vals[j]; +			key.dsize = sizeof(vals[j]); +			ok1(tdb_delete(tdb, key) == 0); +			ok1(tdb_check(tdb, NULL, NULL) == 0); +		} +		tdb_close(tdb); +	} + +	ok1(tap_log_messages == 0); +	return exit_status(); +} diff --git a/lib/tdb2/test/run-14-exists.c b/lib/tdb2/test/run-14-exists.c new file mode 100644 index 0000000000..f264a6f2c9 --- /dev/null +++ b/lib/tdb2/test/run-14-exists.c @@ -0,0 +1,57 @@ +#include <ccan/tdb2/tdb.c> +#include <ccan/tdb2/open.c> +#include <ccan/tdb2/free.c> +#include <ccan/tdb2/lock.c> +#include <ccan/tdb2/io.c> +#include <ccan/tdb2/hash.c> +#include <ccan/tdb2/check.c> +#include <ccan/tdb2/transaction.c> +#include <ccan/tap/tap.h> +#include "logging.h" + +static bool test_records(struct tdb_context *tdb) +{ +	int i; +	struct tdb_data key = { (unsigned char *)&i, sizeof(i) }; +	struct tdb_data data = { (unsigned char *)&i, sizeof(i) }; + +	for (i = 0; i < 1000; i++) { +		if (tdb_exists(tdb, key)) +			return false; +		if (tdb_store(tdb, key, data, TDB_REPLACE) != 0) +			return false; +		if (!tdb_exists(tdb, key)) +			return false; +	} + +	for (i = 0; i < 1000; i++) { +		if (!tdb_exists(tdb, key)) +			return false; +		if (tdb_delete(tdb, key) != 0) +			return false; +		if (tdb_exists(tdb, key)) +			return false; +	} +	return true; +} + +int main(int argc, char *argv[]) +{ +	unsigned int i; +	struct tdb_context *tdb; +	int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP, +			TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT, +			TDB_NOMMAP|TDB_CONVERT }; + +	plan_tests(sizeof(flags) / sizeof(flags[0]) * 2 + 1); +	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { +		tdb = tdb_open("run-14-exists.tdb", flags[i], +			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr); +		if (ok1(tdb)) +			ok1(test_records(tdb)); +		tdb_close(tdb); +	} + +	ok1(tap_log_messages == 0); +	return exit_status(); +} diff --git a/lib/tdb2/test/run-15-append.c b/lib/tdb2/test/run-15-append.c new file mode 100644 index 0000000000..d2f9ec6598 --- /dev/null +++ b/lib/tdb2/test/run-15-append.c @@ -0,0 +1,135 @@ +#include <ccan/tdb2/tdb.c> +#include <ccan/tdb2/open.c> +#include <ccan/tdb2/free.c> +#include <ccan/tdb2/lock.c> +#include <ccan/tdb2/io.c> +#include <ccan/tdb2/hash.c> +#include <ccan/tdb2/check.c> +#include <ccan/tdb2/transaction.c> +#include <ccan/tap/tap.h> +#include <ccan/ilog/ilog.h> +#include "logging.h" + +#define MAX_SIZE 13100 +#define SIZE_STEP 131 + +static tdb_off_t tdb_offset(struct tdb_context *tdb, struct tdb_data key) +{ +	tdb_off_t off; +	struct tdb_used_record rec; +	struct hash_info h; + +	off = find_and_lock(tdb, key, F_RDLCK, &h, &rec, NULL); +	if (TDB_OFF_IS_ERR(off)) +		return 0; +	tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range, F_RDLCK); +	return off; +} + +int main(int argc, char *argv[]) +{ +	unsigned int i, j, moves; +	struct tdb_context *tdb; +	unsigned char *buffer; +	tdb_off_t oldoff = 0, newoff; +	int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP, +			TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT, +			TDB_NOMMAP|TDB_CONVERT }; +	struct tdb_data key = tdb_mkdata("key", 3); +	struct tdb_data data; + +	buffer = malloc(MAX_SIZE); +	for (i = 0; i < MAX_SIZE; i++) +		buffer[i] = i; + +	plan_tests(sizeof(flags) / sizeof(flags[0]) +		   * ((3 + MAX_SIZE/SIZE_STEP * 5) * 2 + 7) +		   + 1); + +	/* Using tdb_store. */ +	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { +		tdb = tdb_open("run-append.tdb", flags[i], +			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr); +		ok1(tdb); +		if (!tdb) +			continue; + +		moves = 0; +		for (j = 0; j < MAX_SIZE; j += SIZE_STEP) { +			data.dptr = buffer; +			data.dsize = j; +			ok1(tdb_store(tdb, key, data, TDB_REPLACE) == 0); +			ok1(tdb_check(tdb, NULL, NULL) == 0); +			ok1(tdb_fetch(tdb, key, &data) == TDB_SUCCESS); +			ok1(data.dsize == j); +			ok1(memcmp(data.dptr, buffer, data.dsize) == 0); +			free(data.dptr); +			newoff = tdb_offset(tdb, key); +			if (newoff != oldoff) +				moves++; +			oldoff = newoff; +		} +		ok1(!tdb->file || (tdb->file->allrecord_lock.count == 0 +				   && tdb->file->num_lockrecs == 0)); +		/* We should increase by 50% each time... */ +		ok(moves <= ilog64(j / SIZE_STEP)*2, "Moved %u times", moves); +		tdb_close(tdb); +	} + +	/* Using tdb_append. */ +	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { +		size_t prev_len = 0; +		tdb = tdb_open("run-append.tdb", flags[i], +			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr); +		ok1(tdb); +		if (!tdb) +			continue; + +		moves = 0; +		for (j = 0; j < MAX_SIZE; j += SIZE_STEP) { +			data.dptr = buffer + prev_len; +			data.dsize = j - prev_len; +			ok1(tdb_append(tdb, key, data) == 0); +			ok1(tdb_check(tdb, NULL, NULL) == 0); +			ok1(tdb_fetch(tdb, key, &data) == TDB_SUCCESS); +			ok1(data.dsize == j); +			ok1(memcmp(data.dptr, buffer, data.dsize) == 0); +			free(data.dptr); +			prev_len = data.dsize; +			newoff = tdb_offset(tdb, key); +			if (newoff != oldoff) +				moves++; +			oldoff = newoff; +		} +		ok1(!tdb->file || (tdb->file->allrecord_lock.count == 0 +				   && tdb->file->num_lockrecs == 0)); +		/* We should increase by 50% each time... */ +		ok(moves <= ilog64(j / SIZE_STEP)*2, "Moved %u times", moves); +		tdb_close(tdb); +	} + +	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { +		tdb = tdb_open("run-append.tdb", flags[i], +			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr); +		ok1(tdb); +		if (!tdb) +			continue; + +		/* Huge initial store. */ +		data.dptr = buffer; +		data.dsize = MAX_SIZE; +		ok1(tdb_append(tdb, key, data) == 0); +		ok1(tdb_check(tdb, NULL, NULL) == 0); +		ok1(tdb_fetch(tdb, key, &data) == TDB_SUCCESS); +		ok1(data.dsize == MAX_SIZE); +		ok1(memcmp(data.dptr, buffer, data.dsize) == 0); +		free(data.dptr); +		ok1(!tdb->file || (tdb->file->allrecord_lock.count == 0 +				   && tdb->file->num_lockrecs == 0)); +		tdb_close(tdb); +	} + +	ok1(tap_log_messages == 0); +	free(buffer); +	return exit_status(); +} diff --git a/lib/tdb2/test/run-16-wipe_all.c b/lib/tdb2/test/run-16-wipe_all.c new file mode 100644 index 0000000000..d9c5128e0b --- /dev/null +++ b/lib/tdb2/test/run-16-wipe_all.c @@ -0,0 +1,50 @@ +#include <ccan/tdb2/tdb.c> +#include <ccan/tdb2/open.c> +#include <ccan/tdb2/free.c> +#include <ccan/tdb2/lock.c> +#include <ccan/tdb2/io.c> +#include <ccan/tdb2/hash.c> +#include <ccan/tdb2/check.c> +#include <ccan/tdb2/transaction.c> +#include <ccan/tdb2/traverse.c> +#include <ccan/tap/tap.h> +#include "logging.h" + +static bool add_records(struct tdb_context *tdb) +{ +	int i; +	struct tdb_data key = { (unsigned char *)&i, sizeof(i) }; +	struct tdb_data data = { (unsigned char *)&i, sizeof(i) }; + +	for (i = 0; i < 1000; i++) { +		if (tdb_store(tdb, key, data, TDB_REPLACE) != 0) +			return false; +	} +	return true; +} + + +int main(int argc, char *argv[]) +{ +	unsigned int i; +	struct tdb_context *tdb; +	int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP, +			TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT, +			TDB_NOMMAP|TDB_CONVERT }; + +	plan_tests(sizeof(flags) / sizeof(flags[0]) * 4 + 1); +	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { +		tdb = tdb_open("run-16-wipe_all.tdb", flags[i], +			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr); +		if (ok1(tdb)) { +			struct tdb_data key; +			ok1(add_records(tdb)); +			ok1(tdb_wipe_all(tdb) == TDB_SUCCESS); +			ok1(tdb_firstkey(tdb, &key) == TDB_ERR_NOEXIST); +			tdb_close(tdb); +		} +	} + +	ok1(tap_log_messages == 0); +	return exit_status(); +} diff --git a/lib/tdb2/test/run-20-growhash.c b/lib/tdb2/test/run-20-growhash.c new file mode 100644 index 0000000000..22a88c4504 --- /dev/null +++ b/lib/tdb2/test/run-20-growhash.c @@ -0,0 +1,144 @@ +#include <ccan/tdb2/tdb.c> +#include <ccan/tdb2/open.c> +#include <ccan/tdb2/free.c> +#include <ccan/tdb2/lock.c> +#include <ccan/tdb2/io.c> +#include <ccan/tdb2/hash.c> +#include <ccan/tdb2/transaction.c> +#include <ccan/tdb2/check.c> +#include <ccan/tap/tap.h> +#include "logging.h" + +static uint64_t myhash(const void *key, size_t len, uint64_t seed, void *priv) +{ +	return *(const uint64_t *)key; +} + +static void add_bits(uint64_t *val, unsigned new, unsigned new_bits, +		     unsigned *done) +{ +	*done += new_bits; +	*val |= ((uint64_t)new << (64 - *done)); +} + +static uint64_t make_key(unsigned topgroup, unsigned topbucket, +			 unsigned subgroup1, unsigned subbucket1, +			 unsigned subgroup2, unsigned subbucket2) +{ +	uint64_t key = 0; +	unsigned done = 0; + +	add_bits(&key, topgroup, TDB_TOPLEVEL_HASH_BITS - TDB_HASH_GROUP_BITS, +		 &done); +	add_bits(&key, topbucket, TDB_HASH_GROUP_BITS, &done); +	add_bits(&key, subgroup1, TDB_SUBLEVEL_HASH_BITS - TDB_HASH_GROUP_BITS, +		 &done); +	add_bits(&key, subbucket1, TDB_HASH_GROUP_BITS, &done); +	add_bits(&key, subgroup2, TDB_SUBLEVEL_HASH_BITS - TDB_HASH_GROUP_BITS, +		 &done); +	add_bits(&key, subbucket2, TDB_HASH_GROUP_BITS, &done); +	return key; +} + +int main(int argc, char *argv[]) +{ +	unsigned int i, j; +	struct tdb_context *tdb; +	uint64_t kdata; +	struct tdb_used_record rec; +	struct tdb_data key = { (unsigned char *)&kdata, sizeof(kdata) }; +	struct tdb_data dbuf = { (unsigned char *)&kdata, sizeof(kdata) }; +	union tdb_attribute hattr = { .hash = { .base = { TDB_ATTRIBUTE_HASH }, +						.fn = myhash } }; +	int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP, +			TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT, +			TDB_NOMMAP|TDB_CONVERT, +	}; + +	hattr.base.next = &tap_log_attr; + +	plan_tests(sizeof(flags) / sizeof(flags[0]) +		   * (9 + (20 + 2 * ((1 << TDB_HASH_GROUP_BITS) - 2)) +		      * (1 << TDB_HASH_GROUP_BITS)) + 1); +	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { +		struct hash_info h; + +		tdb = tdb_open("run-04-basichash.tdb", flags[i], +			       O_RDWR|O_CREAT|O_TRUNC, 0600, &hattr); +		ok1(tdb); +		if (!tdb) +			continue; + +		/* Fill a group. */ +		for (j = 0; j < (1 << TDB_HASH_GROUP_BITS); j++) { +			kdata = make_key(0, j, 0, 0, 0, 0); +			ok1(tdb_store(tdb, key, dbuf, TDB_INSERT) == 0); +		} +		ok1(tdb_check(tdb, NULL, NULL) == 0); + +		/* Check first still exists. */ +		kdata = make_key(0, 0, 0, 0, 0, 0); +		ok1(find_and_lock(tdb, key, F_RDLCK, &h, &rec, NULL) != 0); +		/* Should have created correct hash. */ +		ok1(h.h == tdb_hash(tdb, key.dptr, key.dsize)); +		/* Should have located space in group 0, bucket 0. */ +		ok1(h.group_start == offsetof(struct tdb_header, hashtable)); +		ok1(h.home_bucket == 0); +		ok1(h.found_bucket == 0); +		ok1(h.hash_used == TDB_TOPLEVEL_HASH_BITS); +		/* Entire group should be full! */ +		for (j = 0; j < (1 << TDB_HASH_GROUP_BITS); j++) +			ok1(h.group[j] != 0); + +		ok1(tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range, +				      F_RDLCK) == 0); + +		/* Now, add one more to each should expand (that) bucket. */ +		for (j = 0; j < (1 << TDB_HASH_GROUP_BITS); j++) { +			unsigned int k; +			kdata = make_key(0, j, 0, 1, 0, 0); +			ok1(tdb_store(tdb, key, dbuf, TDB_INSERT) == 0); +			ok1(tdb_check(tdb, NULL, NULL) == 0); + +			ok1(find_and_lock(tdb, key, F_RDLCK, &h, &rec, NULL)); +			/* Should have created correct hash. */ +			ok1(h.h == tdb_hash(tdb, key.dptr, key.dsize)); +			/* Should have moved to subhash */ +			ok1(h.group_start >= sizeof(struct tdb_header)); +			ok1(h.home_bucket == 1); +			ok1(h.found_bucket == 1); +			ok1(h.hash_used == TDB_TOPLEVEL_HASH_BITS +			    + TDB_SUBLEVEL_HASH_BITS); +			ok1(tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range, +					      F_RDLCK) == 0); + +			/* Keep adding, make it expand again. */ +			for (k = 2; k < (1 << TDB_HASH_GROUP_BITS); k++) { +				kdata = make_key(0, j, 0, k, 0, 0); +				ok1(tdb_store(tdb, key, dbuf, TDB_INSERT) == 0); +				ok1(tdb_check(tdb, NULL, NULL) == 0); +			} + +			/* This should tip it over to sub-sub-hash. */ +			kdata = make_key(0, j, 0, 0, 0, 1); +			ok1(tdb_store(tdb, key, dbuf, TDB_INSERT) == 0); +			ok1(tdb_check(tdb, NULL, NULL) == 0); + +			ok1(find_and_lock(tdb, key, F_RDLCK, &h, &rec, NULL)); +			/* Should have created correct hash. */ +			ok1(h.h == tdb_hash(tdb, key.dptr, key.dsize)); +			/* Should have moved to subhash */ +			ok1(h.group_start >= sizeof(struct tdb_header)); +			ok1(h.home_bucket == 1); +			ok1(h.found_bucket == 1); +			ok1(h.hash_used == TDB_TOPLEVEL_HASH_BITS +			    + TDB_SUBLEVEL_HASH_BITS + TDB_SUBLEVEL_HASH_BITS); +			ok1(tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range, +					      F_RDLCK) == 0); +		} +		tdb_close(tdb); +	} + +	ok1(tap_log_messages == 0); +	return exit_status(); +} diff --git a/lib/tdb2/test/run-21-parse_record.c b/lib/tdb2/test/run-21-parse_record.c new file mode 100644 index 0000000000..773cdff4e0 --- /dev/null +++ b/lib/tdb2/test/run-21-parse_record.c @@ -0,0 +1,70 @@ +#include <ccan/tdb2/tdb.c> +#include <ccan/tdb2/open.c> +#include <ccan/tdb2/free.c> +#include <ccan/tdb2/lock.c> +#include <ccan/tdb2/io.c> +#include <ccan/tdb2/hash.c> +#include <ccan/tdb2/check.c> +#include <ccan/tdb2/transaction.c> +#include <ccan/tap/tap.h> +#include "logging.h" + +static enum TDB_ERROR parse(TDB_DATA key, TDB_DATA data, TDB_DATA *expected) +{ +	if (!tdb_deq(data, *expected)) +		return TDB_ERR_EINVAL; +	return TDB_SUCCESS; +} + +static enum TDB_ERROR parse_err(TDB_DATA key, TDB_DATA data, void *unused) +{ +	return 100; +} + +static bool test_records(struct tdb_context *tdb) +{ +	int i; +	struct tdb_data key = { (unsigned char *)&i, sizeof(i) }; +	struct tdb_data data = { (unsigned char *)&i, sizeof(i) }; + +	for (i = 0; i < 1000; i++) { +		if (tdb_store(tdb, key, data, TDB_REPLACE) != 0) +			return false; +	} + +	for (i = 0; i < 1000; i++) { +		if (tdb_parse_record(tdb, key, parse, &data) != TDB_SUCCESS) +			return false; +	} + +	if (tdb_parse_record(tdb, key, parse, &data) != TDB_ERR_NOEXIST) +		return false; + +	/* Test error return from parse function. */ +	i = 0; +	if (tdb_parse_record(tdb, key, parse_err, NULL) != 100) +		return false; + +	return true; +} + +int main(int argc, char *argv[]) +{ +	unsigned int i; +	struct tdb_context *tdb; +	int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP, +			TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT, +			TDB_NOMMAP|TDB_CONVERT }; + +	plan_tests(sizeof(flags) / sizeof(flags[0]) * 2 + 1); +	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { +		tdb = tdb_open("run-14-exists.tdb", flags[i], +			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr); +		if (ok1(tdb)) +			ok1(test_records(tdb)); +		tdb_close(tdb); +	} + +	ok1(tap_log_messages == 0); +	return exit_status(); +} diff --git a/lib/tdb2/test/run-25-hashoverload.c b/lib/tdb2/test/run-25-hashoverload.c new file mode 100644 index 0000000000..83f549d6b2 --- /dev/null +++ b/lib/tdb2/test/run-25-hashoverload.c @@ -0,0 +1,121 @@ +#include <ccan/tdb2/tdb.c> +#include <ccan/tdb2/open.c> +#include <ccan/tdb2/free.c> +#include <ccan/tdb2/lock.c> +#include <ccan/tdb2/io.c> +#include <ccan/tdb2/hash.c> +#include <ccan/tdb2/transaction.c> +#include <ccan/tdb2/traverse.c> +#include <ccan/tdb2/check.c> +#include <ccan/tap/tap.h> +#include "logging.h" + +static uint64_t badhash(const void *key, size_t len, uint64_t seed, void *priv) +{ +	return 0; +} + +static int trav(struct tdb_context *tdb, TDB_DATA key, TDB_DATA dbuf, void *p) +{ +	if (p) +		return tdb_delete(tdb, key); +	return 0; +} + +int main(int argc, char *argv[]) +{ +	unsigned int i, j; +	struct tdb_context *tdb; +	struct tdb_data key = { (unsigned char *)&j, sizeof(j) }; +	struct tdb_data dbuf = { (unsigned char *)&j, sizeof(j) }; +	union tdb_attribute hattr = { .hash = { .base = { TDB_ATTRIBUTE_HASH }, +						.fn = badhash } }; +	int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP, +			TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT, +			TDB_NOMMAP|TDB_CONVERT, +	}; + +	hattr.base.next = &tap_log_attr; + +	plan_tests(6883); +	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { +		struct tdb_data d = { NULL, 0 }; /* Bogus GCC warning */ + +		tdb = tdb_open("run-25-hashoverload.tdb", flags[i], +			       O_RDWR|O_CREAT|O_TRUNC, 0600, &hattr); +		ok1(tdb); +		if (!tdb) +			continue; + +		/* Fill a group. */ +		for (j = 0; j < (1 << TDB_HASH_GROUP_BITS); j++) { +			ok1(tdb_store(tdb, key, dbuf, TDB_INSERT) == 0); +		} +		ok1(tdb_check(tdb, NULL, NULL) == 0); + +		/* Now store one last value: should form chain. */ +		ok1(tdb_store(tdb, key, dbuf, TDB_INSERT) == 0); +		ok1(tdb_check(tdb, NULL, NULL) == 0); + +		/* Check we can find them all. */ +		for (j = 0; j < (1 << TDB_HASH_GROUP_BITS) + 1; j++) { +			ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS); +			ok1(d.dsize == sizeof(j)); +			ok1(d.dptr != NULL); +			ok1(d.dptr && memcmp(d.dptr, &j, d.dsize) == 0); +			free(d.dptr); +		} + +		/* Now add a *lot* more. */ +		for (j = (1 << TDB_HASH_GROUP_BITS) + 1; +		     j < (16 << TDB_HASH_GROUP_BITS); +		     j++) { +			ok1(tdb_store(tdb, key, dbuf, TDB_INSERT) == 0); +			ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS); +			ok1(d.dsize == sizeof(j)); +			ok1(d.dptr != NULL); +			ok1(d.dptr && memcmp(d.dptr, &j, d.dsize) == 0); +			free(d.dptr); +		} +		ok1(tdb_check(tdb, NULL, NULL) == 0); + +		/* Traverse through them. */ +		ok1(tdb_traverse(tdb, trav, NULL) == j); + +		/* Empty the first chain-worth. */ +		for (j = 0; j < (1 << TDB_HASH_GROUP_BITS); j++) +			ok1(tdb_delete(tdb, key) == 0); + +		ok1(tdb_check(tdb, NULL, NULL) == 0); + +		for (j = (1 << TDB_HASH_GROUP_BITS); +		     j < (16 << TDB_HASH_GROUP_BITS); +		     j++) { +			ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS); +			ok1(d.dsize == sizeof(j)); +			ok1(d.dptr != NULL); +			ok1(d.dptr && memcmp(d.dptr, &j, d.dsize) == 0); +			free(d.dptr); +		} + +		/* Traverse through them. */ +		ok1(tdb_traverse(tdb, trav, NULL) +		    == (15 << TDB_HASH_GROUP_BITS)); + +		/* Re-add */ +		for (j = 0; j < (1 << TDB_HASH_GROUP_BITS); j++) { +			ok1(tdb_store(tdb, key, dbuf, TDB_INSERT) == 0); +		} +		ok1(tdb_check(tdb, NULL, NULL) == 0); + +		/* Now try deleting as we go. */ +		ok1(tdb_traverse(tdb, trav, trav) +		    == (16 << TDB_HASH_GROUP_BITS)); +		ok1(tdb_check(tdb, NULL, NULL) == 0); +		ok1(tdb_traverse(tdb, trav, NULL) == 0); +		tdb_close(tdb); +	} + +	ok1(tap_log_messages == 0); +	return exit_status(); +} diff --git a/lib/tdb2/test/run-30-exhaust-before-expand.c b/lib/tdb2/test/run-30-exhaust-before-expand.c new file mode 100644 index 0000000000..2386f85f26 --- /dev/null +++ b/lib/tdb2/test/run-30-exhaust-before-expand.c @@ -0,0 +1,79 @@ +#include <ccan/tdb2/tdb.c> +#include <ccan/tdb2/open.c> +#include <ccan/tdb2/free.c> +#include <ccan/tdb2/lock.c> +#include <ccan/tdb2/io.c> +#include <ccan/tdb2/check.c> +#include <ccan/tdb2/hash.c> +#include <ccan/tdb2/transaction.c> +#include <ccan/tap/tap.h> +#include <err.h> +#include "logging.h" + +static bool empty_freetable(struct tdb_context *tdb) +{ +	struct tdb_freetable ftab; +	unsigned int i; + +	/* Now, free table should be completely exhausted in zone 0 */ +	if (tdb_read_convert(tdb, tdb->ftable_off, &ftab, sizeof(ftab)) != 0) +		abort(); + +	for (i = 0; i < sizeof(ftab.buckets)/sizeof(ftab.buckets[0]); i++) { +		if (ftab.buckets[i]) +			return false; +	} +	return true; +} + + +int main(int argc, char *argv[]) +{ +	unsigned int i, j; +	struct tdb_context *tdb; +	int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP, +			TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT, +			TDB_NOMMAP|TDB_CONVERT }; + +	plan_tests(sizeof(flags) / sizeof(flags[0]) * 9 + 1); + +	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { +		TDB_DATA k; +		uint64_t size; +		bool was_empty = false; + +		k.dptr = (void *)&j; +		k.dsize = sizeof(j); + +		tdb = tdb_open("run-30-exhaust-before-expand.tdb", flags[i], +			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr); +		ok1(tdb); +		if (!tdb) +			continue; + +		ok1(empty_freetable(tdb)); +		/* Need some hash lock for expand. */ +		ok1(tdb_lock_hashes(tdb, 0, 1, F_WRLCK, TDB_LOCK_WAIT) == 0); +		/* Create some free space. */ +		ok1(tdb_expand(tdb, 1) == 0); +		ok1(tdb_unlock_hashes(tdb, 0, 1, F_WRLCK) == 0); +		ok1(tdb_check(tdb, NULL, NULL) == 0); +		ok1(!empty_freetable(tdb)); + +		size = tdb->file->map_size; +		/* Insert minimal-length records until we expand. */ +		for (j = 0; tdb->file->map_size == size; j++) { +			was_empty = empty_freetable(tdb); +			if (tdb_store(tdb, k, k, TDB_INSERT) != 0) +				err(1, "Failed to store record %i", j); +		} + +		/* Would have been empty before expansion, but no longer. */ +		ok1(was_empty); +		ok1(!empty_freetable(tdb)); +		tdb_close(tdb); +	} + +	ok1(tap_log_messages == 0); +	return exit_status(); +} diff --git a/lib/tdb2/test/run-50-multiple-freelists.c b/lib/tdb2/test/run-50-multiple-freelists.c new file mode 100644 index 0000000000..7a48c3e0ee --- /dev/null +++ b/lib/tdb2/test/run-50-multiple-freelists.c @@ -0,0 +1,71 @@ +#include <ccan/tdb2/tdb.c> +#include <ccan/tdb2/open.c> +#include <ccan/tdb2/free.c> +#include <ccan/tdb2/lock.c> +#include <ccan/tdb2/io.c> +#include <ccan/tdb2/hash.c> +#include <ccan/tdb2/check.c> +#include <ccan/tap/tap.h> +#include <ccan/tdb2/transaction.c> +#include "logging.h" +#include "layout.h" + +int main(int argc, char *argv[]) +{ +	tdb_off_t off; +	struct tdb_context *tdb; +	struct tdb_layout *layout; +	TDB_DATA key, data; + +	plan_tests(11); +	key = tdb_mkdata("Hello", 5); +	data = tdb_mkdata("world", 5); + +	/* Create a TDB with three free tables. */ +	layout = new_tdb_layout(NULL); +	tdb_layout_add_freetable(layout); +	tdb_layout_add_freetable(layout); +	tdb_layout_add_freetable(layout); +	tdb_layout_add_free(layout, 80, 0); +	/* Used record prevent coalescing. */ +	tdb_layout_add_used(layout, key, data, 6); +	tdb_layout_add_free(layout, 160, 1); +	key.dsize--; +	tdb_layout_add_used(layout, key, data, 7); +	tdb_layout_add_free(layout, 320, 2); +	key.dsize--; +	tdb_layout_add_used(layout, key, data, 8); +	tdb_layout_add_free(layout, 40, 0); +	tdb = tdb_layout_get(layout); +	ok1(tdb_check(tdb, NULL, NULL) == 0); + +	off = get_free(tdb, 0, 80 - sizeof(struct tdb_used_record), 0, +		       TDB_USED_MAGIC, 0); +	ok1(off == layout->elem[3].base.off); +	ok1(tdb->ftable_off == layout->elem[0].base.off); + +	off = get_free(tdb, 0, 160 - sizeof(struct tdb_used_record), 0, +		       TDB_USED_MAGIC, 0); +	ok1(off == layout->elem[5].base.off); +	ok1(tdb->ftable_off == layout->elem[1].base.off); + +	off = get_free(tdb, 0, 320 - sizeof(struct tdb_used_record), 0, +		       TDB_USED_MAGIC, 0); +	ok1(off == layout->elem[7].base.off); +	ok1(tdb->ftable_off == layout->elem[2].base.off); + +	off = get_free(tdb, 0, 40 - sizeof(struct tdb_used_record), 0, +		       TDB_USED_MAGIC, 0); +	ok1(off == layout->elem[9].base.off); +	ok1(tdb->ftable_off == layout->elem[0].base.off); + +	/* Now we fail. */ +	off = get_free(tdb, 0, 0, 1, TDB_USED_MAGIC, 0); +	ok1(off == 0); + +	tdb_close(tdb); +	tdb_layout_free(layout); + +	ok1(tap_log_messages == 0); +	return exit_status(); +} diff --git a/lib/tdb2/test/run-55-transaction.c b/lib/tdb2/test/run-55-transaction.c new file mode 100644 index 0000000000..1650e40e1f --- /dev/null +++ b/lib/tdb2/test/run-55-transaction.c @@ -0,0 +1,75 @@ +#include <ccan/tdb2/tdb.c> +#include <ccan/tdb2/open.c> +#include <ccan/tdb2/free.c> +#include <ccan/tdb2/lock.c> +#include <ccan/tdb2/io.c> +#include <ccan/tdb2/hash.c> +#include <ccan/tdb2/check.c> +#include <ccan/tdb2/transaction.c> +#include <ccan/tap/tap.h> +#include "logging.h" + +int main(int argc, char *argv[]) +{ +	unsigned int i; +	struct tdb_context *tdb; +	unsigned char *buffer; +	int flags[] = { TDB_DEFAULT, TDB_NOMMAP, +			TDB_CONVERT, TDB_NOMMAP|TDB_CONVERT }; +	struct tdb_data key = tdb_mkdata("key", 3); +	struct tdb_data data; + +	buffer = malloc(1000); +	for (i = 0; i < 1000; i++) +		buffer[i] = i; + +	plan_tests(sizeof(flags) / sizeof(flags[0]) * 20 + 1); + +	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { +		tdb = tdb_open("run-55-transaction.tdb", flags[i], +			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr); +		ok1(tdb); +		if (!tdb) +			continue; + +		ok1(tdb_transaction_start(tdb) == 0); +		data.dptr = buffer; +		data.dsize = 1000; +		ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0); +		ok1(tdb_fetch(tdb, key, &data) == TDB_SUCCESS); +		ok1(data.dsize == 1000); +		ok1(memcmp(data.dptr, buffer, data.dsize) == 0); +		free(data.dptr); + +		/* Cancelling a transaction means no store */ +		tdb_transaction_cancel(tdb); +		ok1(tdb->file->allrecord_lock.count == 0 +		    && tdb->file->num_lockrecs == 0); +		ok1(tdb_check(tdb, NULL, NULL) == 0); +		ok1(tdb_fetch(tdb, key, &data) == TDB_ERR_NOEXIST); + +		/* Commit the transaction. */ +		ok1(tdb_transaction_start(tdb) == 0); +		data.dptr = buffer; +		data.dsize = 1000; +		ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0); +		ok1(tdb_fetch(tdb, key, &data) == TDB_SUCCESS); +		ok1(data.dsize == 1000); +		ok1(memcmp(data.dptr, buffer, data.dsize) == 0); +		free(data.dptr); +		ok1(tdb_transaction_commit(tdb) == 0); +		ok1(tdb->file->allrecord_lock.count == 0 +		    && tdb->file->num_lockrecs == 0); +		ok1(tdb_check(tdb, NULL, NULL) == 0); +		ok1(tdb_fetch(tdb, key, &data) == TDB_SUCCESS); +		ok1(data.dsize == 1000); +		ok1(memcmp(data.dptr, buffer, data.dsize) == 0); +		free(data.dptr); + +		tdb_close(tdb); +	} + +	ok1(tap_log_messages == 0); +	free(buffer); +	return exit_status(); +} diff --git a/lib/tdb2/test/run-56-open-during-transaction.c b/lib/tdb2/test/run-56-open-during-transaction.c new file mode 100644 index 0000000000..96107d637e --- /dev/null +++ b/lib/tdb2/test/run-56-open-during-transaction.c @@ -0,0 +1,175 @@ +#include "config.h" +#include <unistd.h> +#include "lock-tracking.h" + +static ssize_t pwrite_check(int fd, const void *buf, size_t count, off_t offset); +static ssize_t write_check(int fd, const void *buf, size_t count); +static int ftruncate_check(int fd, off_t length); + +#define pwrite pwrite_check +#define write write_check +#define fcntl fcntl_with_lockcheck +#define ftruncate ftruncate_check + +#include <ccan/tdb2/tdb.c> +#include <ccan/tdb2/open.c> +#include <ccan/tdb2/free.c> +#include <ccan/tdb2/lock.c> +#include <ccan/tdb2/io.c> +#include <ccan/tdb2/hash.c> +#include <ccan/tdb2/check.c> +#include <ccan/tdb2/transaction.c> +#include <ccan/tap/tap.h> +#include <stdlib.h> +#include <stdbool.h> +#include <stdarg.h> +#include <err.h> +#include "external-agent.h" +#include "logging.h" + +static struct agent *agent; +static bool opened; +static int errors = 0; +#define TEST_DBNAME "run-56-open-during-transaction.tdb" + +#undef write +#undef pwrite +#undef fcntl +#undef ftruncate + +static bool is_same(const char *snapshot, const char *latest, off_t len) +{ +	unsigned i; + +	for (i = 0; i < len; i++) { +		if (snapshot[i] != latest[i]) +			return false; +	} +	return true; +} + +static bool compare_file(int fd, const char *snapshot, off_t snapshot_len) +{ +	char *contents; +	bool same; + +	/* over-length read serves as length check. */ +	contents = malloc(snapshot_len+1); +	same = pread(fd, contents, snapshot_len+1, 0) == snapshot_len +		&& is_same(snapshot, contents, snapshot_len); +	free(contents); +	return same; +} + +static void check_file_intact(int fd) +{ +	enum agent_return ret; +	struct stat st; +	char *contents; + +	fstat(fd, &st); +	contents = malloc(st.st_size); +	if (pread(fd, contents, st.st_size, 0) != st.st_size) { +		diag("Read fail"); +		errors++; +		return; +	} + +	/* Ask agent to open file. */ +	ret = external_agent_operation(agent, OPEN, TEST_DBNAME); + +	/* It's OK to open it, but it must not have changed! */ +	if (!compare_file(fd, contents, st.st_size)) { +		diag("Agent changed file after opening %s", +		     agent_return_name(ret)); +		errors++; +	} + +	if (ret == SUCCESS) { +		ret = external_agent_operation(agent, CLOSE, NULL); +		if (ret != SUCCESS) { +			diag("Agent failed to close tdb: %s", +			     agent_return_name(ret)); +			errors++; +		} +	} else if (ret != WOULD_HAVE_BLOCKED) { +		diag("Agent opening file gave %s", +		     agent_return_name(ret)); +		errors++; +	} + +	free(contents); +} + +static void after_unlock(int fd) +{ +	if (opened) +		check_file_intact(fd); +} + +static ssize_t pwrite_check(int fd, +			    const void *buf, size_t count, off_t offset) +{ +	if (opened) +		check_file_intact(fd); + +	return pwrite(fd, buf, count, offset); +} + +static ssize_t write_check(int fd, const void *buf, size_t count) +{ +	if (opened) +		check_file_intact(fd); + +	return write(fd, buf, count); +} + +static int ftruncate_check(int fd, off_t length) +{ +	if (opened) +		check_file_intact(fd); + +	return ftruncate(fd, length); + +} + +int main(int argc, char *argv[]) +{ +	const int flags[] = { TDB_DEFAULT, +			      TDB_NOMMAP, +			      TDB_CONVERT, +			      TDB_CONVERT | TDB_NOMMAP }; +	int i; +	struct tdb_context *tdb; +	TDB_DATA key, data; + +	plan_tests(20); +	agent = prepare_external_agent(); +	if (!agent) +		err(1, "preparing agent"); + +	unlock_callback = after_unlock; +	for (i = 0; i < sizeof(flags)/sizeof(flags[0]); i++) { +		diag("Test with %s and %s\n", +		     (flags[i] & TDB_CONVERT) ? "CONVERT" : "DEFAULT", +		     (flags[i] & TDB_NOMMAP) ? "no mmap" : "mmap"); +		unlink(TEST_DBNAME); +		tdb = tdb_open(TEST_DBNAME, flags[i], +			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr); +		ok1(tdb); + +		opened = true; +		ok1(tdb_transaction_start(tdb) == 0); +		key = tdb_mkdata("hi", strlen("hi")); +		data = tdb_mkdata("world", strlen("world")); + +		ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0); +		ok1(tdb_transaction_commit(tdb) == 0); +		ok(!errors, "We had %u open errors", errors); + +		opened = false; +		tdb_close(tdb); +	} + +	return exit_status(); +} diff --git a/lib/tdb2/test/run-57-die-during-transaction.c b/lib/tdb2/test/run-57-die-during-transaction.c new file mode 100644 index 0000000000..84f01eb21a --- /dev/null +++ b/lib/tdb2/test/run-57-die-during-transaction.c @@ -0,0 +1,275 @@ +#include "config.h" +#include <unistd.h> +#include "lock-tracking.h" +#include <ccan/tap/tap.h> +#include <stdlib.h> +#include <assert.h> +static ssize_t pwrite_check(int fd, const void *buf, size_t count, off_t offset); +static ssize_t write_check(int fd, const void *buf, size_t count); +static int ftruncate_check(int fd, off_t length); + +#define pwrite pwrite_check +#define write write_check +#define fcntl fcntl_with_lockcheck +#define ftruncate ftruncate_check + +/* There's a malloc inside transaction_setup_recovery, and valgrind complains + * when we longjmp and leak it. */ +#define MAX_ALLOCATIONS 200 +static void *allocated[MAX_ALLOCATIONS]; + +static void *malloc_noleak(size_t len) +{ +	unsigned int i; + +	for (i = 0; i < MAX_ALLOCATIONS; i++) +		if (!allocated[i]) { +			allocated[i] = malloc(len); +			return allocated[i]; +		} +	diag("Too many allocations!"); +	abort(); +} + +static void free_noleak(void *p) +{ +	unsigned int i; + +	/* We don't catch realloc, so don't care if we miss one. */ +	for (i = 0; i < MAX_ALLOCATIONS; i++) { +		if (allocated[i] == p) { +			allocated[i] = NULL; +			break; +		} +	} +	free(p); +} + +static void free_all(void) +{ +	unsigned int i; + +	for (i = 0; i < MAX_ALLOCATIONS; i++) { +		free(allocated[i]); +		allocated[i] = NULL; +	} +} + +#define malloc malloc_noleak +#define free free_noleak + +#include <ccan/tdb2/tdb.c> +#include <ccan/tdb2/open.c> +#include <ccan/tdb2/free.c> +#include <ccan/tdb2/lock.c> +#include <ccan/tdb2/io.c> +#include <ccan/tdb2/hash.c> +#include <ccan/tdb2/check.c> +#include <ccan/tdb2/transaction.c> +#undef malloc +#undef free +#undef write +#undef pwrite +#undef fcntl +#undef ftruncate + +#include <stdbool.h> +#include <stdarg.h> +#include <err.h> +#include <setjmp.h> +#include "external-agent.h" +#include "logging.h" + +static bool in_transaction; +static int target, current; +static jmp_buf jmpbuf; +#define TEST_DBNAME "run-57-die-during-transaction.tdb" +#define KEY_STRING "helloworld" + +static void maybe_die(int fd) +{ +	if (in_transaction && current++ == target) { +		longjmp(jmpbuf, 1); +	} +} + +static ssize_t pwrite_check(int fd, +			    const void *buf, size_t count, off_t offset) +{ +	ssize_t ret; + +	maybe_die(fd); + +	ret = pwrite(fd, buf, count, offset); +	if (ret != count) +		return ret; + +	maybe_die(fd); +	return ret; +} + +static ssize_t write_check(int fd, const void *buf, size_t count) +{ +	ssize_t ret; + +	maybe_die(fd); + +	ret = write(fd, buf, count); +	if (ret != count) +		return ret; + +	maybe_die(fd); +	return ret; +} + +static int ftruncate_check(int fd, off_t length) +{ +	int ret; + +	maybe_die(fd); + +	ret = ftruncate(fd, length); + +	maybe_die(fd); +	return ret; +} + +static bool test_death(enum operation op, struct agent *agent) +{ +	struct tdb_context *tdb = NULL; +	TDB_DATA key; +	enum agent_return ret; +	int needed_recovery = 0; + +	current = target = 0; +reset: +	unlink(TEST_DBNAME); +	tdb = tdb_open(TEST_DBNAME, TDB_NOMMAP, +		       O_CREAT|O_TRUNC|O_RDWR, 0600, &tap_log_attr); +	if (!tdb) { +		diag("Failed opening TDB: %s", strerror(errno)); +		return false; +	} + +	if (setjmp(jmpbuf) != 0) { +		/* We're partway through.  Simulate our death. */ +		close(tdb->file->fd); +		forget_locking(); +		in_transaction = false; + +		ret = external_agent_operation(agent, NEEDS_RECOVERY, ""); +		if (ret == SUCCESS) +			needed_recovery++; +		else if (ret != FAILED) { +			diag("Step %u agent NEEDS_RECOVERY = %s", current, +			     agent_return_name(ret)); +			return false; +		} + +		ret = external_agent_operation(agent, op, KEY_STRING); +		if (ret != SUCCESS) { +			diag("Step %u op %s failed = %s", current, +			     operation_name(op), +			     agent_return_name(ret)); +			return false; +		} + +		ret = external_agent_operation(agent, NEEDS_RECOVERY, ""); +		if (ret != FAILED) { +			diag("Still needs recovery after step %u = %s", +			     current, agent_return_name(ret)); +			return false; +		} + +		ret = external_agent_operation(agent, CHECK, ""); +		if (ret != SUCCESS) { +			diag("Step %u check failed = %s", current, +			     agent_return_name(ret)); +			return false; +		} + +		ret = external_agent_operation(agent, CLOSE, ""); +		if (ret != SUCCESS) { +			diag("Step %u close failed = %s", current, +			     agent_return_name(ret)); +			return false; +		} + +		/* Suppress logging as this tries to use closed fd. */ +		suppress_logging = true; +		suppress_lockcheck = true; +		tdb_close(tdb); +		suppress_logging = false; +		suppress_lockcheck = false; +		target++; +		current = 0; +		free_all(); +		goto reset; +	} + +	/* Put key for agent to fetch. */ +	key = tdb_mkdata(KEY_STRING, strlen(KEY_STRING)); +	if (tdb_store(tdb, key, key, TDB_INSERT) != 0) +		return false; + +	/* This is the key we insert in transaction. */ +	key.dsize--; + +	ret = external_agent_operation(agent, OPEN, TEST_DBNAME); +	if (ret != SUCCESS) +		errx(1, "Agent failed to open: %s", agent_return_name(ret)); + +	ret = external_agent_operation(agent, FETCH, KEY_STRING); +	if (ret != SUCCESS) +		errx(1, "Agent failed find key: %s", agent_return_name(ret)); + +	in_transaction = true; +	if (tdb_transaction_start(tdb) != 0) +		return false; + +	if (tdb_store(tdb, key, key, TDB_INSERT) != 0) +		return false; + +	if (tdb_transaction_commit(tdb) != 0) +		return false; + +	in_transaction = false; + +	/* We made it! */ +	diag("Completed %u runs", current); +	tdb_close(tdb); +	ret = external_agent_operation(agent, CLOSE, ""); +	if (ret != SUCCESS) { +		diag("Step %u close failed = %s", current, +		     agent_return_name(ret)); +		return false; +	} + +	ok1(needed_recovery); +	ok1(locking_errors == 0); +	ok1(forget_locking() == 0); +	locking_errors = 0; +	return true; +} + +int main(int argc, char *argv[]) +{ +	enum operation ops[] = { FETCH, STORE, TRANSACTION_START }; +	struct agent *agent; +	int i; + +	plan_tests(12); +	unlock_callback = maybe_die; + +	agent = prepare_external_agent(); +	if (!agent) +		err(1, "preparing agent"); + +	for (i = 0; i < sizeof(ops)/sizeof(ops[0]); i++) { +		diag("Testing %s after death", operation_name(ops[i])); +		ok1(test_death(ops[i], agent)); +	} + +	free_external_agent(agent); +	return exit_status(); +} diff --git a/lib/tdb2/test/run-64-bit-tdb.c b/lib/tdb2/test/run-64-bit-tdb.c new file mode 100644 index 0000000000..78dadca016 --- /dev/null +++ b/lib/tdb2/test/run-64-bit-tdb.c @@ -0,0 +1,80 @@ +#include <ccan/tdb2/tdb.c> +#include <ccan/tdb2/open.c> +#include <ccan/tdb2/free.c> +#include <ccan/tdb2/lock.c> +#include <ccan/tdb2/io.c> +#include <ccan/tdb2/hash.c> +#include <ccan/tdb2/check.c> +#include <ccan/tdb2/traverse.c> +#include <ccan/tdb2/transaction.c> +#include <ccan/tap/tap.h> +#include "logging.h" + +int main(int argc, char *argv[]) +{ +	unsigned int i; +	struct tdb_context *tdb; +	int flags[] = { TDB_DEFAULT, TDB_NOMMAP, +			TDB_CONVERT, +			TDB_NOMMAP|TDB_CONVERT }; + +	if (sizeof(off_t) <= 4) { +		plan_tests(1); +		pass("No 64 bit off_t"); +		return exit_status(); +	} + +	plan_tests(sizeof(flags) / sizeof(flags[0]) * 14); +	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { +		off_t old_size; +		TDB_DATA k, d; +		struct hash_info h; +		struct tdb_used_record rec; +		tdb_off_t off; + +		tdb = tdb_open("run-64-bit-tdb.tdb", flags[i], +			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr); +		ok1(tdb); +		if (!tdb) +			continue; + +		old_size = tdb->file->map_size; + +		/* This makes a sparse file */ +		ok1(ftruncate(tdb->file->fd, 0xFFFFFFF0) == 0); +		ok1(add_free_record(tdb, old_size, 0xFFFFFFF0 - old_size, +				    TDB_LOCK_WAIT, false) == TDB_SUCCESS); + +		/* Now add a little record past the 4G barrier. */ +		ok1(tdb_expand_file(tdb, 100) == TDB_SUCCESS); +		ok1(add_free_record(tdb, 0xFFFFFFF0, 100, TDB_LOCK_WAIT, false) +		    == TDB_SUCCESS); + +		ok1(tdb_check(tdb, NULL, NULL) == TDB_SUCCESS); + +		/* Test allocation path. */ +		k = tdb_mkdata("key", 4); +		d = tdb_mkdata("data", 5); +		ok1(tdb_store(tdb, k, d, TDB_INSERT) == 0); +		ok1(tdb_check(tdb, NULL, NULL) == TDB_SUCCESS); + +		/* Make sure it put it at end as we expected. */ +		off = find_and_lock(tdb, k, F_RDLCK, &h, &rec, NULL); +		ok1(off >= 0xFFFFFFF0); +		tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range, F_RDLCK); + +		ok1(tdb_fetch(tdb, k, &d) == 0); +		ok1(d.dsize == 5); +		ok1(strcmp((char *)d.dptr, "data") == 0); +		free(d.dptr); + +		ok1(tdb_delete(tdb, k) == 0); +		ok1(tdb_check(tdb, NULL, NULL) == TDB_SUCCESS); + +		tdb_close(tdb); +	} + +	/* We might get messages about mmap failing, so don't test +	 * tap_log_messages */ +	return exit_status(); +} diff --git a/lib/tdb2/test/run-80-tdb_fd.c b/lib/tdb2/test/run-80-tdb_fd.c new file mode 100644 index 0000000000..e8b2fae2dd --- /dev/null +++ b/lib/tdb2/test/run-80-tdb_fd.c @@ -0,0 +1,35 @@ +#include <ccan/tdb2/tdb.c> +#include <ccan/tdb2/open.c> +#include <ccan/tdb2/free.c> +#include <ccan/tdb2/lock.c> +#include <ccan/tdb2/io.c> +#include <ccan/tdb2/hash.c> +#include <ccan/tdb2/transaction.c> +#include <ccan/tdb2/check.c> +#include <ccan/tap/tap.h> +#include "logging.h" + +int main(int argc, char *argv[]) +{ +	unsigned int i; +	struct tdb_context *tdb; +	int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP, +			TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT, +			TDB_NOMMAP|TDB_CONVERT }; + +	plan_tests(sizeof(flags) / sizeof(flags[0]) * 3); +	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { +		tdb = tdb_open("run-new_database.tdb", flags[i], +			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr); +		if (!ok1(tdb)) +			continue; + +		if (flags[i] & TDB_INTERNAL) +			ok1(tdb_fd(tdb) == -1); +		else +			ok1(tdb_fd(tdb) > 2); +		tdb_close(tdb); +		ok1(tap_log_messages == 0); +	} +	return exit_status(); +} diff --git a/lib/tdb2/test/run-81-seqnum.c b/lib/tdb2/test/run-81-seqnum.c new file mode 100644 index 0000000000..6e8b2698b6 --- /dev/null +++ b/lib/tdb2/test/run-81-seqnum.c @@ -0,0 +1,71 @@ +#include <ccan/tdb2/tdb.c> +#include <ccan/tdb2/open.c> +#include <ccan/tdb2/free.c> +#include <ccan/tdb2/lock.c> +#include <ccan/tdb2/io.c> +#include <ccan/tdb2/hash.c> +#include <ccan/tdb2/transaction.c> +#include <ccan/tdb2/traverse.c> +#include <ccan/tdb2/check.c> +#include <ccan/tap/tap.h> +#include "logging.h" + +int main(int argc, char *argv[]) +{ +	unsigned int i; +	struct tdb_context *tdb; +	struct tdb_data d = { NULL, 0 }; /* Bogus GCC warning */ +	struct tdb_data key = tdb_mkdata("key", 3); +	struct tdb_data data = tdb_mkdata("data", 4); +	int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP, +			TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT, +			TDB_NOMMAP|TDB_CONVERT }; + +	plan_tests(sizeof(flags) / sizeof(flags[0]) * 15 + 4 * 13); +	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { +		tdb = tdb_open("run-new_database.tdb", flags[i]|TDB_SEQNUM, +			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr); +		if (!ok1(tdb)) +			continue; + +		ok1(tdb_get_seqnum(tdb) == 0); +		ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0); +		ok1(tdb_get_seqnum(tdb) == 1); +		/* Fetch doesn't change seqnum */ +		if (ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS)) +			free(d.dptr); +		ok1(tdb_get_seqnum(tdb) == 1); +		ok1(tdb_append(tdb, key, data) == TDB_SUCCESS); +		ok1(tdb_get_seqnum(tdb) == 2); + +		ok1(tdb_delete(tdb, key) == TDB_SUCCESS); +		ok1(tdb_get_seqnum(tdb) == 3); +		/* Empty append works */ +		ok1(tdb_append(tdb, key, data) == TDB_SUCCESS); +		ok1(tdb_get_seqnum(tdb) == 4); + +		ok1(tdb_wipe_all(tdb) == TDB_SUCCESS); +		ok1(tdb_get_seqnum(tdb) == 5); + +		if (!(flags[i] & TDB_INTERNAL)) { +			ok1(tdb_transaction_start(tdb) == TDB_SUCCESS); +			ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0); +			ok1(tdb_get_seqnum(tdb) == 6); +			ok1(tdb_append(tdb, key, data) == TDB_SUCCESS); +			ok1(tdb_get_seqnum(tdb) == 7); +			ok1(tdb_delete(tdb, key) == TDB_SUCCESS); +			ok1(tdb_get_seqnum(tdb) == 8); +			ok1(tdb_transaction_commit(tdb) == TDB_SUCCESS); +			ok1(tdb_get_seqnum(tdb) == 8); + +			ok1(tdb_transaction_start(tdb) == TDB_SUCCESS); +			ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0); +			ok1(tdb_get_seqnum(tdb) == 9); +			tdb_transaction_cancel(tdb); +			ok1(tdb_get_seqnum(tdb) == 8); +		} +		tdb_close(tdb); +		ok1(tap_log_messages == 0); +	} +	return exit_status(); +} diff --git a/lib/tdb2/test/run-82-lockattr.c b/lib/tdb2/test/run-82-lockattr.c new file mode 100644 index 0000000000..bfc2653222 --- /dev/null +++ b/lib/tdb2/test/run-82-lockattr.c @@ -0,0 +1,263 @@ +#include <ccan/tdb2/tdb.c> +#include <ccan/tdb2/open.c> +#include <ccan/tdb2/free.c> +#include <ccan/tdb2/lock.c> +#include <ccan/tdb2/io.c> +#include <ccan/tdb2/hash.c> +#include <ccan/tdb2/check.c> +#include <ccan/tdb2/transaction.c> +#include <ccan/tdb2/traverse.c> +#include <ccan/tap/tap.h> +#include "logging.h" + +static int mylock(int fd, int rw, off_t off, off_t len, bool waitflag, +		  void *_err) +{ +	int *lock_err = _err; +	struct flock fl; +	int ret; + +	if (*lock_err) { +		errno = *lock_err; +		return -1; +	} + +	do { +		fl.l_type = rw; +		fl.l_whence = SEEK_SET; +		fl.l_start = off; +		fl.l_len = len; + +		if (waitflag) +			ret = fcntl(fd, F_SETLKW, &fl); +		else +			ret = fcntl(fd, F_SETLK, &fl); +	} while (ret != 0 && errno == EINTR); + +	return ret; +} + +static int myunlock(int fd, int rw, off_t off, off_t len, void *_err) +{ +	int *lock_err = _err; +	struct flock fl; +	int ret; + +	if (*lock_err) { +		errno = *lock_err; +		return -1; +	} + +	do { +		fl.l_type = F_UNLCK; +		fl.l_whence = SEEK_SET; +		fl.l_start = off; +		fl.l_len = len; + +		ret = fcntl(fd, F_SETLKW, &fl); +	} while (ret != 0 && errno == EINTR); + +	return ret; +} + +static int trav_err; +static int trav(struct tdb_context *tdb, TDB_DATA k, TDB_DATA d, int *err) +{ +	*err = trav_err; +	return 0; +} + +int main(int argc, char *argv[]) +{ +	unsigned int i; +	struct tdb_context *tdb; +	int flags[] = { TDB_DEFAULT, TDB_NOMMAP, +			TDB_CONVERT, TDB_NOMMAP|TDB_CONVERT }; +	union tdb_attribute lock_attr; +	struct tdb_data key = tdb_mkdata("key", 3); +	struct tdb_data data = tdb_mkdata("data", 4); +	int lock_err; + +	lock_attr.base.attr = TDB_ATTRIBUTE_FLOCK; +	lock_attr.base.next = &tap_log_attr; +	lock_attr.flock.lock = mylock; +	lock_attr.flock.unlock = myunlock; +	lock_attr.flock.data = &lock_err; + +	plan_tests(sizeof(flags) / sizeof(flags[0]) * 80); + +	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { +		struct tdb_data d; + +		/* Nonblocking open; expect no error message. */ +		lock_err = EAGAIN; +		tdb = tdb_open("run-82-lockattr.tdb", flags[i], +			       O_RDWR|O_CREAT|O_TRUNC, 0600, &lock_attr); +		ok(errno == lock_err, "Errno is %u", errno); +		ok1(!tdb); +		ok1(tap_log_messages == 0); + +		lock_err = EINTR; +		tdb = tdb_open("run-82-lockattr.tdb", flags[i], +			       O_RDWR|O_CREAT|O_TRUNC, 0600, &lock_attr); +		ok(errno == lock_err, "Errno is %u", errno); +		ok1(!tdb); +		ok1(tap_log_messages == 0); + +		/* Forced fail open. */ +		lock_err = ENOMEM; +		tdb = tdb_open("run-82-lockattr.tdb", flags[i], +			       O_RDWR|O_CREAT|O_TRUNC, 0600, &lock_attr); +		ok1(errno == lock_err); +		ok1(!tdb); +		ok1(tap_log_messages == 1); +		tap_log_messages = 0; + +		lock_err = 0; +		tdb = tdb_open("run-82-lockattr.tdb", flags[i], +			       O_RDWR|O_CREAT|O_TRUNC, 0600, &lock_attr); +		if (!ok1(tdb)) +			continue; +		ok1(tap_log_messages == 0); + +		/* Nonblocking store. */ +		lock_err = EAGAIN; +		ok1(tdb_store(tdb, key, data, TDB_REPLACE) == TDB_ERR_LOCK); +		ok1(tap_log_messages == 0); +		lock_err = EINTR; +		ok1(tdb_store(tdb, key, data, TDB_REPLACE) == TDB_ERR_LOCK); +		ok1(tap_log_messages == 0); +		lock_err = ENOMEM; +		ok1(tdb_store(tdb, key, data, TDB_REPLACE) == TDB_ERR_LOCK); +		ok1(tap_log_messages == 1); +		tap_log_messages = 0; + +		/* Nonblocking fetch. */ +		lock_err = EAGAIN; +		ok1(!tdb_exists(tdb, key)); +		ok1(tap_log_messages == 0); +		lock_err = EINTR; +		ok1(!tdb_exists(tdb, key)); +		ok1(tap_log_messages == 0); +		lock_err = ENOMEM; +		ok1(!tdb_exists(tdb, key)); +		ok1(tap_log_messages == 1); +		tap_log_messages = 0; + +		lock_err = EAGAIN; +		ok1(tdb_fetch(tdb, key, &d) == TDB_ERR_LOCK); +		ok1(tap_log_messages == 0); +		lock_err = EINTR; +		ok1(tdb_fetch(tdb, key, &d) == TDB_ERR_LOCK); +		ok1(tap_log_messages == 0); +		lock_err = ENOMEM; +		ok1(tdb_fetch(tdb, key, &d) == TDB_ERR_LOCK); +		ok1(tap_log_messages == 1); +		tap_log_messages = 0; + +		/* Nonblocking delete. */ +		lock_err = EAGAIN; +		ok1(tdb_delete(tdb, key) == TDB_ERR_LOCK); +		ok1(tap_log_messages == 0); +		lock_err = EINTR; +		ok1(tdb_delete(tdb, key) == TDB_ERR_LOCK); +		ok1(tap_log_messages == 0); +		lock_err = ENOMEM; +		ok1(tdb_delete(tdb, key) == TDB_ERR_LOCK); +		ok1(tap_log_messages == 1); +		tap_log_messages = 0; + +		/* Nonblocking locks. */ +		lock_err = EAGAIN; +		ok1(tdb_chainlock(tdb, key) == TDB_ERR_LOCK); +		ok1(tap_log_messages == 0); +		lock_err = EINTR; +		ok1(tdb_chainlock(tdb, key) == TDB_ERR_LOCK); +		ok1(tap_log_messages == 0); +		lock_err = ENOMEM; +		ok1(tdb_chainlock(tdb, key) == TDB_ERR_LOCK); +		ok1(tap_log_messages == 1); +		tap_log_messages = 0; + +		lock_err = EAGAIN; +		ok1(tdb_chainlock_read(tdb, key) == TDB_ERR_LOCK); +		ok1(tap_log_messages == 0); +		lock_err = EINTR; +		ok1(tdb_chainlock_read(tdb, key) == TDB_ERR_LOCK); +		ok1(tap_log_messages == 0); +		lock_err = ENOMEM; +		ok1(tdb_chainlock_read(tdb, key) == TDB_ERR_LOCK); +		ok1(tap_log_messages == 1); +		tap_log_messages = 0; + +		lock_err = EAGAIN; +		ok1(tdb_lockall(tdb) == TDB_ERR_LOCK); +		ok1(tap_log_messages == 0); +		lock_err = EINTR; +		ok1(tdb_lockall(tdb) == TDB_ERR_LOCK); +		ok1(tap_log_messages == 0); +		lock_err = ENOMEM; +		ok1(tdb_lockall(tdb) == TDB_ERR_LOCK); +		/* This actually does divide and conquer. */ +		ok1(tap_log_messages > 0); +		tap_log_messages = 0; + +		lock_err = EAGAIN; +		ok1(tdb_lockall_read(tdb) == TDB_ERR_LOCK); +		ok1(tap_log_messages == 0); +		lock_err = EINTR; +		ok1(tdb_lockall_read(tdb) == TDB_ERR_LOCK); +		ok1(tap_log_messages == 0); +		lock_err = ENOMEM; +		ok1(tdb_lockall_read(tdb) == TDB_ERR_LOCK); +		ok1(tap_log_messages > 0); +		tap_log_messages = 0; + +		/* Nonblocking traverse; go nonblock partway through. */ +		lock_err = 0; +		ok1(tdb_store(tdb, key, data, TDB_REPLACE) == 0); +		trav_err = EAGAIN; +		ok1(tdb_traverse(tdb, trav, &lock_err) == TDB_ERR_LOCK); +		ok1(tap_log_messages == 0); +		trav_err = EINTR; +		lock_err = 0; +		ok1(tdb_traverse(tdb, trav, &lock_err) == TDB_ERR_LOCK); +		ok1(tap_log_messages == 0); +		trav_err = ENOMEM; +		lock_err = 0; +		ok1(tdb_traverse(tdb, trav, &lock_err) == TDB_ERR_LOCK); +		ok1(tap_log_messages == 1); +		tap_log_messages = 0; + +		/* Nonblocking transactions. */ +		lock_err = EAGAIN; +		ok1(tdb_transaction_start(tdb) == TDB_ERR_LOCK); +		ok1(tap_log_messages == 0); +		lock_err = EINTR; +		ok1(tdb_transaction_start(tdb) == TDB_ERR_LOCK); +		ok1(tap_log_messages == 0); +		lock_err = ENOMEM; +		ok1(tdb_transaction_start(tdb) == TDB_ERR_LOCK); +		ok1(tap_log_messages == 1); +		tap_log_messages = 0; + +		/* Nonblocking transaction prepare. */ +		lock_err = 0; +		ok1(tdb_transaction_start(tdb) == 0); +		ok1(tdb_delete(tdb, key) == 0); + +		lock_err = EAGAIN; +		ok1(tdb_transaction_prepare_commit(tdb) == TDB_ERR_LOCK); +		ok1(tap_log_messages == 0); + +		lock_err = 0; +		ok1(tdb_transaction_prepare_commit(tdb) == 0); +		ok1(tdb_transaction_commit(tdb) == 0); + +		/* And the transaction was committed, right? */ +		ok1(!tdb_exists(tdb, key)); +		tdb_close(tdb); +		ok1(tap_log_messages == 0); +	} +	return exit_status(); +} diff --git a/lib/tdb2/test/run-83-openhook.c b/lib/tdb2/test/run-83-openhook.c new file mode 100644 index 0000000000..320be7d4da --- /dev/null +++ b/lib/tdb2/test/run-83-openhook.c @@ -0,0 +1,98 @@ +#include <ccan/tdb2/tdb.c> +#include <ccan/tdb2/open.c> +#include <ccan/tdb2/free.c> +#include <ccan/tdb2/lock.c> +#include <ccan/tdb2/io.c> +#include <ccan/tdb2/hash.c> +#include <ccan/tdb2/check.c> +#include <ccan/tdb2/transaction.c> +#include <ccan/tap/tap.h> +#include <stdlib.h> +#include <stdbool.h> +#include <stdarg.h> +#include <err.h> +#include "external-agent.h" +#include "logging.h" + +static enum TDB_ERROR clear_if_first(int fd, void *arg) +{ +/* We hold a lock offset 63 always, so we can tell if anyone is holding it. */ +	struct flock fl; + +	if (arg != clear_if_first) +		return TDB_ERR_CORRUPT; + +	fl.l_type = F_WRLCK; +	fl.l_whence = SEEK_SET; +	fl.l_start = 63; +	fl.l_len = 1; + +	if (fcntl(fd, F_SETLK, &fl) == 0) { +		/* We must be first ones to open it! */ +		diag("truncating file!"); +		if (ftruncate(fd, 0) != 0) { +			return TDB_ERR_IO; +		} +	} +	fl.l_type = F_RDLCK; +	if (fcntl(fd, F_SETLKW, &fl) != 0) { +		return TDB_ERR_IO; +	} +	return TDB_SUCCESS; +} + +int main(int argc, char *argv[]) +{ +	unsigned int i; +	struct tdb_context *tdb; +	struct agent *agent; +	union tdb_attribute cif; +	struct tdb_data key = tdb_mkdata("key", 3); +	int flags[] = { TDB_DEFAULT, TDB_NOMMAP, +			TDB_CONVERT, TDB_NOMMAP|TDB_CONVERT }; + +	cif.openhook.base.attr = TDB_ATTRIBUTE_OPENHOOK; +	cif.openhook.base.next = &tap_log_attr; +	cif.openhook.fn = clear_if_first; +	cif.openhook.data = clear_if_first; + +	agent = prepare_external_agent(); +	plan_tests(sizeof(flags) / sizeof(flags[0]) * 13); +	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { +		/* Create it */ +		tdb = tdb_open("run-83-openhook.tdb", flags[i], +			       O_RDWR|O_CREAT|O_TRUNC, 0600, NULL); +		ok1(tdb); +		ok1(tdb_store(tdb, key, key, TDB_REPLACE) == 0); +		tdb_close(tdb); + +		/* Now, open with CIF, should clear it. */ +		tdb = tdb_open("run-83-openhook.tdb", flags[i], +			       O_RDWR, 0, &cif); +		ok1(tdb); +		ok1(!tdb_exists(tdb, key)); +		ok1(tdb_store(tdb, key, key, TDB_REPLACE) == 0); + +		/* Agent should not clear it, since it's still open. */ +		ok1(external_agent_operation(agent, OPEN_WITH_HOOK, +					     "run-83-openhook.tdb") == SUCCESS); +		ok1(external_agent_operation(agent, FETCH, "key") == SUCCESS); +		ok1(external_agent_operation(agent, CLOSE, "") == SUCCESS); + +		/* Still exists for us too. */ +		ok1(tdb_exists(tdb, key)); + +		/* Close it, now agent should clear it. */ +		tdb_close(tdb); + +		ok1(external_agent_operation(agent, OPEN_WITH_HOOK, +					     "run-83-openhook.tdb") == SUCCESS); +		ok1(external_agent_operation(agent, FETCH, "key") == FAILED); +		ok1(external_agent_operation(agent, CLOSE, "") == SUCCESS); + +		ok1(tap_log_messages == 0); +	} + +	free_external_agent(agent); +	return exit_status(); +} diff --git a/lib/tdb2/test/run-90-get-set-attributes.c b/lib/tdb2/test/run-90-get-set-attributes.c new file mode 100644 index 0000000000..159d8a01ea --- /dev/null +++ b/lib/tdb2/test/run-90-get-set-attributes.c @@ -0,0 +1,165 @@ +#include <ccan/tdb2/tdb.c> +#include <ccan/tdb2/open.c> +#include <ccan/tdb2/free.c> +#include <ccan/tdb2/lock.c> +#include <ccan/tdb2/io.c> +#include <ccan/tdb2/hash.c> +#include <ccan/tdb2/check.c> +#include <ccan/tdb2/transaction.c> +#include <ccan/tdb2/traverse.c> +#include <ccan/tap/tap.h> +#include "logging.h" + +static int mylock(int fd, int rw, off_t off, off_t len, bool waitflag, +		  void *unused) +{ +	return 0; +} + +static int myunlock(int fd, int rw, off_t off, off_t len, void *unused) +{ +	return 0; +} + +static uint64_t hash_fn(const void *key, size_t len, uint64_t seed, +			void *priv) +{ +	return 0; +} + +int main(int argc, char *argv[]) +{ +	unsigned int i; +	struct tdb_context *tdb; +	int flags[] = { TDB_DEFAULT, TDB_NOMMAP, +			TDB_CONVERT, TDB_NOMMAP|TDB_CONVERT }; +	union tdb_attribute seed_attr; +	union tdb_attribute hash_attr; +	union tdb_attribute lock_attr; + +	hash_attr.base.attr = TDB_ATTRIBUTE_HASH; +	hash_attr.base.next = &seed_attr; +	hash_attr.hash.fn = hash_fn; +	hash_attr.hash.data = &hash_attr; + +	seed_attr.base.attr = TDB_ATTRIBUTE_SEED; +	seed_attr.base.next = &lock_attr; +	seed_attr.seed.seed = 100; + +	lock_attr.base.attr = TDB_ATTRIBUTE_FLOCK; +	lock_attr.base.next = &tap_log_attr; +	lock_attr.flock.lock = mylock; +	lock_attr.flock.unlock = myunlock; +	lock_attr.flock.data = &lock_attr; + +	plan_tests(sizeof(flags) / sizeof(flags[0]) * 50); + +	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { +		union tdb_attribute attr; + +		/* First open with no attributes. */ +		tdb = tdb_open("run-90-get-set-attributes.tdb", flags[i], +			       O_RDWR|O_CREAT|O_TRUNC, 0600, NULL); +		ok1(tdb); + +		/* Get log on no attributes will fail */ +		attr.base.attr = TDB_ATTRIBUTE_LOG; +		ok1(tdb_get_attribute(tdb, &attr) == TDB_ERR_NOEXIST); +		/* These always work. */ +		attr.base.attr = TDB_ATTRIBUTE_HASH; +		ok1(tdb_get_attribute(tdb, &attr) == 0); +		ok1(attr.base.attr == TDB_ATTRIBUTE_HASH); +		ok1(attr.hash.fn == jenkins_hash); +		attr.base.attr = TDB_ATTRIBUTE_FLOCK; +		ok1(tdb_get_attribute(tdb, &attr) == 0); +		ok1(attr.base.attr == TDB_ATTRIBUTE_FLOCK); +		ok1(attr.flock.lock == tdb_fcntl_lock); +		ok1(attr.flock.unlock == tdb_fcntl_unlock); +		attr.base.attr = TDB_ATTRIBUTE_SEED; +		ok1(tdb_get_attribute(tdb, &attr) == 0); +		ok1(attr.base.attr == TDB_ATTRIBUTE_SEED); +		/* This is possible, just astronomically unlikely. */ +		ok1(attr.seed.seed != 0); + +		/* Unset attributes. */ +		tdb_unset_attribute(tdb, TDB_ATTRIBUTE_LOG); +		tdb_unset_attribute(tdb, TDB_ATTRIBUTE_FLOCK); + +		/* Set them. */ +		ok1(tdb_set_attribute(tdb, &tap_log_attr) == 0); +		ok1(tdb_set_attribute(tdb, &lock_attr) == 0); +		/* These should fail. */ +		ok1(tdb_set_attribute(tdb, &seed_attr) == TDB_ERR_EINVAL); +		ok1(tap_log_messages == 1); +		ok1(tdb_set_attribute(tdb, &hash_attr) == TDB_ERR_EINVAL); +		ok1(tap_log_messages == 2); +		tap_log_messages = 0; + +		/* Getting them should work as expected. */ +		attr.base.attr = TDB_ATTRIBUTE_LOG; +		ok1(tdb_get_attribute(tdb, &attr) == 0); +		ok1(attr.base.attr == TDB_ATTRIBUTE_LOG); +		ok1(attr.log.fn == tap_log_attr.log.fn); +		ok1(attr.log.data == tap_log_attr.log.data); + +		attr.base.attr = TDB_ATTRIBUTE_FLOCK; +		ok1(tdb_get_attribute(tdb, &attr) == 0); +		ok1(attr.base.attr == TDB_ATTRIBUTE_FLOCK); +		ok1(attr.flock.lock == mylock); +		ok1(attr.flock.unlock == myunlock); +		ok1(attr.flock.data == &lock_attr); + +		/* Unset them again. */ +		tdb_unset_attribute(tdb, TDB_ATTRIBUTE_FLOCK); +		ok1(tap_log_messages == 0); +		tdb_unset_attribute(tdb, TDB_ATTRIBUTE_LOG); +		ok1(tap_log_messages == 0); + +		tdb_close(tdb); +		ok1(tap_log_messages == 0); + +		/* Now open with all attributes. */ +		tdb = tdb_open("run-90-get-set-attributes.tdb", flags[i], +			       O_RDWR|O_CREAT|O_TRUNC, 0600, &hash_attr); +		ok1(tdb); + +		/* Get will succeed */ +		attr.base.attr = TDB_ATTRIBUTE_LOG; +		ok1(tdb_get_attribute(tdb, &attr) == 0); +		ok1(attr.base.attr == TDB_ATTRIBUTE_LOG); +		ok1(attr.log.fn == tap_log_attr.log.fn); +		ok1(attr.log.data == tap_log_attr.log.data); + +		attr.base.attr = TDB_ATTRIBUTE_HASH; +		ok1(tdb_get_attribute(tdb, &attr) == 0); +		ok1(attr.base.attr == TDB_ATTRIBUTE_HASH); +		ok1(attr.hash.fn == hash_fn); +		ok1(attr.hash.data == &hash_attr); + +		attr.base.attr = TDB_ATTRIBUTE_FLOCK; +		ok1(tdb_get_attribute(tdb, &attr) == 0); +		ok1(attr.base.attr == TDB_ATTRIBUTE_FLOCK); +		ok1(attr.flock.lock == mylock); +		ok1(attr.flock.unlock == myunlock); +		ok1(attr.flock.data == &lock_attr); + +		attr.base.attr = TDB_ATTRIBUTE_SEED; +		ok1(tdb_get_attribute(tdb, &attr) == 0); +		ok1(attr.base.attr == TDB_ATTRIBUTE_SEED); +		ok1(attr.seed.seed == seed_attr.seed.seed); + +		/* Unset attributes. */ +		tdb_unset_attribute(tdb, TDB_ATTRIBUTE_HASH); +		ok1(tap_log_messages == 1); +		tdb_unset_attribute(tdb, TDB_ATTRIBUTE_SEED); +		ok1(tap_log_messages == 2); +		tdb_unset_attribute(tdb, TDB_ATTRIBUTE_FLOCK); +		tdb_unset_attribute(tdb, TDB_ATTRIBUTE_LOG); +		ok1(tap_log_messages == 2); +		tap_log_messages = 0; + +		tdb_close(tdb); + +	} +	return exit_status(); +} diff --git a/lib/tdb2/test/run-91-get-stats.c b/lib/tdb2/test/run-91-get-stats.c new file mode 100644 index 0000000000..795dfd6602 --- /dev/null +++ b/lib/tdb2/test/run-91-get-stats.c @@ -0,0 +1,59 @@ +#include <ccan/tdb2/tdb.c> +#include <ccan/tdb2/open.c> +#include <ccan/tdb2/free.c> +#include <ccan/tdb2/lock.c> +#include <ccan/tdb2/io.c> +#include <ccan/tdb2/hash.c> +#include <ccan/tdb2/check.c> +#include <ccan/tdb2/transaction.c> +#include <ccan/tdb2/traverse.c> +#include <ccan/tap/tap.h> +#include "logging.h" + +int main(int argc, char *argv[]) +{ +	unsigned int i; +	struct tdb_context *tdb; +	int flags[] = { TDB_DEFAULT, TDB_NOMMAP, +			TDB_CONVERT, TDB_NOMMAP|TDB_CONVERT }; + +	plan_tests(sizeof(flags) / sizeof(flags[0]) * 11); + +	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { +		union tdb_attribute *attr; +		struct tdb_data key = tdb_mkdata("key", 3); + +		tdb = tdb_open("run-91-get-stats.tdb", flags[i], +			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr); +		ok1(tdb); +		ok1(tdb_store(tdb, key, key, TDB_REPLACE) == 0); + +		/* Use malloc so valgrind will catch overruns. */ +		attr = malloc(sizeof *attr); +		attr->stats.base.attr = TDB_ATTRIBUTE_STATS; +		attr->stats.size = sizeof(*attr); + +		ok1(tdb_get_attribute(tdb, attr) == 0); +		ok1(attr->stats.size == sizeof(*attr)); +		ok1(attr->stats.allocs > 0); +		ok1(attr->stats.expands > 0); +		ok1(attr->stats.locks > 0); +		free(attr); + +		/* Try short one. */ +		attr = malloc(offsetof(struct tdb_attribute_stats, allocs) +			      + sizeof(attr->stats.allocs)); +		attr->stats.base.attr = TDB_ATTRIBUTE_STATS; +		attr->stats.size = offsetof(struct tdb_attribute_stats, allocs) +			+ sizeof(attr->stats.allocs); +		ok1(tdb_get_attribute(tdb, attr) == 0); +		ok1(attr->stats.size == sizeof(*attr)); +		ok1(attr->stats.allocs > 0); +		free(attr); +		ok1(tap_log_messages == 0); + +		tdb_close(tdb); + +	} +	return exit_status(); +} diff --git a/lib/tdb2/test/run-add-remove-flags.c b/lib/tdb2/test/run-add-remove-flags.c new file mode 100644 index 0000000000..1dc8463662 --- /dev/null +++ b/lib/tdb2/test/run-add-remove-flags.c @@ -0,0 +1,93 @@ +#include <ccan/tdb2/tdb.c> +#include <ccan/tdb2/open.c> +#include <ccan/tdb2/free.c> +#include <ccan/tdb2/lock.c> +#include <ccan/tdb2/io.c> +#include <ccan/tdb2/hash.c> +#include <ccan/tdb2/check.c> +#include <ccan/tdb2/traverse.c> +#include <ccan/tdb2/transaction.c> +#include <ccan/tap/tap.h> +#include "logging.h" + +int main(int argc, char *argv[]) +{ +	unsigned int i; +	struct tdb_context *tdb; +	int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP, +			TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT, +			TDB_NOMMAP|TDB_CONVERT }; + +	plan_tests(87); +	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { +		tdb = tdb_open("run-add-remove-flags.tdb", flags[i], +			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr); +		ok1(tdb); +		if (!tdb) +			continue; + +		ok1(tdb_get_flags(tdb) == tdb->flags); +		tap_log_messages = 0; +		tdb_add_flag(tdb, TDB_NOLOCK); +		if (flags[i] & TDB_INTERNAL) +			ok1(tap_log_messages == 1); +		else { +			ok1(tap_log_messages == 0); +			ok1(tdb_get_flags(tdb) & TDB_NOLOCK); +		} + +		tap_log_messages = 0; +		tdb_add_flag(tdb, TDB_NOMMAP); +		if (flags[i] & TDB_INTERNAL) +			ok1(tap_log_messages == 1); +		else { +			ok1(tap_log_messages == 0); +			ok1(tdb_get_flags(tdb) & TDB_NOMMAP); +			ok1(tdb->file->map_ptr == NULL); +		} + +		tap_log_messages = 0; +		tdb_add_flag(tdb, TDB_NOSYNC); +		if (flags[i] & TDB_INTERNAL) +			ok1(tap_log_messages == 1); +		else { +			ok1(tap_log_messages == 0); +			ok1(tdb_get_flags(tdb) & TDB_NOSYNC); +		} + +		ok1(tdb_get_flags(tdb) == tdb->flags); + +		tap_log_messages = 0; +		tdb_remove_flag(tdb, TDB_NOLOCK); +		if (flags[i] & TDB_INTERNAL) +			ok1(tap_log_messages == 1); +		else { +			ok1(tap_log_messages == 0); +			ok1(!(tdb_get_flags(tdb) & TDB_NOLOCK)); +		} + +		tap_log_messages = 0; +		tdb_remove_flag(tdb, TDB_NOMMAP); +		if (flags[i] & TDB_INTERNAL) +			ok1(tap_log_messages == 1); +		else { +			ok1(tap_log_messages == 0); +			ok1(!(tdb_get_flags(tdb) & TDB_NOMMAP)); +			ok1(tdb->file->map_ptr != NULL); +		} + +		tap_log_messages = 0; +		tdb_remove_flag(tdb, TDB_NOSYNC); +		if (flags[i] & TDB_INTERNAL) +			ok1(tap_log_messages == 1); +		else { +			ok1(tap_log_messages == 0); +			ok1(!(tdb_get_flags(tdb) & TDB_NOSYNC)); +		} + +		tdb_close(tdb); +	} + +	ok1(tap_log_messages == 0); +	return exit_status(); +} diff --git a/lib/tdb2/test/run-check-callback.c b/lib/tdb2/test/run-check-callback.c new file mode 100644 index 0000000000..1e87436717 --- /dev/null +++ b/lib/tdb2/test/run-check-callback.c @@ -0,0 +1,90 @@ +#include <ccan/tdb2/tdb.c> +#include <ccan/tdb2/free.c> +#include <ccan/tdb2/lock.c> +#include <ccan/tdb2/io.c> +#include <ccan/tdb2/hash.c> +#include <ccan/tdb2/check.c> +#include <ccan/tdb2/traverse.c> +#include <ccan/tdb2/transaction.c> +#include <ccan/tdb2/open.c> +#include <ccan/tap/tap.h> +#include "logging.h" + +#define NUM_RECORDS 1000 + +static bool store_records(struct tdb_context *tdb) +{ +	int i; +	struct tdb_data key = { (unsigned char *)&i, sizeof(i) }; +	struct tdb_data data = { (unsigned char *)&i, sizeof(i) }; + +	for (i = 0; i < NUM_RECORDS; i++) +		if (tdb_store(tdb, key, data, TDB_REPLACE) != 0) +			return false; +	return true; +} + +static enum TDB_ERROR check(struct tdb_data key, +			    struct tdb_data data, +			    bool *array) +{ +	int val; + +	if (key.dsize != sizeof(val)) { +		diag("Wrong key size: %u\n", key.dsize); +		return TDB_ERR_CORRUPT; +	} + +	if (key.dsize != data.dsize +	    || memcmp(key.dptr, data.dptr, sizeof(val)) != 0) { +		diag("Key and data differ\n"); +		return TDB_ERR_CORRUPT; +	} + +	memcpy(&val, key.dptr, sizeof(val)); +	if (val >= NUM_RECORDS || val < 0) { +		diag("check value %i\n", val); +		return TDB_ERR_CORRUPT; +	} + +	if (array[val]) { +		diag("Value %i already seen\n", val); +		return TDB_ERR_CORRUPT; +	} + +	array[val] = true; +	return TDB_SUCCESS; +} + +int main(int argc, char *argv[]) +{ +	unsigned int i, j; +	struct tdb_context *tdb; +	int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP, +			TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT, +			TDB_NOMMAP|TDB_CONVERT }; + +	plan_tests(sizeof(flags) / sizeof(flags[0]) * 4 + 1); +	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { +		bool array[NUM_RECORDS]; + +		tdb = tdb_open("run-check-callback.tdb", flags[i], +			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr); +		ok1(tdb); +		if (!tdb) +			continue; + +		ok1(store_records(tdb)); +		for (j = 0; j < NUM_RECORDS; j++) +			array[j] = false; +		ok1(tdb_check(tdb, check, array) == TDB_SUCCESS); +		for (j = 0; j < NUM_RECORDS; j++) +			if (!array[j]) +				break; +		ok1(j == NUM_RECORDS); +		tdb_close(tdb); +	} + +	ok1(tap_log_messages == 0); +	return exit_status(); +} diff --git a/lib/tdb2/test/run-expand-in-transaction.c b/lib/tdb2/test/run-expand-in-transaction.c new file mode 100644 index 0000000000..49ba03c924 --- /dev/null +++ b/lib/tdb2/test/run-expand-in-transaction.c @@ -0,0 +1,45 @@ +#include <ccan/tdb2/tdb.c> +#include <ccan/tdb2/open.c> +#include <ccan/tdb2/free.c> +#include <ccan/tdb2/lock.c> +#include <ccan/tdb2/io.c> +#include <ccan/tdb2/hash.c> +#include <ccan/tdb2/check.c> +#include <ccan/tdb2/transaction.c> +#include <ccan/tap/tap.h> +#include "logging.h" + +int main(int argc, char *argv[]) +{ +	unsigned int i; +	struct tdb_context *tdb; +	int flags[] = { TDB_DEFAULT, TDB_NOMMAP, +			TDB_CONVERT, TDB_NOMMAP|TDB_CONVERT, +			TDB_CONVERT|TDB_NOSYNC, +			TDB_NOMMAP|TDB_CONVERT|TDB_NOSYNC }; +	struct tdb_data key = tdb_mkdata("key", 3); +	struct tdb_data data = tdb_mkdata("data", 4); + +	plan_tests(sizeof(flags) / sizeof(flags[0]) * 7 + 1); + +	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { +		size_t size; +		tdb = tdb_open("run-expand-in-transaction.tdb", flags[i], +			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr); +		ok1(tdb); +		if (!tdb) +			continue; + +		size = tdb->file->map_size; +		ok1(tdb_transaction_start(tdb) == 0); +		ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0); +		ok1(tdb->file->map_size > size); +		ok1(tdb_transaction_commit(tdb) == 0); +		ok1(tdb->file->map_size > size); +		ok1(tdb_check(tdb, NULL, NULL) == 0); +		tdb_close(tdb); +	} + +	ok1(tap_log_messages == 0); +	return exit_status(); +} diff --git a/lib/tdb2/test/run-features.c b/lib/tdb2/test/run-features.c new file mode 100644 index 0000000000..6d82dc308c --- /dev/null +++ b/lib/tdb2/test/run-features.c @@ -0,0 +1,70 @@ +#include <ccan/tdb2/tdb.c> +#include <ccan/tdb2/open.c> +#include <ccan/tdb2/free.c> +#include <ccan/tdb2/lock.c> +#include <ccan/tdb2/io.c> +#include <ccan/tdb2/hash.c> +#include <ccan/tdb2/check.c> +#include <ccan/tdb2/summary.c> +#include <ccan/tdb2/transaction.c> +#include <ccan/tap/tap.h> +#include "logging.h" + +int main(int argc, char *argv[]) +{ +	unsigned int i, j; +	struct tdb_context *tdb; +	int flags[] = { TDB_DEFAULT, TDB_NOMMAP, +			TDB_CONVERT, TDB_NOMMAP|TDB_CONVERT }; +	struct tdb_data key = { (unsigned char *)&j, sizeof(j) }; +	struct tdb_data data = { (unsigned char *)&j, sizeof(j) }; + +	plan_tests(sizeof(flags) / sizeof(flags[0]) * 8 + 1); +	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { +		uint64_t features; +		tdb = tdb_open("run-features.tdb", flags[i], +			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr); +		ok1(tdb); +		if (!tdb) +			continue; + +		/* Put some stuff in there. */ +		for (j = 0; j < 100; j++) { +			if (tdb_store(tdb, key, data, TDB_REPLACE) != 0) +				fail("Storing in tdb"); +		} + +		/* Mess with features fields in hdr. */ +		features = (~TDB_FEATURE_MASK ^ 1); +		ok1(tdb_write_convert(tdb, offsetof(struct tdb_header, +						    features_used), +				      &features, sizeof(features)) == 0); +		ok1(tdb_write_convert(tdb, offsetof(struct tdb_header, +						    features_offered), +				      &features, sizeof(features)) == 0); +		tdb_close(tdb); + +		tdb = tdb_open("run-features.tdb", flags[i], O_RDWR, 0, +			       &tap_log_attr); +		ok1(tdb); +		if (!tdb) +			continue; + +		/* Should not have changed features offered. */ +		ok1(tdb_read_convert(tdb, offsetof(struct tdb_header, +						   features_offered), +				     &features, sizeof(features)) == 0); +		ok1(features == (~TDB_FEATURE_MASK ^ 1)); + +		/* Should have cleared unknown bits in features_used. */ +		ok1(tdb_read_convert(tdb, offsetof(struct tdb_header, +						   features_used), +				     &features, sizeof(features)) == 0); +		ok1(features == (1 & TDB_FEATURE_MASK)); + +		tdb_close(tdb); +	} + +	ok1(tap_log_messages == 0); +	return exit_status(); +} diff --git a/lib/tdb2/test/run-firstkey-nextkey.c b/lib/tdb2/test/run-firstkey-nextkey.c new file mode 100644 index 0000000000..65a6090a96 --- /dev/null +++ b/lib/tdb2/test/run-firstkey-nextkey.c @@ -0,0 +1,162 @@ +#include <ccan/tdb2/tdb.c> +#include <ccan/tdb2/open.c> +#include <ccan/tdb2/free.c> +#include <ccan/tdb2/lock.c> +#include <ccan/tdb2/io.c> +#include <ccan/tdb2/hash.c> +#include <ccan/tdb2/check.c> +#include <ccan/tdb2/traverse.c> +#include <ccan/tdb2/transaction.c> +#include <ccan/tap/tap.h> +#include "logging.h" + +#define NUM_RECORDS 1000 + +static bool store_records(struct tdb_context *tdb) +{ +	int i; +	struct tdb_data key = { (unsigned char *)&i, sizeof(i) }; +	struct tdb_data data = { (unsigned char *)&i, sizeof(i) }; + +	for (i = 0; i < NUM_RECORDS; i++) +		if (tdb_store(tdb, key, data, TDB_REPLACE) != 0) +			return false; +	return true; +} + +struct trav_data { +	unsigned int records[NUM_RECORDS]; +	unsigned int calls; +}; + +static int trav(struct tdb_context *tdb, TDB_DATA key, TDB_DATA dbuf, void *p) +{ +	struct trav_data *td = p; +	int val; + +	memcpy(&val, dbuf.dptr, dbuf.dsize); +	td->records[td->calls++] = val; +	return 0; +} + +/* Since tdb_nextkey frees dptr, we need to clone it. */ +static TDB_DATA dup_key(TDB_DATA key) +{ +	void *p = malloc(key.dsize); +	memcpy(p, key.dptr, key.dsize); +	key.dptr = p; +	return key; +} + +int main(int argc, char *argv[]) +{ +	unsigned int i, j; +	int num; +	struct trav_data td; +	TDB_DATA k; +	struct tdb_context *tdb; +	union tdb_attribute seed_attr; +	enum TDB_ERROR ecode; + +	int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP, +			TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT, +			TDB_NOMMAP|TDB_CONVERT }; + +	seed_attr.base.attr = TDB_ATTRIBUTE_SEED; +	seed_attr.base.next = &tap_log_attr; +	seed_attr.seed.seed = 6334326220117065685ULL; + +	plan_tests(sizeof(flags) / sizeof(flags[0]) +		   * (NUM_RECORDS*6 + (NUM_RECORDS-1)*3 + 22) + 1); +	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { +		tdb = tdb_open("run-traverse.tdb", flags[i], +			       O_RDWR|O_CREAT|O_TRUNC, 0600, &seed_attr); +		ok1(tdb); +		if (!tdb) +			continue; + +		ok1(tdb_firstkey(tdb, &k) == TDB_ERR_NOEXIST); + +		/* One entry... */ +		k.dptr = (unsigned char *)# +		k.dsize = sizeof(num); +		num = 0; +		ok1(tdb_store(tdb, k, k, TDB_INSERT) == 0); +		ok1(tdb_firstkey(tdb, &k) == TDB_SUCCESS); +		ok1(k.dsize == sizeof(num)); +		ok1(memcmp(k.dptr, &num, sizeof(num)) == 0); +		ok1(tdb_nextkey(tdb, &k) == TDB_ERR_NOEXIST); + +		/* Two entries. */ +		k.dptr = (unsigned char *)# +		k.dsize = sizeof(num); +		num = 1; +		ok1(tdb_store(tdb, k, k, TDB_INSERT) == 0); +		ok1(tdb_firstkey(tdb, &k) == TDB_SUCCESS); +		ok1(k.dsize == sizeof(num)); +		memcpy(&num, k.dptr, sizeof(num)); +		ok1(num == 0 || num == 1); +		ok1(tdb_nextkey(tdb, &k) == TDB_SUCCESS); +		ok1(k.dsize == sizeof(j)); +		memcpy(&j, k.dptr, sizeof(j)); +		ok1(j == 0 || j == 1); +		ok1(j != num); +		ok1(tdb_nextkey(tdb, &k) == TDB_ERR_NOEXIST); + +		/* Clean up. */ +		k.dptr = (unsigned char *)# +		k.dsize = sizeof(num); +		num = 0; +		ok1(tdb_delete(tdb, k) == 0); +		num = 1; +		ok1(tdb_delete(tdb, k) == 0); + +		/* Now lots of records. */ +		ok1(store_records(tdb)); +		td.calls = 0; + +		num = tdb_traverse(tdb, trav, &td); +		ok1(num == NUM_RECORDS); +		ok1(td.calls == NUM_RECORDS); + +		/* Simple loop should match tdb_traverse */ +		for (j = 0, ecode = tdb_firstkey(tdb, &k); j < td.calls; j++) { +			int val; + +			ok1(ecode == TDB_SUCCESS); +			ok1(k.dsize == sizeof(val)); +			memcpy(&val, k.dptr, k.dsize); +			ok1(td.records[j] == val); +			ecode = tdb_nextkey(tdb, &k); +		} + +		/* But arbitrary orderings should work too. */ +		for (j = td.calls-1; j > 0; j--) { +			k.dptr = (unsigned char *)&td.records[j-1]; +			k.dsize = sizeof(td.records[j-1]); +			k = dup_key(k); +			ok1(tdb_nextkey(tdb, &k) == TDB_SUCCESS); +			ok1(k.dsize == sizeof(td.records[j])); +			ok1(memcmp(k.dptr, &td.records[j], k.dsize) == 0); +			free(k.dptr); +		} + +		/* Even delete should work. */ +		for (j = 0, ecode = tdb_firstkey(tdb, &k); +		     ecode != TDB_ERR_NOEXIST; +		     j++) { +			ok1(ecode == TDB_SUCCESS); +			ok1(k.dsize == 4); +			ok1(tdb_delete(tdb, k) == 0); +			ecode = tdb_nextkey(tdb, &k); +		} + +		diag("delete using first/nextkey gave %u of %u records", +		     j, NUM_RECORDS); +		ok1(j == NUM_RECORDS); +		tdb_close(tdb); +	} + +	ok1(tap_log_messages == 0); +	return exit_status(); +} diff --git a/lib/tdb2/test/run-fork-test.c b/lib/tdb2/test/run-fork-test.c new file mode 100644 index 0000000000..e9813e0a0f --- /dev/null +++ b/lib/tdb2/test/run-fork-test.c @@ -0,0 +1,180 @@ +/* Test forking while holding lock. + * + * There are only five ways to do this currently: + * (1) grab a tdb_chainlock, then fork. + * (2) grab a tdb_lockall, then fork. + * (3) grab a tdb_lockall_read, then fork. + * (4) start a transaction, then fork. + * (5) fork from inside a tdb_parse() callback. + * + * Note that we don't hold a lock across tdb_traverse callbacks, so + * that doesn't matter. + */ +#include <ccan/tdb2/tdb.c> +#include <ccan/tdb2/open.c> +#include <ccan/tdb2/free.c> +#include <ccan/tdb2/lock.c> +#include <ccan/tdb2/io.c> +#include <ccan/tdb2/hash.c> +#include <ccan/tdb2/check.c> +#include <ccan/tdb2/transaction.c> +#include <ccan/tap/tap.h> +#include <sys/types.h> +#include <sys/wait.h> +#include "logging.h" + +static enum TDB_ERROR fork_in_parse(TDB_DATA key, TDB_DATA data, +				    struct tdb_context *tdb) +{ +	int status; + +	if (fork() == 0) { +		/* We expect this to fail. */ +		if (tdb_store(tdb, key, data, TDB_REPLACE) != TDB_ERR_LOCK) +			exit(1); + +		if (tdb_fetch(tdb, key, &data) != TDB_ERR_LOCK) +			exit(1); + +		if (tap_log_messages != 2) +			exit(2); + +		tdb_close(tdb); +		if (tap_log_messages != 2) +			exit(3); +		exit(0); +	} +	wait(&status); +	ok1(WIFEXITED(status) && WEXITSTATUS(status) == 0); +	return TDB_SUCCESS; +} + +int main(int argc, char *argv[]) +{ +	unsigned int i; +	struct tdb_context *tdb; +	int flags[] = { TDB_DEFAULT, TDB_NOMMAP, +			TDB_CONVERT, TDB_NOMMAP|TDB_CONVERT }; +	struct tdb_data key = tdb_mkdata("key", 3); +	struct tdb_data data = tdb_mkdata("data", 4); + +	plan_tests(sizeof(flags) / sizeof(flags[0]) * 14); +	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { +		int status; + +		tap_log_messages = 0; + +		tdb = tdb_open("run-fork-test.tdb", flags[i], +			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr); +		if (!ok1(tdb)) +			continue; + +		/* Put a record in here. */ +		ok1(tdb_store(tdb, key, data, TDB_REPLACE) == TDB_SUCCESS); + +		ok1(tdb_chainlock(tdb, key) == TDB_SUCCESS); +		if (fork() == 0) { +			/* We expect this to fail. */ +			if (tdb_store(tdb, key, data, TDB_REPLACE) != TDB_ERR_LOCK) +				return 1; + +			if (tdb_fetch(tdb, key, &data) != TDB_ERR_LOCK) +				return 1; + +			if (tap_log_messages != 2) +				return 2; + +			tdb_chainunlock(tdb, key); +			if (tap_log_messages != 3) +				return 3; +			tdb_close(tdb); +			if (tap_log_messages != 3) +				return 4; +			return 0; +		} +		wait(&status); +		ok1(WIFEXITED(status) && WEXITSTATUS(status) == 0); +		tdb_chainunlock(tdb, key); + +		ok1(tdb_lockall(tdb) == TDB_SUCCESS); +		if (fork() == 0) { +			/* We expect this to fail. */ +			if (tdb_store(tdb, key, data, TDB_REPLACE) != TDB_ERR_LOCK) +				return 1; + +			if (tdb_fetch(tdb, key, &data) != TDB_ERR_LOCK) +				return 1; + +			if (tap_log_messages != 2) +				return 2; + +			tdb_unlockall(tdb); +			if (tap_log_messages != 2) +				return 3; +			tdb_close(tdb); +			if (tap_log_messages != 2) +				return 4; +			return 0; +		} +		wait(&status); +		ok1(WIFEXITED(status) && WEXITSTATUS(status) == 0); +		tdb_unlockall(tdb); + +		ok1(tdb_lockall_read(tdb) == TDB_SUCCESS); +		if (fork() == 0) { +			/* We expect this to fail. */ +			/* This would always fail anyway... */ +			if (tdb_store(tdb, key, data, TDB_REPLACE) != TDB_ERR_LOCK) +				return 1; + +			if (tdb_fetch(tdb, key, &data) != TDB_ERR_LOCK) +				return 1; + +			if (tap_log_messages != 2) +				return 2; + +			tdb_unlockall_read(tdb); +			if (tap_log_messages != 2) +				return 3; +			tdb_close(tdb); +			if (tap_log_messages != 2) +				return 4; +			return 0; +		} +		wait(&status); +		ok1(WIFEXITED(status) && WEXITSTATUS(status) == 0); +		tdb_unlockall_read(tdb); + +		ok1(tdb_transaction_start(tdb) == TDB_SUCCESS); +		/* If transactions is empty, noop "commit" succeeds. */ +		ok1(tdb_delete(tdb, key) == TDB_SUCCESS); +		if (fork() == 0) { +			/* We expect this to fail. */ +			if (tdb_store(tdb, key, data, TDB_REPLACE) != TDB_ERR_LOCK) +				return 1; + +			if (tdb_fetch(tdb, key, &data) != TDB_ERR_LOCK) +				return 1; + +			if (tap_log_messages != 2) +				return 2; + +			if (tdb_transaction_commit(tdb) != TDB_ERR_LOCK) +				return 3; + +			tdb_close(tdb); +			if (tap_log_messages < 3) +				return 4; +			return 0; +		} +		wait(&status); +		ok1(WIFEXITED(status) && WEXITSTATUS(status) == 0); +		tdb_transaction_cancel(tdb); + +		ok1(tdb_parse_record(tdb, key, fork_in_parse, tdb) +		    == TDB_SUCCESS); +		tdb_close(tdb); +		ok1(tap_log_messages == 0); +	} +	return exit_status(); +} diff --git a/lib/tdb2/test/run-lockall.c b/lib/tdb2/test/run-lockall.c new file mode 100644 index 0000000000..4aedf59743 --- /dev/null +++ b/lib/tdb2/test/run-lockall.c @@ -0,0 +1,80 @@ +#include "config.h" +#include <unistd.h> +#include "lock-tracking.h" + +#define fcntl fcntl_with_lockcheck + +#include <ccan/tdb2/tdb.c> +#include <ccan/tdb2/open.c> +#include <ccan/tdb2/free.c> +#include <ccan/tdb2/lock.c> +#include <ccan/tdb2/io.c> +#include <ccan/tdb2/hash.c> +#include <ccan/tdb2/check.c> +#include <ccan/tdb2/transaction.c> +#include <ccan/tap/tap.h> +#include <stdlib.h> +#include <stdbool.h> +#include <stdarg.h> +#include <err.h> +#include "external-agent.h" +#include "logging.h" + +#define TEST_DBNAME "run-lockall.tdb" + +#undef fcntl + +int main(int argc, char *argv[]) +{ +	struct agent *agent; +	const int flags[] = { TDB_DEFAULT, +			      TDB_NOMMAP, +			      TDB_CONVERT, +			      TDB_CONVERT | TDB_NOMMAP }; +	int i; + +	plan_tests(13 * sizeof(flags)/sizeof(flags[0]) + 1); +	agent = prepare_external_agent(); +	if (!agent) +		err(1, "preparing agent"); + +	for (i = 0; i < sizeof(flags)/sizeof(flags[0]); i++) { +		enum agent_return ret; +		struct tdb_context *tdb; + +		tdb = tdb_open(TEST_DBNAME, flags[i], +			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr); +		ok1(tdb); + +		ret = external_agent_operation(agent, OPEN, TEST_DBNAME); +		ok1(ret == SUCCESS); + +		ok1(tdb_lockall(tdb) == TDB_SUCCESS); +		ok1(external_agent_operation(agent, STORE, "key") +		    == WOULD_HAVE_BLOCKED); +		ok1(external_agent_operation(agent, FETCH, "key") +		    == WOULD_HAVE_BLOCKED); +		/* Test nesting. */ +		ok1(tdb_lockall(tdb) == TDB_SUCCESS); +		tdb_unlockall(tdb); +		tdb_unlockall(tdb); + +		ok1(external_agent_operation(agent, STORE, "key") == SUCCESS); + +		ok1(tdb_lockall_read(tdb) == TDB_SUCCESS); +		ok1(external_agent_operation(agent, STORE, "key") +		    == WOULD_HAVE_BLOCKED); +		ok1(external_agent_operation(agent, FETCH, "key") == SUCCESS); +		ok1(tdb_lockall_read(tdb) == TDB_SUCCESS); +		tdb_unlockall_read(tdb); +		tdb_unlockall_read(tdb); + +		ok1(external_agent_operation(agent, STORE, "key") == SUCCESS); +		ok1(external_agent_operation(agent, CLOSE, NULL) == SUCCESS); +		tdb_close(tdb); +	} + +	free_external_agent(agent); +	ok1(tap_log_messages == 0); +	return exit_status(); +} diff --git a/lib/tdb2/test/run-locktimeout.c b/lib/tdb2/test/run-locktimeout.c new file mode 100644 index 0000000000..bb5b5db29b --- /dev/null +++ b/lib/tdb2/test/run-locktimeout.c @@ -0,0 +1,192 @@ +#include <ccan/tdb2/tdb.c> +#include <ccan/tdb2/open.c> +#include <ccan/tdb2/free.c> +#include <ccan/tdb2/lock.c> +#include <ccan/tdb2/io.c> +#include <ccan/tdb2/hash.c> +#include <ccan/tdb2/transaction.c> +#include <ccan/tdb2/check.c> +#include <ccan/tap/tap.h> +#include "logging.h" +#include "external-agent.h" + +#undef alarm +#define alarm fast_alarm + +/* Speed things up by doing things in milliseconds. */ +static unsigned int fast_alarm(unsigned int milli_seconds) +{ +	struct itimerval it; + +	it.it_interval.tv_sec = it.it_interval.tv_usec = 0; +	it.it_value.tv_sec = milli_seconds / 1000; +	it.it_value.tv_usec = milli_seconds * 1000; +	setitimer(ITIMER_REAL, &it, NULL); +	return 0; +} + +#define CatchSignal(sig, handler) signal((sig), (handler)) + +static void do_nothing(int signum) +{ +} + +/* This example code is taken from SAMBA, so try not to change it. */ +static struct flock flock_struct; + +/* Return a value which is none of v1, v2 or v3. */ +static inline short int invalid_value(short int v1, short int v2, short int v3) +{ +	short int try = (v1+v2+v3)^((v1+v2+v3) << 16); +	while (try == v1 || try == v2 || try == v3) +		try++; +	return try; +} + +/* We invalidate in as many ways as we can, so the OS rejects it */ +static void invalidate_flock_struct(int signum) +{ +	flock_struct.l_type = invalid_value(F_RDLCK, F_WRLCK, F_UNLCK); +	flock_struct.l_whence = invalid_value(SEEK_SET, SEEK_CUR, SEEK_END); +	flock_struct.l_start = -1; +	/* A large negative. */ +	flock_struct.l_len = (((off_t)1 << (sizeof(off_t)*CHAR_BIT - 1)) + 1); +} + +static int timeout_lock(int fd, int rw, off_t off, off_t len, bool waitflag, +			void *_timeout) +{ +	int ret, saved_errno = errno; +	unsigned int timeout = *(unsigned int *)_timeout; + +	flock_struct.l_type = rw; +	flock_struct.l_whence = SEEK_SET; +	flock_struct.l_start = off; +	flock_struct.l_len = len; + +	CatchSignal(SIGALRM, invalidate_flock_struct); +	alarm(timeout); + +	for (;;) { +		if (waitflag) +			ret = fcntl(fd, F_SETLKW, &flock_struct); +		else +			ret = fcntl(fd, F_SETLK, &flock_struct); + +		if (ret == 0) +			break; + +		/* Not signalled?  Something else went wrong. */ +		if (flock_struct.l_len == len) { +			if (errno == EAGAIN || errno == EINTR) +				continue; +			saved_errno = errno; +			break; +		} else { +			saved_errno = EINTR; +			break; +		} +	} + +	alarm(0); +	errno = saved_errno; +	return ret; +} + +static int tdb_chainlock_with_timeout_internal(struct tdb_context *tdb, +					       TDB_DATA key, +					       unsigned int timeout, +					       int rw_type) +{ +	union tdb_attribute locking; +	enum TDB_ERROR ecode; + +	if (timeout) { +		locking.base.attr = TDB_ATTRIBUTE_FLOCK; +		ecode = tdb_get_attribute(tdb, &locking); +		if (ecode != TDB_SUCCESS) +			return ecode; + +		/* Replace locking function with our own. */ +		locking.flock.data = &timeout; +		locking.flock.lock = timeout_lock; + +		ecode = tdb_set_attribute(tdb, &locking); +		if (ecode != TDB_SUCCESS) +			return ecode; +	} +	if (rw_type == F_RDLCK) +		ecode = tdb_chainlock_read(tdb, key); +	else +		ecode = tdb_chainlock(tdb, key); + +	if (timeout) { +		tdb_unset_attribute(tdb, TDB_ATTRIBUTE_FLOCK); +	} +	return ecode; +} + +int main(int argc, char *argv[]) +{ +	unsigned int i; +	struct tdb_context *tdb; +	TDB_DATA key = tdb_mkdata("hello", 5); +	int flags[] = { TDB_DEFAULT, TDB_NOMMAP, +			TDB_CONVERT, TDB_NOMMAP|TDB_CONVERT }; +	struct agent *agent; + +	plan_tests(sizeof(flags) / sizeof(flags[0]) * 15); + +	agent = prepare_external_agent(); + +	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { +		enum TDB_ERROR ecode; +		tdb = tdb_open("run-locktimeout.tdb", flags[i], +			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr); +		if (!ok1(tdb)) +			break; + +		/* Simple cases: should succeed. */ +		ecode = tdb_chainlock_with_timeout_internal(tdb, key, 20, +							    F_RDLCK); +		ok1(ecode == TDB_SUCCESS); +		ok1(tap_log_messages == 0); + +		tdb_chainunlock_read(tdb, key); +		ok1(tap_log_messages == 0); + +		ecode = tdb_chainlock_with_timeout_internal(tdb, key, 20, +							    F_WRLCK); +		ok1(ecode == TDB_SUCCESS); +		ok1(tap_log_messages == 0); + +		tdb_chainunlock(tdb, key); +		ok1(tap_log_messages == 0); + +		/* OK, get agent to start transaction, then we should time out. */ +		ok1(external_agent_operation(agent, OPEN, "run-locktimeout.tdb") +		    == SUCCESS); +		ok1(external_agent_operation(agent, TRANSACTION_START, "") +		    == SUCCESS); +		ecode = tdb_chainlock_with_timeout_internal(tdb, key, 20, +							    F_WRLCK); +		ok1(ecode == TDB_ERR_LOCK); +		ok1(tap_log_messages == 0); + +		/* Even if we get a different signal, should be fine. */ +		CatchSignal(SIGUSR1, do_nothing); +		external_agent_operation(agent, SEND_SIGNAL, ""); +		ecode = tdb_chainlock_with_timeout_internal(tdb, key, 20, +							    F_WRLCK); +		ok1(ecode == TDB_ERR_LOCK); +		ok1(tap_log_messages == 0); + +		ok1(external_agent_operation(agent, TRANSACTION_COMMIT, "") +		    == SUCCESS); +		ok1(external_agent_operation(agent, CLOSE, "") +		    == SUCCESS); +		tdb_close(tdb); +	} +	free_external_agent(agent); +	return exit_status(); +} diff --git a/lib/tdb2/test/run-missing-entries.c b/lib/tdb2/test/run-missing-entries.c new file mode 100644 index 0000000000..e99572f64c --- /dev/null +++ b/lib/tdb2/test/run-missing-entries.c @@ -0,0 +1,48 @@ +/* Another test revealed that we lost an entry.  This reproduces it. */ +#include <ccan/tdb2/tdb.c> +#include <ccan/tdb2/open.c> +#include <ccan/tdb2/free.c> +#include <ccan/tdb2/lock.c> +#include <ccan/tdb2/io.c> +#include <ccan/tdb2/hash.c> +#include <ccan/tdb2/check.c> +#include <ccan/tdb2/traverse.c> +#include <ccan/tdb2/transaction.c> +#include <ccan/tap/tap.h> +#include "logging.h" + +#define NUM_RECORDS 1189 + +/* We use the same seed which we saw this failure on. */ +static uint64_t failhash(const void *key, size_t len, uint64_t seed, void *p) +{ +	seed = 699537674708983027ULL; +	return hash64_stable((const unsigned char *)key, len, seed); +} + +int main(int argc, char *argv[]) +{ +	int i; +	struct tdb_context *tdb; +	struct tdb_data key = { (unsigned char *)&i, sizeof(i) }; +	struct tdb_data data = { (unsigned char *)&i, sizeof(i) }; +	union tdb_attribute hattr = { .hash = { .base = { TDB_ATTRIBUTE_HASH }, +						.fn = failhash } }; + +	hattr.base.next = &tap_log_attr; +	plan_tests(1 + 2 * NUM_RECORDS + 1); + +	tdb = tdb_open("run-missing-entries.tdb", TDB_INTERNAL, +		       O_RDWR|O_CREAT|O_TRUNC, 0600, &hattr); +	ok1(tdb); +	if (tdb) { +		for (i = 0; i < NUM_RECORDS; i++) { +			ok1(tdb_store(tdb, key, data, TDB_REPLACE) == 0); +			ok1(tdb_check(tdb, NULL, NULL) == 0); +		} +		tdb_close(tdb); +	} + +	ok1(tap_log_messages == 0); +	return exit_status(); +} diff --git a/lib/tdb2/test/run-open-multiple-times.c b/lib/tdb2/test/run-open-multiple-times.c new file mode 100644 index 0000000000..240828df16 --- /dev/null +++ b/lib/tdb2/test/run-open-multiple-times.c @@ -0,0 +1,84 @@ +#include <ccan/tdb2/tdb.c> +#include <ccan/tdb2/open.c> +#include <ccan/tdb2/free.c> +#include <ccan/tdb2/lock.c> +#include <ccan/tdb2/io.c> +#include <ccan/tdb2/hash.c> +#include <ccan/tdb2/check.c> +#include <ccan/tdb2/transaction.c> +#include <ccan/tap/tap.h> +#include "logging.h" + +int main(int argc, char *argv[]) +{ +	unsigned int i; +	struct tdb_context *tdb, *tdb2; +	struct tdb_data key = { (unsigned char *)&i, sizeof(i) }; +	struct tdb_data data = { (unsigned char *)&i, sizeof(i) }; +	struct tdb_data d = { NULL, 0 }; /* Bogus GCC warning */ +	int flags[] = { TDB_DEFAULT, TDB_NOMMAP, +			TDB_CONVERT, TDB_NOMMAP|TDB_CONVERT }; + +	plan_tests(sizeof(flags) / sizeof(flags[0]) * 28); +	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { +		tdb = tdb_open("run-open-multiple-times.tdb", flags[i], +			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr); +		ok1(tdb); +		if (!tdb) +			continue; +		tdb2 = tdb_open("run-open-multiple-times.tdb", flags[i], +				O_RDWR|O_CREAT, 0600, &tap_log_attr); +		ok1(tdb_check(tdb, NULL, NULL) == 0); +		ok1(tdb_check(tdb2, NULL, NULL) == 0); + +		/* Store in one, fetch in the other. */ +		ok1(tdb_store(tdb, key, data, TDB_REPLACE) == 0); +		ok1(tdb_fetch(tdb2, key, &d) == TDB_SUCCESS); +		ok1(tdb_deq(d, data)); +		free(d.dptr); + +		/* Vice versa, with delete. */ +		ok1(tdb_delete(tdb2, key) == 0); +		ok1(tdb_fetch(tdb, key, &d) == TDB_ERR_NOEXIST); + +		/* OK, now close first one, check second still good. */ +		ok1(tdb_close(tdb) == 0); + +		ok1(tdb_store(tdb2, key, data, TDB_REPLACE) == 0); +		ok1(tdb_fetch(tdb2, key, &d) == TDB_SUCCESS); +		ok1(tdb_deq(d, data)); +		free(d.dptr); + +		/* Reopen */ +		tdb = tdb_open("run-open-multiple-times.tdb", flags[i], +			       O_RDWR|O_CREAT, 0600, &tap_log_attr); +		ok1(tdb); + +		ok1(tdb_transaction_start(tdb2) == 0); + +		/* Anything in the other one should fail. */ +		ok1(tdb_fetch(tdb, key, &d) == TDB_ERR_LOCK); +		ok1(tap_log_messages == 1); +		ok1(tdb_store(tdb, key, data, TDB_REPLACE) == TDB_ERR_LOCK); +		ok1(tap_log_messages == 2); +		ok1(tdb_transaction_start(tdb) == TDB_ERR_LOCK); +		ok1(tap_log_messages == 3); +		ok1(tdb_chainlock(tdb, key) == TDB_ERR_LOCK); +		ok1(tap_log_messages == 4); + +		/* Transaciton should work as normal. */ +		ok1(tdb_store(tdb2, key, data, TDB_REPLACE) == TDB_SUCCESS); + +		/* Now... try closing with locks held. */ +		ok1(tdb_close(tdb2) == 0); + +		ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS); +		ok1(tdb_deq(d, data)); +		free(d.dptr); +		ok1(tdb_close(tdb) == 0); +		ok1(tap_log_messages == 4); +		tap_log_messages = 0; +	} + +	return exit_status(); +} diff --git a/lib/tdb2/test/run-record-expand.c b/lib/tdb2/test/run-record-expand.c new file mode 100644 index 0000000000..109a099278 --- /dev/null +++ b/lib/tdb2/test/run-record-expand.c @@ -0,0 +1,53 @@ +#include <ccan/tdb2/tdb.c> +#include <ccan/tdb2/open.c> +#include <ccan/tdb2/free.c> +#include <ccan/tdb2/lock.c> +#include <ccan/tdb2/io.c> +#include <ccan/tdb2/hash.c> +#include <ccan/tdb2/check.c> +#include <ccan/tdb2/transaction.c> +#include <ccan/tap/tap.h> +#include "logging.h" + +#define MAX_SIZE 10000 +#define SIZE_STEP 131 + +int main(int argc, char *argv[]) +{ +	unsigned int i; +	struct tdb_context *tdb; +	int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP, +			TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT, +			TDB_NOMMAP|TDB_CONVERT }; +	struct tdb_data key = tdb_mkdata("key", 3); +	struct tdb_data data; + +	data.dptr = malloc(MAX_SIZE); +	memset(data.dptr, 0x24, MAX_SIZE); + +	plan_tests(sizeof(flags) / sizeof(flags[0]) +		   * (3 + (1 + (MAX_SIZE/SIZE_STEP)) * 2) + 1); +	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { +		tdb = tdb_open("run-record-expand.tdb", flags[i], +			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr); +		ok1(tdb); +		if (!tdb) +			continue; + +		data.dsize = 0; +		ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0); +		ok1(tdb_check(tdb, NULL, NULL) == 0); +		for (data.dsize = 0; +		     data.dsize < MAX_SIZE; +		     data.dsize += SIZE_STEP) { +			memset(data.dptr, data.dsize, data.dsize); +			ok1(tdb_store(tdb, key, data, TDB_MODIFY) == 0); +			ok1(tdb_check(tdb, NULL, NULL) == 0); +		} +		tdb_close(tdb); +	} +	ok1(tap_log_messages == 0); +	free(data.dptr); + +	return exit_status(); +} diff --git a/lib/tdb2/test/run-remap-in-read_traverse.c b/lib/tdb2/test/run-remap-in-read_traverse.c new file mode 100644 index 0000000000..d784ca3407 --- /dev/null +++ b/lib/tdb2/test/run-remap-in-read_traverse.c @@ -0,0 +1,65 @@ +/* We had a bug where we marked the tdb read-only for a tdb_traverse_read. + * If we then expanded the tdb, we would remap read-only, and later SEGV. */ +#include <ccan/tdb2/tdb.c> +#include <ccan/tdb2/open.c> +#include <ccan/tdb2/free.c> +#include <ccan/tdb2/lock.c> +#include <ccan/tdb2/io.c> +#include <ccan/tdb2/hash.c> +#include <ccan/tdb2/check.c> +#include <ccan/tdb2/traverse.c> +#include <ccan/tdb2/transaction.c> +#include <ccan/tap/tap.h> +#include "external-agent.h" +#include "logging.h" + +static bool file_larger(int fd, tdb_len_t size) +{ +	struct stat st; + +	fstat(fd, &st); +	return st.st_size != size; +} + +static unsigned add_records_to_grow(struct agent *agent, int fd, tdb_len_t size) +{ +	unsigned int i; + +	for (i = 0; !file_larger(fd, size); i++) { +		char data[20]; +		sprintf(data, "%i", i); +		if (external_agent_operation(agent, STORE, data) != SUCCESS) +			return 0; +	} +	diag("Added %u records to grow file", i); +	return i; +} + +int main(int argc, char *argv[]) +{ +	unsigned int i; +	struct agent *agent; +	struct tdb_context *tdb; +	struct tdb_data d = tdb_mkdata("hello", 5); +	const char filename[] = "run-remap-in-read_traverse.tdb"; + +	plan_tests(4); + +	agent = prepare_external_agent(); + +	tdb = tdb_open(filename, TDB_DEFAULT, +		       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr); + +	ok1(external_agent_operation(agent, OPEN, filename) == SUCCESS); +	i = add_records_to_grow(agent, tdb->file->fd, tdb->file->map_size); + +	/* Do a traverse. */ +	ok1(tdb_traverse(tdb, NULL, NULL) == i); + +	/* Now store something! */ +	ok1(tdb_store(tdb, d, d, TDB_INSERT) == 0); +	ok1(tap_log_messages == 0); +	tdb_close(tdb); +	free_external_agent(agent); +	return exit_status(); +} diff --git a/lib/tdb2/test/run-seed.c b/lib/tdb2/test/run-seed.c new file mode 100644 index 0000000000..a9b370b6e5 --- /dev/null +++ b/lib/tdb2/test/run-seed.c @@ -0,0 +1,67 @@ +#include <ccan/tdb2/tdb.c> +#include <ccan/tdb2/open.c> +#include <ccan/tdb2/free.c> +#include <ccan/tdb2/lock.c> +#include <ccan/tdb2/io.c> +#include <ccan/tdb2/hash.c> +#include <ccan/tdb2/check.c> +#include <ccan/tdb2/transaction.c> +#include <ccan/tap/tap.h> +#include "logging.h" + +static int log_count = 0; + +/* Normally we get a log when setting random seed. */ +static void my_log_fn(struct tdb_context *tdb, +		      enum tdb_log_level level, +		      const char *message, void *priv) +{ +	log_count++; +} + +static union tdb_attribute log_attr = { +	.log = { .base = { .attr = TDB_ATTRIBUTE_LOG }, +		 .fn = my_log_fn } +}; + +int main(int argc, char *argv[]) +{ +	unsigned int i; +	struct tdb_context *tdb; +	union tdb_attribute attr; +	int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP, +			TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT, +			TDB_NOMMAP|TDB_CONVERT }; + +	attr.seed.base.attr = TDB_ATTRIBUTE_SEED; +	attr.seed.base.next = &log_attr; +	attr.seed.seed = 42; + +	plan_tests(sizeof(flags) / sizeof(flags[0]) * 4 + 4 * 3); +	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { +		struct tdb_header hdr; +		int fd; +		tdb = tdb_open("run-seed.tdb", flags[i], +			       O_RDWR|O_CREAT|O_TRUNC, 0600, &attr); +		ok1(tdb); +		if (!tdb) +			continue; +		ok1(tdb_check(tdb, NULL, NULL) == 0); +		ok1(tdb->hash_seed == 42); +		ok1(log_count == 0); +		tdb_close(tdb); + +		if (flags[i] & TDB_INTERNAL) +			continue; + +		fd = open("run-seed.tdb", O_RDONLY); +		ok1(fd >= 0); +		ok1(read(fd, &hdr, sizeof(hdr)) == sizeof(hdr)); +		if (flags[i] & TDB_CONVERT) +			ok1(bswap_64(hdr.hash_seed) == 42); +		else +			ok1(hdr.hash_seed == 42); +		close(fd); +	} +	return exit_status(); +} diff --git a/lib/tdb2/test/run-simple-delete.c b/lib/tdb2/test/run-simple-delete.c new file mode 100644 index 0000000000..d06bf2d2bd --- /dev/null +++ b/lib/tdb2/test/run-simple-delete.c @@ -0,0 +1,42 @@ +#include <ccan/tdb2/tdb.c> +#include <ccan/tdb2/open.c> +#include <ccan/tdb2/free.c> +#include <ccan/tdb2/lock.c> +#include <ccan/tdb2/io.c> +#include <ccan/tdb2/hash.c> +#include <ccan/tdb2/check.c> +#include <ccan/tdb2/transaction.c> +#include <ccan/tap/tap.h> +#include "logging.h" + +int main(int argc, char *argv[]) +{ +	unsigned int i; +	struct tdb_context *tdb; +	int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP, +			TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT, +			TDB_NOMMAP|TDB_CONVERT }; +	struct tdb_data key = tdb_mkdata("key", 3); +	struct tdb_data data = tdb_mkdata("data", 4); + +	plan_tests(sizeof(flags) / sizeof(flags[0]) * 7 + 1); +	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { +		tdb = tdb_open("run-simple-delete.tdb", flags[i], +			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr); +		ok1(tdb); +		if (tdb) { +			/* Delete should fail. */ +			ok1(tdb_delete(tdb, key) == TDB_ERR_NOEXIST); +			ok1(tdb_check(tdb, NULL, NULL) == 0); +			/* Insert should succeed. */ +			ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0); +			ok1(tdb_check(tdb, NULL, NULL) == 0); +			/* Delete should now work. */ +			ok1(tdb_delete(tdb, key) == 0); +			ok1(tdb_check(tdb, NULL, NULL) == 0); +			tdb_close(tdb); +		} +	} +	ok1(tap_log_messages == 0); +	return exit_status(); +} diff --git a/lib/tdb2/test/run-summary.c b/lib/tdb2/test/run-summary.c new file mode 100644 index 0000000000..c92e759373 --- /dev/null +++ b/lib/tdb2/test/run-summary.c @@ -0,0 +1,60 @@ +#include <ccan/tdb2/tdb.c> +#include <ccan/tdb2/open.c> +#include <ccan/tdb2/free.c> +#include <ccan/tdb2/lock.c> +#include <ccan/tdb2/io.c> +#include <ccan/tdb2/hash.c> +#include <ccan/tdb2/check.c> +#include <ccan/tdb2/summary.c> +#include <ccan/tdb2/transaction.c> +#include <ccan/tap/tap.h> +#include "logging.h" + +int main(int argc, char *argv[]) +{ +	unsigned int i, j; +	struct tdb_context *tdb; +	int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP, +			TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT, +			TDB_NOMMAP|TDB_CONVERT }; +	struct tdb_data key = { (unsigned char *)&j, sizeof(j) }; +	struct tdb_data data = { (unsigned char *)&j, sizeof(j) }; +	char *summary; + +	plan_tests(sizeof(flags) / sizeof(flags[0]) * (1 + 2 * 5) + 1); +	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { +		tdb = tdb_open("run-summary.tdb", flags[i], +			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr); +		ok1(tdb); +		if (!tdb) +			continue; + +		/* Put some stuff in there. */ +		for (j = 0; j < 500; j++) { +			/* Make sure padding varies to we get some graphs! */ +			data.dsize = j % (sizeof(j) + 1); +			if (tdb_store(tdb, key, data, TDB_REPLACE) != 0) +				fail("Storing in tdb"); +		} + +		for (j = 0; +		     j <= TDB_SUMMARY_HISTOGRAMS; +		     j += TDB_SUMMARY_HISTOGRAMS) { +			ok1(tdb_summary(tdb, j, &summary) == TDB_SUCCESS); +			ok1(strstr(summary, "Number of records: 500\n")); +			ok1(strstr(summary, "Smallest/average/largest keys: 4/4/4\n")); +			ok1(strstr(summary, "Smallest/average/largest data: 0/2/4\n")); +			if (j == TDB_SUMMARY_HISTOGRAMS) +				ok1(strstr(summary, "|") +				    && strstr(summary, "*")); +			else +				ok1(!strstr(summary, "|") +				    && !strstr(summary, "*")); +			free(summary); +		} +		tdb_close(tdb); +	} + +	ok1(tap_log_messages == 0); +	return exit_status(); +} diff --git a/lib/tdb2/test/run-tdb_errorstr.c b/lib/tdb2/test/run-tdb_errorstr.c new file mode 100644 index 0000000000..27bdfcd67c --- /dev/null +++ b/lib/tdb2/test/run-tdb_errorstr.c @@ -0,0 +1,59 @@ +#include <ccan/tdb2/tdb.c> +#include <ccan/tdb2/open.c> +#include <ccan/tdb2/free.c> +#include <ccan/tdb2/lock.c> +#include <ccan/tdb2/io.c> +#include <ccan/tdb2/hash.c> +#include <ccan/tdb2/check.c> +#include <ccan/tdb2/transaction.c> +#include <ccan/tap/tap.h> + +int main(int argc, char *argv[]) +{ +	enum TDB_ERROR err; +	plan_tests(TDB_ERR_RDONLY*-1 + 2); + +	for (err = TDB_SUCCESS; err >= TDB_ERR_RDONLY; err--) { +		switch (err) { +		case TDB_SUCCESS: +			ok1(!strcmp(tdb_errorstr(err), +				    "Success")); +			break; +		case TDB_ERR_IO: +			ok1(!strcmp(tdb_errorstr(err), +				    "IO Error")); +			break; +		case TDB_ERR_LOCK: +			ok1(!strcmp(tdb_errorstr(err), +				    "Locking error")); +			break; +		case TDB_ERR_OOM: +			ok1(!strcmp(tdb_errorstr(err), +				    "Out of memory")); +			break; +		case TDB_ERR_EXISTS: +			ok1(!strcmp(tdb_errorstr(err), +				    "Record exists")); +			break; +		case TDB_ERR_EINVAL: +			ok1(!strcmp(tdb_errorstr(err), +				    "Invalid parameter")); +			break; +		case TDB_ERR_NOEXIST: +			ok1(!strcmp(tdb_errorstr(err), +				    "Record does not exist")); +			break; +		case TDB_ERR_RDONLY: +			ok1(!strcmp(tdb_errorstr(err), +				    "write not permitted")); +			break; +		case TDB_ERR_CORRUPT: +			ok1(!strcmp(tdb_errorstr(err), +				    "Corrupt database")); +			break; +		} +	} +	ok1(!strcmp(tdb_errorstr(err), "Invalid error code")); + +	return exit_status(); +} diff --git a/lib/tdb2/test/run-traverse.c b/lib/tdb2/test/run-traverse.c new file mode 100644 index 0000000000..f973d95d0f --- /dev/null +++ b/lib/tdb2/test/run-traverse.c @@ -0,0 +1,211 @@ +#include <ccan/tdb2/tdb.c> +#include <ccan/tdb2/open.c> +#include <ccan/tdb2/free.c> +#include <ccan/tdb2/lock.c> +#include <ccan/tdb2/io.c> +#include <ccan/tdb2/hash.c> +#include <ccan/tdb2/check.c> +#include <ccan/tdb2/traverse.c> +#include <ccan/tdb2/transaction.c> +#include <ccan/tap/tap.h> +#include "logging.h" + +#define NUM_RECORDS 1000 + +/* We use the same seed which we saw a failure on. */ +static uint64_t fixedhash(const void *key, size_t len, uint64_t seed, void *p) +{ +	return hash64_stable((const unsigned char *)key, len, +			     *(uint64_t *)p); +} + +static bool store_records(struct tdb_context *tdb) +{ +	int i; +	struct tdb_data key = { (unsigned char *)&i, sizeof(i) }; +	struct tdb_data data = { (unsigned char *)&i, sizeof(i) }; + +	for (i = 0; i < NUM_RECORDS; i++) +		if (tdb_store(tdb, key, data, TDB_REPLACE) != 0) +			return false; +	return true; +} + +struct trav_data { +	unsigned int calls, call_limit; +	int low, high; +	bool mismatch; +	bool delete; +	enum TDB_ERROR delete_error; +}; + +static int trav(struct tdb_context *tdb, TDB_DATA key, TDB_DATA dbuf, +		struct trav_data *td) +{ +	int val; + +	td->calls++; +	if (key.dsize != sizeof(val) || dbuf.dsize != sizeof(val) +	    || memcmp(key.dptr, dbuf.dptr, key.dsize) != 0) { +		td->mismatch = true; +		return -1; +	} +	memcpy(&val, dbuf.dptr, dbuf.dsize); +	if (val < td->low) +		td->low = val; +	if (val > td->high) +		td->high = val; + +	if (td->delete) { +		td->delete_error = tdb_delete(tdb, key); +		if (td->delete_error != TDB_SUCCESS) { +			return -1; +		} +	} + +	if (td->calls == td->call_limit) +		return 1; +	return 0; +} + +struct trav_grow_data { +	unsigned int calls; +	unsigned int num_large; +	bool mismatch; +	enum TDB_ERROR error; +}; + +static int trav_grow(struct tdb_context *tdb, TDB_DATA key, TDB_DATA dbuf, +		     struct trav_grow_data *tgd) +{ +	int val; +	unsigned char buffer[128] = { 0 }; + +	tgd->calls++; +	if (key.dsize != sizeof(val) || dbuf.dsize < sizeof(val) +	    || memcmp(key.dptr, dbuf.dptr, key.dsize) != 0) { +		tgd->mismatch = true; +		return -1; +	} + +	if (dbuf.dsize > sizeof(val)) +		/* We must have seen this before! */ +		tgd->num_large++; + +	/* Make a big difference to the database. */ +	dbuf.dptr = buffer; +	dbuf.dsize = sizeof(buffer); +	tgd->error = tdb_append(tdb, key, dbuf); +	if (tgd->error != TDB_SUCCESS) { +		return -1; +	} +	return 0; +} + +int main(int argc, char *argv[]) +{ +	unsigned int i; +	int num; +	struct trav_data td; +	struct trav_grow_data tgd; +	struct tdb_context *tdb; +	uint64_t seed = 16014841315512641303ULL; +	int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP, +			TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT, +			TDB_NOMMAP|TDB_CONVERT }; +	union tdb_attribute hattr = { .hash = { .base = { TDB_ATTRIBUTE_HASH }, +						.fn = fixedhash, +						.data = &seed } }; + +	hattr.base.next = &tap_log_attr; + +	plan_tests(sizeof(flags) / sizeof(flags[0]) * 32 + 1); +	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { +		tdb = tdb_open("run-traverse.tdb", flags[i], +			       O_RDWR|O_CREAT|O_TRUNC, 0600, &hattr); +		ok1(tdb); +		if (!tdb) +			continue; + +		ok1(tdb_traverse(tdb, NULL, NULL) == 0); + +		ok1(store_records(tdb)); +		num = tdb_traverse(tdb, NULL, NULL); +		ok1(num == NUM_RECORDS); + +		/* Full traverse. */ +		td.calls = 0; +		td.call_limit = UINT_MAX; +		td.low = INT_MAX; +		td.high = INT_MIN; +		td.mismatch = false; +		td.delete = false; + +		num = tdb_traverse(tdb, trav, &td); +		ok1(num == NUM_RECORDS); +		ok1(!td.mismatch); +		ok1(td.calls == NUM_RECORDS); +		ok1(td.low == 0); +		ok1(td.high == NUM_RECORDS-1); + +		/* Short traverse. */ +		td.calls = 0; +		td.call_limit = NUM_RECORDS / 2; +		td.low = INT_MAX; +		td.high = INT_MIN; +		td.mismatch = false; +		td.delete = false; + +		num = tdb_traverse(tdb, trav, &td); +		ok1(num == NUM_RECORDS / 2); +		ok1(!td.mismatch); +		ok1(td.calls == NUM_RECORDS / 2); +		ok1(td.low <= NUM_RECORDS / 2); +		ok1(td.high > NUM_RECORDS / 2); +		ok1(tdb_check(tdb, NULL, NULL) == 0); +		ok1(tap_log_messages == 0); + +		/* Deleting traverse (delete everything). */ +		td.calls = 0; +		td.call_limit = UINT_MAX; +		td.low = INT_MAX; +		td.high = INT_MIN; +		td.mismatch = false; +		td.delete = true; +		td.delete_error = TDB_SUCCESS; +		num = tdb_traverse(tdb, trav, &td); +		ok1(num == NUM_RECORDS); +		ok1(td.delete_error == TDB_SUCCESS); +		ok1(!td.mismatch); +		ok1(td.calls == NUM_RECORDS); +		ok1(td.low == 0); +		ok1(td.high == NUM_RECORDS - 1); +		ok1(tdb_check(tdb, NULL, NULL) == 0); + +		/* Now it's empty! */ +		ok1(tdb_traverse(tdb, NULL, NULL) == 0); + +		/* Re-add. */ +		ok1(store_records(tdb)); +		ok1(tdb_traverse(tdb, NULL, NULL) == NUM_RECORDS); +		ok1(tdb_check(tdb, NULL, NULL) == 0); + +		/* Grow.  This will cause us to be reshuffled. */ +		tgd.calls = 0; +		tgd.num_large = 0; +		tgd.mismatch = false; +		tgd.error = TDB_SUCCESS; +		ok1(tdb_traverse(tdb, trav_grow, &tgd) > 1); +		ok1(tgd.error == 0); +		ok1(!tgd.mismatch); +		ok1(tdb_check(tdb, NULL, NULL) == 0); +		ok1(tgd.num_large < tgd.calls); +		diag("growing db: %u calls, %u repeats", +		     tgd.calls, tgd.num_large); + +		tdb_close(tdb); +	} + +	ok1(tap_log_messages == 0); +	return exit_status(); +} diff --git a/lib/tdb2/tools/Makefile b/lib/tdb2/tools/Makefile new file mode 100644 index 0000000000..11188c3baf --- /dev/null +++ b/lib/tdb2/tools/Makefile @@ -0,0 +1,16 @@ +OBJS:=../../tdb2.o ../../hash.o ../../tally.o +CFLAGS:=-I../../.. -I.. -Wall -g -O3 #-g -pg +LDFLAGS:=-L../../.. + +default: tdb2torture tdb2tool tdb2dump tdb2restore mktdb2 speed growtdb-bench + +tdb2dump: tdb2dump.c $(OBJS) +tdb2restore: tdb2restore.c $(OBJS) +tdb2torture: tdb2torture.c $(OBJS) +tdb2tool: tdb2tool.c $(OBJS) +mktdb2: mktdb2.c $(OBJS) +speed: speed.c $(OBJS) +growtdb-bench: growtdb-bench.c $(OBJS) + +clean: +	rm -f tdb2torture tdb2dump tdb2restore tdb2tool mktdb2 speed growtdb-bench diff --git a/lib/tdb2/tools/growtdb-bench.c b/lib/tdb2/tools/growtdb-bench.c new file mode 100644 index 0000000000..f7f6845a8a --- /dev/null +++ b/lib/tdb2/tools/growtdb-bench.c @@ -0,0 +1,112 @@ +#include "tdb2.h" +#include <stdlib.h> +#include <string.h> +#include <stdio.h> +#include <unistd.h> +#include <err.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> + +static void logfn(struct tdb_context *tdb, +		  enum tdb_log_level level, +		  const char *message, +		  void *data) +{ +	fprintf(stderr, "tdb:%s:%s\n", tdb_name(tdb), message); +} + +int main(int argc, char *argv[]) +{ +	unsigned int i, j, users, groups; +	TDB_DATA idxkey, idxdata; +	TDB_DATA k, d, gk; +	char cmd[100]; +	struct tdb_context *tdb; +	enum TDB_ERROR ecode; +	union tdb_attribute log; + +	if (argc != 3) { +		printf("Usage: growtdb-bench <users> <groups>\n"); +		exit(1); +	} +	users = atoi(argv[1]); +	groups = atoi(argv[2]); + +	sprintf(cmd, "cat /proc/%i/statm", getpid()); + +	log.base.attr = TDB_ATTRIBUTE_LOG; +	log.base.next = NULL; +	log.log.fn = logfn; + +	tdb = tdb_open("/tmp/growtdb.tdb", TDB_DEFAULT, +		       O_RDWR|O_CREAT|O_TRUNC, 0600, &log); + +	idxkey.dptr = (unsigned char *)"User index"; +	idxkey.dsize = strlen("User index"); +	idxdata.dsize = 51; +	idxdata.dptr = calloc(idxdata.dsize, 1); + +	/* Create users. */ +	k.dsize = 48; +	k.dptr = calloc(k.dsize, 1); +	d.dsize = 64; +	d.dptr = calloc(d.dsize, 1); + +	tdb_transaction_start(tdb); +	for (i = 0; i < users; i++) { +		memcpy(k.dptr, &i, sizeof(i)); +		ecode = tdb_store(tdb, k, d, TDB_INSERT); +		if (ecode != TDB_SUCCESS) +			errx(1, "tdb insert failed: %s", tdb_errorstr(ecode)); + +		/* This simulates a growing index record. */ +		ecode = tdb_append(tdb, idxkey, idxdata); +		if (ecode != TDB_SUCCESS) +			errx(1, "tdb append failed: %s", tdb_errorstr(ecode)); +	} +	if ((ecode = tdb_transaction_commit(tdb)) != 0) +		errx(1, "tdb commit1 failed: %s", tdb_errorstr(ecode)); + +	if ((ecode = tdb_check(tdb, NULL, NULL)) != 0) +		errx(1, "tdb_check failed after initial insert!"); + +	system(cmd); + +	/* Now put them all in groups: add 32 bytes to each record for +	 * a group. */ +	gk.dsize = 48; +	gk.dptr = calloc(k.dsize, 1); +	gk.dptr[gk.dsize-1] = 1; + +	d.dsize = 32; +	for (i = 0; i < groups; i++) { +		tdb_transaction_start(tdb); +		/* Create the "group". */ +		memcpy(gk.dptr, &i, sizeof(i)); +		ecode = tdb_store(tdb, gk, d, TDB_INSERT); +		if (ecode != TDB_SUCCESS) +			errx(1, "tdb insert failed: %s", tdb_errorstr(ecode)); + +		/* Now populate it. */ +		for (j = 0; j < users; j++) { +			/* Append to the user. */ +			memcpy(k.dptr, &j, sizeof(j)); +			if ((ecode = tdb_append(tdb, k, d)) != 0) +				errx(1, "tdb append failed: %s", +				     tdb_errorstr(ecode)); + +			/* Append to the group. */ +			if ((ecode = tdb_append(tdb, gk, d)) != 0) +				errx(1, "tdb append failed: %s", +				     tdb_errorstr(ecode)); +		} +		if ((ecode = tdb_transaction_commit(tdb)) != 0) +			errx(1, "tdb commit2 failed: %s", tdb_errorstr(ecode)); +		if ((ecode = tdb_check(tdb, NULL, NULL)) != 0) +			errx(1, "tdb_check failed after iteration %i!", i); +		system(cmd); +	} + +	return 0; +} diff --git a/lib/tdb2/tools/mktdb2.c b/lib/tdb2/tools/mktdb2.c new file mode 100644 index 0000000000..c8c280349e --- /dev/null +++ b/lib/tdb2/tools/mktdb2.c @@ -0,0 +1,29 @@ +#include "tdb2.h" +#include <stdlib.h> +#include <stdio.h> +#include <fcntl.h> +#include <err.h> + +int main(int argc, char *argv[]) +{ +	unsigned int i, num_recs; +	struct tdb_context *tdb; + +	if (argc != 3 || (num_recs = atoi(argv[2])) == 0) +		errx(1, "Usage: mktdb <tdbfile> <numrecords>"); + +	tdb = tdb_open(argv[1], TDB_DEFAULT, O_CREAT|O_TRUNC|O_RDWR, 0600,NULL); +	if (!tdb) +		err(1, "Opening %s", argv[1]); + +	for (i = 0; i < num_recs; i++) { +		TDB_DATA d; + +		d.dptr = (void *)&i; +		d.dsize = sizeof(i); +		if (tdb_store(tdb, d, d, TDB_INSERT) != 0) +			err(1, "Failed to store record %i", i); +	} +	printf("Done\n"); +	return 0; +} diff --git a/lib/tdb2/tools/speed.c b/lib/tdb2/tools/speed.c new file mode 100644 index 0000000000..3222465a71 --- /dev/null +++ b/lib/tdb2/tools/speed.c @@ -0,0 +1,440 @@ +/* Simple speed test for TDB */ +#include <err.h> +#include <time.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <unistd.h> +#include <sys/time.h> +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <stdbool.h> +#include "tdb2.h" + +/* Nanoseconds per operation */ +static size_t normalize(const struct timeval *start, +			const struct timeval *stop, +			unsigned int num) +{ +	struct timeval diff; + +	timersub(stop, start, &diff); + +	/* Floating point is more accurate here. */ +	return (double)(diff.tv_sec * 1000000 + diff.tv_usec) +		/ num * 1000; +} + +static size_t file_size(void) +{ +	struct stat st; + +	if (stat("/tmp/speed.tdb", &st) != 0) +		return -1; +	return st.st_size; +} + +static int count_record(struct tdb_context *tdb, +			TDB_DATA key, TDB_DATA data, void *p) +{ +	int *total = p; +	*total += *(int *)data.dptr; +	return 0; +} + +static void dump_and_clear_stats(struct tdb_context **tdb, +				 int flags, +				 union tdb_attribute *attr) +{ +	union tdb_attribute stats; +	enum TDB_ERROR ecode; + +	stats.base.attr = TDB_ATTRIBUTE_STATS; +	stats.stats.size = sizeof(stats.stats); +	ecode = tdb_get_attribute(*tdb, &stats); +	if (ecode != TDB_SUCCESS) +		errx(1, "Getting stats: %s", tdb_errorstr(ecode)); + +	printf("allocs = %llu\n", +	       (unsigned long long)stats.stats.allocs); +	printf("  alloc_subhash = %llu\n", +	       (unsigned long long)stats.stats.alloc_subhash); +	printf("  alloc_chain = %llu\n", +	       (unsigned long long)stats.stats.alloc_chain); +	printf("  alloc_bucket_exact = %llu\n", +	       (unsigned long long)stats.stats.alloc_bucket_exact); +	printf("  alloc_bucket_max = %llu\n", +	       (unsigned long long)stats.stats.alloc_bucket_max); +	printf("  alloc_leftover = %llu\n", +	       (unsigned long long)stats.stats.alloc_leftover); +	printf("  alloc_coalesce_tried = %llu\n", +	       (unsigned long long)stats.stats.alloc_coalesce_tried); +	printf("    alloc_coalesce_iterate_clash = %llu\n", +	       (unsigned long long)stats.stats.alloc_coalesce_iterate_clash); +	printf("    alloc_coalesce_lockfail = %llu\n", +	       (unsigned long long)stats.stats.alloc_coalesce_lockfail); +	printf("    alloc_coalesce_race = %llu\n", +	       (unsigned long long)stats.stats.alloc_coalesce_race); +	printf("    alloc_coalesce_succeeded = %llu\n", +	       (unsigned long long)stats.stats.alloc_coalesce_succeeded); +	printf("      alloc_coalesce_num_merged = %llu\n", +	       (unsigned long long)stats.stats.alloc_coalesce_num_merged); +	printf("compares = %llu\n", +	       (unsigned long long)stats.stats.compares); +	printf("  compare_wrong_bucket = %llu\n", +	       (unsigned long long)stats.stats.compare_wrong_bucket); +	printf("  compare_wrong_offsetbits = %llu\n", +	       (unsigned long long)stats.stats.compare_wrong_offsetbits); +	printf("  compare_wrong_keylen = %llu\n", +	       (unsigned long long)stats.stats.compare_wrong_keylen); +	printf("  compare_wrong_rechash = %llu\n", +	       (unsigned long long)stats.stats.compare_wrong_rechash); +	printf("  compare_wrong_keycmp = %llu\n", +	       (unsigned long long)stats.stats.compare_wrong_keycmp); +	printf("transactions = %llu\n", +	       (unsigned long long)stats.stats.transactions); +	printf("  transaction_cancel = %llu\n", +	       (unsigned long long)stats.stats.transaction_cancel); +	printf("  transaction_nest = %llu\n", +	       (unsigned long long)stats.stats.transaction_nest); +	printf("  transaction_expand_file = %llu\n", +	       (unsigned long long)stats.stats.transaction_expand_file); +	printf("  transaction_read_direct = %llu\n", +	       (unsigned long long)stats.stats.transaction_read_direct); +	printf("    transaction_read_direct_fail = %llu\n", +	       (unsigned long long)stats.stats.transaction_read_direct_fail); +	printf("  transaction_write_direct = %llu\n", +	       (unsigned long long)stats.stats.transaction_write_direct); +	printf("    transaction_write_direct_fail = %llu\n", +	       (unsigned long long)stats.stats.transaction_write_direct_fail); +	printf("expands = %llu\n", +	       (unsigned long long)stats.stats.expands); +	printf("frees = %llu\n", +	       (unsigned long long)stats.stats.frees); +	printf("locks = %llu\n", +	       (unsigned long long)stats.stats.locks); +	printf("  lock_lowlevel = %llu\n", +	       (unsigned long long)stats.stats.lock_lowlevel); +	printf("  lock_nonblock = %llu\n", +	       (unsigned long long)stats.stats.lock_nonblock); +	printf("    lock_nonblock_fail = %llu\n", +	       (unsigned long long)stats.stats.lock_nonblock_fail); + +	/* Now clear. */ +	tdb_close(*tdb); +	*tdb = tdb_open("/tmp/speed.tdb", flags, O_RDWR, 0, attr); +} + +static void tdb_log(struct tdb_context *tdb, enum tdb_log_level level, +		    const char *message, void *data) +{ +	fputs(message, stderr); +	putc('\n', stderr); +} + +int main(int argc, char *argv[]) +{ +	unsigned int i, j, num = 1000, stage = 0, stopat = -1; +	int flags = TDB_DEFAULT; +	bool transaction = false, summary = false; +	TDB_DATA key, data; +	struct tdb_context *tdb; +	struct timeval start, stop; +	union tdb_attribute seed, log; +	bool do_stats = false; +	enum TDB_ERROR ecode; + +	/* Try to keep benchmarks even. */ +	seed.base.attr = TDB_ATTRIBUTE_SEED; +	seed.base.next = NULL; +	seed.seed.seed = 0; + +	log.base.attr = TDB_ATTRIBUTE_LOG; +	log.base.next = &seed; +	log.log.fn = tdb_log; + +	if (argv[1] && strcmp(argv[1], "--internal") == 0) { +		flags = TDB_INTERNAL; +		argc--; +		argv++; +	} +	if (argv[1] && strcmp(argv[1], "--transaction") == 0) { +		transaction = true; +		argc--; +		argv++; +	} +	if (argv[1] && strcmp(argv[1], "--no-sync") == 0) { +		flags |= TDB_NOSYNC; +		argc--; +		argv++; +	} +	if (argv[1] && strcmp(argv[1], "--summary") == 0) { +		summary = true; +		argc--; +		argv++; +	} +	if (argv[1] && strcmp(argv[1], "--stats") == 0) { +		do_stats = true; +		argc--; +		argv++; +	} + +	tdb = tdb_open("/tmp/speed.tdb", flags, O_RDWR|O_CREAT|O_TRUNC, +		       0600, &log); +	if (!tdb) +		err(1, "Opening /tmp/speed.tdb"); + +	key.dptr = (void *)&i; +	key.dsize = sizeof(i); +	data = key; + +	if (argv[1]) { +		num = atoi(argv[1]); +		argv++; +		argc--; +	} + +	if (argv[1]) { +		stopat = atoi(argv[1]); +		argv++; +		argc--; +	} + +	/* Add 1000 records. */ +	printf("Adding %u records: ", num); fflush(stdout); +	if (transaction && (ecode = tdb_transaction_start(tdb))) +		errx(1, "starting transaction: %s", tdb_errorstr(ecode)); +	gettimeofday(&start, NULL); +	for (i = 0; i < num; i++) +		if ((ecode = tdb_store(tdb, key, data, TDB_INSERT)) != 0) +			errx(1, "Inserting key %u in tdb: %s", +			     i, tdb_errorstr(ecode)); +	gettimeofday(&stop, NULL); +	if (transaction && (ecode = tdb_transaction_commit(tdb))) +		errx(1, "committing transaction: %s", tdb_errorstr(ecode)); +	printf(" %zu ns (%zu bytes)\n", +	       normalize(&start, &stop, num), file_size()); + +	if (tdb_check(tdb, NULL, NULL)) +		errx(1, "tdb_check failed!"); +	if (summary) { +		char *sumstr = NULL; +		tdb_summary(tdb, TDB_SUMMARY_HISTOGRAMS, &sumstr); +		printf("%s\n", sumstr); +		free(sumstr); +	} +	if (do_stats) +		dump_and_clear_stats(&tdb, flags, &log); + +	if (++stage == stopat) +		exit(0); + +	/* Finding 1000 records. */ +	printf("Finding %u records: ", num); fflush(stdout); +	if (transaction && (ecode = tdb_transaction_start(tdb))) +		errx(1, "starting transaction: %s", tdb_errorstr(ecode)); +	gettimeofday(&start, NULL); +	for (i = 0; i < num; i++) { +		struct tdb_data dbuf; +		if ((ecode = tdb_fetch(tdb, key, &dbuf)) != TDB_SUCCESS +		    || *(int *)dbuf.dptr != i) { +			errx(1, "Fetching key %u in tdb gave %u", +			     i, ecode ? ecode : *(int *)dbuf.dptr); +		} +	} +	gettimeofday(&stop, NULL); +	if (transaction && (ecode = tdb_transaction_commit(tdb))) +		errx(1, "committing transaction: %s", tdb_errorstr(ecode)); +	printf(" %zu ns (%zu bytes)\n", +	       normalize(&start, &stop, num), file_size()); +	if (tdb_check(tdb, NULL, NULL)) +		errx(1, "tdb_check failed!"); +	if (summary) { +		char *sumstr = NULL; +		tdb_summary(tdb, TDB_SUMMARY_HISTOGRAMS, &sumstr); +		printf("%s\n", sumstr); +		free(sumstr); +	} +	if (do_stats) +		dump_and_clear_stats(&tdb, flags, &log); +	if (++stage == stopat) +		exit(0); + +	/* Missing 1000 records. */ +	printf("Missing %u records: ", num); fflush(stdout); +	if (transaction && (ecode = tdb_transaction_start(tdb))) +		errx(1, "starting transaction: %s", tdb_errorstr(ecode)); +	gettimeofday(&start, NULL); +	for (i = num; i < num*2; i++) { +		struct tdb_data dbuf; +		ecode = tdb_fetch(tdb, key, &dbuf); +		if (ecode != TDB_ERR_NOEXIST) +			errx(1, "Fetching key %u in tdb gave %s", +			     i, tdb_errorstr(ecode)); +	} +	gettimeofday(&stop, NULL); +	if (transaction && (ecode = tdb_transaction_commit(tdb))) +		errx(1, "committing transaction: %s", tdb_errorstr(ecode)); +	printf(" %zu ns (%zu bytes)\n", +	       normalize(&start, &stop, num), file_size()); +	if (tdb_check(tdb, NULL, NULL)) +		errx(1, "tdb_check failed!"); +	if (summary) { +		char *sumstr = NULL; +		tdb_summary(tdb, TDB_SUMMARY_HISTOGRAMS, &sumstr); +		printf("%s\n", sumstr); +		free(sumstr); +	} +	if (do_stats) +		dump_and_clear_stats(&tdb, flags, &log); +	if (++stage == stopat) +		exit(0); + +	/* Traverse 1000 records. */ +	printf("Traversing %u records: ", num); fflush(stdout); +	if (transaction && (ecode = tdb_transaction_start(tdb))) +		errx(1, "starting transaction: %s", tdb_errorstr(ecode)); +	i = 0; +	gettimeofday(&start, NULL); +	if (tdb_traverse(tdb, count_record, &i) != num) +		errx(1, "Traverse returned wrong number of records"); +	if (i != (num - 1) * (num / 2)) +		errx(1, "Traverse tallied to %u", i); +	gettimeofday(&stop, NULL); +	if (transaction && (ecode = tdb_transaction_commit(tdb))) +		errx(1, "committing transaction: %s", tdb_errorstr(ecode)); +	printf(" %zu ns (%zu bytes)\n", +	       normalize(&start, &stop, num), file_size()); +	if (tdb_check(tdb, NULL, NULL)) +		errx(1, "tdb_check failed!"); +	if (summary) { +		char *sumstr = NULL; +		tdb_summary(tdb, TDB_SUMMARY_HISTOGRAMS, &sumstr); +		printf("%s\n", sumstr); +		free(sumstr); +	} +	if (do_stats) +		dump_and_clear_stats(&tdb, flags, &log); +	if (++stage == stopat) +		exit(0); + +	/* Delete 1000 records (not in order). */ +	printf("Deleting %u records: ", num); fflush(stdout); +	if (transaction && (ecode = tdb_transaction_start(tdb))) +		errx(1, "starting transaction: %s", tdb_errorstr(ecode)); +	gettimeofday(&start, NULL); +	for (j = 0; j < num; j++) { +		i = (j + 100003) % num; +		if ((ecode = tdb_delete(tdb, key)) != TDB_SUCCESS) +			errx(1, "Deleting key %u in tdb: %s", +			     i, tdb_errorstr(ecode)); +	} +	gettimeofday(&stop, NULL); +	if (transaction && (ecode = tdb_transaction_commit(tdb))) +		errx(1, "committing transaction: %s", tdb_errorstr(ecode)); +	printf(" %zu ns (%zu bytes)\n", +	       normalize(&start, &stop, num), file_size()); +	if (tdb_check(tdb, NULL, NULL)) +		errx(1, "tdb_check failed!"); +	if (summary) { +		char *sumstr = NULL; +		tdb_summary(tdb, TDB_SUMMARY_HISTOGRAMS, &sumstr); +		printf("%s\n", sumstr); +		free(sumstr); +	} +	if (do_stats) +		dump_and_clear_stats(&tdb, flags, &log); +	if (++stage == stopat) +		exit(0); + +	/* Re-add 1000 records (not in order). */ +	printf("Re-adding %u records: ", num); fflush(stdout); +	if (transaction && (ecode = tdb_transaction_start(tdb))) +		errx(1, "starting transaction: %s", tdb_errorstr(ecode)); +	gettimeofday(&start, NULL); +	for (j = 0; j < num; j++) { +		i = (j + 100003) % num; +		if ((ecode = tdb_store(tdb, key, data, TDB_INSERT)) != 0) +			errx(1, "Inserting key %u in tdb: %s", +			     i, tdb_errorstr(ecode)); +	} +	gettimeofday(&stop, NULL); +	if (transaction && (ecode = tdb_transaction_commit(tdb))) +		errx(1, "committing transaction: %s", tdb_errorstr(ecode)); +	printf(" %zu ns (%zu bytes)\n", +	       normalize(&start, &stop, num), file_size()); +	if (tdb_check(tdb, NULL, NULL)) +		errx(1, "tdb_check failed!"); +	if (summary) { +		char *sumstr = NULL; +		tdb_summary(tdb, TDB_SUMMARY_HISTOGRAMS, &sumstr); +		printf("%s\n", sumstr); +		free(sumstr); +	} +	if (do_stats) +		dump_and_clear_stats(&tdb, flags, &log); +	if (++stage == stopat) +		exit(0); + +	/* Append 1000 records. */ +	if (transaction && (ecode = tdb_transaction_start(tdb))) +		errx(1, "starting transaction: %s", tdb_errorstr(ecode)); +	printf("Appending %u records: ", num); fflush(stdout); +	gettimeofday(&start, NULL); +	for (i = 0; i < num; i++) +		if ((ecode = tdb_append(tdb, key, data)) != TDB_SUCCESS) +			errx(1, "Appending key %u in tdb: %s", +			     i, tdb_errorstr(ecode)); +	gettimeofday(&stop, NULL); +	if (transaction && (ecode = tdb_transaction_commit(tdb))) +		errx(1, "committing transaction: %s", tdb_errorstr(ecode)); +	printf(" %zu ns (%zu bytes)\n", +	       normalize(&start, &stop, num), file_size()); +	if (tdb_check(tdb, NULL, NULL)) +		errx(1, "tdb_check failed!"); +	if (summary) { +		char *sumstr = NULL; +		tdb_summary(tdb, TDB_SUMMARY_HISTOGRAMS, &sumstr); +		printf("%s\n", sumstr); +		free(sumstr); +	} +	if (++stage == stopat) +		exit(0); + +	/* Churn 1000 records: not in order! */ +	if (transaction && (ecode = tdb_transaction_start(tdb))) +		errx(1, "starting transaction: %s", tdb_errorstr(ecode)); +	printf("Churning %u records: ", num); fflush(stdout); +	gettimeofday(&start, NULL); +	for (j = 0; j < num; j++) { +		i = (j + 1000019) % num; +		if ((ecode = tdb_delete(tdb, key)) != TDB_SUCCESS) +			errx(1, "Deleting key %u in tdb: %s", +			     i, tdb_errorstr(ecode)); +		i += num; +		if ((ecode = tdb_store(tdb, key, data, TDB_INSERT)) != 0) +			errx(1, "Inserting key %u in tdb: %s", +			     i, tdb_errorstr(ecode)); +	} +	gettimeofday(&stop, NULL); +	if (transaction && (ecode = tdb_transaction_commit(tdb))) +		errx(1, "committing transaction: %s", tdb_errorstr(ecode)); +	printf(" %zu ns (%zu bytes)\n", +	       normalize(&start, &stop, num), file_size()); + +	if (tdb_check(tdb, NULL, NULL)) +		errx(1, "tdb_check failed!"); +	if (summary) { +		char *sumstr = NULL; +		tdb_summary(tdb, TDB_SUMMARY_HISTOGRAMS, &sumstr); +		printf("%s\n", sumstr); +		free(sumstr); +	} +	if (do_stats) +		dump_and_clear_stats(&tdb, flags, &log); +	if (++stage == stopat) +		exit(0); + +	return 0; +} diff --git a/lib/tdb2/tools/tdb2dump.c b/lib/tdb2/tools/tdb2dump.c new file mode 100644 index 0000000000..abe1d9b871 --- /dev/null +++ b/lib/tdb2/tools/tdb2dump.c @@ -0,0 +1,115 @@ +/* +   simple tdb2 dump util +   Copyright (C) Andrew Tridgell              2001 +   Copyright (C) Rusty Russell                2011 + +   This program is free software; you can redistribute it and/or modify +   it under the terms of the GNU General Public License as published by +   the Free Software Foundation; either version 3 of the License, or +   (at your option) any later version. + +   This program is distributed in the hope that it will be useful, +   but WITHOUT ANY WARRANTY; without even the implied warranty of +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the +   GNU General Public License for more details. + +   You should have received a copy of the GNU General Public License +   along with this program.  If not, see <http://www.gnu.org/licenses/>. +*/ +#include "tdb2.h" +#include <ctype.h> +#include <stdio.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <stdlib.h> +#include <unistd.h> + +static void print_data(TDB_DATA d) +{ +	unsigned char *p = (unsigned char *)d.dptr; +	int len = d.dsize; +	while (len--) { +		if (isprint(*p) && !strchr("\"\\", *p)) { +			fputc(*p, stdout); +		} else { +			printf("\\%02X", *p); +		} +		p++; +	} +} + +static int traverse_fn(struct tdb_context *tdb, TDB_DATA key, TDB_DATA dbuf, void *state) +{ +	printf("{\n"); +	printf("key(%d) = \"", (int)key.dsize); +	print_data(key); +	printf("\"\n"); +	printf("data(%d) = \"", (int)dbuf.dsize); +	print_data(dbuf); +	printf("\"\n"); +	printf("}\n"); +	return 0; +} + +static int dump_tdb(const char *fname, const char *keyname) +{ +	struct tdb_context *tdb; +	TDB_DATA key, value; + +	tdb = tdb_open(fname, 0, O_RDONLY, 0, NULL); +	if (!tdb) { +		printf("Failed to open %s\n", fname); +		return 1; +	} + +	if (!keyname) { +		tdb_traverse(tdb, traverse_fn, NULL); +	} else { +		key = tdb_mkdata(keyname, strlen(keyname)); +		if (tdb_fetch(tdb, key, &value) != 0) { +			return 1; +		} else { +			print_data(value); +			free(value.dptr); +		} +	} + +	return 0; +} + +static void usage( void) +{ +	printf( "Usage: tdb2dump [options] <filename>\n\n"); +	printf( "   -h          this help message\n"); +	printf( "   -k keyname  dumps value of keyname\n"); +} + + int main(int argc, char *argv[]) +{ +	char *fname, *keyname=NULL; +	int c; + +	if (argc < 2) { +		printf("Usage: tdb2dump <fname>\n"); +		exit(1); +	} + +	while ((c = getopt( argc, argv, "hk:")) != -1) { +		switch (c) { +		case 'h': +			usage(); +			exit( 0); +		case 'k': +			keyname = optarg; +			break; +		default: +			usage(); +			exit( 1); +		} +	} + +	fname = argv[optind]; + +	return dump_tdb(fname, keyname); +} diff --git a/lib/tdb2/tools/tdb2restore.c b/lib/tdb2/tools/tdb2restore.c new file mode 100644 index 0000000000..658215a16c --- /dev/null +++ b/lib/tdb2/tools/tdb2restore.c @@ -0,0 +1,227 @@ +/* +   tdb2restore -- construct a tdb from tdbdump output. +   Copyright (C) Volker Lendecke		2010 +   Copyright (C) Simon McVittie			2005 + +   This program is free software; you can redistribute it and/or modify +   it under the terms of the GNU General Public License as published by +   the Free Software Foundation; either version 3 of the License, or +   (at your option) any later version. + +   This program is distributed in the hope that it will be useful, +   but WITHOUT ANY WARRANTY; without even the implied warranty of +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the +   GNU General Public License for more details. + +   You should have received a copy of the GNU General Public License +   along with this program.  If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "tdb2.h" +#include <assert.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> + +#define debug_fprintf(file, fmt, ...) do {/*nothing*/} while (0) + +static int read_linehead(FILE *f) +{ +	int i, c; +	int num_bytes; +	char prefix[128]; + +	while (1) { +		c = getc(f); +		if (c == EOF) { +			return -1; +		} +		if (c == '(') { +			break; +		} +	} +	for (i=0; i<sizeof(prefix); i++) { +		c = getc(f); +		if (c == EOF) { +			return -1; +		} +		prefix[i] = c; +		if (c == '"') { +			break; +		} +	} +	if (i == sizeof(prefix)) { +		return -1; +	} +	prefix[i] = '\0'; + +	if (sscanf(prefix, "%d) = ", &num_bytes) != 1) { +		return -1; +	} +	return num_bytes; +} + +static int read_hex(void) { +	int c; +	c = getchar(); +	if (c == EOF) { +		fprintf(stderr, "Unexpected EOF in data\n"); +		return -1; +	} else if (c == '"') { +		fprintf(stderr, "Unexpected \\\" sequence\n"); +		return -1; +	} else if ('0' <= c && c <= '9')  { +		return c - '0'; +	} else if ('A' <= c && c <= 'F')  { +		return c - 'A' + 10; +	} else if ('a' <= c && c <= 'f')  { +		return c - 'a' + 10; +	} else { +		fprintf(stderr, "Invalid hex: %c\n", c); +		return -1; +	} +} + +static int read_data(FILE *f, struct tdb_data *d, size_t size) { +	int c, low, high; +	int i; + +	d->dptr = (unsigned char *)malloc(size); +	if (d->dptr == NULL) { +		return -1; +	} +	d->dsize = size; + +	for (i=0; i<size; i++) { +		c = getc(f); +		if (c == EOF) { +			fprintf(stderr, "Unexpected EOF in data\n"); +			return 1; +		} else if (c == '"') { +			return 0; +		} else if (c == '\\') { +			high = read_hex(); +			if (high < 0) { +				return -1; +			} +			high = high << 4; +			assert(high == (high & 0xf0)); +			low = read_hex(); +			if (low < 0) { +				return -1; +			} +			assert(low == (low & 0x0f)); +			d->dptr[i] = (low|high); +		} else { +			d->dptr[i] = c; +		} +	} +	return 0; +} + +static int swallow(FILE *f, const char *s, int *eof) +{ +	char line[128]; + +	if (fgets(line, sizeof(line), f) == NULL) { +		if (eof != NULL) { +			*eof = 1; +		} +		return -1; +	} +	if (strcmp(line, s) != 0) { +		return -1; +	} +	return 0; +} + +static bool read_rec(FILE *f, struct tdb_context *tdb, int *eof) +{ +	int length; +	struct tdb_data key, data; +	bool ret = false; +	enum TDB_ERROR e; + +	key.dptr = NULL; +	data.dptr = NULL; + +	if (swallow(f, "{\n", eof) == -1) { +		goto fail; +	} +	length = read_linehead(f); +	if (length == -1) { +		goto fail; +	} +	if (read_data(f, &key, length) == -1) { +		goto fail; +	} +	if (swallow(f, "\"\n", NULL) == -1) { +		goto fail; +	} +	length = read_linehead(f); +	if (length == -1) { +		goto fail; +	} +	if (read_data(f, &data, length) == -1) { +		goto fail; +	} +	if ((swallow(f, "\"\n", NULL) == -1) +	    || (swallow(f, "}\n", NULL) == -1)) { +		goto fail; +	} +	e = tdb_store(tdb, key, data, TDB_INSERT); +	if (e != TDB_SUCCESS) { +		fprintf(stderr, "TDB error: %s\n", tdb_errorstr(e)); +		goto fail; +	} + +	ret = true; +fail: +	free(key.dptr); +	free(data.dptr); +	return ret; +} + +static int restore_tdb(const char *fname) +{ +	struct tdb_context *tdb; + +	tdb = tdb_open(fname, 0, O_RDWR|O_CREAT|O_EXCL, 0666, NULL); +	if (!tdb) { +		perror("tdb_open"); +		fprintf(stderr, "Failed to open %s\n", fname); +		return 1; +	} + +	while (1) { +		int eof = 0; +		if (!read_rec(stdin, tdb, &eof)) { +			if (eof) { +				break; +			} +			return 1; +		} +	} +	if (tdb_close(tdb)) { +		fprintf(stderr, "Error closing tdb\n"); +		return 1; +	} +	fprintf(stderr, "EOF\n"); +	return 0; +} + +int main(int argc, char *argv[]) +{ +	char *fname; + +	if (argc < 2) { +		printf("Usage: %s dbname < tdbdump_output\n", argv[0]); +		exit(1); +	} + +	fname = argv[1]; + +	return restore_tdb(fname); +} diff --git a/lib/tdb2/tools/tdb2tool.c b/lib/tdb2/tools/tdb2tool.c new file mode 100644 index 0000000000..cd301c80b7 --- /dev/null +++ b/lib/tdb2/tools/tdb2tool.c @@ -0,0 +1,798 @@ +/* +   Unix SMB/CIFS implementation. +   Samba database functions +   Copyright (C) Andrew Tridgell              1999-2000 +   Copyright (C) Paul `Rusty' Russell		   2000 +   Copyright (C) Jeremy Allison			   2000 +   Copyright (C) Andrew Esh                        2001 + +   This program is free software; you can redistribute it and/or modify +   it under the terms of the GNU General Public License as published by +   the Free Software Foundation; either version 3 of the License, or +   (at your option) any later version. + +   This program is distributed in the hope that it will be useful, +   but WITHOUT ANY WARRANTY; without even the implied warranty of +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the +   GNU General Public License for more details. + +   You should have received a copy of the GNU General Public License +   along with this program.  If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "tdb2.h" +#include <stdlib.h> +#include <stdio.h> +#include <ctype.h> +#include <sys/time.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <errno.h> +#include <string.h> +#include <stdarg.h> + +static int do_command(void); +const char *cmdname; +char *arg1, *arg2; +size_t arg1len, arg2len; +int bIterate = 0; +char *line; +TDB_DATA iterate_kbuf; +char cmdline[1024]; +static int disable_mmap; + +enum commands { +	CMD_CREATE_TDB, +	CMD_OPEN_TDB, +	CMD_TRANSACTION_START, +	CMD_TRANSACTION_COMMIT, +	CMD_TRANSACTION_CANCEL, +	CMD_ERASE, +	CMD_DUMP, +	CMD_INSERT, +	CMD_MOVE, +	CMD_STORE, +	CMD_SHOW, +	CMD_KEYS, +	CMD_HEXKEYS, +	CMD_DELETE, +#if 0 +	CMD_LIST_HASH_FREE, +	CMD_LIST_FREE, +#endif +	CMD_INFO, +	CMD_MMAP, +	CMD_SPEED, +	CMD_FIRST, +	CMD_NEXT, +	CMD_SYSTEM, +	CMD_CHECK, +	CMD_QUIT, +	CMD_HELP +}; + +typedef struct { +	const char *name; +	enum commands cmd; +} COMMAND_TABLE; + +COMMAND_TABLE cmd_table[] = { +	{"create",	CMD_CREATE_TDB}, +	{"open",	CMD_OPEN_TDB}, +#if 0 +	{"transaction_start",	CMD_TRANSACTION_START}, +	{"transaction_commit",	CMD_TRANSACTION_COMMIT}, +	{"transaction_cancel",	CMD_TRANSACTION_CANCEL}, +#endif +	{"erase",	CMD_ERASE}, +	{"dump",	CMD_DUMP}, +	{"insert",	CMD_INSERT}, +	{"move",	CMD_MOVE}, +	{"store",	CMD_STORE}, +	{"show",	CMD_SHOW}, +	{"keys",	CMD_KEYS}, +	{"hexkeys",	CMD_HEXKEYS}, +	{"delete",	CMD_DELETE}, +#if 0 +	{"list",	CMD_LIST_HASH_FREE}, +	{"free",	CMD_LIST_FREE}, +#endif +	{"info",	CMD_INFO}, +	{"speed",	CMD_SPEED}, +	{"mmap",	CMD_MMAP}, +	{"first",	CMD_FIRST}, +	{"1",		CMD_FIRST}, +	{"next",	CMD_NEXT}, +	{"n",		CMD_NEXT}, +	{"check",	CMD_CHECK}, +	{"quit",	CMD_QUIT}, +	{"q",		CMD_QUIT}, +	{"!",		CMD_SYSTEM}, +	{NULL,		CMD_HELP} +}; + +struct timeval tp1,tp2; + +static void _start_timer(void) +{ +	gettimeofday(&tp1,NULL); +} + +static double _end_timer(void) +{ +	gettimeofday(&tp2,NULL); +	return((tp2.tv_sec - tp1.tv_sec) + +	       (tp2.tv_usec - tp1.tv_usec)*1.0e-6); +} + +static void tdb_log(struct tdb_context *tdb, enum tdb_log_level level, +		    const char *message, void *priv) +{ +	fputs(message, stderr); +} + +/* a tdb tool for manipulating a tdb database */ + +static struct tdb_context *tdb; + +static int print_rec(struct tdb_context *the_tdb, TDB_DATA key, TDB_DATA dbuf, void *state); +static int print_key(struct tdb_context *the_tdb, TDB_DATA key, TDB_DATA dbuf, void *state); +static int print_hexkey(struct tdb_context *the_tdb, TDB_DATA key, TDB_DATA dbuf, void *state); + +static void print_asc(const char *buf,int len) +{ +	int i; + +	/* We're probably printing ASCII strings so don't try to display +	   the trailing NULL character. */ + +	if (buf[len - 1] == 0) +	        len--; + +	for (i=0;i<len;i++) +		printf("%c",isprint(buf[i])?buf[i]:'.'); +} + +static void print_data(const char *buf,int len) +{ +	int i=0; +	if (len<=0) return; +	printf("[%03X] ",i); +	for (i=0;i<len;) { +		printf("%02X ",(int)((unsigned char)buf[i])); +		i++; +		if (i%8 == 0) printf(" "); +		if (i%16 == 0) { +			print_asc(&buf[i-16],8); printf(" "); +			print_asc(&buf[i-8],8); printf("\n"); +			if (i<len) printf("[%03X] ",i); +		} +	} +	if (i%16) { +		int n; + +		n = 16 - (i%16); +		printf(" "); +		if (n>8) printf(" "); +		while (n--) printf("   "); + +		n = i%16; +		if (n > 8) n = 8; +		print_asc(&buf[i-(i%16)],n); printf(" "); +		n = (i%16) - n; +		if (n>0) print_asc(&buf[i-n],n); +		printf("\n"); +	} +} + +static void help(void) +{ +	printf("\n" +"tdbtool: \n" +"  create    dbname     : create a database\n" +"  open      dbname     : open an existing database\n" +"  openjh    dbname     : open an existing database (jenkins hash)\n" +"  transaction_start    : start a transaction\n" +"  transaction_commit   : commit a transaction\n" +"  transaction_cancel   : cancel a transaction\n" +"  erase                : erase the database\n" +"  dump                 : dump the database as strings\n" +"  keys                 : dump the database keys as strings\n" +"  hexkeys              : dump the database keys as hex values\n" +"  info                 : print summary info about the database\n" +"  insert    key  data  : insert a record\n" +"  move      key  file  : move a record to a destination tdb\n" +"  store     key  data  : store a record (replace)\n" +"  show      key        : show a record by key\n" +"  delete    key        : delete a record by key\n" +#if 0 +"  list                 : print the database hash table and freelist\n" +"  free                 : print the database freelist\n" +#endif +"  check                : check the integrity of an opened database\n" +"  speed                : perform speed tests on the database\n" +"  ! command            : execute system command\n" +"  1 | first            : print the first record\n" +"  n | next             : print the next record\n" +"  q | quit             : terminate\n" +"  \\n                   : repeat 'next' command\n" +"\n"); +} + +static void terror(enum TDB_ERROR err, const char *why) +{ +	if (err != TDB_SUCCESS) +		printf("%s:%s\n", tdb_errorstr(err), why); +	else +		printf("%s\n", why); +} + +static void create_tdb(const char *tdbname) +{ +	union tdb_attribute log_attr; +	log_attr.base.attr = TDB_ATTRIBUTE_LOG; +	log_attr.base.next = NULL; +	log_attr.log.fn = tdb_log; + +	if (tdb) tdb_close(tdb); +	tdb = tdb_open(tdbname, (disable_mmap?TDB_NOMMAP:0), +		       O_RDWR | O_CREAT | O_TRUNC, 0600, &log_attr); +	if (!tdb) { +		printf("Could not create %s: %s\n", tdbname, strerror(errno)); +	} +} + +static void open_tdb(const char *tdbname) +{ +	union tdb_attribute log_attr; +	log_attr.base.attr = TDB_ATTRIBUTE_LOG; +	log_attr.base.next = NULL; +	log_attr.log.fn = tdb_log; + +	if (tdb) tdb_close(tdb); +	tdb = tdb_open(tdbname, disable_mmap?TDB_NOMMAP:0, O_RDWR, 0600, +		       &log_attr); +	if (!tdb) { +		printf("Could not open %s: %s\n", tdbname, strerror(errno)); +	} +} + +static void insert_tdb(char *keyname, size_t keylen, char* data, size_t datalen) +{ +	TDB_DATA key, dbuf; +	enum TDB_ERROR ecode; + +	if ((keyname == NULL) || (keylen == 0)) { +		terror(TDB_SUCCESS, "need key"); +		return; +	} + +	key.dptr = (unsigned char *)keyname; +	key.dsize = keylen; +	dbuf.dptr = (unsigned char *)data; +	dbuf.dsize = datalen; + +	ecode = tdb_store(tdb, key, dbuf, TDB_INSERT); +	if (ecode) { +		terror(ecode, "insert failed"); +	} +} + +static void store_tdb(char *keyname, size_t keylen, char* data, size_t datalen) +{ +	TDB_DATA key, dbuf; +	enum TDB_ERROR ecode; + +	if ((keyname == NULL) || (keylen == 0)) { +		terror(TDB_SUCCESS, "need key"); +		return; +	} + +	if ((data == NULL) || (datalen == 0)) { +		terror(TDB_SUCCESS, "need data"); +		return; +	} + +	key.dptr = (unsigned char *)keyname; +	key.dsize = keylen; +	dbuf.dptr = (unsigned char *)data; +	dbuf.dsize = datalen; + +	printf("Storing key:\n"); +	print_rec(tdb, key, dbuf, NULL); + +	ecode = tdb_store(tdb, key, dbuf, TDB_REPLACE); +	if (ecode) { +		terror(ecode, "store failed"); +	} +} + +static void show_tdb(char *keyname, size_t keylen) +{ +	TDB_DATA key, dbuf; +	enum TDB_ERROR ecode; + +	if ((keyname == NULL) || (keylen == 0)) { +		terror(TDB_SUCCESS, "need key"); +		return; +	} + +	key.dptr = (unsigned char *)keyname; +	key.dsize = keylen; + +	ecode = tdb_fetch(tdb, key, &dbuf); +	if (ecode) { +		terror(ecode, "fetch failed"); +		return; +	} + +	print_rec(tdb, key, dbuf, NULL); + +	free( dbuf.dptr ); +} + +static void delete_tdb(char *keyname, size_t keylen) +{ +	TDB_DATA key; +	enum TDB_ERROR ecode; + +	if ((keyname == NULL) || (keylen == 0)) { +		terror(TDB_SUCCESS, "need key"); +		return; +	} + +	key.dptr = (unsigned char *)keyname; +	key.dsize = keylen; + +	ecode = tdb_delete(tdb, key); +	if (ecode) { +		terror(ecode, "delete failed"); +	} +} + +static void move_rec(char *keyname, size_t keylen, char* tdbname) +{ +	TDB_DATA key, dbuf; +	struct tdb_context *dst_tdb; +	enum TDB_ERROR ecode; + +	if ((keyname == NULL) || (keylen == 0)) { +		terror(TDB_SUCCESS, "need key"); +		return; +	} + +	if ( !tdbname ) { +		terror(TDB_SUCCESS, "need destination tdb name"); +		return; +	} + +	key.dptr = (unsigned char *)keyname; +	key.dsize = keylen; + +	ecode = tdb_fetch(tdb, key, &dbuf); +	if (ecode) { +		terror(ecode, "fetch failed"); +		return; +	} + +	print_rec(tdb, key, dbuf, NULL); + +	dst_tdb = tdb_open(tdbname, 0, O_RDWR, 0600, NULL); +	if ( !dst_tdb ) { +		terror(TDB_SUCCESS, "unable to open destination tdb"); +		return; +	} + +	ecode = tdb_store( dst_tdb, key, dbuf, TDB_REPLACE); +	if (ecode) +		terror(ecode, "failed to move record"); +	else +		printf("record moved\n"); + +	tdb_close( dst_tdb ); +} + +static int print_rec(struct tdb_context *the_tdb, TDB_DATA key, TDB_DATA dbuf, void *state) +{ +	printf("\nkey %d bytes\n", (int)key.dsize); +	print_asc((const char *)key.dptr, key.dsize); +	printf("\ndata %d bytes\n", (int)dbuf.dsize); +	print_data((const char *)dbuf.dptr, dbuf.dsize); +	return 0; +} + +static int print_key(struct tdb_context *the_tdb, TDB_DATA key, TDB_DATA dbuf, void *state) +{ +	printf("key %d bytes: ", (int)key.dsize); +	print_asc((const char *)key.dptr, key.dsize); +	printf("\n"); +	return 0; +} + +static int print_hexkey(struct tdb_context *the_tdb, TDB_DATA key, TDB_DATA dbuf, void *state) +{ +	printf("key %d bytes\n", (int)key.dsize); +	print_data((const char *)key.dptr, key.dsize); +	printf("\n"); +	return 0; +} + +static int total_bytes; + +static int traverse_fn(struct tdb_context *the_tdb, TDB_DATA key, TDB_DATA dbuf, void *state) +{ +	total_bytes += dbuf.dsize; +	return 0; +} + +static void info_tdb(void) +{ +	enum TDB_ERROR ecode; +	char *summary; + +	ecode = tdb_summary(tdb, TDB_SUMMARY_HISTOGRAMS, &summary); + +	if (ecode) { +		terror(ecode, "Getting summary"); +	} else { +		printf("%s", summary); +		free(summary); +	} +} + +static void speed_tdb(const char *tlimit) +{ +	unsigned timelimit = tlimit?atoi(tlimit):0; +	double t; +	int ops; +	if (timelimit == 0) timelimit = 5; + +	ops = 0; +	printf("Testing store speed for %u seconds\n", timelimit); +	_start_timer(); +	do { +		long int r = random(); +		TDB_DATA key, dbuf; +		key = tdb_mkdata("store test", strlen("store test")); +		dbuf.dptr = (unsigned char *)&r; +		dbuf.dsize = sizeof(r); +		tdb_store(tdb, key, dbuf, TDB_REPLACE); +		t = _end_timer(); +		ops++; +	} while (t < timelimit); +	printf("%10.3f ops/sec\n", ops/t); + +	ops = 0; +	printf("Testing fetch speed for %u seconds\n", timelimit); +	_start_timer(); +	do { +		long int r = random(); +		TDB_DATA key, dbuf; +		key = tdb_mkdata("store test", strlen("store test")); +		dbuf.dptr = (unsigned char *)&r; +		dbuf.dsize = sizeof(r); +		tdb_fetch(tdb, key, &dbuf); +		t = _end_timer(); +		ops++; +	} while (t < timelimit); +	printf("%10.3f ops/sec\n", ops/t); + +	ops = 0; +	printf("Testing transaction speed for %u seconds\n", timelimit); +	_start_timer(); +	do { +		long int r = random(); +		TDB_DATA key, dbuf; +		key = tdb_mkdata("transaction test", strlen("transaction test")); +		dbuf.dptr = (unsigned char *)&r; +		dbuf.dsize = sizeof(r); +		tdb_transaction_start(tdb); +		tdb_store(tdb, key, dbuf, TDB_REPLACE); +		tdb_transaction_commit(tdb); +		t = _end_timer(); +		ops++; +	} while (t < timelimit); +	printf("%10.3f ops/sec\n", ops/t); + +	ops = 0; +	printf("Testing traverse speed for %u seconds\n", timelimit); +	_start_timer(); +	do { +		tdb_traverse(tdb, traverse_fn, NULL); +		t = _end_timer(); +		ops++; +	} while (t < timelimit); +	printf("%10.3f ops/sec\n", ops/t); +} + +static void toggle_mmap(void) +{ +	disable_mmap = !disable_mmap; +	if (disable_mmap) { +		printf("mmap is disabled\n"); +	} else { +		printf("mmap is enabled\n"); +	} +} + +static char *tdb_getline(const char *prompt) +{ +	static char thisline[1024]; +	char *p; +	fputs(prompt, stdout); +	thisline[0] = 0; +	p = fgets(thisline, sizeof(thisline)-1, stdin); +	if (p) p = strchr(p, '\n'); +	if (p) *p = 0; +	return p?thisline:NULL; +} + +static int do_delete_fn(struct tdb_context *the_tdb, TDB_DATA key, TDB_DATA dbuf, +                     void *state) +{ +    return tdb_delete(the_tdb, key); +} + +static void first_record(struct tdb_context *the_tdb, TDB_DATA *pkey) +{ +	TDB_DATA dbuf; +	enum TDB_ERROR ecode; +	ecode = tdb_firstkey(the_tdb, pkey); +	if (!ecode) +		ecode = tdb_fetch(the_tdb, *pkey, &dbuf); +	if (ecode) terror(ecode, "fetch failed"); +	else { +		print_rec(the_tdb, *pkey, dbuf, NULL); +	} +} + +static void next_record(struct tdb_context *the_tdb, TDB_DATA *pkey) +{ +	TDB_DATA dbuf; +	enum TDB_ERROR ecode; +	ecode = tdb_nextkey(the_tdb, pkey); + +	if (!ecode) +		ecode = tdb_fetch(the_tdb, *pkey, &dbuf); +	if (ecode) +		terror(ecode, "fetch failed"); +	else +		print_rec(the_tdb, *pkey, dbuf, NULL); +} + +static void check_db(struct tdb_context *the_tdb) +{ +	if (!the_tdb) { +		printf("Error: No database opened!\n"); +	} else { +		if (tdb_check(the_tdb, NULL, NULL) != 0) +			printf("Integrity check for the opened database failed.\n"); +		else +			printf("Database integrity is OK.\n"); +	} +} + +static int do_command(void) +{ +	COMMAND_TABLE *ctp = cmd_table; +	enum commands mycmd = CMD_HELP; +	int cmd_len; + +	if (cmdname && strlen(cmdname) == 0) { +		mycmd = CMD_NEXT; +	} else { +		while (ctp->name) { +			cmd_len = strlen(ctp->name); +			if (strncmp(ctp->name,cmdname,cmd_len) == 0) { +				mycmd = ctp->cmd; +				break; +			} +			ctp++; +		} +	} + +	switch (mycmd) { +	case CMD_CREATE_TDB: +		bIterate = 0; +		create_tdb(arg1); +		return 0; +	case CMD_OPEN_TDB: +		bIterate = 0; +		open_tdb(arg1); +		return 0; +	case CMD_SYSTEM: +		/* Shell command */ +		if (system(arg1) == -1) { +			terror(TDB_SUCCESS, "system() call failed\n"); +		} +		return 0; +	case CMD_QUIT: +		return 1; +	default: +		/* all the rest require a open database */ +		if (!tdb) { +			bIterate = 0; +			terror(TDB_SUCCESS, "database not open"); +			help(); +			return 0; +		} +		switch (mycmd) { +		case CMD_TRANSACTION_START: +			bIterate = 0; +			tdb_transaction_start(tdb); +			return 0; +		case CMD_TRANSACTION_COMMIT: +			bIterate = 0; +			tdb_transaction_commit(tdb); +			return 0; +		case CMD_TRANSACTION_CANCEL: +			bIterate = 0; +			tdb_transaction_cancel(tdb); +			return 0; +		case CMD_ERASE: +			bIterate = 0; +			tdb_traverse(tdb, do_delete_fn, NULL); +			return 0; +		case CMD_DUMP: +			bIterate = 0; +			tdb_traverse(tdb, print_rec, NULL); +			return 0; +		case CMD_INSERT: +			bIterate = 0; +			insert_tdb(arg1, arg1len,arg2,arg2len); +			return 0; +		case CMD_MOVE: +			bIterate = 0; +			move_rec(arg1,arg1len,arg2); +			return 0; +		case CMD_STORE: +			bIterate = 0; +			store_tdb(arg1,arg1len,arg2,arg2len); +			return 0; +		case CMD_SHOW: +			bIterate = 0; +			show_tdb(arg1, arg1len); +			return 0; +		case CMD_KEYS: +			tdb_traverse(tdb, print_key, NULL); +			return 0; +		case CMD_HEXKEYS: +			tdb_traverse(tdb, print_hexkey, NULL); +			return 0; +		case CMD_DELETE: +			bIterate = 0; +			delete_tdb(arg1,arg1len); +			return 0; +#if 0 +		case CMD_LIST_HASH_FREE: +			tdb_dump_all(tdb); +			return 0; +		case CMD_LIST_FREE: +			tdb_printfreelist(tdb); +			return 0; +#endif +		case CMD_INFO: +			info_tdb(); +			return 0; +		case CMD_SPEED: +			speed_tdb(arg1); +			return 0; +		case CMD_MMAP: +			toggle_mmap(); +			return 0; +		case CMD_FIRST: +			bIterate = 1; +			first_record(tdb, &iterate_kbuf); +			return 0; +		case CMD_NEXT: +			if (bIterate) +				next_record(tdb, &iterate_kbuf); +			return 0; +		case CMD_CHECK: +			check_db(tdb); +			return 0; +		case CMD_HELP: +			help(); +			return 0; +		case CMD_CREATE_TDB: +		case CMD_OPEN_TDB: +		case CMD_SYSTEM: +		case CMD_QUIT: +			/* +			 * unhandled commands.  cases included here to avoid compiler +			 * warnings. +			 */ +			return 0; +		} +	} + +	return 0; +} + +static char *convert_string(char *instring, size_t *sizep) +{ +	size_t length = 0; +	char *outp, *inp; +	char temp[3]; + +	outp = inp = instring; + +	while (*inp) { +		if (*inp == '\\') { +			inp++; +			if (*inp && strchr("0123456789abcdefABCDEF",(int)*inp)) { +				temp[0] = *inp++; +				temp[1] = '\0'; +				if (*inp && strchr("0123456789abcdefABCDEF",(int)*inp)) { +					temp[1] = *inp++; +					temp[2] = '\0'; +				} +				*outp++ = (char)strtol((const char *)temp,NULL,16); +			} else { +				*outp++ = *inp++; +			} +		} else { +			*outp++ = *inp++; +		} +		length++; +	} +	*sizep = length; +	return instring; +} + +int main(int argc, char *argv[]) +{ +	cmdname = ""; +	arg1 = NULL; +	arg1len = 0; +	arg2 = NULL; +	arg2len = 0; + +	if (argv[1]) { +		cmdname = "open"; +		arg1 = argv[1]; +		do_command(); +		cmdname =  ""; +		arg1 = NULL; +	} + +	switch (argc) { +	case 1: +	case 2: +		/* Interactive mode */ +		while ((cmdname = tdb_getline("tdb> "))) { +			arg2 = arg1 = NULL; +			if ((arg1 = strchr((const char *)cmdname,' ')) != NULL) { +				arg1++; +				arg2 = arg1; +				while (*arg2) { +					if (*arg2 == ' ') { +						*arg2++ = '\0'; +						break; +					} +					if ((*arg2++ == '\\') && (*arg2 == ' ')) { +						arg2++; +					} +				} +			} +			if (arg1) arg1 = convert_string(arg1,&arg1len); +			if (arg2) arg2 = convert_string(arg2,&arg2len); +			if (do_command()) break; +		} +		break; +	case 5: +		arg2 = convert_string(argv[4],&arg2len); +	case 4: +		arg1 = convert_string(argv[3],&arg1len); +	case 3: +		cmdname = argv[2]; +	default: +		do_command(); +		break; +	} + +	if (tdb) tdb_close(tdb); + +	return 0; +} diff --git a/lib/tdb2/tools/tdb2torture.c b/lib/tdb2/tools/tdb2torture.c new file mode 100644 index 0000000000..f6a7a5064a --- /dev/null +++ b/lib/tdb2/tools/tdb2torture.c @@ -0,0 +1,494 @@ +/* this tests tdb by doing lots of ops from several simultaneous +   writers - that stresses the locking code. +*/ + +#include "tdb2.h" +#include <stdlib.h> +#include <err.h> +#include <getopt.h> +#include <stdarg.h> +#include <stdio.h> +#include <string.h> +#include <errno.h> +#include <unistd.h> +#include <sys/types.h> +#include <fcntl.h> +#include <time.h> +#include <sys/wait.h> + +//#define REOPEN_PROB 30 +#define DELETE_PROB 8 +#define STORE_PROB 4 +#define APPEND_PROB 6 +#define TRANSACTION_PROB 10 +#define TRANSACTION_PREPARE_PROB 2 +#define LOCKSTORE_PROB 5 +#define TRAVERSE_PROB 20 +#define TRAVERSE_MOD_PROB 100 +#define TRAVERSE_ABORT_PROB 500 +#define CULL_PROB 100 +#define KEYLEN 3 +#define DATALEN 100 + +static struct tdb_context *db; +static int in_transaction; +static int in_traverse; +static int error_count; +#if TRANSACTION_PROB +static int always_transaction = 0; +#endif +static int loopnum; +static int count_pipe; +static union tdb_attribute log_attr; +static union tdb_attribute seed_attr; + +static void tdb_log(struct tdb_context *tdb, enum tdb_log_level level, +		    const char *message, void *data) +{ +	fputs(message, stdout); +	fflush(stdout); +#if 0 +	{ +		char str[200]; +		signal(SIGUSR1, SIG_IGN); +		sprintf(str,"xterm -e gdb /proc/%d/exe %d", getpid(), getpid()); +		system(str); +	} +#endif +} + +#include "../private.h" + +static void segv_handler(int sig, siginfo_t *info, void *p) +{ +	char string[100]; + +	sprintf(string, "%u: death at %p (map_ptr %p, map_size %zu)\n", +		getpid(), info->si_addr, db->file->map_ptr, +		(size_t)db->file->map_size); +	if (write(2, string, strlen(string)) > 0) +		sleep(60); +	_exit(11); +} + +static void fatal(struct tdb_context *tdb, const char *why) +{ +	fprintf(stderr, "%u:%s:%s\n", getpid(), why, +		tdb ? tdb_errorstr(tdb_error(tdb)) : "(no tdb)"); +	error_count++; +} + +static char *randbuf(int len) +{ +	char *buf; +	int i; +	buf = (char *)malloc(len+1); + +	for (i=0;i<len;i++) { +		buf[i] = 'a' + (rand() % 26); +	} +	buf[i] = 0; +	return buf; +} + +static void addrec_db(void); +static int modify_traverse(struct tdb_context *tdb, TDB_DATA key, TDB_DATA dbuf, +			   void *state) +{ +#if CULL_PROB +	if (random() % CULL_PROB == 0) { +		tdb_delete(tdb, key); +	} +#endif + +#if TRAVERSE_MOD_PROB +	if (random() % TRAVERSE_MOD_PROB == 0) { +		addrec_db(); +	} +#endif + +#if TRAVERSE_ABORT_PROB +	if (random() % TRAVERSE_ABORT_PROB == 0) +		return 1; +#endif + +	return 0; +} + +static void addrec_db(void) +{ +	int klen, dlen; +	char *k, *d; +	TDB_DATA key, data; + +	klen = 1 + (rand() % KEYLEN); +	dlen = 1 + (rand() % DATALEN); + +	k = randbuf(klen); +	d = randbuf(dlen); + +	key.dptr = (unsigned char *)k; +	key.dsize = klen+1; + +	data.dptr = (unsigned char *)d; +	data.dsize = dlen+1; + +#if REOPEN_PROB +	if (in_traverse == 0 && in_transaction == 0 && random() % REOPEN_PROB == 0) { +		tdb_reopen_all(0); +		goto next; +	} +#endif + +#if TRANSACTION_PROB +	if (in_traverse == 0 && in_transaction == 0 && (always_transaction || random() % TRANSACTION_PROB == 0)) { +		if (tdb_transaction_start(db) != 0) { +			fatal(db, "tdb_transaction_start failed"); +		} +		in_transaction++; +		goto next; +	} +	if (in_traverse == 0 && in_transaction && random() % TRANSACTION_PROB == 0) { +		if (random() % TRANSACTION_PREPARE_PROB == 0) { +			if (tdb_transaction_prepare_commit(db) != 0) { +				fatal(db, "tdb_transaction_prepare_commit failed"); +			} +		} +		if (tdb_transaction_commit(db) != 0) { +			fatal(db, "tdb_transaction_commit failed"); +		} +		in_transaction--; +		goto next; +	} + +	if (in_traverse == 0 && in_transaction && random() % TRANSACTION_PROB == 0) { +		tdb_transaction_cancel(db); +		in_transaction--; +		goto next; +	} +#endif + +#if DELETE_PROB +	if (random() % DELETE_PROB == 0) { +		tdb_delete(db, key); +		goto next; +	} +#endif + +#if STORE_PROB +	if (random() % STORE_PROB == 0) { +		if (tdb_store(db, key, data, TDB_REPLACE) != 0) { +			fatal(db, "tdb_store failed"); +		} +		goto next; +	} +#endif + +#if APPEND_PROB +	if (random() % APPEND_PROB == 0) { +		if (tdb_append(db, key, data) != 0) { +			fatal(db, "tdb_append failed"); +		} +		goto next; +	} +#endif + +#if LOCKSTORE_PROB +	if (random() % LOCKSTORE_PROB == 0) { +		tdb_chainlock(db, key); +		if (tdb_fetch(db, key, &data) != TDB_SUCCESS) { +			data.dsize = 0; +			data.dptr = NULL; +		} +		if (tdb_store(db, key, data, TDB_REPLACE) != 0) { +			fatal(db, "tdb_store failed"); +		} +		if (data.dptr) free(data.dptr); +		tdb_chainunlock(db, key); +		goto next; +	} +#endif + +#if TRAVERSE_PROB +	/* FIXME: recursive traverses break transactions? */ +	if (in_traverse == 0 && random() % TRAVERSE_PROB == 0) { +		in_traverse++; +		tdb_traverse(db, modify_traverse, NULL); +		in_traverse--; +		goto next; +	} +#endif + +	if (tdb_fetch(db, key, &data) == TDB_SUCCESS) +		free(data.dptr); + +next: +	free(k); +	free(d); +} + +static int traverse_fn(struct tdb_context *tdb, TDB_DATA key, TDB_DATA dbuf, +                       void *state) +{ +	tdb_delete(tdb, key); +	return 0; +} + +static void usage(void) +{ +	printf("Usage: tdbtorture" +#if TRANSACTION_PROB +	       " [-t]" +#endif +	       " [-k] [-n NUM_PROCS] [-l NUM_LOOPS] [-s SEED] [-S]\n"); +	exit(0); +} + +static void send_count_and_suicide(int sig) +{ +	/* This ensures our successor can continue where we left off. */ +	if (write(count_pipe, &loopnum, sizeof(loopnum)) != sizeof(loopnum)) +		exit(2); +	/* This gives a unique signature. */ +	kill(getpid(), SIGUSR2); +} + +static int run_child(int i, int seed, unsigned num_loops, unsigned start, +		     int tdb_flags) +{ +	struct sigaction act = { .sa_sigaction = segv_handler, +				 .sa_flags = SA_SIGINFO }; +	sigaction(11, &act, NULL); + +	db = tdb_open("torture.tdb", tdb_flags, O_RDWR | O_CREAT, 0600, +		      &log_attr); +	if (!db) { +		fatal(NULL, "db open failed"); +	} + +#if 0 +	if (i == 0) { +		printf("pid %i\n", getpid()); +		sleep(9); +	} else +		sleep(10); +#endif + +	srand(seed + i); +	srandom(seed + i); + +	/* Set global, then we're ready to handle being killed. */ +	loopnum = start; +	signal(SIGUSR1, send_count_and_suicide); + +	for (;loopnum<num_loops && error_count == 0;loopnum++) { +		addrec_db(); +	} + +	if (error_count == 0) { +		tdb_traverse(db, NULL, NULL); +#if TRANSACTION_PROB +		if (always_transaction) { +			while (in_transaction) { +				tdb_transaction_cancel(db); +				in_transaction--; +			} +			if (tdb_transaction_start(db) != 0) +				fatal(db, "tdb_transaction_start failed"); +		} +#endif +		tdb_traverse(db, traverse_fn, NULL); +		tdb_traverse(db, traverse_fn, NULL); + +#if TRANSACTION_PROB +		if (always_transaction) { +			if (tdb_transaction_commit(db) != 0) +				fatal(db, "tdb_transaction_commit failed"); +		} +#endif +	} + +	tdb_close(db); + +	return (error_count < 100 ? error_count : 100); +} + +int main(int argc, char * const *argv) +{ +	int i, seed = -1; +	int num_loops = 5000; +	int num_procs = 3; +	int c, pfds[2]; +	extern char *optarg; +	pid_t *pids; +	int kill_random = 0; +	int *done; +	int tdb_flags = TDB_DEFAULT; + +	log_attr.base.attr = TDB_ATTRIBUTE_LOG; +	log_attr.base.next = &seed_attr; +	log_attr.log.fn = tdb_log; +	seed_attr.base.attr = TDB_ATTRIBUTE_SEED; + +	while ((c = getopt(argc, argv, "n:l:s:thkS")) != -1) { +		switch (c) { +		case 'n': +			num_procs = strtol(optarg, NULL, 0); +			break; +		case 'l': +			num_loops = strtol(optarg, NULL, 0); +			break; +		case 's': +			seed = strtol(optarg, NULL, 0); +			break; +		case 'S': +			tdb_flags = TDB_NOSYNC; +			break; +		case 't': +#if TRANSACTION_PROB +			always_transaction = 1; +#else +			fprintf(stderr, "Transactions not supported\n"); +			usage(); +#endif +			break; +		case 'k': +			kill_random = 1; +			break; +		default: +			usage(); +		} +	} + +	unlink("torture.tdb"); + +	if (seed == -1) { +		seed = (getpid() + time(NULL)) & 0x7FFFFFFF; +	} +	seed_attr.seed.seed = (((uint64_t)seed) << 32) | seed; + +	if (num_procs == 1 && !kill_random) { +		/* Don't fork for this case, makes debugging easier. */ +		error_count = run_child(0, seed, num_loops, 0, tdb_flags); +		goto done; +	} + +	pids = (pid_t *)calloc(sizeof(pid_t), num_procs); +	done = (int *)calloc(sizeof(int), num_procs); + +	if (pipe(pfds) != 0) { +		perror("Creating pipe"); +		exit(1); +	} +	count_pipe = pfds[1]; + +	for (i=0;i<num_procs;i++) { +		if ((pids[i]=fork()) == 0) { +			close(pfds[0]); +			if (i == 0) { +				printf("testing with %d processes, %d loops, seed=%d%s\n", +				       num_procs, num_loops, seed, +#if TRANSACTION_PROB +				       always_transaction ? " (all within transactions)" : "" +#else +				       "" +#endif +					); +			} +			exit(run_child(i, seed, num_loops, 0, tdb_flags)); +		} +	} + +	while (num_procs) { +		int status, j; +		pid_t pid; + +		if (error_count != 0) { +			/* try and stop the test on any failure */ +			for (j=0;j<num_procs;j++) { +				if (pids[j] != 0) { +					kill(pids[j], SIGTERM); +				} +			} +		} + +		pid = waitpid(-1, &status, kill_random ? WNOHANG : 0); +		if (pid == 0) { +			struct timespec ts; + +			/* Sleep for 1/10 second. */ +			ts.tv_sec = 0; +			ts.tv_nsec = 100000000; +			nanosleep(&ts, NULL); + +			/* Kill someone. */ +			kill(pids[random() % num_procs], SIGUSR1); +			continue; +		} + +		if (pid == -1) { +			perror("failed to wait for child\n"); +			exit(1); +		} + +		for (j=0;j<num_procs;j++) { +			if (pids[j] == pid) break; +		} +		if (j == num_procs) { +			printf("unknown child %d exited!?\n", (int)pid); +			exit(1); +		} +		if (WIFSIGNALED(status)) { +			if (WTERMSIG(status) == SIGUSR2 +			    || WTERMSIG(status) == SIGUSR1) { +				/* SIGUSR2 means they wrote to pipe. */ +				if (WTERMSIG(status) == SIGUSR2) { +					if (read(pfds[0], &done[j], +						 sizeof(done[j])) +					    != sizeof(done[j])) +						err(1, +						    "Short read from child?"); +				} +				pids[j] = fork(); +				if (pids[j] == 0) +					exit(run_child(j, seed, num_loops, +						       done[j], tdb_flags)); +				printf("Restarting child %i for %u-%u\n", +				       j, done[j], num_loops); +				continue; +			} +			printf("child %d exited with signal %d\n", +			       (int)pid, WTERMSIG(status)); +			error_count++; +		} else { +			if (WEXITSTATUS(status) != 0) { +				printf("child %d exited with status %d\n", +				       (int)pid, WEXITSTATUS(status)); +				error_count++; +			} +		} +		memmove(&pids[j], &pids[j+1], +			(num_procs - j - 1)*sizeof(pids[0])); +		num_procs--; +	} + +	free(pids); + +done: +	if (error_count == 0) { +		db = tdb_open("torture.tdb", TDB_DEFAULT, O_RDWR | O_CREAT, +			      0600, &log_attr); +		if (!db) { +			fatal(db, "db open failed"); +			exit(1); +		} +		if (tdb_check(db, NULL, NULL) != 0) { +			fatal(db, "db check failed"); +			exit(1); +		} +		tdb_close(db); +		printf("OK\n"); +	} + +	return error_count; +} diff --git a/lib/tdb2/transaction.c b/lib/tdb2/transaction.c new file mode 100644 index 0000000000..b13223bc2e --- /dev/null +++ b/lib/tdb2/transaction.c @@ -0,0 +1,1308 @@ + /* +   Unix SMB/CIFS implementation. + +   trivial database library + +   Copyright (C) Andrew Tridgell              2005 +   Copyright (C) Rusty Russell                2010 + +     ** NOTE! The following LGPL license applies to the tdb +     ** library. This does NOT imply that all of Samba is released +     ** under the LGPL + +   This library is free software; you can redistribute it and/or +   modify it under the terms of the GNU Lesser General Public +   License as published by the Free Software Foundation; either +   version 3 of the License, or (at your option) any later version. + +   This library is distributed in the hope that it will be useful, +   but WITHOUT ANY WARRANTY; without even the implied warranty of +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU +   Lesser General Public License for more details. + +   You should have received a copy of the GNU Lesser General Public +   License along with this library; if not, see <http://www.gnu.org/licenses/>. +*/ + +#include "private.h" +#define SAFE_FREE(x) do { if ((x) != NULL) {free(x); (x)=NULL;} } while(0) + +/* +  transaction design: + +  - only allow a single transaction at a time per database. This makes +    using the transaction API simpler, as otherwise the caller would +    have to cope with temporary failures in transactions that conflict +    with other current transactions + +  - keep the transaction recovery information in the same file as the +    database, using a special 'transaction recovery' record pointed at +    by the header. This removes the need for extra journal files as +    used by some other databases + +  - dynamically allocated the transaction recover record, re-using it +    for subsequent transactions. If a larger record is needed then +    tdb_free() the old record to place it on the normal tdb freelist +    before allocating the new record + +  - during transactions, keep a linked list of writes all that have +    been performed by intercepting all tdb_write() calls. The hooked +    transaction versions of tdb_read() and tdb_write() check this +    linked list and try to use the elements of the list in preference +    to the real database. + +  - don't allow any locks to be held when a transaction starts, +    otherwise we can end up with deadlock (plus lack of lock nesting +    in POSIX locks would mean the lock is lost) + +  - if the caller gains a lock during the transaction but doesn't +    release it then fail the commit + +  - allow for nested calls to tdb_transaction_start(), re-using the +    existing transaction record. If the inner transaction is canceled +    then a subsequent commit will fail + +  - keep a mirrored copy of the tdb hash chain heads to allow for the +    fast hash heads scan on traverse, updating the mirrored copy in +    the transaction version of tdb_write + +  - allow callers to mix transaction and non-transaction use of tdb, +    although once a transaction is started then an exclusive lock is +    gained until the transaction is committed or canceled + +  - the commit stategy involves first saving away all modified data +    into a linearised buffer in the transaction recovery area, then +    marking the transaction recovery area with a magic value to +    indicate a valid recovery record. In total 4 fsync/msync calls are +    needed per commit to prevent race conditions. It might be possible +    to reduce this to 3 or even 2 with some more work. + +  - check for a valid recovery record on open of the tdb, while the +    open lock is held. Automatically recover from the transaction +    recovery area if needed, then continue with the open as +    usual. This allows for smooth crash recovery with no administrator +    intervention. + +  - if TDB_NOSYNC is passed to flags in tdb_open then transactions are +    still available, but no transaction recovery area is used and no +    fsync/msync calls are made. +*/ + +/* +  hold the context of any current transaction +*/ +struct tdb_transaction { +	/* the original io methods - used to do IOs to the real db */ +	const struct tdb_methods *io_methods; + +	/* the list of transaction blocks. When a block is first +	   written to, it gets created in this list */ +	uint8_t **blocks; +	size_t num_blocks; +	size_t last_block_size; /* number of valid bytes in the last block */ + +	/* non-zero when an internal transaction error has +	   occurred. All write operations will then fail until the +	   transaction is ended */ +	int transaction_error; + +	/* when inside a transaction we need to keep track of any +	   nested tdb_transaction_start() calls, as these are allowed, +	   but don't create a new transaction */ +	unsigned int nesting; + +	/* set when a prepare has already occurred */ +	bool prepared; +	tdb_off_t magic_offset; + +	/* old file size before transaction */ +	tdb_len_t old_map_size; +}; + +/* This doesn't really need to be pagesize, but we use it for similar reasons. */ +#define PAGESIZE 65536 + +/* +  read while in a transaction. We need to check first if the data is in our list +  of transaction elements, then if not do a real read +*/ +static enum TDB_ERROR transaction_read(struct tdb_context *tdb, tdb_off_t off, +				       void *buf, tdb_len_t len) +{ +	size_t blk; +	enum TDB_ERROR ecode; + +	/* break it down into block sized ops */ +	while (len + (off % PAGESIZE) > PAGESIZE) { +		tdb_len_t len2 = PAGESIZE - (off % PAGESIZE); +		ecode = transaction_read(tdb, off, buf, len2); +		if (ecode != TDB_SUCCESS) { +			return ecode; +		} +		len -= len2; +		off += len2; +		buf = (void *)(len2 + (char *)buf); +	} + +	if (len == 0) { +		return TDB_SUCCESS; +	} + +	blk = off / PAGESIZE; + +	/* see if we have it in the block list */ +	if (tdb->transaction->num_blocks <= blk || +	    tdb->transaction->blocks[blk] == NULL) { +		/* nope, do a real read */ +		ecode = tdb->transaction->io_methods->tread(tdb, off, buf, len); +		if (ecode != TDB_SUCCESS) { +			goto fail; +		} +		return 0; +	} + +	/* it is in the block list. Now check for the last block */ +	if (blk == tdb->transaction->num_blocks-1) { +		if (len > tdb->transaction->last_block_size) { +			ecode = TDB_ERR_IO; +			goto fail; +		} +	} + +	/* now copy it out of this block */ +	memcpy(buf, tdb->transaction->blocks[blk] + (off % PAGESIZE), len); +	return TDB_SUCCESS; + +fail: +	tdb->transaction->transaction_error = 1; +	return tdb_logerr(tdb, ecode, TDB_LOG_ERROR, +			  "transaction_read: failed at off=%zu len=%zu", +			  (size_t)off, (size_t)len); +} + + +/* +  write while in a transaction +*/ +static enum TDB_ERROR transaction_write(struct tdb_context *tdb, tdb_off_t off, +					const void *buf, tdb_len_t len) +{ +	size_t blk; +	enum TDB_ERROR ecode; + +	/* Only a commit is allowed on a prepared transaction */ +	if (tdb->transaction->prepared) { +		ecode = tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_LOG_ERROR, +				   "transaction_write: transaction already" +				   " prepared, write not allowed"); +		goto fail; +	} + +	/* break it up into block sized chunks */ +	while (len + (off % PAGESIZE) > PAGESIZE) { +		tdb_len_t len2 = PAGESIZE - (off % PAGESIZE); +		ecode = transaction_write(tdb, off, buf, len2); +		if (ecode != TDB_SUCCESS) { +			return -1; +		} +		len -= len2; +		off += len2; +		if (buf != NULL) { +			buf = (const void *)(len2 + (const char *)buf); +		} +	} + +	if (len == 0) { +		return TDB_SUCCESS; +	} + +	blk = off / PAGESIZE; +	off = off % PAGESIZE; + +	if (tdb->transaction->num_blocks <= blk) { +		uint8_t **new_blocks; +		/* expand the blocks array */ +		if (tdb->transaction->blocks == NULL) { +			new_blocks = (uint8_t **)malloc( +				(blk+1)*sizeof(uint8_t *)); +		} else { +			new_blocks = (uint8_t **)realloc( +				tdb->transaction->blocks, +				(blk+1)*sizeof(uint8_t *)); +		} +		if (new_blocks == NULL) { +			ecode = tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR, +					   "transaction_write:" +					   " failed to allocate"); +			goto fail; +		} +		memset(&new_blocks[tdb->transaction->num_blocks], 0, +		       (1+(blk - tdb->transaction->num_blocks))*sizeof(uint8_t *)); +		tdb->transaction->blocks = new_blocks; +		tdb->transaction->num_blocks = blk+1; +		tdb->transaction->last_block_size = 0; +	} + +	/* allocate and fill a block? */ +	if (tdb->transaction->blocks[blk] == NULL) { +		tdb->transaction->blocks[blk] = (uint8_t *)calloc(PAGESIZE, 1); +		if (tdb->transaction->blocks[blk] == NULL) { +			ecode = tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR, +					   "transaction_write:" +					   " failed to allocate"); +			goto fail; +		} +		if (tdb->transaction->old_map_size > blk * PAGESIZE) { +			tdb_len_t len2 = PAGESIZE; +			if (len2 + (blk * PAGESIZE) > tdb->transaction->old_map_size) { +				len2 = tdb->transaction->old_map_size - (blk * PAGESIZE); +			} +			ecode = tdb->transaction->io_methods->tread(tdb, +					blk * PAGESIZE, +					tdb->transaction->blocks[blk], +					len2); +			if (ecode != TDB_SUCCESS) { +				ecode = tdb_logerr(tdb, ecode, +						   TDB_LOG_ERROR, +						   "transaction_write:" +						   " failed to" +						   " read old block: %s", +						   strerror(errno)); +				SAFE_FREE(tdb->transaction->blocks[blk]); +				goto fail; +			} +			if (blk == tdb->transaction->num_blocks-1) { +				tdb->transaction->last_block_size = len2; +			} +		} +	} + +	/* overwrite part of an existing block */ +	if (buf == NULL) { +		memset(tdb->transaction->blocks[blk] + off, 0, len); +	} else { +		memcpy(tdb->transaction->blocks[blk] + off, buf, len); +	} +	if (blk == tdb->transaction->num_blocks-1) { +		if (len + off > tdb->transaction->last_block_size) { +			tdb->transaction->last_block_size = len + off; +		} +	} + +	return TDB_SUCCESS; + +fail: +	tdb->transaction->transaction_error = 1; +	return ecode; +} + + +/* +  write while in a transaction - this variant never expands the transaction blocks, it only +  updates existing blocks. This means it cannot change the recovery size +*/ +static void transaction_write_existing(struct tdb_context *tdb, tdb_off_t off, +				       const void *buf, tdb_len_t len) +{ +	size_t blk; + +	/* break it up into block sized chunks */ +	while (len + (off % PAGESIZE) > PAGESIZE) { +		tdb_len_t len2 = PAGESIZE - (off % PAGESIZE); +		transaction_write_existing(tdb, off, buf, len2); +		len -= len2; +		off += len2; +		if (buf != NULL) { +			buf = (const void *)(len2 + (const char *)buf); +		} +	} + +	if (len == 0) { +		return; +	} + +	blk = off / PAGESIZE; +	off = off % PAGESIZE; + +	if (tdb->transaction->num_blocks <= blk || +	    tdb->transaction->blocks[blk] == NULL) { +		return; +	} + +	if (blk == tdb->transaction->num_blocks-1 && +	    off + len > tdb->transaction->last_block_size) { +		if (off >= tdb->transaction->last_block_size) { +			return; +		} +		len = tdb->transaction->last_block_size - off; +	} + +	/* overwrite part of an existing block */ +	memcpy(tdb->transaction->blocks[blk] + off, buf, len); +} + + +/* +  out of bounds check during a transaction +*/ +static enum TDB_ERROR transaction_oob(struct tdb_context *tdb, tdb_off_t len, +				      bool probe) +{ +	if (len <= tdb->file->map_size) { +		return TDB_SUCCESS; +	} +	if (!probe) { +		tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR, +			   "tdb_oob len %lld beyond transaction size %lld", +			   (long long)len, +			   (long long)tdb->file->map_size); +	} +	return TDB_ERR_IO; +} + +/* +  transaction version of tdb_expand(). +*/ +static enum TDB_ERROR transaction_expand_file(struct tdb_context *tdb, +					      tdb_off_t addition) +{ +	enum TDB_ERROR ecode; + +	/* add a write to the transaction elements, so subsequent +	   reads see the zero data */ +	ecode = transaction_write(tdb, tdb->file->map_size, NULL, addition); +	if (ecode == TDB_SUCCESS) { +		tdb->file->map_size += addition; +	} +	return ecode; +} + +static void *transaction_direct(struct tdb_context *tdb, tdb_off_t off, +				size_t len, bool write_mode) +{ +	size_t blk = off / PAGESIZE, end_blk; + +	/* This is wrong for zero-length blocks, but will fail gracefully */ +	end_blk = (off + len - 1) / PAGESIZE; + +	/* Can only do direct if in single block and we've already copied. */ +	if (write_mode) { +		tdb->stats.transaction_write_direct++; +		if (blk != end_blk +		    || blk >= tdb->transaction->num_blocks +		    || tdb->transaction->blocks[blk] == NULL) { +			tdb->stats.transaction_write_direct_fail++; +			return NULL; +		} +		return tdb->transaction->blocks[blk] + off % PAGESIZE; +	} + +	tdb->stats.transaction_read_direct++; +	/* Single which we have copied? */ +	if (blk == end_blk +	    && blk < tdb->transaction->num_blocks +	    && tdb->transaction->blocks[blk]) +		return tdb->transaction->blocks[blk] + off % PAGESIZE; + +	/* Otherwise must be all not copied. */ +	while (blk <= end_blk) { +		if (blk >= tdb->transaction->num_blocks) +			break; +		if (tdb->transaction->blocks[blk]) { +			tdb->stats.transaction_read_direct_fail++; +			return NULL; +		} +		blk++; +	} +	return tdb->transaction->io_methods->direct(tdb, off, len, false); +} + +static const struct tdb_methods transaction_methods = { +	transaction_read, +	transaction_write, +	transaction_oob, +	transaction_expand_file, +	transaction_direct, +}; + +/* +  sync to disk +*/ +static enum TDB_ERROR transaction_sync(struct tdb_context *tdb, +				       tdb_off_t offset, tdb_len_t length) +{ +	if (tdb->flags & TDB_NOSYNC) { +		return TDB_SUCCESS; +	} + +	if (fsync(tdb->file->fd) != 0) { +		return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR, +				  "tdb_transaction: fsync failed: %s", +				  strerror(errno)); +	} +#ifdef MS_SYNC +	if (tdb->file->map_ptr) { +		tdb_off_t moffset = offset & ~(getpagesize()-1); +		if (msync(moffset + (char *)tdb->file->map_ptr, +			  length + (offset - moffset), MS_SYNC) != 0) { +			return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR, +					  "tdb_transaction: msync failed: %s", +					  strerror(errno)); +		} +	} +#endif +	return TDB_SUCCESS; +} + + +static void _tdb_transaction_cancel(struct tdb_context *tdb) +{ +	int i; +	enum TDB_ERROR ecode; + +	if (tdb->transaction == NULL) { +		tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_LOG_USE_ERROR, +			   "tdb_transaction_cancel: no transaction"); +		return; +	} + +	if (tdb->transaction->nesting != 0) { +		tdb->transaction->transaction_error = 1; +		tdb->transaction->nesting--; +		return; +	} + +	tdb->file->map_size = tdb->transaction->old_map_size; + +	/* free all the transaction blocks */ +	for (i=0;i<tdb->transaction->num_blocks;i++) { +		if (tdb->transaction->blocks[i] != NULL) { +			free(tdb->transaction->blocks[i]); +		} +	} +	SAFE_FREE(tdb->transaction->blocks); + +	if (tdb->transaction->magic_offset) { +		const struct tdb_methods *methods = tdb->transaction->io_methods; +		uint64_t invalid = TDB_RECOVERY_INVALID_MAGIC; + +		/* remove the recovery marker */ +		ecode = methods->twrite(tdb, tdb->transaction->magic_offset, +					&invalid, sizeof(invalid)); +		if (ecode == TDB_SUCCESS) +			ecode = transaction_sync(tdb, +						 tdb->transaction->magic_offset, +						 sizeof(invalid)); +		if (ecode != TDB_SUCCESS) { +			tdb_logerr(tdb, ecode, TDB_LOG_ERROR, +				   "tdb_transaction_cancel: failed to remove" +				   " recovery magic"); +		} +	} + +	if (tdb->file->allrecord_lock.count) +		tdb_allrecord_unlock(tdb, tdb->file->allrecord_lock.ltype); + +	/* restore the normal io methods */ +	tdb->methods = tdb->transaction->io_methods; + +	tdb_transaction_unlock(tdb, F_WRLCK); + +	if (tdb_has_open_lock(tdb)) +		tdb_unlock_open(tdb, F_WRLCK); + +	SAFE_FREE(tdb->transaction); +} + +/* +  start a tdb transaction. No token is returned, as only a single +  transaction is allowed to be pending per tdb_context +*/ +enum TDB_ERROR tdb_transaction_start(struct tdb_context *tdb) +{ +	enum TDB_ERROR ecode; + +	tdb->stats.transactions++; +	/* some sanity checks */ +	if (tdb->read_only || (tdb->flags & TDB_INTERNAL)) { +		return tdb->last_error = tdb_logerr(tdb, TDB_ERR_EINVAL, +						    TDB_LOG_USE_ERROR, +						    "tdb_transaction_start:" +						    " cannot start a" +						    " transaction on a " +						    "read-only or internal db"); +	} + +	/* cope with nested tdb_transaction_start() calls */ +	if (tdb->transaction != NULL) { +		if (!(tdb->flags & TDB_ALLOW_NESTING)) { +			return tdb->last_error +				= tdb_logerr(tdb, TDB_ERR_IO, +					     TDB_LOG_USE_ERROR, +					     "tdb_transaction_start:" +					     " already inside transaction"); +		} +		tdb->transaction->nesting++; +		tdb->stats.transaction_nest++; +		return 0; +	} + +	if (tdb_has_hash_locks(tdb)) { +		/* the caller must not have any locks when starting a +		   transaction as otherwise we'll be screwed by lack +		   of nested locks in POSIX */ +		return tdb->last_error = tdb_logerr(tdb, TDB_ERR_LOCK, +						    TDB_LOG_USE_ERROR, +						    "tdb_transaction_start:" +						    " cannot start a" +						    " transaction with locks" +						    " held"); +	} + +	tdb->transaction = (struct tdb_transaction *) +		calloc(sizeof(struct tdb_transaction), 1); +	if (tdb->transaction == NULL) { +		return tdb->last_error = tdb_logerr(tdb, TDB_ERR_OOM, +						    TDB_LOG_ERROR, +						    "tdb_transaction_start:" +						    " cannot allocate"); +	} + +	/* get the transaction write lock. This is a blocking lock. As +	   discussed with Volker, there are a number of ways we could +	   make this async, which we will probably do in the future */ +	ecode = tdb_transaction_lock(tdb, F_WRLCK); +	if (ecode != TDB_SUCCESS) { +		SAFE_FREE(tdb->transaction->blocks); +		SAFE_FREE(tdb->transaction); +		return tdb->last_error = ecode; +	} + +	/* get a read lock over entire file. This is upgraded to a write +	   lock during the commit */ +	ecode = tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_WAIT, true); +	if (ecode != TDB_SUCCESS) { +		goto fail_allrecord_lock; +	} + +	/* make sure we know about any file expansions already done by +	   anyone else */ +	tdb->methods->oob(tdb, tdb->file->map_size + 1, true); +	tdb->transaction->old_map_size = tdb->file->map_size; + +	/* finally hook the io methods, replacing them with +	   transaction specific methods */ +	tdb->transaction->io_methods = tdb->methods; +	tdb->methods = &transaction_methods; +	return tdb->last_error = TDB_SUCCESS; + +fail_allrecord_lock: +	tdb_transaction_unlock(tdb, F_WRLCK); +	SAFE_FREE(tdb->transaction->blocks); +	SAFE_FREE(tdb->transaction); +	return tdb->last_error = ecode; +} + + +/* +  cancel the current transaction +*/ +void tdb_transaction_cancel(struct tdb_context *tdb) +{ +	tdb->stats.transaction_cancel++; +	_tdb_transaction_cancel(tdb); +} + +/* +  work out how much space the linearised recovery data will consume (worst case) +*/ +static tdb_len_t tdb_recovery_size(struct tdb_context *tdb) +{ +	tdb_len_t recovery_size = 0; +	int i; + +	recovery_size = 0; +	for (i=0;i<tdb->transaction->num_blocks;i++) { +		if (i * PAGESIZE >= tdb->transaction->old_map_size) { +			break; +		} +		if (tdb->transaction->blocks[i] == NULL) { +			continue; +		} +		recovery_size += 2*sizeof(tdb_off_t); +		if (i == tdb->transaction->num_blocks-1) { +			recovery_size += tdb->transaction->last_block_size; +		} else { +			recovery_size += PAGESIZE; +		} +	} + +	return recovery_size; +} + +static enum TDB_ERROR tdb_recovery_area(struct tdb_context *tdb, +					const struct tdb_methods *methods, +					tdb_off_t *recovery_offset, +					struct tdb_recovery_record *rec) +{ +	enum TDB_ERROR ecode; + +	*recovery_offset = tdb_read_off(tdb, +					offsetof(struct tdb_header, recovery)); +	if (TDB_OFF_IS_ERR(*recovery_offset)) { +		return *recovery_offset; +	} + +	if (*recovery_offset == 0) { +		rec->max_len = 0; +		return TDB_SUCCESS; +	} + +	ecode = methods->tread(tdb, *recovery_offset, rec, sizeof(*rec)); +	if (ecode != TDB_SUCCESS) +		return ecode; + +	tdb_convert(tdb, rec, sizeof(*rec)); +	/* ignore invalid recovery regions: can happen in crash */ +	if (rec->magic != TDB_RECOVERY_MAGIC && +	    rec->magic != TDB_RECOVERY_INVALID_MAGIC) { +		*recovery_offset = 0; +		rec->max_len = 0; +	} +	return TDB_SUCCESS; +} + +static unsigned int same(const unsigned char *new, +			 const unsigned char *old, +			 unsigned int length) +{ +	unsigned int i; + +	for (i = 0; i < length; i++) { +		if (new[i] != old[i]) +			break; +	} +	return i; +} + +static unsigned int different(const unsigned char *new, +			      const unsigned char *old, +			      unsigned int length, +			      unsigned int min_same, +			      unsigned int *samelen) +{ +	unsigned int i; + +	*samelen = 0; +	for (i = 0; i < length; i++) { +		if (new[i] == old[i]) { +			(*samelen)++; +		} else { +			if (*samelen >= min_same) { +				return i - *samelen; +			} +			*samelen = 0; +		} +	} + +	if (*samelen < min_same) +		*samelen = 0; +	return length - *samelen; +} + +/* Allocates recovery blob, without tdb_recovery_record at head set up. */ +static struct tdb_recovery_record *alloc_recovery(struct tdb_context *tdb, +						  tdb_len_t *len) +{ +	struct tdb_recovery_record *rec; +	size_t i; +	enum TDB_ERROR ecode; +	unsigned char *p; +	const struct tdb_methods *old_methods = tdb->methods; + +	rec = malloc(sizeof(*rec) + tdb_recovery_size(tdb)); +	if (!rec) { +		tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR, +			   "transaction_setup_recovery:" +			   " cannot allocate"); +		return TDB_ERR_PTR(TDB_ERR_OOM); +	} + +	/* We temporarily revert to the old I/O methods, so we can use +	 * tdb_access_read */ +	tdb->methods = tdb->transaction->io_methods; + +	/* build the recovery data into a single blob to allow us to do a single +	   large write, which should be more efficient */ +	p = (unsigned char *)(rec + 1); +	for (i=0;i<tdb->transaction->num_blocks;i++) { +		tdb_off_t offset; +		tdb_len_t length; +		unsigned int off; +		const unsigned char *buffer; + +		if (tdb->transaction->blocks[i] == NULL) { +			continue; +		} + +		offset = i * PAGESIZE; +		length = PAGESIZE; +		if (i == tdb->transaction->num_blocks-1) { +			length = tdb->transaction->last_block_size; +		} + +		if (offset >= tdb->transaction->old_map_size) { +			continue; +		} + +		if (offset + length > tdb->file->map_size) { +			ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR, +					   "tdb_transaction_setup_recovery:" +					   " transaction data over new region" +					   " boundary"); +			goto fail; +		} +		if (offset + length > tdb->transaction->old_map_size) { +			/* Short read at EOF. */ +			length = tdb->transaction->old_map_size - offset; +		} +		buffer = tdb_access_read(tdb, offset, length, false); +		if (TDB_PTR_IS_ERR(buffer)) { +			ecode = TDB_PTR_ERR(buffer); +			goto fail; +		} + +		/* Skip over anything the same at the start. */ +		off = same(tdb->transaction->blocks[i], buffer, length); +		offset += off; + +		while (off < length) { +			tdb_len_t len; +			unsigned int samelen; + +			len = different(tdb->transaction->blocks[i] + off, +					buffer + off, length - off, +					sizeof(offset) + sizeof(len) + 1, +					&samelen); + +			memcpy(p, &offset, sizeof(offset)); +			memcpy(p + sizeof(offset), &len, sizeof(len)); +			tdb_convert(tdb, p, sizeof(offset) + sizeof(len)); +			p += sizeof(offset) + sizeof(len); +			memcpy(p, buffer + off, len); +			p += len; +			off += len + samelen; +			offset += len + samelen; +		} +		tdb_access_release(tdb, buffer); +	} + +	*len = p - (unsigned char *)(rec + 1); +	tdb->methods = old_methods; +	return rec; + +fail: +	free(rec); +	tdb->methods = old_methods; +	return TDB_ERR_PTR(ecode); +} + +static tdb_off_t create_recovery_area(struct tdb_context *tdb, +				      tdb_len_t rec_length, +				      struct tdb_recovery_record *rec) +{ +	tdb_off_t off, recovery_off; +	tdb_len_t addition; +	enum TDB_ERROR ecode; +	const struct tdb_methods *methods = tdb->transaction->io_methods; + +	/* round up to a multiple of page size. Overallocate, since each +	 * such allocation forces us to expand the file. */ +	rec->max_len +		= (((sizeof(*rec) + rec_length + rec_length / 2) +		    + PAGESIZE-1) & ~(PAGESIZE-1)) +		- sizeof(*rec); +	off = tdb->file->map_size; + +	/* Restore ->map_size before calling underlying expand_file. +	   Also so that we don't try to expand the file again in the +	   transaction commit, which would destroy the recovery +	   area */ +	addition = (tdb->file->map_size - tdb->transaction->old_map_size) + +		sizeof(*rec) + rec->max_len; +	tdb->file->map_size = tdb->transaction->old_map_size; +	tdb->stats.transaction_expand_file++; +	ecode = methods->expand_file(tdb, addition); +	if (ecode != TDB_SUCCESS) { +		return tdb_logerr(tdb, ecode, TDB_LOG_ERROR, +				  "tdb_recovery_allocate:" +				  " failed to create recovery area"); +	} + +	/* we have to reset the old map size so that we don't try to +	   expand the file again in the transaction commit, which +	   would destroy the recovery area */ +	tdb->transaction->old_map_size = tdb->file->map_size; + +	/* write the recovery header offset and sync - we can sync without a race here +	   as the magic ptr in the recovery record has not been set */ +	recovery_off = off; +	tdb_convert(tdb, &recovery_off, sizeof(recovery_off)); +	ecode = methods->twrite(tdb, offsetof(struct tdb_header, recovery), +				&recovery_off, sizeof(tdb_off_t)); +	if (ecode != TDB_SUCCESS) { +		return tdb_logerr(tdb, ecode, TDB_LOG_ERROR, +				  "tdb_recovery_allocate:" +				  " failed to write recovery head"); +	} +	transaction_write_existing(tdb, offsetof(struct tdb_header, recovery), +				   &recovery_off, +				   sizeof(tdb_off_t)); +	return off; +} + +/* +  setup the recovery data that will be used on a crash during commit +*/ +static enum TDB_ERROR transaction_setup_recovery(struct tdb_context *tdb) +{ +	tdb_len_t recovery_size = 0; +	tdb_off_t recovery_off = 0; +	tdb_off_t old_map_size = tdb->transaction->old_map_size; +	struct tdb_recovery_record *recovery; +	const struct tdb_methods *methods = tdb->transaction->io_methods; +	uint64_t magic; +	enum TDB_ERROR ecode; + +	recovery = alloc_recovery(tdb, &recovery_size); +	if (TDB_PTR_IS_ERR(recovery)) +		return TDB_PTR_ERR(recovery); + +	ecode = tdb_recovery_area(tdb, methods, &recovery_off, recovery); +	if (ecode) { +		free(recovery); +		return ecode; +	} + +	if (recovery->max_len < recovery_size) { +		/* Not large enough. Free up old recovery area. */ +		if (recovery_off) { +			tdb->stats.frees++; +			ecode = add_free_record(tdb, recovery_off, +						sizeof(*recovery) +						+ recovery->max_len, +						TDB_LOCK_WAIT, true); +			free(recovery); +			if (ecode != TDB_SUCCESS) { +				return tdb_logerr(tdb, ecode, TDB_LOG_ERROR, +						  "tdb_recovery_allocate:" +						  " failed to free previous" +						  " recovery area"); +			} + +			/* Refresh recovery after add_free_record above. */ +			recovery = alloc_recovery(tdb, &recovery_size); +			if (TDB_PTR_IS_ERR(recovery)) +				return TDB_PTR_ERR(recovery); +		} + +		recovery_off = create_recovery_area(tdb, recovery_size, +						    recovery); +		if (TDB_OFF_IS_ERR(recovery_off)) { +			free(recovery); +			return recovery_off; +		} +	} + +	/* Now we know size, convert rec header. */ +	recovery->magic = TDB_RECOVERY_INVALID_MAGIC; +	recovery->len = recovery_size; +	recovery->eof = old_map_size; +	tdb_convert(tdb, recovery, sizeof(*recovery)); + +	/* write the recovery data to the recovery area */ +	ecode = methods->twrite(tdb, recovery_off, recovery, recovery_size); +	if (ecode != TDB_SUCCESS) { +		free(recovery); +		return tdb_logerr(tdb, ecode, TDB_LOG_ERROR, +				  "tdb_transaction_setup_recovery:" +				  " failed to write recovery data"); +	} +	transaction_write_existing(tdb, recovery_off, recovery, recovery_size); + +	free(recovery); + +	/* as we don't have ordered writes, we have to sync the recovery +	   data before we update the magic to indicate that the recovery +	   data is present */ +	ecode = transaction_sync(tdb, recovery_off, recovery_size); +	if (ecode != TDB_SUCCESS) +		return ecode; + +	magic = TDB_RECOVERY_MAGIC; +	tdb_convert(tdb, &magic, sizeof(magic)); + +	tdb->transaction->magic_offset +		= recovery_off + offsetof(struct tdb_recovery_record, magic); + +	ecode = methods->twrite(tdb, tdb->transaction->magic_offset, +				&magic, sizeof(magic)); +	if (ecode != TDB_SUCCESS) { +		return tdb_logerr(tdb, ecode, TDB_LOG_ERROR, +				  "tdb_transaction_setup_recovery:" +				  " failed to write recovery magic"); +	} +	transaction_write_existing(tdb, tdb->transaction->magic_offset, +				   &magic, sizeof(magic)); + +	/* ensure the recovery magic marker is on disk */ +	return transaction_sync(tdb, tdb->transaction->magic_offset, +				sizeof(magic)); +} + +static enum TDB_ERROR _tdb_transaction_prepare_commit(struct tdb_context *tdb) +{ +	const struct tdb_methods *methods; +	enum TDB_ERROR ecode; + +	if (tdb->transaction == NULL) { +		return tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_LOG_USE_ERROR, +				  "tdb_transaction_prepare_commit:" +				  " no transaction"); +	} + +	if (tdb->transaction->prepared) { +		_tdb_transaction_cancel(tdb); +		return tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_LOG_USE_ERROR, +				  "tdb_transaction_prepare_commit:" +				  " transaction already prepared"); +	} + +	if (tdb->transaction->transaction_error) { +		_tdb_transaction_cancel(tdb); +		return tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_LOG_ERROR, +				  "tdb_transaction_prepare_commit:" +				  " transaction error pending"); +	} + + +	if (tdb->transaction->nesting != 0) { +		return TDB_SUCCESS; +	} + +	/* check for a null transaction */ +	if (tdb->transaction->blocks == NULL) { +		return TDB_SUCCESS; +	} + +	methods = tdb->transaction->io_methods; + +	/* upgrade the main transaction lock region to a write lock */ +	ecode = tdb_allrecord_upgrade(tdb); +	if (ecode != TDB_SUCCESS) { +		return ecode; +	} + +	/* get the open lock - this prevents new users attaching to the database +	   during the commit */ +	ecode = tdb_lock_open(tdb, F_WRLCK, TDB_LOCK_WAIT|TDB_LOCK_NOCHECK); +	if (ecode != TDB_SUCCESS) { +		return ecode; +	} + +	/* Since we have whole db locked, we don't need the expansion lock. */ +	if (!(tdb->flags & TDB_NOSYNC)) { +		/* Sets up tdb->transaction->recovery and +		 * tdb->transaction->magic_offset. */ +		ecode = transaction_setup_recovery(tdb); +		if (ecode != TDB_SUCCESS) { +			return ecode; +		} +	} + +	tdb->transaction->prepared = true; + +	/* expand the file to the new size if needed */ +	if (tdb->file->map_size != tdb->transaction->old_map_size) { +		tdb_len_t add; + +		add = tdb->file->map_size - tdb->transaction->old_map_size; +		/* Restore original map size for tdb_expand_file */ +		tdb->file->map_size = tdb->transaction->old_map_size; +		ecode = methods->expand_file(tdb, add); +		if (ecode != TDB_SUCCESS) { +			return ecode; +		} +	} + +	/* Keep the open lock until the actual commit */ +	return TDB_SUCCESS; +} + +/* +   prepare to commit the current transaction +*/ +enum TDB_ERROR tdb_transaction_prepare_commit(struct tdb_context *tdb) +{ +	return _tdb_transaction_prepare_commit(tdb); +} + +/* +  commit the current transaction +*/ +enum TDB_ERROR tdb_transaction_commit(struct tdb_context *tdb) +{ +	const struct tdb_methods *methods; +	int i; +	enum TDB_ERROR ecode; + +	if (tdb->transaction == NULL) { +		return tdb->last_error = tdb_logerr(tdb, TDB_ERR_EINVAL, +						    TDB_LOG_USE_ERROR, +						    "tdb_transaction_commit:" +						    " no transaction"); +	} + +	tdb_trace(tdb, "tdb_transaction_commit"); + +	if (tdb->transaction->nesting != 0) { +		tdb->transaction->nesting--; +		return tdb->last_error = TDB_SUCCESS; +	} + +	/* check for a null transaction */ +	if (tdb->transaction->blocks == NULL) { +		_tdb_transaction_cancel(tdb); +		return tdb->last_error = TDB_SUCCESS; +	} + +	if (!tdb->transaction->prepared) { +		ecode = _tdb_transaction_prepare_commit(tdb); +		if (ecode != TDB_SUCCESS) { +			_tdb_transaction_cancel(tdb); +			return tdb->last_error = ecode; +		} +	} + +	methods = tdb->transaction->io_methods; + +	/* perform all the writes */ +	for (i=0;i<tdb->transaction->num_blocks;i++) { +		tdb_off_t offset; +		tdb_len_t length; + +		if (tdb->transaction->blocks[i] == NULL) { +			continue; +		} + +		offset = i * PAGESIZE; +		length = PAGESIZE; +		if (i == tdb->transaction->num_blocks-1) { +			length = tdb->transaction->last_block_size; +		} + +		ecode = methods->twrite(tdb, offset, +					tdb->transaction->blocks[i], length); +		if (ecode != TDB_SUCCESS) { +			/* we've overwritten part of the data and +			   possibly expanded the file, so we need to +			   run the crash recovery code */ +			tdb->methods = methods; +			tdb_transaction_recover(tdb); + +			_tdb_transaction_cancel(tdb); + +			return tdb->last_error = ecode; +		} +		SAFE_FREE(tdb->transaction->blocks[i]); +	} + +	SAFE_FREE(tdb->transaction->blocks); +	tdb->transaction->num_blocks = 0; + +	/* ensure the new data is on disk */ +	ecode = transaction_sync(tdb, 0, tdb->file->map_size); +	if (ecode != TDB_SUCCESS) { +		return tdb->last_error = ecode; +	} + +	/* +	  TODO: maybe write to some dummy hdr field, or write to magic +	  offset without mmap, before the last sync, instead of the +	  utime() call +	*/ + +	/* on some systems (like Linux 2.6.x) changes via mmap/msync +	   don't change the mtime of the file, this means the file may +	   not be backed up (as tdb rounding to block sizes means that +	   file size changes are quite rare too). The following forces +	   mtime changes when a transaction completes */ +#if HAVE_UTIME +	utime(tdb->name, NULL); +#endif + +	/* use a transaction cancel to free memory and remove the +	   transaction locks: it "restores" map_size, too. */ +	tdb->transaction->old_map_size = tdb->file->map_size; +	_tdb_transaction_cancel(tdb); + +	return tdb->last_error = TDB_SUCCESS; +} + + +/* +  recover from an aborted transaction. Must be called with exclusive +  database write access already established (including the open +  lock to prevent new processes attaching) +*/ +enum TDB_ERROR tdb_transaction_recover(struct tdb_context *tdb) +{ +	tdb_off_t recovery_head, recovery_eof; +	unsigned char *data, *p; +	struct tdb_recovery_record rec; +	enum TDB_ERROR ecode; + +	/* find the recovery area */ +	recovery_head = tdb_read_off(tdb, offsetof(struct tdb_header,recovery)); +	if (TDB_OFF_IS_ERR(recovery_head)) { +		return tdb_logerr(tdb, recovery_head, TDB_LOG_ERROR, +				  "tdb_transaction_recover:" +				  " failed to read recovery head"); +	} + +	if (recovery_head == 0) { +		/* we have never allocated a recovery record */ +		return TDB_SUCCESS; +	} + +	/* read the recovery record */ +	ecode = tdb_read_convert(tdb, recovery_head, &rec, sizeof(rec)); +	if (ecode != TDB_SUCCESS) { +		return tdb_logerr(tdb, ecode, TDB_LOG_ERROR, +				  "tdb_transaction_recover:" +				  " failed to read recovery record"); +	} + +	if (rec.magic != TDB_RECOVERY_MAGIC) { +		/* there is no valid recovery data */ +		return TDB_SUCCESS; +	} + +	if (tdb->read_only) { +		return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR, +				  "tdb_transaction_recover:" +				  " attempt to recover read only database"); +	} + +	recovery_eof = rec.eof; + +	data = (unsigned char *)malloc(rec.len); +	if (data == NULL) { +		return tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR, +				  "tdb_transaction_recover:" +				  " failed to allocate recovery data"); +	} + +	/* read the full recovery data */ +	ecode = tdb->methods->tread(tdb, recovery_head + sizeof(rec), data, +				    rec.len); +	if (ecode != TDB_SUCCESS) { +		return tdb_logerr(tdb, ecode, TDB_LOG_ERROR, +				  "tdb_transaction_recover:" +				  " failed to read recovery data"); +	} + +	/* recover the file data */ +	p = data; +	while (p+sizeof(tdb_off_t)+sizeof(tdb_len_t) < data + rec.len) { +		tdb_off_t ofs; +		tdb_len_t len; +		tdb_convert(tdb, p, sizeof(ofs) + sizeof(len)); +		memcpy(&ofs, p, sizeof(ofs)); +		memcpy(&len, p + sizeof(ofs), sizeof(len)); +		p += sizeof(ofs) + sizeof(len); + +		ecode = tdb->methods->twrite(tdb, ofs, p, len); +		if (ecode != TDB_SUCCESS) { +			free(data); +			return tdb_logerr(tdb, ecode, TDB_LOG_ERROR, +					  "tdb_transaction_recover:" +					  " failed to recover %zu bytes" +					  " at offset %zu", +					  (size_t)len, (size_t)ofs); +		} +		p += len; +	} + +	free(data); + +	ecode = transaction_sync(tdb, 0, tdb->file->map_size); +	if (ecode != TDB_SUCCESS) { +		return tdb_logerr(tdb, ecode, TDB_LOG_ERROR, +				  "tdb_transaction_recover:" +				  " failed to sync recovery"); +	} + +	/* if the recovery area is after the recovered eof then remove it */ +	if (recovery_eof <= recovery_head) { +		ecode = tdb_write_off(tdb, offsetof(struct tdb_header, +						    recovery), +				      0); +		if (ecode != TDB_SUCCESS) { +			return tdb_logerr(tdb, ecode, TDB_LOG_ERROR, +					  "tdb_transaction_recover:" +					  " failed to remove recovery head"); +		} +	} + +	/* remove the recovery magic */ +	ecode = tdb_write_off(tdb, +			      recovery_head +			      + offsetof(struct tdb_recovery_record, magic), +			      TDB_RECOVERY_INVALID_MAGIC); +	if (ecode != TDB_SUCCESS) { +		return tdb_logerr(tdb, ecode, TDB_LOG_ERROR, +				  "tdb_transaction_recover:" +				  " failed to remove recovery magic"); +	} + +	ecode = transaction_sync(tdb, 0, recovery_eof); +	if (ecode != TDB_SUCCESS) { +		return tdb_logerr(tdb, ecode, TDB_LOG_ERROR, +				  "tdb_transaction_recover:" +				  " failed to sync2 recovery"); +	} + +	tdb_logerr(tdb, TDB_SUCCESS, TDB_LOG_WARNING, +		   "tdb_transaction_recover: recovered %zu byte database", +		   (size_t)recovery_eof); + +	/* all done */ +	return TDB_SUCCESS; +} + +tdb_bool_err tdb_needs_recovery(struct tdb_context *tdb) +{ +	tdb_off_t recovery_head; +	struct tdb_recovery_record rec; +	enum TDB_ERROR ecode; + +	/* find the recovery area */ +	recovery_head = tdb_read_off(tdb, offsetof(struct tdb_header,recovery)); +	if (TDB_OFF_IS_ERR(recovery_head)) { +		return recovery_head; +	} + +	if (recovery_head == 0) { +		/* we have never allocated a recovery record */ +		return false; +	} + +	/* read the recovery record */ +	ecode = tdb_read_convert(tdb, recovery_head, &rec, sizeof(rec)); +	if (ecode != TDB_SUCCESS) { +		return ecode; +	} + +	return (rec.magic == TDB_RECOVERY_MAGIC); +} diff --git a/lib/tdb2/traverse.c b/lib/tdb2/traverse.c new file mode 100644 index 0000000000..179e095142 --- /dev/null +++ b/lib/tdb2/traverse.c @@ -0,0 +1,99 @@ + /* +   Trivial Database 2: traverse function. +   Copyright (C) Rusty Russell 2010 + +   This library is free software; you can redistribute it and/or +   modify it under the terms of the GNU Lesser General Public +   License as published by the Free Software Foundation; either +   version 3 of the License, or (at your option) any later version. + +   This library is distributed in the hope that it will be useful, +   but WITHOUT ANY WARRANTY; without even the implied warranty of +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU +   Lesser General Public License for more details. + +   You should have received a copy of the GNU Lesser General Public +   License along with this library; if not, see <http://www.gnu.org/licenses/>. +*/ +#include "private.h" +#include <ccan/likely/likely.h> + +int64_t tdb_traverse_(struct tdb_context *tdb, +		      int (*fn)(struct tdb_context *, +				TDB_DATA, TDB_DATA, void *), +		      void *p) +{ +	enum TDB_ERROR ecode; +	struct traverse_info tinfo; +	struct tdb_data k, d; +	int64_t count = 0; + +	k.dptr = NULL; +	for (ecode = first_in_hash(tdb, &tinfo, &k, &d.dsize); +	     ecode == TDB_SUCCESS; +	     ecode = next_in_hash(tdb, &tinfo, &k, &d.dsize)) { +		d.dptr = k.dptr + k.dsize; + +		count++; +		if (fn && fn(tdb, k, d, p)) { +			free(k.dptr); +			tdb->last_error = TDB_SUCCESS; +			return count; +		} +		free(k.dptr); +	} + +	if (ecode != TDB_ERR_NOEXIST) { +		return tdb->last_error = ecode; +	} +	tdb->last_error = TDB_SUCCESS; +	return count; +} + +enum TDB_ERROR tdb_firstkey(struct tdb_context *tdb, struct tdb_data *key) +{ +	struct traverse_info tinfo; + +	return tdb->last_error = first_in_hash(tdb, &tinfo, key, NULL); +} + +/* We lock twice, not very efficient.  We could keep last key & tinfo cached. */ +enum TDB_ERROR tdb_nextkey(struct tdb_context *tdb, struct tdb_data *key) +{ +	struct traverse_info tinfo; +	struct hash_info h; +	struct tdb_used_record rec; + +	tinfo.prev = find_and_lock(tdb, *key, F_RDLCK, &h, &rec, &tinfo); +	free(key->dptr); +	if (TDB_OFF_IS_ERR(tinfo.prev)) { +		return tdb->last_error = tinfo.prev; +	} +	tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range, F_RDLCK); + +	return tdb->last_error = next_in_hash(tdb, &tinfo, key, NULL); +} + +static int wipe_one(struct tdb_context *tdb, +		    TDB_DATA key, TDB_DATA data, enum TDB_ERROR *ecode) +{ +	*ecode = tdb_delete(tdb, key); +	return (*ecode != TDB_SUCCESS); +} + +enum TDB_ERROR tdb_wipe_all(struct tdb_context *tdb) +{ +	enum TDB_ERROR ecode; +	int64_t count; + +	ecode = tdb_allrecord_lock(tdb, F_WRLCK, TDB_LOCK_WAIT, false); +	if (ecode != TDB_SUCCESS) +		return tdb->last_error = ecode; + +	/* FIXME: Be smarter. */ +	count = tdb_traverse(tdb, wipe_one, &ecode); +	if (count < 0) +		ecode = count; +	tdb_allrecord_unlock(tdb, F_WRLCK); +	return tdb->last_error = ecode; +}  | 
