300 files changed, 45975 insertions, 1337 deletions
diff --git a/lib/addns/dnsgss.c b/lib/addns/dnsgss.c
index c9037417da..19b734a6a3 100644
--- a/lib/addns/dnsgss.c
+++ b/lib/addns/dnsgss.c
@@ -92,7 +92,7 @@ static DNS_ERROR dns_negotiate_gss_ctx_int( TALLOC_CTX *mem_ctx,
 	DNS_ERROR err;
 
 	gss_OID_desc krb5_oid_desc =
-		{ 9, (char *)"\x2a\x86\x48\x86\xf7\x12\x01\x02\x02" };
+		{ 9, (const char *)"\x2a\x86\x48\x86\xf7\x12\x01\x02\x02" };
 
 	*ctx = GSS_C_NO_CONTEXT;
 	input_ptr = NULL;
@@ -222,7 +222,7 @@ DNS_ERROR dns_negotiate_sec_ctx( const char *target_realm,
 	gss_name_t targ_name;
 
 	gss_OID_desc nt_host_oid_desc =
-		{10, (char *)"\x2a\x86\x48\x86\xf7\x12\x01\x02\x02\x01"};
+		{10, (const char *)"\x2a\x86\x48\x86\xf7\x12\x01\x02\x02\x01"};
 
 	TALLOC_CTX *mem_ctx;
 
diff --git a/lib/addns/dnsmarshall.c b/lib/addns/dnsmarshall.c
index 5530290c57..59d6470f34 100644
--- a/lib/addns/dnsmarshall.c
+++ b/lib/addns/dnsmarshall.c
@@ -39,7 +39,7 @@ struct dns_buffer *dns_create_buffer(TALLOC_CTX *mem_ctx)
 	 */
 	result->size = 2;
 
-	if (!(result->data = TALLOC_ARRAY(result, uint8, result->size))) {
+	if (!(result->data = talloc_array(result, uint8, result->size))) {
 		TALLOC_FREE(result);
 		return NULL;
 	}
@@ -78,7 +78,7 @@ void dns_marshall_buffer(struct dns_buffer *buf, const uint8 *data,
 
 		new_size += (64 - (new_size % 64));
 
-		if (!(new_data = TALLOC_REALLOC_ARRAY(buf, buf->data, uint8,
+		if (!(new_data = talloc_realloc(buf, buf->data, uint8,
 						      new_size))) {
 			buf->error = ERROR_DNS_NO_MEMORY;
 			return;
@@ -223,7 +223,7 @@ static void dns_unmarshall_label(TALLOC_CTX *mem_ctx,
 
 	label->len = len;
 
-	if (!(label->label = TALLOC_ARRAY(label, char, len+1))) {
+	if (!(label->label = talloc_array(label, char, len+1))) {
 		buf->error = ERROR_DNS_NO_MEMORY;
 		goto error;
 	}
@@ -329,7 +329,7 @@ static void dns_unmarshall_rr(TALLOC_CTX *mem_ctx,
 	if (!(ERR_DNS_IS_OK(buf->error))) return;
 
 	if (r->data_length != 0) {
-		if (!(r->data = TALLOC_ARRAY(r, uint8, r->data_length))) {
+		if (!(r->data = talloc_array(r, uint8, r->data_length))) {
 			buf->error = ERROR_DNS_NO_MEMORY;
 			return;
 		}
@@ -390,7 +390,7 @@ DNS_ERROR dns_unmarshall_request(TALLOC_CTX *mem_ctx,
 	uint16 i;
 	DNS_ERROR err;
 
-	if (!(req = TALLOC_ZERO_P(mem_ctx, struct dns_request))) {
+	if (!(req = talloc_zero(mem_ctx, struct dns_request))) {
 		return ERROR_DNS_NO_MEMORY;
 	}
 
@@ -406,22 +406,22 @@ DNS_ERROR dns_unmarshall_request(TALLOC_CTX *mem_ctx,
 	err = ERROR_DNS_NO_MEMORY;
 
 	if ((req->num_questions != 0) &&
-	    !(req->questions = TALLOC_ARRAY(req, struct dns_question *,
+	    !(req->questions = talloc_array(req, struct dns_question *,
 					    req->num_questions))) {
 		goto error;
 	}
 	if ((req->num_answers != 0) &&
-	    !(req->answers = TALLOC_ARRAY(req, struct dns_rrec *,
+	    !(req->answers = talloc_array(req, struct dns_rrec *,
 					  req->num_answers))) {
 		goto error;
 	}
 	if ((req->num_auths != 0) &&
-	    !(req->auths = TALLOC_ARRAY(req, struct dns_rrec *,
+	    !(req->auths = talloc_array(req, struct dns_rrec *,
 					req->num_auths))) {
 		goto error;
 	}
 	if ((req->num_additionals != 0) &&
-	    !(req->additionals = TALLOC_ARRAY(req, struct dns_rrec *,
+	    !(req->additionals = talloc_array(req, struct dns_rrec *,
 					      req->num_additionals))) {
 		goto error;
 	}
diff --git a/lib/addns/dnsrecord.c b/lib/addns/dnsrecord.c
index 559c2644d4..2240d08fb9 100644
--- a/lib/addns/dnsrecord.c
+++ b/lib/addns/dnsrecord.c
@@ -31,8 +31,8 @@ DNS_ERROR dns_create_query( TALLOC_CTX *mem_ctx, const char *name,
 	struct dns_question *q;
 	DNS_ERROR err;
 
-	if (!(req = TALLOC_ZERO_P(mem_ctx, struct dns_request)) ||
-	    !(req->questions = TALLOC_ARRAY(req, struct dns_question *, 1)) ||
+	if (!(req = talloc_zero(mem_ctx, struct dns_request)) ||
+	    !(req->questions = talloc_array(req, struct dns_question *, 1)) ||
 	    !(req->questions[0] = talloc(req->questions,
 					 struct dns_question))) {
 		TALLOC_FREE(req);
@@ -64,8 +64,8 @@ DNS_ERROR dns_create_update( TALLOC_CTX *mem_ctx, const char *name,
 	struct dns_zone *z;
 	DNS_ERROR err;
 
-	if (!(req = TALLOC_ZERO_P(mem_ctx, struct dns_update_request)) ||
-	    !(req->zones = TALLOC_ARRAY(req, struct dns_zone *, 1)) ||
+	if (!(req = talloc_zero(mem_ctx, struct dns_update_request)) ||
+	    !(req->zones = talloc_array(req, struct dns_zone *, 1)) ||
 	    !(req->zones[0] = talloc(req->zones, struct dns_zone))) {
 		TALLOC_FREE(req);
 		return ERROR_DNS_NO_MEMORY;
@@ -131,8 +131,8 @@ DNS_ERROR dns_create_a_record(TALLOC_CTX *mem_ctx, const char *host,
 		return ERROR_DNS_SUCCESS;
 	}
 
-	ip = ((struct sockaddr_in *)pss)->sin_addr;
-	if (!(data = (uint8 *)TALLOC_MEMDUP(mem_ctx, (const void *)&ip.s_addr,
+	ip = ((const struct sockaddr_in *)pss)->sin_addr;
+	if (!(data = (uint8 *)talloc_memdup(mem_ctx, (const void *)&ip.s_addr,
 					    sizeof(ip.s_addr)))) {
 		return ERROR_DNS_NO_MEMORY;
 	}
@@ -240,7 +240,7 @@ DNS_ERROR dns_unmarshall_tkey_record(TALLOC_CTX *mem_ctx, struct dns_rrec *rec,
 	if (!ERR_DNS_IS_OK(buf.error)) goto error;
 
 	if (tkey->key_length) {
-		if (!(tkey->key = TALLOC_ARRAY(tkey, uint8, tkey->key_length))) {
+		if (!(tkey->key = talloc_array(tkey, uint8, tkey->key_length))) {
 			buf.error = ERROR_DNS_NO_MEMORY;
 			goto error;
 		}
@@ -308,7 +308,7 @@ DNS_ERROR dns_add_rrec(TALLOC_CTX *mem_ctx, struct dns_rrec *rec,
 {
 	struct dns_rrec **new_records;
 
-	if (!(new_records = TALLOC_REALLOC_ARRAY(mem_ctx, *records,
+	if (!(new_records = talloc_realloc(mem_ctx, *records,
 						 struct dns_rrec *,
 						 (*num_records)+1))) {
 		return ERROR_DNS_NO_MEMORY;
diff --git a/lib/addns/dnssock.c b/lib/addns/dnssock.c
index 42b4e2d40f..aaeb3f03fa 100644
--- a/lib/addns/dnssock.c
+++ b/lib/addns/dnssock.c
@@ -250,7 +250,7 @@ static DNS_ERROR dns_receive_tcp(TALLOC_CTX *mem_ctx,
 	DNS_ERROR err;
 	uint16 len;
 
-	if (!(buf = TALLOC_ZERO_P(mem_ctx, struct dns_buffer))) {
+	if (!(buf = talloc_zero(mem_ctx, struct dns_buffer))) {
 		return ERROR_DNS_NO_MEMORY;
 	}
 
@@ -262,7 +262,7 @@ static DNS_ERROR dns_receive_tcp(TALLOC_CTX *mem_ctx,
 	buf->size = ntohs(len);
 
 	if (buf->size) {
-		if (!(buf->data = TALLOC_ARRAY(buf, uint8, buf->size))) {
+		if (!(buf->data = talloc_array(buf, uint8, buf->size))) {
 			TALLOC_FREE(buf);
 			return ERROR_DNS_NO_MEMORY;
 		}
@@ -287,7 +287,7 @@ static DNS_ERROR dns_receive_udp(TALLOC_CTX *mem_ctx,
 	struct dns_buffer *buf;
 	ssize_t received;
 
-	if (!(buf = TALLOC_ZERO_P(mem_ctx, struct dns_buffer))) {
+	if (!(buf = talloc_zero(mem_ctx, struct dns_buffer))) {
 		return ERROR_DNS_NO_MEMORY;
 	}
 
@@ -295,7 +295,7 @@ static DNS_ERROR dns_receive_udp(TALLOC_CTX *mem_ctx,
 	 * UDP based DNS can only be 512 bytes
 	 */
 
-	if (!(buf->data = TALLOC_ARRAY(buf, uint8, 512))) {
+	if (!(buf->data = talloc_array(buf, uint8, 512))) {
 		TALLOC_FREE(buf);
 		return ERROR_DNS_NO_MEMORY;
 	}
diff --git a/lib/addns/dnsutils.c b/lib/addns/dnsutils.c
index 37b862c7f0..43305a9873 100644
--- a/lib/addns/dnsutils.c
+++ b/lib/addns/dnsutils.c
@@ -53,7 +53,7 @@ static DNS_ERROR LabelList( TALLOC_CTX *mem_ctx,
 		return ERROR_DNS_INVALID_NAME;
 	}
 
-	if (!(result = TALLOC_ZERO_P(mem_ctx, struct dns_domain_label))) {
+	if (!(result = talloc_zero(mem_ctx, struct dns_domain_label))) {
 		return ERROR_DNS_NO_MEMORY;
 	}
 
@@ -138,7 +138,7 @@ char *dns_generate_keyname( TALLOC_CTX *mem_ctx )
 	/*
 	 * uuid_unparse gives 36 bytes plus '\0'
 	 */
-	if (!(result = TALLOC_ARRAY(mem_ctx, char, 37))) {
+	if (!(result = talloc_array(mem_ctx, char, 37))) {
 		return NULL;
 	}
 
diff --git a/lib/async_req/async_sock.c b/lib/async_req/async_sock.c
index 86053d94e8..dfb1a1cdbd 100644
--- a/lib/async_req/async_sock.c
+++ b/lib/async_req/async_sock.c
@@ -386,6 +386,7 @@ struct writev_state {
 	int count;
 	size_t total_size;
 	uint16_t flags;
+	bool err_on_readability;
 };
 
 static void writev_trigger(struct tevent_req *req, void *private_data);
@@ -413,10 +414,8 @@ struct tevent_req *writev_send(TALLOC_CTX *mem_ctx, struct tevent_context *ev,
 	if (state->iov == NULL) {
 		goto fail;
 	}
-	state->flags = TEVENT_FD_WRITE;
-	if (err_on_readability) {
-		state->flags |= TEVENT_FD_READ;
-	}
+	state->flags = TEVENT_FD_WRITE|TEVENT_FD_READ;
+	state->err_on_readability = err_on_readability;
 
 	if (queue == NULL) {
 		struct tevent_fd *fde;
@@ -462,8 +461,35 @@ static void writev_handler(struct tevent_context *ev, struct tevent_fd *fde,
 	to_write = 0;
 
 	if ((state->flags & TEVENT_FD_READ) && (flags & TEVENT_FD_READ)) {
-		tevent_req_error(req, EPIPE);
-		return;
+		int ret, value;
+
+		if (state->err_on_readability) {
+			/* Readable and the caller wants an error on read. */
+			tevent_req_error(req, EPIPE);
+			return;
+		}
+
+		/* Might be an error. Check if there are bytes to read */
+		ret = ioctl(state->fd, FIONREAD, &value);
+		/* FIXME - should we also check
+		   for ret == 0 and value == 0 here ? */
+		if (ret == -1) {
+			/* There's an error. */
+			tevent_req_error(req, EPIPE);
+			return;
+		}
+		/* A request for TEVENT_FD_READ will succeed from now and
+		   forevermore until the bytes are read so if there was
+		   an error we'll wait until we do read, then get it in
+		   the read callback function. Until then, remove TEVENT_FD_READ
+		   from the flags we're waiting for. */
+		state->flags &= ~TEVENT_FD_READ;
+		TEVENT_FD_NOT_READABLE(fde);
+
+		/* If not writable, we're done. */
+		if (!(flags & TEVENT_FD_WRITE)) {
+			return;
+		}
 	}
 
 	for (i=0; i<state->count; i++) {
diff --git a/lib/ccan/array_size/LICENSE b/lib/ccan/array_size/LICENSE
new file mode 100644
index 0000000000..5522aa5f33
--- /dev/null
+++ b/lib/ccan/array_size/LICENSE
@@ -0,0 +1,508 @@
+
+                  GNU LESSER GENERAL PUBLIC LICENSE
+                       Version 2.1, February 1999
+
+ Copyright (C) 1991, 1999 Free Software Foundation, Inc.
+	51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+[This is the first released version of the Lesser GPL.  It also counts
+ as the successor of the GNU Library Public License, version 2, hence
+ the version number 2.1.]
+
+                            Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+Licenses are intended to guarantee your freedom to share and change
+free software--to make sure the software is free for all its users.
+
+  This license, the Lesser General Public License, applies to some
+specially designated software packages--typically libraries--of the
+Free Software Foundation and other authors who decide to use it.  You
+can use it too, but we suggest you first think carefully about whether
+this license or the ordinary General Public License is the better
+strategy to use in any particular case, based on the explanations
+below.
+
+  When we speak of free software, we are referring to freedom of use,
+not price.  Our General Public Licenses are designed to make sure that
+you have the freedom to distribute copies of free software (and charge
+for this service if you wish); that you receive source code or can get
+it if you want it; that you can change the software and use pieces of
+it in new free programs; and that you are informed that you can do
+these things.
+
+  To protect your rights, we need to make restrictions that forbid
+distributors to deny you these rights or to ask you to surrender these
+rights.  These restrictions translate to certain responsibilities for
+you if you distribute copies of the library or if you modify it.
+
+  For example, if you distribute copies of the library, whether gratis
+or for a fee, you must give the recipients all the rights that we gave
+you.  You must make sure that they, too, receive or can get the source
+code.  If you link other code with the library, you must provide
+complete object files to the recipients, so that they can relink them
+with the library after making changes to the library and recompiling
+it.  And you must show them these terms so they know their rights.
+
+  We protect your rights with a two-step method: (1) we copyright the
+library, and (2) we offer you this license, which gives you legal
+permission to copy, distribute and/or modify the library.
+
+  To protect each distributor, we want to make it very clear that
+there is no warranty for the free library.  Also, if the library is
+modified by someone else and passed on, the recipients should know
+that what they have is not the original version, so that the original
+author's reputation will not be affected by problems that might be
+introduced by others.
+
+  Finally, software patents pose a constant threat to the existence of
+any free program.  We wish to make sure that a company cannot
+effectively restrict the users of a free program by obtaining a
+restrictive license from a patent holder.  Therefore, we insist that
+any patent license obtained for a version of the library must be
+consistent with the full freedom of use specified in this license.
+
+  Most GNU software, including some libraries, is covered by the
+ordinary GNU General Public License.  This license, the GNU Lesser
+General Public License, applies to certain designated libraries, and
+is quite different from the ordinary General Public License.  We use
+this license for certain libraries in order to permit linking those
+libraries into non-free programs.
+
+  When a program is linked with a library, whether statically or using
+a shared library, the combination of the two is legally speaking a
+combined work, a derivative of the original library.  The ordinary
+General Public License therefore permits such linking only if the
+entire combination fits its criteria of freedom.  The Lesser General
+Public License permits more lax criteria for linking other code with
+the library.
+
+  We call this license the "Lesser" General Public License because it
+does Less to protect the user's freedom than the ordinary General
+Public License.  It also provides other free software developers Less
+of an advantage over competing non-free programs.  These disadvantages
+are the reason we use the ordinary General Public License for many
+libraries.  However, the Lesser license provides advantages in certain
+special circumstances.
+
+  For example, on rare occasions, there may be a special need to
+encourage the widest possible use of a certain library, so that it
+becomes a de-facto standard.  To achieve this, non-free programs must
+be allowed to use the library.  A more frequent case is that a free
+library does the same job as widely used non-free libraries.  In this
+case, there is little to gain by limiting the free library to free
+software only, so we use the Lesser General Public License.
+
+  In other cases, permission to use a particular library in non-free
+programs enables a greater number of people to use a large body of
+free software.  For example, permission to use the GNU C Library in
+non-free programs enables many more people to use the whole GNU
+operating system, as well as its variant, the GNU/Linux operating
+system.
+
+  Although the Lesser General Public License is Less protective of the
+users' freedom, it does ensure that the user of a program that is
+linked with the Library has the freedom and the wherewithal to run
+that program using a modified version of the Library.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.  Pay close attention to the difference between a
+"work based on the library" and a "work that uses the library".  The
+former contains code derived from the library, whereas the latter must
+be combined with the library in order to run.
+
+                  GNU LESSER GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License Agreement applies to any software library or other
+program which contains a notice placed by the copyright holder or
+other authorized party saying it may be distributed under the terms of
+this Lesser General Public License (also called "this License").
+Each licensee is addressed as "you".
+
+  A "library" means a collection of software functions and/or data
+prepared so as to be conveniently linked with application programs
+(which use some of those functions and data) to form executables.
+
+  The "Library", below, refers to any such software library or work
+which has been distributed under these terms.  A "work based on the
+Library" means either the Library or any derivative work under
+copyright law: that is to say, a work containing the Library or a
+portion of it, either verbatim or with modifications and/or translated
+straightforwardly into another language.  (Hereinafter, translation is
+included without limitation in the term "modification".)
+
+  "Source code" for a work means the preferred form of the work for
+making modifications to it.  For a library, complete source code means
+all the source code for all modules it contains, plus any associated
+interface definition files, plus the scripts used to control
+compilation and installation of the library.
+
+  Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running a program using the Library is not restricted, and output from
+such a program is covered only if its contents constitute a work based
+on the Library (independent of the use of the Library in a tool for
+writing it).  Whether that is true depends on what the Library does
+and what the program that uses the Library does.
+
+  1. You may copy and distribute verbatim copies of the Library's
+complete source code as you receive it, in any medium, provided that
+you conspicuously and appropriately publish on each copy an
+appropriate copyright notice and disclaimer of warranty; keep intact
+all the notices that refer to this License and to the absence of any
+warranty; and distribute a copy of this License along with the
+Library.
+
+  You may charge a fee for the physical act of transferring a copy,
+and you may at your option offer warranty protection in exchange for a
+fee.
+
+  2. You may modify your copy or copies of the Library or any portion
+of it, thus forming a work based on the Library, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) The modified work must itself be a software library.
+
+    b) You must cause the files modified to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    c) You must cause the whole of the work to be licensed at no
+    charge to all third parties under the terms of this License.
+
+    d) If a facility in the modified Library refers to a function or a
+    table of data to be supplied by an application program that uses
+    the facility, other than as an argument passed when the facility
+    is invoked, then you must make a good faith effort to ensure that,
+    in the event an application does not supply such function or
+    table, the facility still operates, and performs whatever part of
+    its purpose remains meaningful.
+
+    (For example, a function in a library to compute square roots has
+    a purpose that is entirely well-defined independent of the
+    application.  Therefore, Subsection 2d requires that any
+    application-supplied function or table used by this function must
+    be optional: if the application does not supply it, the square
+    root function must still compute square roots.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Library,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Library, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote
+it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Library.
+
+In addition, mere aggregation of another work not based on the Library
+with the Library (or with a work based on the Library) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may opt to apply the terms of the ordinary GNU General Public
+License instead of this License to a given copy of the Library.  To do
+this, you must alter all the notices that refer to this License, so
+that they refer to the ordinary GNU General Public License, version 2,
+instead of to this License.  (If a newer version than version 2 of the
+ordinary GNU General Public License has appeared, then you can specify
+that version instead if you wish.)  Do not make any other change in
+these notices.
+
+  Once this change is made in a given copy, it is irreversible for
+that copy, so the ordinary GNU General Public License applies to all
+subsequent copies and derivative works made from that copy.
+
+  This option is useful when you wish to copy part of the code of
+the Library into a program that is not a library.
+
+  4. You may copy and distribute the Library (or a portion or
+derivative of it, under Section 2) in object code or executable form
+under the terms of Sections 1 and 2 above provided that you accompany
+it with the complete corresponding machine-readable source code, which
+must be distributed under the terms of Sections 1 and 2 above on a
+medium customarily used for software interchange.
+
+  If distribution of object code is made by offering access to copy
+from a designated place, then offering equivalent access to copy the
+source code from the same place satisfies the requirement to
+distribute the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  5. A program that contains no derivative of any portion of the
+Library, but is designed to work with the Library by being compiled or
+linked with it, is called a "work that uses the Library".  Such a
+work, in isolation, is not a derivative work of the Library, and
+therefore falls outside the scope of this License.
+
+  However, linking a "work that uses the Library" with the Library
+creates an executable that is a derivative of the Library (because it
+contains portions of the Library), rather than a "work that uses the
+library".  The executable is therefore covered by this License.
+Section 6 states terms for distribution of such executables.
+
+  When a "work that uses the Library" uses material from a header file
+that is part of the Library, the object code for the work may be a
+derivative work of the Library even though the source code is not.
+Whether this is true is especially significant if the work can be
+linked without the Library, or if the work is itself a library.  The
+threshold for this to be true is not precisely defined by law.
+
+  If such an object file uses only numerical parameters, data
+structure layouts and accessors, and small macros and small inline
+functions (ten lines or less in length), then the use of the object
+file is unrestricted, regardless of whether it is legally a derivative
+work.  (Executables containing this object code plus portions of the
+Library will still fall under Section 6.)
+
+  Otherwise, if the work is a derivative of the Library, you may
+distribute the object code for the work under the terms of Section 6.
+Any executables containing that work also fall under Section 6,
+whether or not they are linked directly with the Library itself.
+
+  6. As an exception to the Sections above, you may also combine or
+link a "work that uses the Library" with the Library to produce a
+work containing portions of the Library, and distribute that work
+under terms of your choice, provided that the terms permit
+modification of the work for the customer's own use and reverse
+engineering for debugging such modifications.
+
+  You must give prominent notice with each copy of the work that the
+Library is used in it and that the Library and its use are covered by
+this License.  You must supply a copy of this License.  If the work
+during execution displays copyright notices, you must include the
+copyright notice for the Library among them, as well as a reference
+directing the user to the copy of this License.  Also, you must do one
+of these things:
+
+    a) Accompany the work with the complete corresponding
+    machine-readable source code for the Library including whatever
+    changes were used in the work (which must be distributed under
+    Sections 1 and 2 above); and, if the work is an executable linked
+    with the Library, with the complete machine-readable "work that
+    uses the Library", as object code and/or source code, so that the
+    user can modify the Library and then relink to produce a modified
+    executable containing the modified Library.  (It is understood
+    that the user who changes the contents of definitions files in the
+    Library will not necessarily be able to recompile the application
+    to use the modified definitions.)
+
+    b) Use a suitable shared library mechanism for linking with the
+    Library.  A suitable mechanism is one that (1) uses at run time a
+    copy of the library already present on the user's computer system,
+    rather than copying library functions into the executable, and (2)
+    will operate properly with a modified version of the library, if
+    the user installs one, as long as the modified version is
+    interface-compatible with the version that the work was made with.
+
+    c) Accompany the work with a written offer, valid for at least
+    three years, to give the same user the materials specified in
+    Subsection 6a, above, for a charge no more than the cost of
+    performing this distribution.
+
+    d) If distribution of the work is made by offering access to copy
+    from a designated place, offer equivalent access to copy the above
+    specified materials from the same place.
+
+    e) Verify that the user has already received a copy of these
+    materials or that you have already sent this user a copy.
+
+  For an executable, the required form of the "work that uses the
+Library" must include any data and utility programs needed for
+reproducing the executable from it.  However, as a special exception,
+the materials to be distributed need not include anything that is
+normally distributed (in either source or binary form) with the major
+components (compiler, kernel, and so on) of the operating system on
+which the executable runs, unless that component itself accompanies
+the executable.
+
+  It may happen that this requirement contradicts the license
+restrictions of other proprietary libraries that do not normally
+accompany the operating system.  Such a contradiction means you cannot
+use both them and the Library together in an executable that you
+distribute.
+
+  7. You may place library facilities that are a work based on the
+Library side-by-side in a single library together with other library
+facilities not covered by this License, and distribute such a combined
+library, provided that the separate distribution of the work based on
+the Library and of the other library facilities is otherwise
+permitted, and provided that you do these two things:
+
+    a) Accompany the combined library with a copy of the same work
+    based on the Library, uncombined with any other library
+    facilities.  This must be distributed under the terms of the
+    Sections above.
+
+    b) Give prominent notice with the combined library of the fact
+    that part of it is a work based on the Library, and explaining
+    where to find the accompanying uncombined form of the same work.
+
+  8. You may not copy, modify, sublicense, link with, or distribute
+the Library except as expressly provided under this License.  Any
+attempt otherwise to copy, modify, sublicense, link with, or
+distribute the Library is void, and will automatically terminate your
+rights under this License.  However, parties who have received copies,
+or rights, from you under this License will not have their licenses
+terminated so long as such parties remain in full compliance.
+
+  9. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Library or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Library (or any work based on the
+Library), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Library or works based on it.
+
+  10. Each time you redistribute the Library (or any work based on the
+Library), the recipient automatically receives a license from the
+original licensor to copy, distribute, link with or modify the Library
+subject to these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties with
+this License.
+
+  11. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Library at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Library by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Library.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply, and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  12. If the distribution and/or use of the Library is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Library under this License
+may add an explicit geographical distribution limitation excluding those
+countries, so that distribution is permitted only in or among
+countries not thus excluded.  In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+  13. The Free Software Foundation may publish revised and/or new
+versions of the Lesser General Public License from time to time.
+Such new versions will be similar in spirit to the present version,
+but may differ in detail to address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Library
+specifies a version number of this License which applies to it and
+"any later version", you have the option of following the terms and
+conditions either of that version or of any later version published by
+the Free Software Foundation.  If the Library does not specify a
+license version number, you may choose any version ever published by
+the Free Software Foundation.
+
+  14. If you wish to incorporate parts of the Library into other free
+programs whose distribution conditions are incompatible with these,
+write to the author to ask for permission.  For software which is
+copyrighted by the Free Software Foundation, write to the Free
+Software Foundation; we sometimes make exceptions for this.  Our
+decision will be guided by the two goals of preserving the free status
+of all derivatives of our free software and of promoting the sharing
+and reuse of software generally.
+
+                            NO WARRANTY
+
+  15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO
+WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW.
+EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR
+OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY
+KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE
+LIBRARY IS WITH YOU.  SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME
+THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN
+WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY
+AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU
+FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR
+CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
+LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
+RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
+FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
+SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
+DAMAGES.
+
+                     END OF TERMS AND CONDITIONS
+
+           How to Apply These Terms to Your New Libraries
+
+  If you develop a new library, and you want it to be of the greatest
+possible use to the public, we recommend making it free software that
+everyone can redistribute and change.  You can do so by permitting
+redistribution under these terms (or, alternatively, under the terms
+of the ordinary General Public License).
+
+  To apply these terms, attach the following notices to the library.
+It is safest to attach them to the start of each source file to most
+effectively convey the exclusion of warranty; and each file should
+have at least the "copyright" line and a pointer to where the full
+notice is found.
+
+
+    <one line to give the library's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This library is free software; you can redistribute it and/or
+    modify it under the terms of the GNU Lesser General Public
+    License as published by the Free Software Foundation; either
+    version 2.1 of the License, or (at your option) any later version.
+
+    This library is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public
+    License along with this library; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+
+Also add information on how to contact you by electronic and paper mail.
+
+You should also get your employer (if you work as a programmer) or
+your school, if any, to sign a "copyright disclaimer" for the library,
+if necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the
+  library `Frob' (a library for tweaking knobs) written by James
+  Random Hacker.
+
+  <signature of Ty Coon>, 1 April 1990
+  Ty Coon, President of Vice
+
+That's all there is to it!
diff --git a/lib/ccan/array_size/_info b/lib/ccan/array_size/_info
new file mode 100644
index 0000000000..af7ef1cfd2
--- /dev/null
+++ b/lib/ccan/array_size/_info
@@ -0,0 +1,46 @@
+#include <stdio.h>
+#include <string.h>
+#include "config.h"
+
+/**
+ * array_size - routine for safely deriving the size of a visible array.
+ *
+ * This provides a simple ARRAY_SIZE() macro, which (given a good compiler)
+ * will also break compile if you try to use it on a pointer.
+ *
+ * This can ensure your code is robust to changes, without needing a gratuitous
+ * macro or constant.
+ *
+ * Example:
+ *	// Outputs "Initialized 32 values"
+ *	#include <ccan/array_size/array_size.h>
+ *	#include <stdlib.h>
+ *	#include <stdio.h>
+ *
+ *	// We currently use 32 random values.
+ *	static unsigned int vals[32];
+ *
+ *	int main(void)
+ *	{
+ *		unsigned int i;
+ *		for (i = 0; i < ARRAY_SIZE(vals); i++)
+ *			vals[i] = random();
+ *		printf("Initialized %u values\n", i);
+ *		return 0;
+ *	}
+ *
+ * License: LGPL (2 or any later version)
+ * Author: Rusty Russell <rusty@rustcorp.com.au>
+ */
+int main(int argc, char *argv[])
+{
+	if (argc != 2)
+		return 1;
+
+	if (strcmp(argv[1], "depends") == 0) {
+		printf("ccan/build_assert\n");
+		return 0;
+	}
+
+	return 1;
+}
diff --git a/lib/ccan/array_size/array_size.h b/lib/ccan/array_size/array_size.h
new file mode 100644
index 0000000000..0876945c5e
--- /dev/null
+++ b/lib/ccan/array_size/array_size.h
@@ -0,0 +1,25 @@
+#ifndef CCAN_ARRAY_SIZE_H
+#define CCAN_ARRAY_SIZE_H
+#include "config.h"
+#include <ccan/build_assert/build_assert.h>
+
+/**
+ * ARRAY_SIZE - get the number of elements in a visible array
+ * @arr: the array whose size you want.
+ *
+ * This does not work on pointers, or arrays declared as [], or
+ * function parameters.  With correct compiler support, such usage
+ * will cause a build error (see build_assert).
+ */
+#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]) + _array_size_chk(arr))
+
+#if HAVE_BUILTIN_TYPES_COMPATIBLE_P && HAVE_TYPEOF
+/* Two gcc extensions.
+ * &a[0] degrades to a pointer: a different type from an array */
+#define _array_size_chk(arr)						\
+	BUILD_ASSERT_OR_ZERO(!__builtin_types_compatible_p(typeof(arr),	\
+							typeof(&(arr)[0])))
+#else
+#define _array_size_chk(arr) 0
+#endif
+#endif /* CCAN_ALIGNOF_H */
diff --git a/lib/ccan/array_size/test/compile_fail-function-param.c b/lib/ccan/array_size/test/compile_fail-function-param.c
new file mode 100644
index 0000000000..cb64d98424
--- /dev/null
+++ b/lib/ccan/array_size/test/compile_fail-function-param.c
@@ -0,0 +1,24 @@
+#include <ccan/array_size/array_size.h>
+#include <stdlib.h>
+
+struct foo {
+	unsigned int a, b;
+};
+
+int check_parameter(const struct foo array[4]);
+int check_parameter(const struct foo array[4])
+{
+#ifdef FAIL
+	return (ARRAY_SIZE(array) == 4);
+#if !HAVE_TYPEOF || !HAVE_BUILTIN_TYPES_COMPATIBLE_P
+#error "Unfortunately we don't fail if _array_size_chk is a noop."
+#endif
+#else
+	return sizeof(array) == 4 * sizeof(struct foo);
+#endif
+}
+
+int main(int argc, char *argv[])
+{
+	return check_parameter(NULL);
+}
diff --git a/lib/ccan/array_size/test/compile_fail.c b/lib/ccan/array_size/test/compile_fail.c
new file mode 100644
index 0000000000..37d315f219
--- /dev/null
+++ b/lib/ccan/array_size/test/compile_fail.c
@@ -0,0 +1,14 @@
+#include <ccan/array_size/array_size.h>
+
+int main(int argc, char *argv[8])
+{
+	char array[100];
+#ifdef FAIL
+	return ARRAY_SIZE(argv) + ARRAY_SIZE(array);
+#if !HAVE_TYPEOF || !HAVE_BUILTIN_TYPES_COMPATIBLE_P
+#error "Unfortunately we don't fail if _array_size_chk is a noop."
+#endif
+#else
+	return ARRAY_SIZE(array);
+#endif
+}
diff --git a/lib/ccan/array_size/test/run.c b/lib/ccan/array_size/test/run.c
new file mode 100644
index 0000000000..37b4200b44
--- /dev/null
+++ b/lib/ccan/array_size/test/run.c
@@ -0,0 +1,33 @@
+#include <ccan/array_size/array_size.h>
+#include <ccan/tap/tap.h>
+
+static char array1[1];
+static int array2[2];
+static unsigned long array3[3][5];
+struct foo {
+	unsigned int a, b;
+	char string[100];
+};
+static struct foo array4[4];
+
+/* Make sure they can be used in initializers. */
+static int array1_size = ARRAY_SIZE(array1);
+static int array2_size = ARRAY_SIZE(array2);
+static int array3_size = ARRAY_SIZE(array3);
+static int array4_size = ARRAY_SIZE(array4);
+
+int main(int argc, char *argv[])
+{
+	plan_tests(8);
+	ok1(array1_size == 1);
+	ok1(array2_size == 2);
+	ok1(array3_size == 3);
+	ok1(array4_size == 4);
+
+	ok1(ARRAY_SIZE(array1) == 1);
+	ok1(ARRAY_SIZE(array2) == 2);
+	ok1(ARRAY_SIZE(array3) == 3);
+	ok1(ARRAY_SIZE(array4) == 4);
+
+	return exit_status();
+}
diff --git a/lib/ccan/asearch/LICENSE b/lib/ccan/asearch/LICENSE
new file mode 100644
index 0000000000..5522aa5f33
--- /dev/null
+++ b/lib/ccan/asearch/LICENSE
@@ -0,0 +1,508 @@
+
+                  GNU LESSER GENERAL PUBLIC LICENSE
+                       Version 2.1, February 1999
+
+ Copyright (C) 1991, 1999 Free Software Foundation, Inc.
+	51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+[This is the first released version of the Lesser GPL.  It also counts
+ as the successor of the GNU Library Public License, version 2, hence
+ the version number 2.1.]
+
+                            Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+Licenses are intended to guarantee your freedom to share and change
+free software--to make sure the software is free for all its users.
+
+  This license, the Lesser General Public License, applies to some
+specially designated software packages--typically libraries--of the
+Free Software Foundation and other authors who decide to use it.  You
+can use it too, but we suggest you first think carefully about whether
+this license or the ordinary General Public License is the better
+strategy to use in any particular case, based on the explanations
+below.
+
+  When we speak of free software, we are referring to freedom of use,
+not price.  Our General Public Licenses are designed to make sure that
+you have the freedom to distribute copies of free software (and charge
+for this service if you wish); that you receive source code or can get
+it if you want it; that you can change the software and use pieces of
+it in new free programs; and that you are informed that you can do
+these things.
+
+  To protect your rights, we need to make restrictions that forbid
+distributors to deny you these rights or to ask you to surrender these
+rights.  These restrictions translate to certain responsibilities for
+you if you distribute copies of the library or if you modify it.
+
+  For example, if you distribute copies of the library, whether gratis
+or for a fee, you must give the recipients all the rights that we gave
+you.  You must make sure that they, too, receive or can get the source
+code.  If you link other code with the library, you must provide
+complete object files to the recipients, so that they can relink them
+with the library after making changes to the library and recompiling
+it.  And you must show them these terms so they know their rights.
+
+  We protect your rights with a two-step method: (1) we copyright the
+library, and (2) we offer you this license, which gives you legal
+permission to copy, distribute and/or modify the library.
+
+  To protect each distributor, we want to make it very clear that
+there is no warranty for the free library.  Also, if the library is
+modified by someone else and passed on, the recipients should know
+that what they have is not the original version, so that the original
+author's reputation will not be affected by problems that might be
+introduced by others.
+
+  Finally, software patents pose a constant threat to the existence of
+any free program.  We wish to make sure that a company cannot
+effectively restrict the users of a free program by obtaining a
+restrictive license from a patent holder.  Therefore, we insist that
+any patent license obtained for a version of the library must be
+consistent with the full freedom of use specified in this license.
+
+  Most GNU software, including some libraries, is covered by the
+ordinary GNU General Public License.  This license, the GNU Lesser
+General Public License, applies to certain designated libraries, and
+is quite different from the ordinary General Public License.  We use
+this license for certain libraries in order to permit linking those
+libraries into non-free programs.
+
+  When a program is linked with a library, whether statically or using
+a shared library, the combination of the two is legally speaking a
+combined work, a derivative of the original library.  The ordinary
+General Public License therefore permits such linking only if the
+entire combination fits its criteria of freedom.  The Lesser General
+Public License permits more lax criteria for linking other code with
+the library.
+
+  We call this license the "Lesser" General Public License because it
+does Less to protect the user's freedom than the ordinary General
+Public License.  It also provides other free software developers Less
+of an advantage over competing non-free programs.  These disadvantages
+are the reason we use the ordinary General Public License for many
+libraries.  However, the Lesser license provides advantages in certain
+special circumstances.
+
+  For example, on rare occasions, there may be a special need to
+encourage the widest possible use of a certain library, so that it
+becomes a de-facto standard.  To achieve this, non-free programs must
+be allowed to use the library.  A more frequent case is that a free
+library does the same job as widely used non-free libraries.  In this
+case, there is little to gain by limiting the free library to free
+software only, so we use the Lesser General Public License.
+
+  In other cases, permission to use a particular library in non-free
+programs enables a greater number of people to use a large body of
+free software.  For example, permission to use the GNU C Library in
+non-free programs enables many more people to use the whole GNU
+operating system, as well as its variant, the GNU/Linux operating
+system.
+
+  Although the Lesser General Public License is Less protective of the
+users' freedom, it does ensure that the user of a program that is
+linked with the Library has the freedom and the wherewithal to run
+that program using a modified version of the Library.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.  Pay close attention to the difference between a
+"work based on the library" and a "work that uses the library".  The
+former contains code derived from the library, whereas the latter must
+be combined with the library in order to run.
+
+                  GNU LESSER GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License Agreement applies to any software library or other
+program which contains a notice placed by the copyright holder or
+other authorized party saying it may be distributed under the terms of
+this Lesser General Public License (also called "this License").
+Each licensee is addressed as "you".
+
+  A "library" means a collection of software functions and/or data
+prepared so as to be conveniently linked with application programs
+(which use some of those functions and data) to form executables.
+
+  The "Library", below, refers to any such software library or work
+which has been distributed under these terms.  A "work based on the
+Library" means either the Library or any derivative work under
+copyright law: that is to say, a work containing the Library or a
+portion of it, either verbatim or with modifications and/or translated
+straightforwardly into another language.  (Hereinafter, translation is
+included without limitation in the term "modification".)
+
+  "Source code" for a work means the preferred form of the work for
+making modifications to it.  For a library, complete source code means
+all the source code for all modules it contains, plus any associated
+interface definition files, plus the scripts used to control
+compilation and installation of the library.
+
+  Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running a program using the Library is not restricted, and output from
+such a program is covered only if its contents constitute a work based
+on the Library (independent of the use of the Library in a tool for
+writing it).  Whether that is true depends on what the Library does
+and what the program that uses the Library does.
+
+  1. You may copy and distribute verbatim copies of the Library's
+complete source code as you receive it, in any medium, provided that
+you conspicuously and appropriately publish on each copy an
+appropriate copyright notice and disclaimer of warranty; keep intact
+all the notices that refer to this License and to the absence of any
+warranty; and distribute a copy of this License along with the
+Library.
+
+  You may charge a fee for the physical act of transferring a copy,
+and you may at your option offer warranty protection in exchange for a
+fee.
+
+  2. You may modify your copy or copies of the Library or any portion
+of it, thus forming a work based on the Library, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) The modified work must itself be a software library.
+
+    b) You must cause the files modified to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    c) You must cause the whole of the work to be licensed at no
+    charge to all third parties under the terms of this License.
+
+    d) If a facility in the modified Library refers to a function or a
+    table of data to be supplied by an application program that uses
+    the facility, other than as an argument passed when the facility
+    is invoked, then you must make a good faith effort to ensure that,
+    in the event an application does not supply such function or
+    table, the facility still operates, and performs whatever part of
+    its purpose remains meaningful.
+
+    (For example, a function in a library to compute square roots has
+    a purpose that is entirely well-defined independent of the
+    application.  Therefore, Subsection 2d requires that any
+    application-supplied function or table used by this function must
+    be optional: if the application does not supply it, the square
+    root function must still compute square roots.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Library,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Library, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote
+it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Library.
+
+In addition, mere aggregation of another work not based on the Library
+with the Library (or with a work based on the Library) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may opt to apply the terms of the ordinary GNU General Public
+License instead of this License to a given copy of the Library.  To do
+this, you must alter all the notices that refer to this License, so
+that they refer to the ordinary GNU General Public License, version 2,
+instead of to this License.  (If a newer version than version 2 of the
+ordinary GNU General Public License has appeared, then you can specify
+that version instead if you wish.)  Do not make any other change in
+these notices.
+
+  Once this change is made in a given copy, it is irreversible for
+that copy, so the ordinary GNU General Public License applies to all
+subsequent copies and derivative works made from that copy.
+
+  This option is useful when you wish to copy part of the code of
+the Library into a program that is not a library.
+
+  4. You may copy and distribute the Library (or a portion or
+derivative of it, under Section 2) in object code or executable form
+under the terms of Sections 1 and 2 above provided that you accompany
+it with the complete corresponding machine-readable source code, which
+must be distributed under the terms of Sections 1 and 2 above on a
+medium customarily used for software interchange.
+
+  If distribution of object code is made by offering access to copy
+from a designated place, then offering equivalent access to copy the
+source code from the same place satisfies the requirement to
+distribute the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  5. A program that contains no derivative of any portion of the
+Library, but is designed to work with the Library by being compiled or
+linked with it, is called a "work that uses the Library".  Such a
+work, in isolation, is not a derivative work of the Library, and
+therefore falls outside the scope of this License.
+
+  However, linking a "work that uses the Library" with the Library
+creates an executable that is a derivative of the Library (because it
+contains portions of the Library), rather than a "work that uses the
+library".  The executable is therefore covered by this License.
+Section 6 states terms for distribution of such executables.
+
+  When a "work that uses the Library" uses material from a header file
+that is part of the Library, the object code for the work may be a
+derivative work of the Library even though the source code is not.
+Whether this is true is especially significant if the work can be
+linked without the Library, or if the work is itself a library.  The
+threshold for this to be true is not precisely defined by law.
+
+  If such an object file uses only numerical parameters, data
+structure layouts and accessors, and small macros and small inline
+functions (ten lines or less in length), then the use of the object
+file is unrestricted, regardless of whether it is legally a derivative
+work.  (Executables containing this object code plus portions of the
+Library will still fall under Section 6.)
+
+  Otherwise, if the work is a derivative of the Library, you may
+distribute the object code for the work under the terms of Section 6.
+Any executables containing that work also fall under Section 6,
+whether or not they are linked directly with the Library itself.
+
+  6. As an exception to the Sections above, you may also combine or
+link a "work that uses the Library" with the Library to produce a
+work containing portions of the Library, and distribute that work
+under terms of your choice, provided that the terms permit
+modification of the work for the customer's own use and reverse
+engineering for debugging such modifications.
+
+  You must give prominent notice with each copy of the work that the
+Library is used in it and that the Library and its use are covered by
+this License.  You must supply a copy of this License.  If the work
+during execution displays copyright notices, you must include the
+copyright notice for the Library among them, as well as a reference
+directing the user to the copy of this License.  Also, you must do one
+of these things:
+
+    a) Accompany the work with the complete corresponding
+    machine-readable source code for the Library including whatever
+    changes were used in the work (which must be distributed under
+    Sections 1 and 2 above); and, if the work is an executable linked
+    with the Library, with the complete machine-readable "work that
+    uses the Library", as object code and/or source code, so that the
+    user can modify the Library and then relink to produce a modified
+    executable containing the modified Library.  (It is understood
+    that the user who changes the contents of definitions files in the
+    Library will not necessarily be able to recompile the application
+    to use the modified definitions.)
+
+    b) Use a suitable shared library mechanism for linking with the
+    Library.  A suitable mechanism is one that (1) uses at run time a
+    copy of the library already present on the user's computer system,
+    rather than copying library functions into the executable, and (2)
+    will operate properly with a modified version of the library, if
+    the user installs one, as long as the modified version is
+    interface-compatible with the version that the work was made with.
+
+    c) Accompany the work with a written offer, valid for at least
+    three years, to give the same user the materials specified in
+    Subsection 6a, above, for a charge no more than the cost of
+    performing this distribution.
+
+    d) If distribution of the work is made by offering access to copy
+    from a designated place, offer equivalent access to copy the above
+    specified materials from the same place.
+
+    e) Verify that the user has already received a copy of these
+    materials or that you have already sent this user a copy.
+
+  For an executable, the required form of the "work that uses the
+Library" must include any data and utility programs needed for
+reproducing the executable from it.  However, as a special exception,
+the materials to be distributed need not include anything that is
+normally distributed (in either source or binary form) with the major
+components (compiler, kernel, and so on) of the operating system on
+which the executable runs, unless that component itself accompanies
+the executable.
+
+  It may happen that this requirement contradicts the license
+restrictions of other proprietary libraries that do not normally
+accompany the operating system.  Such a contradiction means you cannot
+use both them and the Library together in an executable that you
+distribute.
+
+  7. You may place library facilities that are a work based on the
+Library side-by-side in a single library together with other library
+facilities not covered by this License, and distribute such a combined
+library, provided that the separate distribution of the work based on
+the Library and of the other library facilities is otherwise
+permitted, and provided that you do these two things:
+
+    a) Accompany the combined library with a copy of the same work
+    based on the Library, uncombined with any other library
+    facilities.  This must be distributed under the terms of the
+    Sections above.
+
+    b) Give prominent notice with the combined library of the fact
+    that part of it is a work based on the Library, and explaining
+    where to find the accompanying uncombined form of the same work.
+
+  8. You may not copy, modify, sublicense, link with, or distribute
+the Library except as expressly provided under this License.  Any
+attempt otherwise to copy, modify, sublicense, link with, or
+distribute the Library is void, and will automatically terminate your
+rights under this License.  However, parties who have received copies,
+or rights, from you under this License will not have their licenses
+terminated so long as such parties remain in full compliance.
+
+  9. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Library or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Library (or any work based on the
+Library), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Library or works based on it.
+
+  10. Each time you redistribute the Library (or any work based on the
+Library), the recipient automatically receives a license from the
+original licensor to copy, distribute, link with or modify the Library
+subject to these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties with
+this License.
+
+  11. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Library at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Library by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Library.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply, and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  12. If the distribution and/or use of the Library is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Library under this License
+may add an explicit geographical distribution limitation excluding those
+countries, so that distribution is permitted only in or among
+countries not thus excluded.  In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+  13. The Free Software Foundation may publish revised and/or new
+versions of the Lesser General Public License from time to time.
+Such new versions will be similar in spirit to the present version,
+but may differ in detail to address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Library
+specifies a version number of this License which applies to it and
+"any later version", you have the option of following the terms and
+conditions either of that version or of any later version published by
+the Free Software Foundation.  If the Library does not specify a
+license version number, you may choose any version ever published by
+the Free Software Foundation.
+
+  14. If you wish to incorporate parts of the Library into other free
+programs whose distribution conditions are incompatible with these,
+write to the author to ask for permission.  For software which is
+copyrighted by the Free Software Foundation, write to the Free
+Software Foundation; we sometimes make exceptions for this.  Our
+decision will be guided by the two goals of preserving the free status
+of all derivatives of our free software and of promoting the sharing
+and reuse of software generally.
+
+                            NO WARRANTY
+
+  15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO
+WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW.
+EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR
+OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY
+KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE
+LIBRARY IS WITH YOU.  SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME
+THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN
+WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY
+AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU
+FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR
+CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
+LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
+RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
+FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
+SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
+DAMAGES.
+
+                     END OF TERMS AND CONDITIONS
+
+           How to Apply These Terms to Your New Libraries
+
+  If you develop a new library, and you want it to be of the greatest
+possible use to the public, we recommend making it free software that
+everyone can redistribute and change.  You can do so by permitting
+redistribution under these terms (or, alternatively, under the terms
+of the ordinary General Public License).
+
+  To apply these terms, attach the following notices to the library.
+It is safest to attach them to the start of each source file to most
+effectively convey the exclusion of warranty; and each file should
+have at least the "copyright" line and a pointer to where the full
+notice is found.
+
+
+    <one line to give the library's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This library is free software; you can redistribute it and/or
+    modify it under the terms of the GNU Lesser General Public
+    License as published by the Free Software Foundation; either
+    version 2.1 of the License, or (at your option) any later version.
+
+    This library is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public
+    License along with this library; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+
+Also add information on how to contact you by electronic and paper mail.
+
+You should also get your employer (if you work as a programmer) or
+your school, if any, to sign a "copyright disclaimer" for the library,
+if necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the
+  library `Frob' (a library for tweaking knobs) written by James
+  Random Hacker.
+
+  <signature of Ty Coon>, 1 April 1990
+  Ty Coon, President of Vice
+
+That's all there is to it!
diff --git a/lib/ccan/asearch/_info b/lib/ccan/asearch/_info
new file mode 100644
index 0000000000..857475016f
--- /dev/null
+++ b/lib/ccan/asearch/_info
@@ -0,0 +1,58 @@
+#include <stdio.h>
+#include <string.h>
+#include "config.h"
+
+/**
+ * asearch - typesafe binary search (bsearch)
+ *
+ * An ordered array of objects can be efficiently searched using a binary
+ * search algorithm; the time taken is around log(number of elements).
+ *
+ * This version uses macros to be typesafe on platforms which support it.
+ *
+ * License: LGPL
+ * Author: Rusty Russell <rusty@rustcorp.com.au>
+ *
+ * Example:
+ *	#include <ccan/asearch/asearch.h>
+ *	#include <stdio.h>
+ *	#include <string.h>
+ *
+ *	static int cmp(const char *key, char *const *elem)
+ *	{
+ *		return strcmp(key, *elem);
+ *	}
+ *
+ *	int main(int argc, char *argv[])
+ *	{
+ *		char **p;
+ *
+ *		if (argc < 2) {
+ *			fprintf(stderr, "Usage: %s <key> <list>...\n"
+ *				"Print position of key in (sorted) list\n",
+ *				argv[0]);
+ *			exit(1);
+ *		}
+ *
+ *		p = asearch(argv[1], &argv[2], argc-2, cmp);
+ *		if (!p) {
+ *			printf("Not found!\n");
+ *			return 1;
+ *		}
+ *		printf("%u\n", p - &argv[2]);
+ *		return 0;
+ *	}
+ */
+int main(int argc, char *argv[])
+{
+	if (argc != 2)
+		return 1;
+
+	if (strcmp(argv[1], "depends") == 0) {
+		printf("ccan/typesafe_cb\n");
+		printf("ccan/array_size\n");
+		return 0;
+	}
+
+	return 1;
+}
diff --git a/lib/ccan/asearch/asearch.h b/lib/ccan/asearch/asearch.h
new file mode 100644
index 0000000000..d252284e7d
--- /dev/null
+++ b/lib/ccan/asearch/asearch.h
@@ -0,0 +1,37 @@
+#ifndef CCAN_ASEARCH_H
+#define CCAN_ASEARCH_H
+#include <stdlib.h>
+#include <ccan/typesafe_cb/typesafe_cb.h>
+
+/**
+ * asearch - search an array of elements
+ * @key: pointer to item being searched for
+ * @base: pointer to data to sort
+ * @num: number of elements
+ * @cmp: pointer to comparison function
+ *
+ * This function does a binary search on the given array.  The
+ * contents of the array should already be in ascending sorted order
+ * under the provided comparison function.
+ *
+ * Note that the key need not have the same type as the elements in
+ * the array, e.g. key could be a string and the comparison function
+ * could compare the string with the struct's name field.  However, if
+ * the key and elements in the array are of the same type, you can use
+ * the same comparison function for both sort() and asearch().
+ */
+#if HAVE_TYPEOF
+#define asearch(key, base, num, cmp)					\
+	((__typeof__(*(base))*)(bsearch((key), (base), (num), sizeof(*(base)), \
+		typesafe_cb_cast(int (*)(const void *, const void *),	\
+				 int (*)(const __typeof__(*(key)) *,	\
+					 const __typeof__(*(base)) *),	\
+				 (cmp)))))
+
+#else
+#define asearch(key, base, num, cmp)				\
+	(bsearch((key), (base), (num), sizeof(*(base)),		\
+		 (int (*)(const void *, const void *))(cmp)))
+#endif
+
+#endif /* CCAN_ASEARCH_H */
diff --git a/lib/ccan/asearch/test/compile_fail-return-value-const.c b/lib/ccan/asearch/test/compile_fail-return-value-const.c
new file mode 100644
index 0000000000..2edee93501
--- /dev/null
+++ b/lib/ccan/asearch/test/compile_fail-return-value-const.c
@@ -0,0 +1,25 @@
+#include <ccan/asearch/asearch.h>
+#include <ccan/array_size/array_size.h>
+#include <string.h>
+
+static int cmp(const char *key, const char *const *elem)
+{
+	return strcmp(key, *elem);
+}
+
+int main(void)
+{
+	const char key[] = "key";
+	const char *elems[] = { "a", "big", "list", "of", "things" };
+
+#ifdef FAIL
+	char **p;
+#if !HAVE_TYPEOF
+#error "Unfortunately we don't fail if no typeof."
+#endif
+#else
+	const char **p;
+#endif
+	p = asearch(key, elems, ARRAY_SIZE(elems), cmp);
+	return p ? 0 : 1;
+}
diff --git a/lib/ccan/asearch/test/compile_fail-return-value.c b/lib/ccan/asearch/test/compile_fail-return-value.c
new file mode 100644
index 0000000000..4aef5327a8
--- /dev/null
+++ b/lib/ccan/asearch/test/compile_fail-return-value.c
@@ -0,0 +1,22 @@
+#include <ccan/asearch/asearch.h>
+
+static int cmp(const char *key, char *const *elem)
+{
+	return 0;
+}
+
+int main(int argc, char **argv)
+{
+	const char key[] = "key";
+
+#ifdef FAIL
+	int **p;
+#if !HAVE_TYPEOF
+#error "Unfortunately we don't fail if no typeof."
+#endif
+#else
+	char **p;
+#endif
+	p = asearch(key, argv+1, argc-1, cmp);
+	return p ? 0 : 1;
+}
diff --git a/lib/ccan/asearch/test/run-strings.c b/lib/ccan/asearch/test/run-strings.c
new file mode 100644
index 0000000000..3ec453842f
--- /dev/null
+++ b/lib/ccan/asearch/test/run-strings.c
@@ -0,0 +1,22 @@
+#include <ccan/asearch/asearch.h>
+#include <ccan/array_size/array_size.h>
+#include <ccan/tap/tap.h>
+#include <stdlib.h>
+
+static int cmp(const int *key, const char *const *elem)
+{
+	return *key - atoi(*elem);
+}
+
+int main(void)
+{
+	const char *args[] = { "1", "4", "7", "9" };
+	int key = 7;
+	const char **p;
+
+	plan_tests(1);
+	p = asearch(&key, args, ARRAY_SIZE(args), cmp);
+	ok1(p == &args[2]);
+
+	return exit_status();
+}
diff --git a/lib/ccan/asearch/test/run.c b/lib/ccan/asearch/test/run.c
new file mode 100644
index 0000000000..2a896fccfe
--- /dev/null
+++ b/lib/ccan/asearch/test/run.c
@@ -0,0 +1,40 @@
+#include <ccan/asearch/asearch.h>
+#include <ccan/array_size/array_size.h>
+#include <ccan/tap/tap.h>
+#include <limits.h>
+
+static int test_cmp(const int *key, const int *elt)
+{
+	if (*key < *elt)
+		return -1;
+	else if (*key > *elt)
+		return 1;
+	return 0;
+}
+
+int main(void)
+{
+	const int arr[] = { INT_MIN, 0, 1, 2, 3, 4, 5, 6, INT_MAX };
+	unsigned int start, num, i, total = 0;
+	int key;
+
+	plan_tests(285);
+
+	for (start = 0; start < ARRAY_SIZE(arr); start++) {
+		for (num = 0; num < ARRAY_SIZE(arr) - start; num++) {
+			key = 7;
+			ok1(asearch(&key, &arr[start], num, test_cmp) == NULL);
+			total++;
+			for (i = start; i < start+num; i++) {
+				const int *ret;
+				key = arr[i];
+				ret = asearch(&key, &arr[start], num, test_cmp);
+				ok1(ret);
+				ok1(ret && *ret == key);
+				total++;
+			}
+		}
+	}
+	diag("Tested %u searches\n", total);
+	return exit_status();
+}
diff --git a/lib/ccan/build_assert/LICENSE b/lib/ccan/build_assert/LICENSE
new file mode 100644
index 0000000000..5522aa5f33
--- /dev/null
+++ b/lib/ccan/build_assert/LICENSE
@@ -0,0 +1,508 @@
+
+                  GNU LESSER GENERAL PUBLIC LICENSE
+                       Version 2.1, February 1999
+
+ Copyright (C) 1991, 1999 Free Software Foundation, Inc.
+	51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+[This is the first released version of the Lesser GPL.  It also counts
+ as the successor of the GNU Library Public License, version 2, hence
+ the version number 2.1.]
+
+                            Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+Licenses are intended to guarantee your freedom to share and change
+free software--to make sure the software is free for all its users.
+
+  This license, the Lesser General Public License, applies to some
+specially designated software packages--typically libraries--of the
+Free Software Foundation and other authors who decide to use it.  You
+can use it too, but we suggest you first think carefully about whether
+this license or the ordinary General Public License is the better
+strategy to use in any particular case, based on the explanations
+below.
+
+  When we speak of free software, we are referring to freedom of use,
+not price.  Our General Public Licenses are designed to make sure that
+you have the freedom to distribute copies of free software (and charge
+for this service if you wish); that you receive source code or can get
+it if you want it; that you can change the software and use pieces of
+it in new free programs; and that you are informed that you can do
+these things.
+
+  To protect your rights, we need to make restrictions that forbid
+distributors to deny you these rights or to ask you to surrender these
+rights.  These restrictions translate to certain responsibilities for
+you if you distribute copies of the library or if you modify it.
+
+  For example, if you distribute copies of the library, whether gratis
+or for a fee, you must give the recipients all the rights that we gave
+you.  You must make sure that they, too, receive or can get the source
+code.  If you link other code with the library, you must provide
+complete object files to the recipients, so that they can relink them
+with the library after making changes to the library and recompiling
+it.  And you must show them these terms so they know their rights.
+
+  We protect your rights with a two-step method: (1) we copyright the
+library, and (2) we offer you this license, which gives you legal
+permission to copy, distribute and/or modify the library.
+
+  To protect each distributor, we want to make it very clear that
+there is no warranty for the free library.  Also, if the library is
+modified by someone else and passed on, the recipients should know
+that what they have is not the original version, so that the original
+author's reputation will not be affected by problems that might be
+introduced by others.
+
+  Finally, software patents pose a constant threat to the existence of
+any free program.  We wish to make sure that a company cannot
+effectively restrict the users of a free program by obtaining a
+restrictive license from a patent holder.  Therefore, we insist that
+any patent license obtained for a version of the library must be
+consistent with the full freedom of use specified in this license.
+
+  Most GNU software, including some libraries, is covered by the
+ordinary GNU General Public License.  This license, the GNU Lesser
+General Public License, applies to certain designated libraries, and
+is quite different from the ordinary General Public License.  We use
+this license for certain libraries in order to permit linking those
+libraries into non-free programs.
+
+  When a program is linked with a library, whether statically or using
+a shared library, the combination of the two is legally speaking a
+combined work, a derivative of the original library.  The ordinary
+General Public License therefore permits such linking only if the
+entire combination fits its criteria of freedom.  The Lesser General
+Public License permits more lax criteria for linking other code with
+the library.
+
+  We call this license the "Lesser" General Public License because it
+does Less to protect the user's freedom than the ordinary General
+Public License.  It also provides other free software developers Less
+of an advantage over competing non-free programs.  These disadvantages
+are the reason we use the ordinary General Public License for many
+libraries.  However, the Lesser license provides advantages in certain
+special circumstances.
+
+  For example, on rare occasions, there may be a special need to
+encourage the widest possible use of a certain library, so that it
+becomes a de-facto standard.  To achieve this, non-free programs must
+be allowed to use the library.  A more frequent case is that a free
+library does the same job as widely used non-free libraries.  In this
+case, there is little to gain by limiting the free library to free
+software only, so we use the Lesser General Public License.
+
+  In other cases, permission to use a particular library in non-free
+programs enables a greater number of people to use a large body of
+free software.  For example, permission to use the GNU C Library in
+non-free programs enables many more people to use the whole GNU
+operating system, as well as its variant, the GNU/Linux operating
+system.
+
+  Although the Lesser General Public License is Less protective of the
+users' freedom, it does ensure that the user of a program that is
+linked with the Library has the freedom and the wherewithal to run
+that program using a modified version of the Library.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.  Pay close attention to the difference between a
+"work based on the library" and a "work that uses the library".  The
+former contains code derived from the library, whereas the latter must
+be combined with the library in order to run.
+
+                  GNU LESSER GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License Agreement applies to any software library or other
+program which contains a notice placed by the copyright holder or
+other authorized party saying it may be distributed under the terms of
+this Lesser General Public License (also called "this License").
+Each licensee is addressed as "you".
+
+  A "library" means a collection of software functions and/or data
+prepared so as to be conveniently linked with application programs
+(which use some of those functions and data) to form executables.
+
+  The "Library", below, refers to any such software library or work
+which has been distributed under these terms.  A "work based on the
+Library" means either the Library or any derivative work under
+copyright law: that is to say, a work containing the Library or a
+portion of it, either verbatim or with modifications and/or translated
+straightforwardly into another language.  (Hereinafter, translation is
+included without limitation in the term "modification".)
+
+  "Source code" for a work means the preferred form of the work for
+making modifications to it.  For a library, complete source code means
+all the source code for all modules it contains, plus any associated
+interface definition files, plus the scripts used to control
+compilation and installation of the library.
+
+  Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running a program using the Library is not restricted, and output from
+such a program is covered only if its contents constitute a work based
+on the Library (independent of the use of the Library in a tool for
+writing it).  Whether that is true depends on what the Library does
+and what the program that uses the Library does.
+
+  1. You may copy and distribute verbatim copies of the Library's
+complete source code as you receive it, in any medium, provided that
+you conspicuously and appropriately publish on each copy an
+appropriate copyright notice and disclaimer of warranty; keep intact
+all the notices that refer to this License and to the absence of any
+warranty; and distribute a copy of this License along with the
+Library.
+
+  You may charge a fee for the physical act of transferring a copy,
+and you may at your option offer warranty protection in exchange for a
+fee.
+
+  2. You may modify your copy or copies of the Library or any portion
+of it, thus forming a work based on the Library, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) The modified work must itself be a software library.
+
+    b) You must cause the files modified to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    c) You must cause the whole of the work to be licensed at no
+    charge to all third parties under the terms of this License.
+
+    d) If a facility in the modified Library refers to a function or a
+    table of data to be supplied by an application program that uses
+    the facility, other than as an argument passed when the facility
+    is invoked, then you must make a good faith effort to ensure that,
+    in the event an application does not supply such function or
+    table, the facility still operates, and performs whatever part of
+    its purpose remains meaningful.
+
+    (For example, a function in a library to compute square roots has
+    a purpose that is entirely well-defined independent of the
+    application.  Therefore, Subsection 2d requires that any
+    application-supplied function or table used by this function must
+    be optional: if the application does not supply it, the square
+    root function must still compute square roots.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Library,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Library, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote
+it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Library.
+
+In addition, mere aggregation of another work not based on the Library
+with the Library (or with a work based on the Library) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may opt to apply the terms of the ordinary GNU General Public
+License instead of this License to a given copy of the Library.  To do
+this, you must alter all the notices that refer to this License, so
+that they refer to the ordinary GNU General Public License, version 2,
+instead of to this License.  (If a newer version than version 2 of the
+ordinary GNU General Public License has appeared, then you can specify
+that version instead if you wish.)  Do not make any other change in
+these notices.
+
+  Once this change is made in a given copy, it is irreversible for
+that copy, so the ordinary GNU General Public License applies to all
+subsequent copies and derivative works made from that copy.
+
+  This option is useful when you wish to copy part of the code of
+the Library into a program that is not a library.
+
+  4. You may copy and distribute the Library (or a portion or
+derivative of it, under Section 2) in object code or executable form
+under the terms of Sections 1 and 2 above provided that you accompany
+it with the complete corresponding machine-readable source code, which
+must be distributed under the terms of Sections 1 and 2 above on a
+medium customarily used for software interchange.
+
+  If distribution of object code is made by offering access to copy
+from a designated place, then offering equivalent access to copy the
+source code from the same place satisfies the requirement to
+distribute the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  5. A program that contains no derivative of any portion of the
+Library, but is designed to work with the Library by being compiled or
+linked with it, is called a "work that uses the Library".  Such a
+work, in isolation, is not a derivative work of the Library, and
+therefore falls outside the scope of this License.
+
+  However, linking a "work that uses the Library" with the Library
+creates an executable that is a derivative of the Library (because it
+contains portions of the Library), rather than a "work that uses the
+library".  The executable is therefore covered by this License.
+Section 6 states terms for distribution of such executables.
+
+  When a "work that uses the Library" uses material from a header file
+that is part of the Library, the object code for the work may be a
+derivative work of the Library even though the source code is not.
+Whether this is true is especially significant if the work can be
+linked without the Library, or if the work is itself a library.  The
+threshold for this to be true is not precisely defined by law.
+
+  If such an object file uses only numerical parameters, data
+structure layouts and accessors, and small macros and small inline
+functions (ten lines or less in length), then the use of the object
+file is unrestricted, regardless of whether it is legally a derivative
+work.  (Executables containing this object code plus portions of the
+Library will still fall under Section 6.)
+
+  Otherwise, if the work is a derivative of the Library, you may
+distribute the object code for the work under the terms of Section 6.
+Any executables containing that work also fall under Section 6,
+whether or not they are linked directly with the Library itself.
+
+  6. As an exception to the Sections above, you may also combine or
+link a "work that uses the Library" with the Library to produce a
+work containing portions of the Library, and distribute that work
+under terms of your choice, provided that the terms permit
+modification of the work for the customer's own use and reverse
+engineering for debugging such modifications.
+
+  You must give prominent notice with each copy of the work that the
+Library is used in it and that the Library and its use are covered by
+this License.  You must supply a copy of this License.  If the work
+during execution displays copyright notices, you must include the
+copyright notice for the Library among them, as well as a reference
+directing the user to the copy of this License.  Also, you must do one
+of these things:
+
+    a) Accompany the work with the complete corresponding
+    machine-readable source code for the Library including whatever
+    changes were used in the work (which must be distributed under
+    Sections 1 and 2 above); and, if the work is an executable linked
+    with the Library, with the complete machine-readable "work that
+    uses the Library", as object code and/or source code, so that the
+    user can modify the Library and then relink to produce a modified
+    executable containing the modified Library.  (It is understood
+    that the user who changes the contents of definitions files in the
+    Library will not necessarily be able to recompile the application
+    to use the modified definitions.)
+
+    b) Use a suitable shared library mechanism for linking with the
+    Library.  A suitable mechanism is one that (1) uses at run time a
+    copy of the library already present on the user's computer system,
+    rather than copying library functions into the executable, and (2)
+    will operate properly with a modified version of the library, if
+    the user installs one, as long as the modified version is
+    interface-compatible with the version that the work was made with.
+
+    c) Accompany the work with a written offer, valid for at least
+    three years, to give the same user the materials specified in
+    Subsection 6a, above, for a charge no more than the cost of
+    performing this distribution.
+
+    d) If distribution of the work is made by offering access to copy
+    from a designated place, offer equivalent access to copy the above
+    specified materials from the same place.
+
+    e) Verify that the user has already received a copy of these
+    materials or that you have already sent this user a copy.
+
+  For an executable, the required form of the "work that uses the
+Library" must include any data and utility programs needed for
+reproducing the executable from it.  However, as a special exception,
+the materials to be distributed need not include anything that is
+normally distributed (in either source or binary form) with the major
+components (compiler, kernel, and so on) of the operating system on
+which the executable runs, unless that component itself accompanies
+the executable.
+
+  It may happen that this requirement contradicts the license
+restrictions of other proprietary libraries that do not normally
+accompany the operating system.  Such a contradiction means you cannot
+use both them and the Library together in an executable that you
+distribute.
+
+  7. You may place library facilities that are a work based on the
+Library side-by-side in a single library together with other library
+facilities not covered by this License, and distribute such a combined
+library, provided that the separate distribution of the work based on
+the Library and of the other library facilities is otherwise
+permitted, and provided that you do these two things:
+
+    a) Accompany the combined library with a copy of the same work
+    based on the Library, uncombined with any other library
+    facilities.  This must be distributed under the terms of the
+    Sections above.
+
+    b) Give prominent notice with the combined library of the fact
+    that part of it is a work based on the Library, and explaining
+    where to find the accompanying uncombined form of the same work.
+
+  8. You may not copy, modify, sublicense, link with, or distribute
+the Library except as expressly provided under this License.  Any
+attempt otherwise to copy, modify, sublicense, link with, or
+distribute the Library is void, and will automatically terminate your
+rights under this License.  However, parties who have received copies,
+or rights, from you under this License will not have their licenses
+terminated so long as such parties remain in full compliance.
+
+  9. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Library or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Library (or any work based on the
+Library), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Library or works based on it.
+
+  10. Each time you redistribute the Library (or any work based on the
+Library), the recipient automatically receives a license from the
+original licensor to copy, distribute, link with or modify the Library
+subject to these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties with
+this License.
+
+  11. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Library at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Library by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Library.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply, and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  12. If the distribution and/or use of the Library is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Library under this License
+may add an explicit geographical distribution limitation excluding those
+countries, so that distribution is permitted only in or among
+countries not thus excluded.  In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+  13. The Free Software Foundation may publish revised and/or new
+versions of the Lesser General Public License from time to time.
+Such new versions will be similar in spirit to the present version,
+but may differ in detail to address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Library
+specifies a version number of this License which applies to it and
+"any later version", you have the option of following the terms and
+conditions either of that version or of any later version published by
+the Free Software Foundation.  If the Library does not specify a
+license version number, you may choose any version ever published by
+the Free Software Foundation.
+
+  14. If you wish to incorporate parts of the Library into other free
+programs whose distribution conditions are incompatible with these,
+write to the author to ask for permission.  For software which is
+copyrighted by the Free Software Foundation, write to the Free
+Software Foundation; we sometimes make exceptions for this.  Our
+decision will be guided by the two goals of preserving the free status
+of all derivatives of our free software and of promoting the sharing
+and reuse of software generally.
+
+                            NO WARRANTY
+
+  15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO
+WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW.
+EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR
+OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY
+KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE
+LIBRARY IS WITH YOU.  SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME
+THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN
+WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY
+AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU
+FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR
+CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
+LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
+RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
+FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
+SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
+DAMAGES.
+
+                     END OF TERMS AND CONDITIONS
+
+           How to Apply These Terms to Your New Libraries
+
+  If you develop a new library, and you want it to be of the greatest
+possible use to the public, we recommend making it free software that
+everyone can redistribute and change.  You can do so by permitting
+redistribution under these terms (or, alternatively, under the terms
+of the ordinary General Public License).
+
+  To apply these terms, attach the following notices to the library.
+It is safest to attach them to the start of each source file to most
+effectively convey the exclusion of warranty; and each file should
+have at least the "copyright" line and a pointer to where the full
+notice is found.
+
+
+    <one line to give the library's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This library is free software; you can redistribute it and/or
+    modify it under the terms of the GNU Lesser General Public
+    License as published by the Free Software Foundation; either
+    version 2.1 of the License, or (at your option) any later version.
+
+    This library is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public
+    License along with this library; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+
+Also add information on how to contact you by electronic and paper mail.
+
+You should also get your employer (if you work as a programmer) or
+your school, if any, to sign a "copyright disclaimer" for the library,
+if necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the
+  library `Frob' (a library for tweaking knobs) written by James
+  Random Hacker.
+
+  <signature of Ty Coon>, 1 April 1990
+  Ty Coon, President of Vice
+
+That's all there is to it!
diff --git a/lib/ccan/build_assert/_info b/lib/ccan/build_assert/_info
new file mode 100644
index 0000000000..0906af07e6
--- /dev/null
+++ b/lib/ccan/build_assert/_info
@@ -0,0 +1,49 @@
+#include <stdio.h>
+#include <string.h>
+#include "config.h"
+
+/**
+ * build_assert - routines for build-time assertions
+ *
+ * This code provides routines which will cause compilation to fail should some
+ * assertion be untrue: such failures are preferable to run-time assertions,
+ * but much more limited since they can only depends on compile-time constants.
+ *
+ * These assertions are most useful when two parts of the code must be kept in
+ * sync: it is better to avoid such cases if possible, but seconds best is to
+ * detect invalid changes at build time.
+ *
+ * For example, a tricky piece of code might rely on a certain element being at
+ * the start of the structure.  To ensure that future changes don't break it,
+ * you would catch such changes in your code like so:
+ *
+ * Example:
+ *	#include <stddef.h>
+ *	#include <ccan/build_assert/build_assert.h>
+ *
+ *	struct foo {
+ *		char string[5];
+ *		int x;
+ *	};
+ *
+ *	static char *foo_string(struct foo *foo)
+ *	{
+ *		// This trick requires that the string be first in the structure
+ *		BUILD_ASSERT(offsetof(struct foo, string) == 0);
+ *		return (char *)foo;
+ *	}
+ *
+ * License: LGPL (2 or any later version)
+ * Author: Rusty Russell <rusty@rustcorp.com.au>
+ */
+int main(int argc, char *argv[])
+{
+	if (argc != 2)
+		return 1;
+
+	if (strcmp(argv[1], "depends") == 0)
+		/* Nothing. */
+		return 0;
+
+	return 1;
+}
diff --git a/lib/ccan/build_assert/build_assert.h b/lib/ccan/build_assert/build_assert.h
new file mode 100644
index 0000000000..24e59c44cd
--- /dev/null
+++ b/lib/ccan/build_assert/build_assert.h
@@ -0,0 +1,39 @@
+#ifndef CCAN_BUILD_ASSERT_H
+#define CCAN_BUILD_ASSERT_H
+
+/**
+ * BUILD_ASSERT - assert a build-time dependency.
+ * @cond: the compile-time condition which must be true.
+ *
+ * Your compile will fail if the condition isn't true, or can't be evaluated
+ * by the compiler.  This can only be used within a function.
+ *
+ * Example:
+ *	#include <stddef.h>
+ *	...
+ *	static char *foo_to_char(struct foo *foo)
+ *	{
+ *		// This code needs string to be at start of foo.
+ *		BUILD_ASSERT(offsetof(struct foo, string) == 0);
+ *		return (char *)foo;
+ *	}
+ */
+#define BUILD_ASSERT(cond) \
+	do { (void) sizeof(char [1 - 2*!(cond)]); } while(0)
+
+/**
+ * BUILD_ASSERT_OR_ZERO - assert a build-time dependency, as an expression.
+ * @cond: the compile-time condition which must be true.
+ *
+ * Your compile will fail if the condition isn't true, or can't be evaluated
+ * by the compiler.  This can be used in an expression: its value is "0".
+ *
+ * Example:
+ *	#define foo_to_char(foo)					\
+ *		 ((char *)(foo)						\
+ *		  + BUILD_ASSERT_OR_ZERO(offsetof(struct foo, string) == 0))
+ */
+#define BUILD_ASSERT_OR_ZERO(cond) \
+	(sizeof(char [1 - 2*!(cond)]) - 1)
+
+#endif /* CCAN_BUILD_ASSERT_H */
diff --git a/lib/ccan/build_assert/test/compile_fail-expr.c b/lib/ccan/build_assert/test/compile_fail-expr.c
new file mode 100644
index 0000000000..109215b8aa
--- /dev/null
+++ b/lib/ccan/build_assert/test/compile_fail-expr.c
@@ -0,0 +1,10 @@
+#include <ccan/build_assert/build_assert.h>
+
+int main(int argc, char *argv[])
+{
+#ifdef FAIL
+	return BUILD_ASSERT_OR_ZERO(1 == 0);
+#else
+	return 0;
+#endif
+}
diff --git a/lib/ccan/build_assert/test/compile_fail.c b/lib/ccan/build_assert/test/compile_fail.c
new file mode 100644
index 0000000000..37d95eddc9
--- /dev/null
+++ b/lib/ccan/build_assert/test/compile_fail.c
@@ -0,0 +1,9 @@
+#include <ccan/build_assert/build_assert.h>
+
+int main(int argc, char *argv[])
+{
+#ifdef FAIL
+	BUILD_ASSERT(1 == 0);
+#endif
+	return 0;
+}
diff --git a/lib/ccan/build_assert/test/compile_ok.c b/lib/ccan/build_assert/test/compile_ok.c
new file mode 100644
index 0000000000..4105484d1f
--- /dev/null
+++ b/lib/ccan/build_assert/test/compile_ok.c
@@ -0,0 +1,7 @@
+#include <ccan/build_assert/build_assert.h>
+
+int main(int argc, char *argv[])
+{
+	BUILD_ASSERT(1 == 1);
+	return 0;
+}
diff --git a/lib/ccan/build_assert/test/run-BUILD_ASSERT_OR_ZERO.c b/lib/ccan/build_assert/test/run-BUILD_ASSERT_OR_ZERO.c
new file mode 100644
index 0000000000..4185821331
--- /dev/null
+++ b/lib/ccan/build_assert/test/run-BUILD_ASSERT_OR_ZERO.c
@@ -0,0 +1,9 @@
+#include <ccan/build_assert/build_assert.h>
+#include <ccan/tap/tap.h>
+
+int main(int argc, char *argv[])
+{
+	plan_tests(1);
+	ok1(BUILD_ASSERT_OR_ZERO(1 == 1) == 0);
+	return exit_status();
+}
diff --git a/lib/ccan/build_assert/test/run-EXPR_BUILD_ASSERT.c b/lib/ccan/build_assert/test/run-EXPR_BUILD_ASSERT.c
new file mode 100644
index 0000000000..91bbbbbf75
--- /dev/null
+++ b/lib/ccan/build_assert/test/run-EXPR_BUILD_ASSERT.c
@@ -0,0 +1,9 @@
+#include <ccan/build_assert/build_assert.h>
+#include <ccan/tap/tap.h>
+
+int main(int argc, char *argv[])
+{
+	plan_tests(1);
+	ok1(EXPR_BUILD_ASSERT(1 == 1) == 0);
+	return exit_status();
+}
diff --git a/lib/ccan/cast/LICENSE b/lib/ccan/cast/LICENSE
new file mode 100644
index 0000000000..cca7fc278f
--- /dev/null
+++ b/lib/ccan/cast/LICENSE
@@ -0,0 +1,165 @@
+		   GNU LESSER GENERAL PUBLIC LICENSE
+                       Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+
+  This version of the GNU Lesser General Public License incorporates
+the terms and conditions of version 3 of the GNU General Public
+License, supplemented by the additional permissions listed below.
+
+  0. Additional Definitions.
+
+  As used herein, "this License" refers to version 3 of the GNU Lesser
+General Public License, and the "GNU GPL" refers to version 3 of the GNU
+General Public License.
+
+  "The Library" refers to a covered work governed by this License,
+other than an Application or a Combined Work as defined below.
+
+  An "Application" is any work that makes use of an interface provided
+by the Library, but which is not otherwise based on the Library.
+Defining a subclass of a class defined by the Library is deemed a mode
+of using an interface provided by the Library.
+
+  A "Combined Work" is a work produced by combining or linking an
+Application with the Library.  The particular version of the Library
+with which the Combined Work was made is also called the "Linked
+Version".
+
+  The "Minimal Corresponding Source" for a Combined Work means the
+Corresponding Source for the Combined Work, excluding any source code
+for portions of the Combined Work that, considered in isolation, are
+based on the Application, and not on the Linked Version.
+
+  The "Corresponding Application Code" for a Combined Work means the
+object code and/or source code for the Application, including any data
+and utility programs needed for reproducing the Combined Work from the
+Application, but excluding the System Libraries of the Combined Work.
+
+  1. Exception to Section 3 of the GNU GPL.
+
+  You may convey a covered work under sections 3 and 4 of this License
+without being bound by section 3 of the GNU GPL.
+
+  2. Conveying Modified Versions.
+
+  If you modify a copy of the Library, and, in your modifications, a
+facility refers to a function or data to be supplied by an Application
+that uses the facility (other than as an argument passed when the
+facility is invoked), then you may convey a copy of the modified
+version:
+
+   a) under this License, provided that you make a good faith effort to
+   ensure that, in the event an Application does not supply the
+   function or data, the facility still operates, and performs
+   whatever part of its purpose remains meaningful, or
+
+   b) under the GNU GPL, with none of the additional permissions of
+   this License applicable to that copy.
+
+  3. Object Code Incorporating Material from Library Header Files.
+
+  The object code form of an Application may incorporate material from
+a header file that is part of the Library.  You may convey such object
+code under terms of your choice, provided that, if the incorporated
+material is not limited to numerical parameters, data structure
+layouts and accessors, or small macros, inline functions and templates
+(ten or fewer lines in length), you do both of the following:
+
+   a) Give prominent notice with each copy of the object code that the
+   Library is used in it and that the Library and its use are
+   covered by this License.
+
+   b) Accompany the object code with a copy of the GNU GPL and this license
+   document.
+
+  4. Combined Works.
+
+  You may convey a Combined Work under terms of your choice that,
+taken together, effectively do not restrict modification of the
+portions of the Library contained in the Combined Work and reverse
+engineering for debugging such modifications, if you also do each of
+the following:
+
+   a) Give prominent notice with each copy of the Combined Work that
+   the Library is used in it and that the Library and its use are
+   covered by this License.
+
+   b) Accompany the Combined Work with a copy of the GNU GPL and this license
+   document.
+
+   c) For a Combined Work that displays copyright notices during
+   execution, include the copyright notice for the Library among
+   these notices, as well as a reference directing the user to the
+   copies of the GNU GPL and this license document.
+
+   d) Do one of the following:
+
+       0) Convey the Minimal Corresponding Source under the terms of this
+       License, and the Corresponding Application Code in a form
+       suitable for, and under terms that permit, the user to
+       recombine or relink the Application with a modified version of
+       the Linked Version to produce a modified Combined Work, in the
+       manner specified by section 6 of the GNU GPL for conveying
+       Corresponding Source.
+
+       1) Use a suitable shared library mechanism for linking with the
+       Library.  A suitable mechanism is one that (a) uses at run time
+       a copy of the Library already present on the user's computer
+       system, and (b) will operate properly with a modified version
+       of the Library that is interface-compatible with the Linked
+       Version.
+
+   e) Provide Installation Information, but only if you would otherwise
+   be required to provide such information under section 6 of the
+   GNU GPL, and only to the extent that such information is
+   necessary to install and execute a modified version of the
+   Combined Work produced by recombining or relinking the
+   Application with a modified version of the Linked Version. (If
+   you use option 4d0, the Installation Information must accompany
+   the Minimal Corresponding Source and Corresponding Application
+   Code. If you use option 4d1, you must provide the Installation
+   Information in the manner specified by section 6 of the GNU GPL
+   for conveying Corresponding Source.)
+
+  5. Combined Libraries.
+
+  You may place library facilities that are a work based on the
+Library side by side in a single library together with other library
+facilities that are not Applications and are not covered by this
+License, and convey such a combined library under terms of your
+choice, if you do both of the following:
+
+   a) Accompany the combined library with a copy of the same work based
+   on the Library, uncombined with any other library facilities,
+   conveyed under the terms of this License.
+
+   b) Give prominent notice with the combined library that part of it
+   is a work based on the Library, and explaining where to find the
+   accompanying uncombined form of the same work.
+
+  6. Revised Versions of the GNU Lesser General Public License.
+
+  The Free Software Foundation may publish revised and/or new versions
+of the GNU Lesser General Public License from time to time. Such new
+versions will be similar in spirit to the present version, but may
+differ in detail to address new problems or concerns.
+
+  Each version is given a distinguishing version number. If the
+Library as you received it specifies that a certain numbered version
+of the GNU Lesser General Public License "or any later version"
+applies to it, you have the option of following the terms and
+conditions either of that published version or of any later version
+published by the Free Software Foundation. If the Library as you
+received it does not specify a version number of the GNU Lesser
+General Public License, you may choose any version of the GNU Lesser
+General Public License ever published by the Free Software Foundation.
+
+  If the Library as you received it specifies that a proxy can decide
+whether future versions of the GNU Lesser General Public License shall
+apply, that proxy's public statement of acceptance of any version is
+permanent authorization for you to choose that version for the
+Library.
diff --git a/lib/ccan/cast/_info b/lib/ccan/cast/_info
new file mode 100644
index 0000000000..5f82a05b8f
--- /dev/null
+++ b/lib/ccan/cast/_info
@@ -0,0 +1,84 @@
+#include <string.h>
+#include "config.h"
+
+/**
+ * cast - routines for safer casting.
+ *
+ * Often you want to cast in a limited way, such as removing a const or
+ * switching between integer types.  However, normal casts will work on
+ * almost any type, making them dangerous when the code changes.
+ *
+ * These C++-inspired macros serve two purposes: they make it clear the
+ * exact reason for the cast, and they also (with some compilers) cause
+ * errors when misused.
+ *
+ * Based on Jan Engelhardt's libHX macros: http://libhx.sourceforge.net/
+ *
+ * Author: Jan Engelhardt
+ * Maintainer: Rusty Russell <rusty@rustcorp.com.au>
+ * License: LGPL
+ *
+ * Example:
+ *	// Given "test" contains "3 t's in 'test string'
+ *	#include <ccan/cast/cast.h>
+ *	#include <stdint.h>
+ *	#include <stdio.h>
+ *
+ *	// Find char @orig in @str, if @repl, replace them.  Return number.
+ *	static size_t find_chars(char *str, char orig, char repl)
+ *	{
+ *		size_t i, count = 0;
+ *		for (i = 0; str[i]; i++) {
+ *			if (str[i] == orig) {
+ *				count++;
+ *				if (repl)
+ *					str[i] = repl;
+ *			}
+ *		}
+ *		return count;
+ *	}
+ *
+ *	// Terrible hash function.
+ *	static uint64_t hash_string(const unsigned char *str)
+ *	{
+ *		size_t i;
+ *		uint64_t hash = 0;
+ *		for (i = 0; str[i]; i++)
+ *			hash += str[i];
+ *		return hash;
+ *	}
+ *
+ *	int main(int argc, char *argv[])
+ *	{
+ *		uint64_t hash;
+ *
+ *		// find_chars wants a non-const string, but doesn't
+ *		// need it if repl == 0.
+ *		printf("%zu %c's in 'test string'\n",
+ *		       find_chars(cast_const(char *, "test string"),
+ *				  argv[1][0], 0),
+ *		       argv[1][0]);
+ *
+ *		// hash_string wants an unsigned char.
+ *		hash = hash_string(cast_signed(unsigned char *, argv[1]));
+ *
+ *		// Need a long long to hand to printf.
+ *		printf("Hash of '%s' = %llu\n", argv[1],
+ *		       cast_static(unsigned long long, hash));
+ *		return 0;
+ *	}
+ *
+ */
+int main(int argc, char *argv[])
+{
+	/* Expect exactly one argument */
+	if (argc != 2)
+		return 1;
+
+	if (strcmp(argv[1], "depends") == 0) {
+		printf("ccan/build_assert\n");
+		return 0;
+	}
+
+	return 1;
+}
diff --git a/lib/ccan/cast/cast.h b/lib/ccan/cast/cast.h
new file mode 100644
index 0000000000..daebd85723
--- /dev/null
+++ b/lib/ccan/cast/cast.h
@@ -0,0 +1,129 @@
+#ifndef CCAN_CAST_H
+#define CCAN_CAST_H
+#include "config.h"
+#include <stdint.h>
+#include <ccan/build_assert/build_assert.h>
+
+/**
+ * cast_signed - cast a (const) char * to/from (const) signed/unsigned char *.
+ * @type: some char * variant.
+ * @expr: expression (of some char * variant) to cast.
+ *
+ * Some libraries insist on an unsigned char in various places; cast_signed
+ * makes sure (with suitable compiler) that the expression you are casting
+ * only differs in signed/unsigned, not in type or const-ness.
+ */
+#define cast_signed(type, expr)						\
+	((type)(expr)							\
+	 + BUILD_ASSERT_OR_ZERO(cast_sign_compatible(type, (expr))))
+
+/**
+ * cast_const - remove a const qualifier from a pointer.
+ * @type: some pointer type.
+ * @expr: expression to cast.
+ *
+ * This ensures that you are only removing the const qualifier from an
+ * expression.  The expression must otherwise match @type.
+ *
+ * If @type is a pointer to a pointer, you must use cast_const2 (etc).
+ *
+ * Example:
+ *	// Dumb open-coded strstr variant.
+ *	static char *find_needle(const char *haystack)
+ *	{
+ *		size_t i;
+ *		for (i = 0; i < strlen(haystack); i++)
+ *		if (memcmp("needle", haystack+i, strlen("needle")) == 0)
+ *			return cast_const(char *, haystack+i);
+ *		return NULL;
+ *	}
+ */
+#define cast_const(type, expr)						\
+	((type)((intptr_t)(expr)					\
+		+ BUILD_ASSERT_OR_ZERO(cast_const_compat1((expr), type))))
+
+/**
+ * cast_const2 - remove a const qualifier from a pointer to a pointer.
+ * @type: some pointer to pointer type.
+ * @expr: expression to cast.
+ *
+ * This ensures that you are only removing the const qualifier from an
+ * expression.  The expression must otherwise match @type.
+ */
+#define cast_const2(type, expr)						\
+	((type)((intptr_t)(expr)					\
+		+ BUILD_ASSERT_OR_ZERO(cast_const_compat2((expr), type))))
+
+/**
+ * cast_const3 - remove a const from a pointer to a pointer to a pointer..
+ * @type: some pointer to pointer to pointer type.
+ * @expr: expression to cast.
+ *
+ * This ensures that you are only removing the const qualifier from an
+ * expression.  The expression must otherwise match @type.
+ */
+#define cast_const3(type, expr)						\
+	((type)((intptr_t)(expr)					\
+		+ BUILD_ASSERT_OR_ZERO(cast_const_compat3((expr), type))))
+
+
+/**
+ * cast_static - explicit mimic of implicit cast.
+ * @type: some type.
+ * @expr: expression to cast.
+ *
+ * This ensures that the cast is not to or from a pointer: it can only be
+ * an implicit cast, such as a pointer to a similar const pointer, or between
+ * integral types.
+ */
+#if HAVE_COMPOUND_LITERALS
+#define cast_static(type, expr)			\
+	((struct { type x; }){(expr)}.x)
+#else
+#define cast_static(type, expr)			\
+	((type)(expr))
+#endif
+
+/* Herein lies the gcc magic to evoke compile errors. */
+#if HAVE_BUILTIN_CHOOSE_EXPR && HAVE_BUILTIN_TYPES_COMPATIBLE_P && HAVE_TYPEOF
+#define cast_sign_compatible(t, e) \
+  __builtin_choose_expr(						\
+	  __builtin_types_compatible_p(__typeof__(t), char *) ||	\
+	  __builtin_types_compatible_p(__typeof__(t), signed char *) || \
+	  __builtin_types_compatible_p(__typeof__(t), unsigned char *), \
+	  /* if type is not const qualified */				\
+	  __builtin_types_compatible_p(__typeof__(e), char *) ||	\
+	  __builtin_types_compatible_p(__typeof__(e), signed char *) || \
+	  __builtin_types_compatible_p(__typeof__(e), unsigned char *), \
+	  /* and if it is... */						\
+	  __builtin_types_compatible_p(__typeof__(e), const char *) ||	\
+	  __builtin_types_compatible_p(__typeof__(e), const signed char *) || \
+	  __builtin_types_compatible_p(__typeof__(e), const unsigned char *) ||\
+	  __builtin_types_compatible_p(__typeof__(e), char *) ||	\
+	  __builtin_types_compatible_p(__typeof__(e), signed char *) ||	\
+	  __builtin_types_compatible_p(__typeof__(e), unsigned char *)	\
+	  )
+
+#define cast_const_strip1(expr)			\
+	__typeof__(*(struct { int z; __typeof__(expr) x; }){0}.x)
+#define cast_const_strip2(expr) \
+	__typeof__(**(struct { int z; __typeof__(expr) x; }){0}.x)
+#define cast_const_strip3(expr) \
+	__typeof__(***(struct { int z; __typeof__(expr) x; }){0}.x)
+#define cast_const_compat1(expr, type)					\
+	__builtin_types_compatible_p(cast_const_strip1(expr),		\
+				     cast_const_strip1(type))
+#define cast_const_compat2(expr, type)					\
+	__builtin_types_compatible_p(cast_const_strip2(expr),		\
+				     cast_const_strip2(type))
+#define cast_const_compat3(expr, type)					\
+	__builtin_types_compatible_p(cast_const_strip3(expr),		\
+				     cast_const_strip3(type))
+#else
+#define cast_sign_compatible(type, expr)		\
+	(sizeof(*(type)0) == 1 && sizeof(*(expr)) == 1)
+#define cast_const_compat1(expr, type)		(1)
+#define cast_const_compat2(expr, type)		(1)
+#define cast_const_compat3(expr, type)		(1)
+#endif
+#endif /* CCAN_CAST_H */
diff --git a/lib/ccan/cast/test/compile_fail-cast_const.c b/lib/ccan/cast/test/compile_fail-cast_const.c
new file mode 100644
index 0000000000..277f3de1c4
--- /dev/null
+++ b/lib/ccan/cast/test/compile_fail-cast_const.c
@@ -0,0 +1,29 @@
+#include <ccan/cast/cast.h>
+#include <stdlib.h>
+
+/* Note: this *isn't* sizeof(char) on all platforms. */
+struct char_struct {
+	char c;
+};
+
+int main(int argc, char *argv[])
+{
+	char *uc;
+	const
+#ifdef FAIL
+		struct char_struct
+#else
+		char
+#endif
+		*p = NULL;
+
+	uc = cast_const(char *, p);
+	(void) uc; /* Suppress unused-but-set-variable warning. */
+	return 0;
+}
+
+#ifdef FAIL
+#if !HAVE_TYPEOF||!HAVE_BUILTIN_CHOOSE_EXPR||!HAVE_BUILTIN_TYPES_COMPATIBLE_P
+#error "Unfortunately we don't fail if cast_const can only use size"
+#endif
+#endif
diff --git a/lib/ccan/cast/test/compile_fail-cast_const2.c b/lib/ccan/cast/test/compile_fail-cast_const2.c
new file mode 100644
index 0000000000..e671e88eda
--- /dev/null
+++ b/lib/ccan/cast/test/compile_fail-cast_const2.c
@@ -0,0 +1,29 @@
+#include <ccan/cast/cast.h>
+#include <stdlib.h>
+
+/* Note: this *isn't* sizeof(char) on all platforms. */
+struct char_struct {
+	char c;
+};
+
+int main(int argc, char *argv[])
+{
+	char **uc;
+	const
+#ifdef FAIL
+		struct char_struct
+#else
+		char
+#endif
+		**p = NULL;
+
+	uc = cast_const2(char **, p);
+	(void) uc; /* Suppress unused-but-set-variable warning. */
+	return 0;
+}
+
+#ifdef FAIL
+#if !HAVE_TYPEOF||!HAVE_BUILTIN_CHOOSE_EXPR||!HAVE_BUILTIN_TYPES_COMPATIBLE_P
+#error "Unfortunately we don't fail if cast_const can only use size"
+#endif
+#endif
diff --git a/lib/ccan/cast/test/compile_fail-cast_const3.c b/lib/ccan/cast/test/compile_fail-cast_const3.c
new file mode 100644
index 0000000000..e958e2dde5
--- /dev/null
+++ b/lib/ccan/cast/test/compile_fail-cast_const3.c
@@ -0,0 +1,29 @@
+#include <ccan/cast/cast.h>
+#include <stdlib.h>
+
+/* Note: this *isn't* sizeof(char) on all platforms. */
+struct char_struct {
+	char c;
+};
+
+int main(int argc, char *argv[])
+{
+	char ***uc;
+	const
+#ifdef FAIL
+		struct char_struct
+#else
+		char
+#endif
+		***p = NULL;
+
+	uc = cast_const3(char ***, p);
+	(void) uc; /* Suppress unused-but-set-variable warning. */
+	return 0;
+}
+
+#ifdef FAIL
+#if !HAVE_TYPEOF||!HAVE_BUILTIN_CHOOSE_EXPR||!HAVE_BUILTIN_TYPES_COMPATIBLE_P
+#error "Unfortunately we don't fail if cast_const can only use size"
+#endif
+#endif
diff --git a/lib/ccan/cast/test/compile_fail-cast_signed-const.c b/lib/ccan/cast/test/compile_fail-cast_signed-const.c
new file mode 100644
index 0000000000..9971dc8eb3
--- /dev/null
+++ b/lib/ccan/cast/test/compile_fail-cast_signed-const.c
@@ -0,0 +1,22 @@
+#include <ccan/cast/cast.h>
+#include <stdlib.h>
+
+int main(int argc, char *argv[])
+{
+	unsigned char *uc;
+#ifdef FAIL
+	const
+#endif
+	char
+		*p = NULL;
+
+	uc = cast_signed(unsigned char *, p);
+	(void) uc; /* Suppress unused-but-set-variable warning. */
+	return 0;
+}
+
+#ifdef FAIL
+#if !HAVE_TYPEOF||!HAVE_BUILTIN_CHOOSE_EXPR||!HAVE_BUILTIN_TYPES_COMPATIBLE_P
+#error "Unfortunately we don't fail if cast_const can only use size"
+#endif
+#endif
diff --git a/lib/ccan/cast/test/compile_fail-cast_signed-sizesame.c b/lib/ccan/cast/test/compile_fail-cast_signed-sizesame.c
new file mode 100644
index 0000000000..2bc40b2f46
--- /dev/null
+++ b/lib/ccan/cast/test/compile_fail-cast_signed-sizesame.c
@@ -0,0 +1,29 @@
+#include <ccan/cast/cast.h>
+#include <stdlib.h>
+
+/* Note: this *isn't* sizeof(char) on all platforms. */
+struct char_struct {
+	char c;
+};
+
+int main(int argc, char *argv[])
+{
+	unsigned char *uc;
+#ifdef FAIL
+	struct char_struct
+#else
+	char
+#endif
+		*p = NULL;
+
+	uc = cast_signed(unsigned char *, p);
+
+	(void) uc; /* Suppress unused-but-set-variable warning. */
+	return 0;
+}
+
+#ifdef FAIL
+#if !HAVE_TYPEOF||!HAVE_BUILTIN_CHOOSE_EXPR||!HAVE_BUILTIN_TYPES_COMPATIBLE_P
+#error "Unfortunately we don't fail if cast_signed can only use size"
+#endif
+#endif
diff --git a/lib/ccan/cast/test/compile_fail-cast_signed.c b/lib/ccan/cast/test/compile_fail-cast_signed.c
new file mode 100644
index 0000000000..66bcc0a1b5
--- /dev/null
+++ b/lib/ccan/cast/test/compile_fail-cast_signed.c
@@ -0,0 +1,17 @@
+#include <ccan/cast/cast.h>
+#include <stdlib.h>
+
+int main(int argc, char *argv[])
+{
+	unsigned char *uc;
+#ifdef FAIL
+	int
+#else
+	char
+#endif
+		*p = NULL;
+
+	uc = cast_signed(unsigned char *, p);
+	(void) uc; /* Suppress unused-but-set-variable warning. */
+	return 0;
+}
diff --git a/lib/ccan/cast/test/compile_fail-cast_static-2.c b/lib/ccan/cast/test/compile_fail-cast_static-2.c
new file mode 100644
index 0000000000..8a12025384
--- /dev/null
+++ b/lib/ccan/cast/test/compile_fail-cast_static-2.c
@@ -0,0 +1,23 @@
+#include <ccan/cast/cast.h>
+#include <stdlib.h>
+
+int main(int argc, char *argv[])
+{
+	char *c;
+#ifdef FAIL
+	long
+#else
+	char
+#endif
+		*p = 0;
+
+	c = cast_static(char *, p);
+	(void) c; /* Suppress unused-but-set-variable warning. */
+	return 0;
+}
+
+#ifdef FAIL
+#if !HAVE_COMPOUND_LITERALS
+#error "Unfortunately we don't fail if cast_static is a noop"
+#endif
+#endif
diff --git a/lib/ccan/cast/test/compile_fail-cast_static-3.c b/lib/ccan/cast/test/compile_fail-cast_static-3.c
new file mode 100644
index 0000000000..6296b75276
--- /dev/null
+++ b/lib/ccan/cast/test/compile_fail-cast_static-3.c
@@ -0,0 +1,21 @@
+#include <ccan/cast/cast.h>
+#include <stdlib.h>
+
+int main(int argc, char *argv[])
+{
+	char *c;
+#ifdef FAIL
+	const
+#endif
+		char *p = 0;
+
+	c = cast_static(char *, p);
+	(void) c; /* Suppress unused-but-set-variable warning. */
+	return 0;
+}
+
+#ifdef FAIL
+#if !HAVE_COMPOUND_LITERALS
+#error "Unfortunately we don't fail if cast_static is a noop"
+#endif
+#endif
diff --git a/lib/ccan/cast/test/compile_fail-cast_static.c b/lib/ccan/cast/test/compile_fail-cast_static.c
new file mode 100644
index 0000000000..0f9e478047
--- /dev/null
+++ b/lib/ccan/cast/test/compile_fail-cast_static.c
@@ -0,0 +1,17 @@
+#include <ccan/cast/cast.h>
+#include <stdlib.h>
+
+int main(int argc, char *argv[])
+{
+	char c;
+#ifdef FAIL
+	char *
+#else
+	long
+#endif
+		x = 0;
+
+	c = cast_static(char, x);
+	(void) c; /* Suppress unused-but-set-variable warning. */
+	return 0;
+}
diff --git a/lib/ccan/cast/test/compile_ok-cast_void.c b/lib/ccan/cast/test/compile_ok-cast_void.c
new file mode 100644
index 0000000000..c649d283b3
--- /dev/null
+++ b/lib/ccan/cast/test/compile_ok-cast_void.c
@@ -0,0 +1,12 @@
+#include <ccan/cast/cast.h>
+
+static void *remove_void(const void *p)
+{
+	return cast_const(void *, p);
+}
+
+int main(void)
+{
+	void *p = remove_void("foo");
+	return !p;
+}
diff --git a/lib/ccan/compiler/LICENSE b/lib/ccan/compiler/LICENSE
new file mode 100644
index 0000000000..cca7fc278f
--- /dev/null
+++ b/lib/ccan/compiler/LICENSE
@@ -0,0 +1,165 @@
+		   GNU LESSER GENERAL PUBLIC LICENSE
+                       Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+
+  This version of the GNU Lesser General Public License incorporates
+the terms and conditions of version 3 of the GNU General Public
+License, supplemented by the additional permissions listed below.
+
+  0. Additional Definitions.
+
+  As used herein, "this License" refers to version 3 of the GNU Lesser
+General Public License, and the "GNU GPL" refers to version 3 of the GNU
+General Public License.
+
+  "The Library" refers to a covered work governed by this License,
+other than an Application or a Combined Work as defined below.
+
+  An "Application" is any work that makes use of an interface provided
+by the Library, but which is not otherwise based on the Library.
+Defining a subclass of a class defined by the Library is deemed a mode
+of using an interface provided by the Library.
+
+  A "Combined Work" is a work produced by combining or linking an
+Application with the Library.  The particular version of the Library
+with which the Combined Work was made is also called the "Linked
+Version".
+
+  The "Minimal Corresponding Source" for a Combined Work means the
+Corresponding Source for the Combined Work, excluding any source code
+for portions of the Combined Work that, considered in isolation, are
+based on the Application, and not on the Linked Version.
+
+  The "Corresponding Application Code" for a Combined Work means the
+object code and/or source code for the Application, including any data
+and utility programs needed for reproducing the Combined Work from the
+Application, but excluding the System Libraries of the Combined Work.
+
+  1. Exception to Section 3 of the GNU GPL.
+
+  You may convey a covered work under sections 3 and 4 of this License
+without being bound by section 3 of the GNU GPL.
+
+  2. Conveying Modified Versions.
+
+  If you modify a copy of the Library, and, in your modifications, a
+facility refers to a function or data to be supplied by an Application
+that uses the facility (other than as an argument passed when the
+facility is invoked), then you may convey a copy of the modified
+version:
+
+   a) under this License, provided that you make a good faith effort to
+   ensure that, in the event an Application does not supply the
+   function or data, the facility still operates, and performs
+   whatever part of its purpose remains meaningful, or
+
+   b) under the GNU GPL, with none of the additional permissions of
+   this License applicable to that copy.
+
+  3. Object Code Incorporating Material from Library Header Files.
+
+  The object code form of an Application may incorporate material from
+a header file that is part of the Library.  You may convey such object
+code under terms of your choice, provided that, if the incorporated
+material is not limited to numerical parameters, data structure
+layouts and accessors, or small macros, inline functions and templates
+(ten or fewer lines in length), you do both of the following:
+
+   a) Give prominent notice with each copy of the object code that the
+   Library is used in it and that the Library and its use are
+   covered by this License.
+
+   b) Accompany the object code with a copy of the GNU GPL and this license
+   document.
+
+  4. Combined Works.
+
+  You may convey a Combined Work under terms of your choice that,
+taken together, effectively do not restrict modification of the
+portions of the Library contained in the Combined Work and reverse
+engineering for debugging such modifications, if you also do each of
+the following:
+
+   a) Give prominent notice with each copy of the Combined Work that
+   the Library is used in it and that the Library and its use are
+   covered by this License.
+
+   b) Accompany the Combined Work with a copy of the GNU GPL and this license
+   document.
+
+   c) For a Combined Work that displays copyright notices during
+   execution, include the copyright notice for the Library among
+   these notices, as well as a reference directing the user to the
+   copies of the GNU GPL and this license document.
+
+   d) Do one of the following:
+
+       0) Convey the Minimal Corresponding Source under the terms of this
+       License, and the Corresponding Application Code in a form
+       suitable for, and under terms that permit, the user to
+       recombine or relink the Application with a modified version of
+       the Linked Version to produce a modified Combined Work, in the
+       manner specified by section 6 of the GNU GPL for conveying
+       Corresponding Source.
+
+       1) Use a suitable shared library mechanism for linking with the
+       Library.  A suitable mechanism is one that (a) uses at run time
+       a copy of the Library already present on the user's computer
+       system, and (b) will operate properly with a modified version
+       of the Library that is interface-compatible with the Linked
+       Version.
+
+   e) Provide Installation Information, but only if you would otherwise
+   be required to provide such information under section 6 of the
+   GNU GPL, and only to the extent that such information is
+   necessary to install and execute a modified version of the
+   Combined Work produced by recombining or relinking the
+   Application with a modified version of the Linked Version. (If
+   you use option 4d0, the Installation Information must accompany
+   the Minimal Corresponding Source and Corresponding Application
+   Code. If you use option 4d1, you must provide the Installation
+   Information in the manner specified by section 6 of the GNU GPL
+   for conveying Corresponding Source.)
+
+  5. Combined Libraries.
+
+  You may place library facilities that are a work based on the
+Library side by side in a single library together with other library
+facilities that are not Applications and are not covered by this
+License, and convey such a combined library under terms of your
+choice, if you do both of the following:
+
+   a) Accompany the combined library with a copy of the same work based
+   on the Library, uncombined with any other library facilities,
+   conveyed under the terms of this License.
+
+   b) Give prominent notice with the combined library that part of it
+   is a work based on the Library, and explaining where to find the
+   accompanying uncombined form of the same work.
+
+  6. Revised Versions of the GNU Lesser General Public License.
+
+  The Free Software Foundation may publish revised and/or new versions
+of the GNU Lesser General Public License from time to time. Such new
+versions will be similar in spirit to the present version, but may
+differ in detail to address new problems or concerns.
+
+  Each version is given a distinguishing version number. If the
+Library as you received it specifies that a certain numbered version
+of the GNU Lesser General Public License "or any later version"
+applies to it, you have the option of following the terms and
+conditions either of that published version or of any later version
+published by the Free Software Foundation. If the Library as you
+received it does not specify a version number of the GNU Lesser
+General Public License, you may choose any version of the GNU Lesser
+General Public License ever published by the Free Software Foundation.
+
+  If the Library as you received it specifies that a proxy can decide
+whether future versions of the GNU Lesser General Public License shall
+apply, that proxy's public statement of acceptance of any version is
+permanent authorization for you to choose that version for the
+Library.
diff --git a/lib/ccan/compiler/_info b/lib/ccan/compiler/_info
new file mode 100644
index 0000000000..c55ba22f08
--- /dev/null
+++ b/lib/ccan/compiler/_info
@@ -0,0 +1,64 @@
+#include <string.h>
+#include <stdio.h>
+#include "config.h"
+
+/**
+ * compiler - macros for common compiler extensions
+ *
+ * Abstracts away some compiler hints.  Currently these include:
+ * - COLD
+ *	For functions not called in fast paths (aka. cold functions)
+ * - PRINTF_FMT
+ *	For functions which take printf-style parameters.
+ * - IDEMPOTENT
+ *	For functions which return the same value for same parameters.
+ * - NEEDED
+ *	For functions and variables which must be emitted even if unused.
+ * - UNNEEDED
+ *	For functions and variables which need not be emitted if unused.
+ * - UNUSED
+ *	For parameters which are not used.
+ * - IS_COMPILE_CONSTANT
+ *	For using different tradeoffs for compiletime vs runtime evaluation.
+ *
+ * License: LGPL (3 or any later version)
+ * Author: Rusty Russell <rusty@rustcorp.com.au>
+ *
+ * Example:
+ *	#include <ccan/compiler/compiler.h>
+ *	#include <stdio.h>
+ *	#include <stdarg.h>
+ *
+ *	// Example of a (slow-path) logging function.
+ *	static int log_threshold = 2;
+ *	static void COLD PRINTF_FMT(2,3)
+ *		logger(int level, const char *fmt, ...)
+ *	{
+ *		va_list ap;
+ *		va_start(ap, fmt);
+ *		if (level >= log_threshold)
+ *			vfprintf(stderr, fmt, ap);
+ *		va_end(ap);
+ *	}
+ *
+ *	int main(int argc, char *argv[])
+ *	{
+ *		if (argc != 1) {
+ *			logger(3, "Don't want %i arguments!\n", argc-1);
+ *			return 1;
+ *		}
+ *		return 0;
+ *	}
+ */
+int main(int argc, char *argv[])
+{
+	/* Expect exactly one argument */
+	if (argc != 2)
+		return 1;
+
+	if (strcmp(argv[1], "depends") == 0) {
+		return 0;
+	}
+
+	return 1;
+}
diff --git a/lib/ccan/compiler/compiler.h b/lib/ccan/compiler/compiler.h
new file mode 100644
index 0000000000..74e0f1835c
--- /dev/null
+++ b/lib/ccan/compiler/compiler.h
@@ -0,0 +1,216 @@
+#ifndef CCAN_COMPILER_H
+#define CCAN_COMPILER_H
+#include "config.h"
+
+#ifndef COLD
+#if HAVE_ATTRIBUTE_COLD
+/**
+ * COLD - a function is unlikely to be called.
+ *
+ * Used to mark an unlikely code path and optimize appropriately.
+ * It is usually used on logging or error routines.
+ *
+ * Example:
+ * static void COLD moan(const char *reason)
+ * {
+ *	fprintf(stderr, "Error: %s (%s)\n", reason, strerror(errno));
+ * }
+ */
+#define COLD __attribute__((cold))
+#else
+#define COLD
+#endif
+#endif
+
+#ifndef NORETURN
+#if HAVE_ATTRIBUTE_NORETURN
+/**
+ * NORETURN - a function does not return
+ *
+ * Used to mark a function which exits; useful for suppressing warnings.
+ *
+ * Example:
+ * static void NORETURN fail(const char *reason)
+ * {
+ *	fprintf(stderr, "Error: %s (%s)\n", reason, strerror(errno));
+ *	exit(1);
+ * }
+ */
+#define NORETURN __attribute__((noreturn))
+#else
+#define NORETURN
+#endif
+#endif
+
+#ifndef PRINTF_FMT
+#if HAVE_ATTRIBUTE_PRINTF
+/**
+ * PRINTF_FMT - a function takes printf-style arguments
+ * @nfmt: the 1-based number of the function's format argument.
+ * @narg: the 1-based number of the function's first variable argument.
+ *
+ * This allows the compiler to check your parameters as it does for printf().
+ *
+ * Example:
+ * void PRINTF_FMT(2,3) my_printf(const char *prefix, const char *fmt, ...);
+ */
+#define PRINTF_FMT(nfmt, narg) \
+	__attribute__((format(__printf__, nfmt, narg)))
+#else
+#define PRINTF_FMT(nfmt, narg)
+#endif
+#endif
+
+#ifndef IDEMPOTENT
+#if HAVE_ATTRIBUTE_CONST
+/**
+ * IDEMPOTENT - a function's return depends only on its argument
+ *
+ * This allows the compiler to assume that the function will return the exact
+ * same value for the exact same arguments.  This implies that the function
+ * must not use global variables, or dereference pointer arguments.
+ */
+#define IDEMPOTENT __attribute__((const))
+#else
+#define IDEMPOTENT
+#endif
+#endif
+
+#if HAVE_ATTRIBUTE_UNUSED
+#ifndef UNNEEDED
+/**
+ * UNNEEDED - a variable/function may not be needed
+ *
+ * This suppresses warnings about unused variables or functions, but tells
+ * the compiler that if it is unused it need not emit it into the source code.
+ *
+ * Example:
+ * // With some preprocessor options, this is unnecessary.
+ * static UNNEEDED int counter;
+ *
+ * // With some preprocessor options, this is unnecessary.
+ * static UNNEEDED void add_to_counter(int add)
+ * {
+ *	counter += add;
+ * }
+ */
+#define UNNEEDED __attribute__((unused))
+#endif
+
+#ifndef NEEDED
+#if HAVE_ATTRIBUTE_USED
+/**
+ * NEEDED - a variable/function is needed
+ *
+ * This suppresses warnings about unused variables or functions, but tells
+ * the compiler that it must exist even if it (seems) unused.
+ *
+ * Example:
+ *	// Even if this is unused, these are vital for debugging.
+ *	static NEEDED int counter;
+ *	static NEEDED void dump_counter(void)
+ *	{
+ *		printf("Counter is %i\n", counter);
+ *	}
+ */
+#define NEEDED __attribute__((used))
+#else
+/* Before used, unused functions and vars were always emitted. */
+#define NEEDED __attribute__((unused))
+#endif
+#endif
+
+#ifndef UNUSED
+/**
+ * UNUSED - a parameter is unused
+ *
+ * Some compilers (eg. gcc with -W or -Wunused) warn about unused
+ * function parameters.  This suppresses such warnings and indicates
+ * to the reader that it's deliberate.
+ *
+ * Example:
+ *	// This is used as a callback, so needs to have this prototype.
+ *	static int some_callback(void *unused UNUSED)
+ *	{
+ *		return 0;
+ *	}
+ */
+#define UNUSED __attribute__((unused))
+#endif
+#else
+#ifndef UNNEEDED
+#define UNNEEDED
+#endif
+#ifndef NEEDED
+#define NEEDED
+#endif
+#ifndef UNUSED
+#define UNUSED
+#endif
+#endif
+
+#ifndef IS_COMPILE_CONSTANT
+#if HAVE_BUILTIN_CONSTANT_P
+/**
+ * IS_COMPILE_CONSTANT - does the compiler know the value of this expression?
+ * @expr: the expression to evaluate
+ *
+ * When an expression manipulation is complicated, it is usually better to
+ * implement it in a function.  However, if the expression being manipulated is
+ * known at compile time, it is better to have the compiler see the entire
+ * expression so it can simply substitute the result.
+ *
+ * This can be done using the IS_COMPILE_CONSTANT() macro.
+ *
+ * Example:
+ *	enum greek { ALPHA, BETA, GAMMA, DELTA, EPSILON };
+ *
+ *	// Out-of-line version.
+ *	const char *greek_name(enum greek greek);
+ *
+ *	// Inline version.
+ *	static inline const char *_greek_name(enum greek greek)
+ *	{
+ *		switch (greek) {
+ *		case ALPHA: return "alpha";
+ *		case BETA: return "beta";
+ *		case GAMMA: return "gamma";
+ *		case DELTA: return "delta";
+ *		case EPSILON: return "epsilon";
+ *		default: return "**INVALID**";
+ *		}
+ *	}
+ *
+ *	// Use inline if compiler knows answer.  Otherwise call function
+ *	// to avoid copies of the same code everywhere.
+ *	#define greek_name(g)						\
+ *		 (IS_COMPILE_CONSTANT(greek) ? _greek_name(g) : greek_name(g))
+ */
+#define IS_COMPILE_CONSTANT(expr) __builtin_constant_p(expr)
+#else
+/* If we don't know, assume it's not. */
+#define IS_COMPILE_CONSTANT(expr) 0
+#endif
+#endif
+
+#ifndef WARN_UNUSED_RESULT
+#if HAVE_WARN_UNUSED_RESULT
+/**
+ * WARN_UNUSED_RESULT - warn if a function return value is unused.
+ *
+ * Used to mark a function where it is extremely unlikely that the caller
+ * can ignore the result, eg realloc().
+ *
+ * Example:
+ * // buf param may be freed by this; need return value!
+ * static char *WARN_UNUSED_RESULT enlarge(char *buf, unsigned *size)
+ * {
+ *	return realloc(buf, (*size) *= 2);
+ * }
+ */
+#define WARN_UNUSED_RESULT __attribute__((warn_unused_result))
+#else
+#define WARN_UNUSED_RESULT
+#endif
+#endif
+#endif /* CCAN_COMPILER_H */
diff --git a/lib/ccan/compiler/test/compile_fail-printf.c b/lib/ccan/compiler/test/compile_fail-printf.c
new file mode 100644
index 0000000000..8f34ae5a12
--- /dev/null
+++ b/lib/ccan/compiler/test/compile_fail-printf.c
@@ -0,0 +1,22 @@
+#include <ccan/compiler/compiler.h>
+
+static void PRINTF_FMT(2,3) my_printf(int x, const char *fmt, ...)
+{
+}
+
+int main(int argc, char *argv[])
+{
+	unsigned int i = 0;
+
+	my_printf(1, "Not a pointer "
+#ifdef FAIL
+		  "%p",
+#if !HAVE_ATTRIBUTE_PRINTF
+#error "Unfortunately we don't fail if !HAVE_ATTRIBUTE_PRINTF."
+#endif
+#else
+		  "%i",
+#endif
+		  i);
+	return 0;
+}
diff --git a/lib/ccan/compiler/test/run-is_compile_constant.c b/lib/ccan/compiler/test/run-is_compile_constant.c
new file mode 100644
index 0000000000..a66f2e13e6
--- /dev/null
+++ b/lib/ccan/compiler/test/run-is_compile_constant.c
@@ -0,0 +1,15 @@
+#include <ccan/compiler/compiler.h>
+#include <ccan/tap/tap.h>
+
+int main(int argc, char *argv[])
+{
+	plan_tests(2);
+
+	ok1(!IS_COMPILE_CONSTANT(argc));
+#if HAVE_BUILTIN_CONSTANT_P
+	ok1(IS_COMPILE_CONSTANT(7));
+#else
+	pass("If !HAVE_BUILTIN_CONSTANT_P, IS_COMPILE_CONSTANT always false");
+#endif
+	return exit_status();
+}
diff --git a/lib/ccan/endian/LICENSE b/lib/ccan/endian/LICENSE
new file mode 100644
index 0000000000..5522aa5f33
--- /dev/null
+++ b/lib/ccan/endian/LICENSE
@@ -0,0 +1,508 @@
+
+                  GNU LESSER GENERAL PUBLIC LICENSE
+                       Version 2.1, February 1999
+
+ Copyright (C) 1991, 1999 Free Software Foundation, Inc.
+	51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+[This is the first released version of the Lesser GPL.  It also counts
+ as the successor of the GNU Library Public License, version 2, hence
+ the version number 2.1.]
+
+                            Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+Licenses are intended to guarantee your freedom to share and change
+free software--to make sure the software is free for all its users.
+
+  This license, the Lesser General Public License, applies to some
+specially designated software packages--typically libraries--of the
+Free Software Foundation and other authors who decide to use it.  You
+can use it too, but we suggest you first think carefully about whether
+this license or the ordinary General Public License is the better
+strategy to use in any particular case, based on the explanations
+below.
+
+  When we speak of free software, we are referring to freedom of use,
+not price.  Our General Public Licenses are designed to make sure that
+you have the freedom to distribute copies of free software (and charge
+for this service if you wish); that you receive source code or can get
+it if you want it; that you can change the software and use pieces of
+it in new free programs; and that you are informed that you can do
+these things.
+
+  To protect your rights, we need to make restrictions that forbid
+distributors to deny you these rights or to ask you to surrender these
+rights.  These restrictions translate to certain responsibilities for
+you if you distribute copies of the library or if you modify it.
+
+  For example, if you distribute copies of the library, whether gratis
+or for a fee, you must give the recipients all the rights that we gave
+you.  You must make sure that they, too, receive or can get the source
+code.  If you link other code with the library, you must provide
+complete object files to the recipients, so that they can relink them
+with the library after making changes to the library and recompiling
+it.  And you must show them these terms so they know their rights.
+
+  We protect your rights with a two-step method: (1) we copyright the
+library, and (2) we offer you this license, which gives you legal
+permission to copy, distribute and/or modify the library.
+
+  To protect each distributor, we want to make it very clear that
+there is no warranty for the free library.  Also, if the library is
+modified by someone else and passed on, the recipients should know
+that what they have is not the original version, so that the original
+author's reputation will not be affected by problems that might be
+introduced by others.
+
+  Finally, software patents pose a constant threat to the existence of
+any free program.  We wish to make sure that a company cannot
+effectively restrict the users of a free program by obtaining a
+restrictive license from a patent holder.  Therefore, we insist that
+any patent license obtained for a version of the library must be
+consistent with the full freedom of use specified in this license.
+
+  Most GNU software, including some libraries, is covered by the
+ordinary GNU General Public License.  This license, the GNU Lesser
+General Public License, applies to certain designated libraries, and
+is quite different from the ordinary General Public License.  We use
+this license for certain libraries in order to permit linking those
+libraries into non-free programs.
+
+  When a program is linked with a library, whether statically or using
+a shared library, the combination of the two is legally speaking a
+combined work, a derivative of the original library.  The ordinary
+General Public License therefore permits such linking only if the
+entire combination fits its criteria of freedom.  The Lesser General
+Public License permits more lax criteria for linking other code with
+the library.
+
+  We call this license the "Lesser" General Public License because it
+does Less to protect the user's freedom than the ordinary General
+Public License.  It also provides other free software developers Less
+of an advantage over competing non-free programs.  These disadvantages
+are the reason we use the ordinary General Public License for many
+libraries.  However, the Lesser license provides advantages in certain
+special circumstances.
+
+  For example, on rare occasions, there may be a special need to
+encourage the widest possible use of a certain library, so that it
+becomes a de-facto standard.  To achieve this, non-free programs must
+be allowed to use the library.  A more frequent case is that a free
+library does the same job as widely used non-free libraries.  In this
+case, there is little to gain by limiting the free library to free
+software only, so we use the Lesser General Public License.
+
+  In other cases, permission to use a particular library in non-free
+programs enables a greater number of people to use a large body of
+free software.  For example, permission to use the GNU C Library in
+non-free programs enables many more people to use the whole GNU
+operating system, as well as its variant, the GNU/Linux operating
+system.
+
+  Although the Lesser General Public License is Less protective of the
+users' freedom, it does ensure that the user of a program that is
+linked with the Library has the freedom and the wherewithal to run
+that program using a modified version of the Library.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.  Pay close attention to the difference between a
+"work based on the library" and a "work that uses the library".  The
+former contains code derived from the library, whereas the latter must
+be combined with the library in order to run.
+
+                  GNU LESSER GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License Agreement applies to any software library or other
+program which contains a notice placed by the copyright holder or
+other authorized party saying it may be distributed under the terms of
+this Lesser General Public License (also called "this License").
+Each licensee is addressed as "you".
+
+  A "library" means a collection of software functions and/or data
+prepared so as to be conveniently linked with application programs
+(which use some of those functions and data) to form executables.
+
+  The "Library", below, refers to any such software library or work
+which has been distributed under these terms.  A "work based on the
+Library" means either the Library or any derivative work under
+copyright law: that is to say, a work containing the Library or a
+portion of it, either verbatim or with modifications and/or translated
+straightforwardly into another language.  (Hereinafter, translation is
+included without limitation in the term "modification".)
+
+  "Source code" for a work means the preferred form of the work for
+making modifications to it.  For a library, complete source code means
+all the source code for all modules it contains, plus any associated
+interface definition files, plus the scripts used to control
+compilation and installation of the library.
+
+  Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running a program using the Library is not restricted, and output from
+such a program is covered only if its contents constitute a work based
+on the Library (independent of the use of the Library in a tool for
+writing it).  Whether that is true depends on what the Library does
+and what the program that uses the Library does.
+
+  1. You may copy and distribute verbatim copies of the Library's
+complete source code as you receive it, in any medium, provided that
+you conspicuously and appropriately publish on each copy an
+appropriate copyright notice and disclaimer of warranty; keep intact
+all the notices that refer to this License and to the absence of any
+warranty; and distribute a copy of this License along with the
+Library.
+
+  You may charge a fee for the physical act of transferring a copy,
+and you may at your option offer warranty protection in exchange for a
+fee.
+
+  2. You may modify your copy or copies of the Library or any portion
+of it, thus forming a work based on the Library, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) The modified work must itself be a software library.
+
+    b) You must cause the files modified to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    c) You must cause the whole of the work to be licensed at no
+    charge to all third parties under the terms of this License.
+
+    d) If a facility in the modified Library refers to a function or a
+    table of data to be supplied by an application program that uses
+    the facility, other than as an argument passed when the facility
+    is invoked, then you must make a good faith effort to ensure that,
+    in the event an application does not supply such function or
+    table, the facility still operates, and performs whatever part of
+    its purpose remains meaningful.
+
+    (For example, a function in a library to compute square roots has
+    a purpose that is entirely well-defined independent of the
+    application.  Therefore, Subsection 2d requires that any
+    application-supplied function or table used by this function must
+    be optional: if the application does not supply it, the square
+    root function must still compute square roots.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Library,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Library, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote
+it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Library.
+
+In addition, mere aggregation of another work not based on the Library
+with the Library (or with a work based on the Library) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may opt to apply the terms of the ordinary GNU General Public
+License instead of this License to a given copy of the Library.  To do
+this, you must alter all the notices that refer to this License, so
+that they refer to the ordinary GNU General Public License, version 2,
+instead of to this License.  (If a newer version than version 2 of the
+ordinary GNU General Public License has appeared, then you can specify
+that version instead if you wish.)  Do not make any other change in
+these notices.
+
+  Once this change is made in a given copy, it is irreversible for
+that copy, so the ordinary GNU General Public License applies to all
+subsequent copies and derivative works made from that copy.
+
+  This option is useful when you wish to copy part of the code of
+the Library into a program that is not a library.
+
+  4. You may copy and distribute the Library (or a portion or
+derivative of it, under Section 2) in object code or executable form
+under the terms of Sections 1 and 2 above provided that you accompany
+it with the complete corresponding machine-readable source code, which
+must be distributed under the terms of Sections 1 and 2 above on a
+medium customarily used for software interchange.
+
+  If distribution of object code is made by offering access to copy
+from a designated place, then offering equivalent access to copy the
+source code from the same place satisfies the requirement to
+distribute the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  5. A program that contains no derivative of any portion of the
+Library, but is designed to work with the Library by being compiled or
+linked with it, is called a "work that uses the Library".  Such a
+work, in isolation, is not a derivative work of the Library, and
+therefore falls outside the scope of this License.
+
+  However, linking a "work that uses the Library" with the Library
+creates an executable that is a derivative of the Library (because it
+contains portions of the Library), rather than a "work that uses the
+library".  The executable is therefore covered by this License.
+Section 6 states terms for distribution of such executables.
+
+  When a "work that uses the Library" uses material from a header file
+that is part of the Library, the object code for the work may be a
+derivative work of the Library even though the source code is not.
+Whether this is true is especially significant if the work can be
+linked without the Library, or if the work is itself a library.  The
+threshold for this to be true is not precisely defined by law.
+
+  If such an object file uses only numerical parameters, data
+structure layouts and accessors, and small macros and small inline
+functions (ten lines or less in length), then the use of the object
+file is unrestricted, regardless of whether it is legally a derivative
+work.  (Executables containing this object code plus portions of the
+Library will still fall under Section 6.)
+
+  Otherwise, if the work is a derivative of the Library, you may
+distribute the object code for the work under the terms of Section 6.
+Any executables containing that work also fall under Section 6,
+whether or not they are linked directly with the Library itself.
+
+  6. As an exception to the Sections above, you may also combine or
+link a "work that uses the Library" with the Library to produce a
+work containing portions of the Library, and distribute that work
+under terms of your choice, provided that the terms permit
+modification of the work for the customer's own use and reverse
+engineering for debugging such modifications.
+
+  You must give prominent notice with each copy of the work that the
+Library is used in it and that the Library and its use are covered by
+this License.  You must supply a copy of this License.  If the work
+during execution displays copyright notices, you must include the
+copyright notice for the Library among them, as well as a reference
+directing the user to the copy of this License.  Also, you must do one
+of these things:
+
+    a) Accompany the work with the complete corresponding
+    machine-readable source code for the Library including whatever
+    changes were used in the work (which must be distributed under
+    Sections 1 and 2 above); and, if the work is an executable linked
+    with the Library, with the complete machine-readable "work that
+    uses the Library", as object code and/or source code, so that the
+    user can modify the Library and then relink to produce a modified
+    executable containing the modified Library.  (It is understood
+    that the user who changes the contents of definitions files in the
+    Library will not necessarily be able to recompile the application
+    to use the modified definitions.)
+
+    b) Use a suitable shared library mechanism for linking with the
+    Library.  A suitable mechanism is one that (1) uses at run time a
+    copy of the library already present on the user's computer system,
+    rather than copying library functions into the executable, and (2)
+    will operate properly with a modified version of the library, if
+    the user installs one, as long as the modified version is
+    interface-compatible with the version that the work was made with.
+
+    c) Accompany the work with a written offer, valid for at least
+    three years, to give the same user the materials specified in
+    Subsection 6a, above, for a charge no more than the cost of
+    performing this distribution.
+
+    d) If distribution of the work is made by offering access to copy
+    from a designated place, offer equivalent access to copy the above
+    specified materials from the same place.
+
+    e) Verify that the user has already received a copy of these
+    materials or that you have already sent this user a copy.
+
+  For an executable, the required form of the "work that uses the
+Library" must include any data and utility programs needed for
+reproducing the executable from it.  However, as a special exception,
+the materials to be distributed need not include anything that is
+normally distributed (in either source or binary form) with the major
+components (compiler, kernel, and so on) of the operating system on
+which the executable runs, unless that component itself accompanies
+the executable.
+
+  It may happen that this requirement contradicts the license
+restrictions of other proprietary libraries that do not normally
+accompany the operating system.  Such a contradiction means you cannot
+use both them and the Library together in an executable that you
+distribute.
+
+  7. You may place library facilities that are a work based on the
+Library side-by-side in a single library together with other library
+facilities not covered by this License, and distribute such a combined
+library, provided that the separate distribution of the work based on
+the Library and of the other library facilities is otherwise
+permitted, and provided that you do these two things:
+
+    a) Accompany the combined library with a copy of the same work
+    based on the Library, uncombined with any other library
+    facilities.  This must be distributed under the terms of the
+    Sections above.
+
+    b) Give prominent notice with the combined library of the fact
+    that part of it is a work based on the Library, and explaining
+    where to find the accompanying uncombined form of the same work.
+
+  8. You may not copy, modify, sublicense, link with, or distribute
+the Library except as expressly provided under this License.  Any
+attempt otherwise to copy, modify, sublicense, link with, or
+distribute the Library is void, and will automatically terminate your
+rights under this License.  However, parties who have received copies,
+or rights, from you under this License will not have their licenses
+terminated so long as such parties remain in full compliance.
+
+  9. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Library or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Library (or any work based on the
+Library), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Library or works based on it.
+
+  10. Each time you redistribute the Library (or any work based on the
+Library), the recipient automatically receives a license from the
+original licensor to copy, distribute, link with or modify the Library
+subject to these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties with
+this License.
+
+  11. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Library at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Library by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Library.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply, and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  12. If the distribution and/or use of the Library is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Library under this License
+may add an explicit geographical distribution limitation excluding those
+countries, so that distribution is permitted only in or among
+countries not thus excluded.  In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+  13. The Free Software Foundation may publish revised and/or new
+versions of the Lesser General Public License from time to time.
+Such new versions will be similar in spirit to the present version,
+but may differ in detail to address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Library
+specifies a version number of this License which applies to it and
+"any later version", you have the option of following the terms and
+conditions either of that version or of any later version published by
+the Free Software Foundation.  If the Library does not specify a
+license version number, you may choose any version ever published by
+the Free Software Foundation.
+
+  14. If you wish to incorporate parts of the Library into other free
+programs whose distribution conditions are incompatible with these,
+write to the author to ask for permission.  For software which is
+copyrighted by the Free Software Foundation, write to the Free
+Software Foundation; we sometimes make exceptions for this.  Our
+decision will be guided by the two goals of preserving the free status
+of all derivatives of our free software and of promoting the sharing
+and reuse of software generally.
+
+                            NO WARRANTY
+
+  15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO
+WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW.
+EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR
+OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY
+KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE
+LIBRARY IS WITH YOU.  SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME
+THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN
+WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY
+AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU
+FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR
+CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
+LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
+RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
+FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
+SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
+DAMAGES.
+
+                     END OF TERMS AND CONDITIONS
+
+           How to Apply These Terms to Your New Libraries
+
+  If you develop a new library, and you want it to be of the greatest
+possible use to the public, we recommend making it free software that
+everyone can redistribute and change.  You can do so by permitting
+redistribution under these terms (or, alternatively, under the terms
+of the ordinary General Public License).
+
+  To apply these terms, attach the following notices to the library.
+It is safest to attach them to the start of each source file to most
+effectively convey the exclusion of warranty; and each file should
+have at least the "copyright" line and a pointer to where the full
+notice is found.
+
+
+    <one line to give the library's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This library is free software; you can redistribute it and/or
+    modify it under the terms of the GNU Lesser General Public
+    License as published by the Free Software Foundation; either
+    version 2.1 of the License, or (at your option) any later version.
+
+    This library is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public
+    License along with this library; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+
+Also add information on how to contact you by electronic and paper mail.
+
+You should also get your employer (if you work as a programmer) or
+your school, if any, to sign a "copyright disclaimer" for the library,
+if necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the
+  library `Frob' (a library for tweaking knobs) written by James
+  Random Hacker.
+
+  <signature of Ty Coon>, 1 April 1990
+  Ty Coon, President of Vice
+
+That's all there is to it!
diff --git a/lib/ccan/endian/_info b/lib/ccan/endian/_info
new file mode 100644
index 0000000000..5d4d65ff74
--- /dev/null
+++ b/lib/ccan/endian/_info
@@ -0,0 +1,53 @@
+#include <stdio.h>
+#include <string.h>
+#include "config.h"
+
+/**
+ * endian - endian conversion macros for simple types
+ *
+ * Portable protocols (such as on-disk formats, or network protocols)
+ * are often defined to be a particular endian: little-endian (least
+ * significant bytes first) or big-endian (most significant bytes
+ * first).
+ *
+ * Similarly, some CPUs lay out values in memory in little-endian
+ * order (most commonly, Intel's 8086 and derivatives), or big-endian
+ * order (almost everyone else).
+ *
+ * This module provides conversion routines, inspired by the linux kernel.
+ *
+ * Example:
+ *	#include <stdio.h>
+ *	#include <err.h>
+ *	#include <ccan/endian/endian.h>
+ *
+ *	//
+ *	int main(int argc, char *argv[])
+ *	{
+ *		uint32_t value;
+ *
+ *		if (argc != 2)
+ *			errx(1, "Usage: %s <value>", argv[0]);
+ *
+ *		value = atoi(argv[1]);
+ *		printf("native:        %08x\n", value);
+ *		printf("little-endian: %08x\n", cpu_to_le32(value));
+ *		printf("big-endian:    %08x\n", cpu_to_be32(value));
+ *		printf("byte-reversed: %08x\n", bswap_32(value));
+ *		exit(0);
+ *	}
+ *
+ * License: LGPL (2 or any later version)
+ * Author: Rusty Russell <rusty@rustcorp.com.au>
+ */
+int main(int argc, char *argv[])
+{
+	if (argc != 2)
+		return 1;
+
+	if (strcmp(argv[1], "depends") == 0)
+		/* Nothing */
+		return 0;
+
+	return 1;
+}
diff --git a/lib/ccan/endian/endian.h b/lib/ccan/endian/endian.h
new file mode 100644
index 0000000000..baee60be14
--- /dev/null
+++ b/lib/ccan/endian/endian.h
@@ -0,0 +1,226 @@
+#ifndef CCAN_ENDIAN_H
+#define CCAN_ENDIAN_H
+#include <stdint.h>
+#include "config.h"
+
+#if HAVE_BYTESWAP_H
+#include <byteswap.h>
+#else
+/**
+ * bswap_16 - reverse bytes in a uint16_t value.
+ * @val: value whose bytes to swap.
+ *
+ * Example:
+ *	// Output contains "1024 is 4 as two bytes reversed"
+ *	printf("1024 is %u as two bytes reversed\n", bswap_16(1024));
+ */
+static inline uint16_t bswap_16(uint16_t val)
+{
+	return ((val & (uint16_t)0x00ffU) << 8)
+		| ((val & (uint16_t)0xff00U) >> 8);
+}
+
+/**
+ * bswap_32 - reverse bytes in a uint32_t value.
+ * @val: value whose bytes to swap.
+ *
+ * Example:
+ *	// Output contains "1024 is 262144 as four bytes reversed"
+ *	printf("1024 is %u as four bytes reversed\n", bswap_32(1024));
+ */
+static inline uint32_t bswap_32(uint32_t val)
+{
+	return ((val & (uint32_t)0x000000ffUL) << 24)
+		| ((val & (uint32_t)0x0000ff00UL) <<  8)
+		| ((val & (uint32_t)0x00ff0000UL) >>  8)
+		| ((val & (uint32_t)0xff000000UL) >> 24);
+}
+#endif /* !HAVE_BYTESWAP_H */
+
+#if !HAVE_BSWAP_64
+/**
+ * bswap_64 - reverse bytes in a uint64_t value.
+ * @val: value whose bytes to swap.
+ *
+ * Example:
+ *	// Output contains "1024 is 1125899906842624 as eight bytes reversed"
+ *	printf("1024 is %llu as eight bytes reversed\n",
+ *		(unsigned long long)bswap_64(1024));
+ */
+static inline uint64_t bswap_64(uint64_t val)
+{
+	return ((val & (uint64_t)0x00000000000000ffULL) << 56)
+		| ((val & (uint64_t)0x000000000000ff00ULL) << 40)
+		| ((val & (uint64_t)0x0000000000ff0000ULL) << 24)
+		| ((val & (uint64_t)0x00000000ff000000ULL) <<  8)
+		| ((val & (uint64_t)0x000000ff00000000ULL) >>  8)
+		| ((val & (uint64_t)0x0000ff0000000000ULL) >> 24)
+		| ((val & (uint64_t)0x00ff000000000000ULL) >> 40)
+		| ((val & (uint64_t)0xff00000000000000ULL) >> 56);
+}
+#endif
+
+/* Sanity check the defines.  We don't handle weird endianness. */
+#if !HAVE_LITTLE_ENDIAN && !HAVE_BIG_ENDIAN
+#error "Unknown endian"
+#elif HAVE_LITTLE_ENDIAN && HAVE_BIG_ENDIAN
+#error "Can't compile for both big and little endian."
+#endif
+
+/**
+ * cpu_to_le64 - convert a uint64_t value to little-endian
+ * @native: value to convert
+ */
+static inline uint64_t cpu_to_le64(uint64_t native)
+{
+#if HAVE_LITTLE_ENDIAN
+	return native;
+#else
+	return bswap_64(native);
+#endif
+}
+
+/**
+ * cpu_to_le32 - convert a uint32_t value to little-endian
+ * @native: value to convert
+ */
+static inline uint32_t cpu_to_le32(uint32_t native)
+{
+#if HAVE_LITTLE_ENDIAN
+	return native;
+#else
+	return bswap_32(native);
+#endif
+}
+
+/**
+ * cpu_to_le16 - convert a uint16_t value to little-endian
+ * @native: value to convert
+ */
+static inline uint16_t cpu_to_le16(uint16_t native)
+{
+#if HAVE_LITTLE_ENDIAN
+	return native;
+#else
+	return bswap_16(native);
+#endif
+}
+
+/**
+ * le64_to_cpu - convert a little-endian uint64_t value
+ * @le_val: little-endian value to convert
+ */
+static inline uint64_t le64_to_cpu(uint64_t le_val)
+{
+#if HAVE_LITTLE_ENDIAN
+	return le_val;
+#else
+	return bswap_64(le_val);
+#endif
+}
+
+/**
+ * le32_to_cpu - convert a little-endian uint32_t value
+ * @le_val: little-endian value to convert
+ */
+static inline uint32_t le32_to_cpu(uint32_t le_val)
+{
+#if HAVE_LITTLE_ENDIAN
+	return le_val;
+#else
+	return bswap_32(le_val);
+#endif
+}
+
+/**
+ * le16_to_cpu - convert a little-endian uint16_t value
+ * @le_val: little-endian value to convert
+ */
+static inline uint16_t le16_to_cpu(uint16_t le_val)
+{
+#if HAVE_LITTLE_ENDIAN
+	return le_val;
+#else
+	return bswap_16(le_val);
+#endif
+}
+
+/**
+ * cpu_to_be64 - convert a uint64_t value to big endian.
+ * @native: value to convert
+ */
+static inline uint64_t cpu_to_be64(uint64_t native)
+{
+#if HAVE_LITTLE_ENDIAN
+	return bswap_64(native);
+#else
+	return native;
+#endif
+}
+
+/**
+ * cpu_to_be32 - convert a uint32_t value to big endian.
+ * @native: value to convert
+ */
+static inline uint32_t cpu_to_be32(uint32_t native)
+{
+#if HAVE_LITTLE_ENDIAN
+	return bswap_32(native);
+#else
+	return native;
+#endif
+}
+
+/**
+ * cpu_to_be16 - convert a uint16_t value to big endian.
+ * @native: value to convert
+ */
+static inline uint16_t cpu_to_be16(uint16_t native)
+{
+#if HAVE_LITTLE_ENDIAN
+	return bswap_16(native);
+#else
+	return native;
+#endif
+}
+
+/**
+ * be64_to_cpu - convert a big-endian uint64_t value
+ * @be_val: big-endian value to convert
+ */
+static inline uint64_t be64_to_cpu(uint64_t be_val)
+{
+#if HAVE_LITTLE_ENDIAN
+	return bswap_64(be_val);
+#else
+	return be_val;
+#endif
+}
+
+/**
+ * be32_to_cpu - convert a big-endian uint32_t value
+ * @be_val: big-endian value to convert
+ */
+static inline uint32_t be32_to_cpu(uint32_t be_val)
+{
+#if HAVE_LITTLE_ENDIAN
+	return bswap_32(be_val);
+#else
+	return be_val;
+#endif
+}
+
+/**
+ * be16_to_cpu - convert a big-endian uint16_t value
+ * @be_val: big-endian value to convert
+ */
+static inline uint16_t be16_to_cpu(uint16_t be_val)
+{
+#if HAVE_LITTLE_ENDIAN
+	return bswap_16(be_val);
+#else
+	return be_val;
+#endif
+}
+
+#endif /* CCAN_ENDIAN_H */
diff --git a/lib/ccan/endian/test/run.c b/lib/ccan/endian/test/run.c
new file mode 100644
index 0000000000..a00fce74e4
--- /dev/null
+++ b/lib/ccan/endian/test/run.c
@@ -0,0 +1,106 @@
+#include <ccan/endian/endian.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <ccan/tap/tap.h>
+
+int main(int argc, char *argv[])
+{
+	union {
+		uint64_t u64;
+		unsigned char u64_bytes[8];
+	} u64;
+	union {
+		uint32_t u32;
+		unsigned char u32_bytes[4];
+	} u32;
+	union {
+		uint16_t u16;
+		unsigned char u16_bytes[2];
+	} u16;
+
+	plan_tests(48);
+
+	/* Straight swap tests. */
+	u64.u64_bytes[0] = 0x00;
+	u64.u64_bytes[1] = 0x11;
+	u64.u64_bytes[2] = 0x22;
+	u64.u64_bytes[3] = 0x33;
+	u64.u64_bytes[4] = 0x44;
+	u64.u64_bytes[5] = 0x55;
+	u64.u64_bytes[6] = 0x66;
+	u64.u64_bytes[7] = 0x77;
+	u64.u64 = bswap_64(u64.u64);
+	ok1(u64.u64_bytes[7] == 0x00);
+	ok1(u64.u64_bytes[6] == 0x11);
+	ok1(u64.u64_bytes[5] == 0x22);
+	ok1(u64.u64_bytes[4] == 0x33);
+	ok1(u64.u64_bytes[3] == 0x44);
+	ok1(u64.u64_bytes[2] == 0x55);
+	ok1(u64.u64_bytes[1] == 0x66);
+	ok1(u64.u64_bytes[0] == 0x77);
+
+	u32.u32_bytes[0] = 0x00;
+	u32.u32_bytes[1] = 0x11;
+	u32.u32_bytes[2] = 0x22;
+	u32.u32_bytes[3] = 0x33;
+	u32.u32 = bswap_32(u32.u32);
+	ok1(u32.u32_bytes[3] == 0x00);
+	ok1(u32.u32_bytes[2] == 0x11);
+	ok1(u32.u32_bytes[1] == 0x22);
+	ok1(u32.u32_bytes[0] == 0x33);
+
+	u16.u16_bytes[0] = 0x00;
+	u16.u16_bytes[1] = 0x11;
+	u16.u16 = bswap_16(u16.u16);
+	ok1(u16.u16_bytes[1] == 0x00);
+	ok1(u16.u16_bytes[0] == 0x11);
+
+	/* Endian tests. */
+	u64.u64 = cpu_to_le64(0x0011223344556677ULL);
+	ok1(u64.u64_bytes[0] == 0x77);
+	ok1(u64.u64_bytes[1] == 0x66);
+	ok1(u64.u64_bytes[2] == 0x55);
+	ok1(u64.u64_bytes[3] == 0x44);
+	ok1(u64.u64_bytes[4] == 0x33);
+	ok1(u64.u64_bytes[5] == 0x22);
+	ok1(u64.u64_bytes[6] == 0x11);
+	ok1(u64.u64_bytes[7] == 0x00);
+	ok1(le64_to_cpu(u64.u64) == 0x0011223344556677ULL);
+
+	u64.u64 = cpu_to_be64(0x0011223344556677ULL);
+	ok1(u64.u64_bytes[7] == 0x77);
+	ok1(u64.u64_bytes[6] == 0x66);
+	ok1(u64.u64_bytes[5] == 0x55);
+	ok1(u64.u64_bytes[4] == 0x44);
+	ok1(u64.u64_bytes[3] == 0x33);
+	ok1(u64.u64_bytes[2] == 0x22);
+	ok1(u64.u64_bytes[1] == 0x11);
+	ok1(u64.u64_bytes[0] == 0x00);
+	ok1(be64_to_cpu(u64.u64) == 0x0011223344556677ULL);
+
+	u32.u32 = cpu_to_le32(0x00112233);
+	ok1(u32.u32_bytes[0] == 0x33);
+	ok1(u32.u32_bytes[1] == 0x22);
+	ok1(u32.u32_bytes[2] == 0x11);
+	ok1(u32.u32_bytes[3] == 0x00);
+	ok1(le32_to_cpu(u32.u32) == 0x00112233);
+
+	u32.u32 = cpu_to_be32(0x00112233);
+	ok1(u32.u32_bytes[3] == 0x33);
+	ok1(u32.u32_bytes[2] == 0x22);
+	ok1(u32.u32_bytes[1] == 0x11);
+	ok1(u32.u32_bytes[0] == 0x00);
+	ok1(be32_to_cpu(u32.u32) == 0x00112233);
+
+	u16.u16 = cpu_to_le16(0x0011);
+	ok1(u16.u16_bytes[0] == 0x11);
+	ok1(u16.u16_bytes[1] == 0x00);
+	ok1(le16_to_cpu(u16.u16) == 0x0011);
+
+	u16.u16 = cpu_to_be16(0x0011);
+	ok1(u16.u16_bytes[1] == 0x11);
+	ok1(u16.u16_bytes[0] == 0x00);
+	ok1(be16_to_cpu(u16.u16) == 0x0011);
+
+	exit(exit_status());
+}
diff --git a/lib/ccan/hash/_info b/lib/ccan/hash/_info
new file mode 100644
index 0000000000..5aeb912136
--- /dev/null
+++ b/lib/ccan/hash/_info
@@ -0,0 +1,31 @@
+#include <string.h>
+#include <stdio.h>
+
+/**
+ * hash - routines for hashing bytes
+ *
+ * When creating a hash table it's important to have a hash function
+ * which mixes well and is fast.  This package supplies such functions.
+ *
+ * The hash functions come in two flavors: the normal ones and the
+ * stable ones.  The normal ones can vary from machine-to-machine and
+ * may change if we find better or faster hash algorithms in future.
+ * The stable ones will always give the same results on any computer,
+ * and on any version of this package.
+ *
+ * License: Public Domain
+ * Maintainer: Rusty Russell <rusty@rustcorp.com.au>
+ * Author: Bob Jenkins <bob_jenkins@burtleburtle.net>
+ */
+int main(int argc, char *argv[])
+{
+	if (argc != 2)
+		return 1;
+
+	if (strcmp(argv[1], "depends") == 0) {
+		printf("ccan/build_assert\n");
+		return 0;
+	}
+
+	return 1;
+}
diff --git a/lib/ccan/hash/hash.c b/lib/ccan/hash/hash.c
new file mode 100644
index 0000000000..0fd6109513
--- /dev/null
+++ b/lib/ccan/hash/hash.c
@@ -0,0 +1,925 @@
+/*
+-------------------------------------------------------------------------------
+lookup3.c, by Bob Jenkins, May 2006, Public Domain.
+
+These are functions for producing 32-bit hashes for hash table lookup.
+hash_word(), hashlittle(), hashlittle2(), hashbig(), mix(), and final()
+are externally useful functions.  Routines to test the hash are included
+if SELF_TEST is defined.  You can use this free for any purpose.  It's in
+the public domain.  It has no warranty.
+
+You probably want to use hashlittle().  hashlittle() and hashbig()
+hash byte arrays.  hashlittle() is is faster than hashbig() on
+little-endian machines.  Intel and AMD are little-endian machines.
+On second thought, you probably want hashlittle2(), which is identical to
+hashlittle() except it returns two 32-bit hashes for the price of one.
+You could implement hashbig2() if you wanted but I haven't bothered here.
+
+If you want to find a hash of, say, exactly 7 integers, do
+  a = i1;  b = i2;  c = i3;
+  mix(a,b,c);
+  a += i4; b += i5; c += i6;
+  mix(a,b,c);
+  a += i7;
+  final(a,b,c);
+then use c as the hash value.  If you have a variable length array of
+4-byte integers to hash, use hash_word().  If you have a byte array (like
+a character string), use hashlittle().  If you have several byte arrays, or
+a mix of things, see the comments above hashlittle().
+
+Why is this so big?  I read 12 bytes at a time into 3 4-byte integers,
+then mix those integers.  This is fast (you can do a lot more thorough
+mixing with 12*3 instructions on 3 integers than you can with 3 instructions
+on 1 byte), but shoehorning those bytes into integers efficiently is messy.
+-------------------------------------------------------------------------------
+*/
+//#define SELF_TEST 1
+
+#if 0
+#include <stdio.h>      /* defines printf for tests */
+#include <time.h>       /* defines time_t for timings in the test */
+#include <stdint.h>     /* defines uint32_t etc */
+#include <sys/param.h>  /* attempt to define endianness */
+
+#ifdef linux
+# include <endian.h>    /* attempt to define endianness */
+#endif
+
+/*
+ * My best guess at if you are big-endian or little-endian.  This may
+ * need adjustment.
+ */
+#if (defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && \
+     __BYTE_ORDER == __LITTLE_ENDIAN) || \
+    (defined(i386) || defined(__i386__) || defined(__i486__) || \
+     defined(__i586__) || defined(__i686__) || defined(__x86_64) || \
+     defined(vax) || defined(MIPSEL))
+# define HASH_LITTLE_ENDIAN 1
+# define HASH_BIG_ENDIAN 0
+#elif (defined(__BYTE_ORDER) && defined(__BIG_ENDIAN) && \
+       __BYTE_ORDER == __BIG_ENDIAN) || \
+      (defined(sparc) || defined(POWERPC) || defined(mc68000) || defined(sel))
+# define HASH_LITTLE_ENDIAN 0
+# define HASH_BIG_ENDIAN 1
+#else
+# error Unknown endian
+#endif
+#endif /* old hash.c headers. */
+
+#include "hash.h"
+
+#if HAVE_LITTLE_ENDIAN
+#define HASH_LITTLE_ENDIAN 1
+#define HASH_BIG_ENDIAN 0
+#elif HAVE_BIG_ENDIAN
+#define HASH_LITTLE_ENDIAN 0
+#define HASH_BIG_ENDIAN 1
+#else
+#error Unknown endian
+#endif
+
+#define hashsize(n) ((uint32_t)1<<(n))
+#define hashmask(n) (hashsize(n)-1)
+#define rot(x,k) (((x)<<(k)) | ((x)>>(32-(k))))
+
+/*
+-------------------------------------------------------------------------------
+mix -- mix 3 32-bit values reversibly.
+
+This is reversible, so any information in (a,b,c) before mix() is
+still in (a,b,c) after mix().
+
+If four pairs of (a,b,c) inputs are run through mix(), or through
+mix() in reverse, there are at least 32 bits of the output that
+are sometimes the same for one pair and different for another pair.
+This was tested for:
+* pairs that differed by one bit, by two bits, in any combination
+  of top bits of (a,b,c), or in any combination of bottom bits of
+  (a,b,c).
+* "differ" is defined as +, -, ^, or ~^.  For + and -, I transformed
+  the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
+  is commonly produced by subtraction) look like a single 1-bit
+  difference.
+* the base values were pseudorandom, all zero but one bit set, or
+  all zero plus a counter that starts at zero.
+
+Some k values for my "a-=c; a^=rot(c,k); c+=b;" arrangement that
+satisfy this are
+    4  6  8 16 19  4
+    9 15  3 18 27 15
+   14  9  3  7 17  3
+Well, "9 15 3 18 27 15" didn't quite get 32 bits diffing
+for "differ" defined as + with a one-bit base and a two-bit delta.  I
+used http://burtleburtle.net/bob/hash/avalanche.html to choose
+the operations, constants, and arrangements of the variables.
+
+This does not achieve avalanche.  There are input bits of (a,b,c)
+that fail to affect some output bits of (a,b,c), especially of a.  The
+most thoroughly mixed value is c, but it doesn't really even achieve
+avalanche in c.
+
+This allows some parallelism.  Read-after-writes are good at doubling
+the number of bits affected, so the goal of mixing pulls in the opposite
+direction as the goal of parallelism.  I did what I could.  Rotates
+seem to cost as much as shifts on every machine I could lay my hands
+on, and rotates are much kinder to the top and bottom bits, so I used
+rotates.
+-------------------------------------------------------------------------------
+*/
+#define mix(a,b,c) \
+{ \
+  a -= c;  a ^= rot(c, 4);  c += b; \
+  b -= a;  b ^= rot(a, 6);  a += c; \
+  c -= b;  c ^= rot(b, 8);  b += a; \
+  a -= c;  a ^= rot(c,16);  c += b; \
+  b -= a;  b ^= rot(a,19);  a += c; \
+  c -= b;  c ^= rot(b, 4);  b += a; \
+}
+
+/*
+-------------------------------------------------------------------------------
+final -- final mixing of 3 32-bit values (a,b,c) into c
+
+Pairs of (a,b,c) values differing in only a few bits will usually
+produce values of c that look totally different.  This was tested for
+* pairs that differed by one bit, by two bits, in any combination
+  of top bits of (a,b,c), or in any combination of bottom bits of
+  (a,b,c).
+* "differ" is defined as +, -, ^, or ~^.  For + and -, I transformed
+  the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
+  is commonly produced by subtraction) look like a single 1-bit
+  difference.
+* the base values were pseudorandom, all zero but one bit set, or
+  all zero plus a counter that starts at zero.
+
+These constants passed:
+ 14 11 25 16 4 14 24
+ 12 14 25 16 4 14 24
+and these came close:
+  4  8 15 26 3 22 24
+ 10  8 15 26 3 22 24
+ 11  8 15 26 3 22 24
+-------------------------------------------------------------------------------
+*/
+#define final(a,b,c) \
+{ \
+  c ^= b; c -= rot(b,14); \
+  a ^= c; a -= rot(c,11); \
+  b ^= a; b -= rot(a,25); \
+  c ^= b; c -= rot(b,16); \
+  a ^= c; a -= rot(c,4);  \
+  b ^= a; b -= rot(a,14); \
+  c ^= b; c -= rot(b,24); \
+}
+
+/*
+--------------------------------------------------------------------
+ This works on all machines.  To be useful, it requires
+ -- that the key be an array of uint32_t's, and
+ -- that the length be the number of uint32_t's in the key
+
+ The function hash_word() is identical to hashlittle() on little-endian
+ machines, and identical to hashbig() on big-endian machines,
+ except that the length has to be measured in uint32_ts rather than in
+ bytes.  hashlittle() is more complicated than hash_word() only because
+ hashlittle() has to dance around fitting the key bytes into registers.
+--------------------------------------------------------------------
+*/
+uint32_t hash_u32(
+const uint32_t *k,                   /* the key, an array of uint32_t values */
+size_t          length,               /* the length of the key, in uint32_ts */
+uint32_t        initval)         /* the previous hash, or an arbitrary value */
+{
+  uint32_t a,b,c;
+
+  /* Set up the internal state */
+  a = b = c = 0xdeadbeef + (((uint32_t)length)<<2) + initval;
+
+  /*------------------------------------------------- handle most of the key */
+  while (length > 3)
+  {
+    a += k[0];
+    b += k[1];
+    c += k[2];
+    mix(a,b,c);
+    length -= 3;
+    k += 3;
+  }
+
+  /*------------------------------------------- handle the last 3 uint32_t's */
+  switch(length)                     /* all the case statements fall through */
+  {
+  case 3 : c+=k[2];
+  case 2 : b+=k[1];
+  case 1 : a+=k[0];
+    final(a,b,c);
+  case 0:     /* case 0: nothing left to add */
+    break;
+  }
+  /*------------------------------------------------------ report the result */
+  return c;
+}
+
+/*
+-------------------------------------------------------------------------------
+hashlittle() -- hash a variable-length key into a 32-bit value
+  k       : the key (the unaligned variable-length array of bytes)
+  length  : the length of the key, counting by bytes
+  val2    : IN: can be any 4-byte value OUT: second 32 bit hash.
+Returns a 32-bit value.  Every bit of the key affects every bit of
+the return value.  Two keys differing by one or two bits will have
+totally different hash values.  Note that the return value is better
+mixed than val2, so use that first.
+
+The best hash table sizes are powers of 2.  There is no need to do
+mod a prime (mod is sooo slow!).  If you need less than 32 bits,
+use a bitmask.  For example, if you need only 10 bits, do
+  h = (h & hashmask(10));
+In which case, the hash table should have hashsize(10) elements.
+
+If you are hashing n strings (uint8_t **)k, do it like this:
+  for (i=0, h=0; i<n; ++i) h = hashlittle( k[i], len[i], h);
+
+By Bob Jenkins, 2006.  bob_jenkins@burtleburtle.net.  You may use this
+code any way you wish, private, educational, or commercial.  It's free.
+
+Use for hash table lookup, or anything where one collision in 2^^32 is
+acceptable.  Do NOT use for cryptographic purposes.
+-------------------------------------------------------------------------------
+*/
+
+static uint32_t hashlittle( const void *key, size_t length, uint32_t *val2 )
+{
+  uint32_t a,b,c;                                          /* internal state */
+  union { const void *ptr; size_t i; } u;     /* needed for Mac Powerbook G4 */
+
+  /* Set up the internal state */
+  a = b = c = 0xdeadbeef + ((uint32_t)length) + *val2;
+
+  u.ptr = key;
+  if (HASH_LITTLE_ENDIAN && ((u.i & 0x3) == 0)) {
+    const uint32_t *k = (const uint32_t *)key;         /* read 32-bit chunks */
+    const uint8_t  *k8;
+
+    /*------ all but last block: aligned reads and affect 32 bits of (a,b,c) */
+    while (length > 12)
+    {
+      a += k[0];
+      b += k[1];
+      c += k[2];
+      mix(a,b,c);
+      length -= 12;
+      k += 3;
+    }
+
+    /*----------------------------- handle the last (probably partial) block */
+    /*
+     * "k[2]&0xffffff" actually reads beyond the end of the string, but
+     * then masks off the part it's not allowed to read.  Because the
+     * string is aligned, the masked-off tail is in the same word as the
+     * rest of the string.  Every machine with memory protection I've seen
+     * does it on word boundaries, so is OK with this.  But VALGRIND will
+     * still catch it and complain.  The masking trick does make the hash
+     * noticably faster for short strings (like English words).
+     *
+     * Not on my testing with gcc 4.5 on an intel i5 CPU, at least --RR.
+     */
+#if 0
+    switch(length)
+    {
+    case 12: c+=k[2]; b+=k[1]; a+=k[0]; break;
+    case 11: c+=k[2]&0xffffff; b+=k[1]; a+=k[0]; break;
+    case 10: c+=k[2]&0xffff; b+=k[1]; a+=k[0]; break;
+    case 9 : c+=k[2]&0xff; b+=k[1]; a+=k[0]; break;
+    case 8 : b+=k[1]; a+=k[0]; break;
+    case 7 : b+=k[1]&0xffffff; a+=k[0]; break;
+    case 6 : b+=k[1]&0xffff; a+=k[0]; break;
+    case 5 : b+=k[1]&0xff; a+=k[0]; break;
+    case 4 : a+=k[0]; break;
+    case 3 : a+=k[0]&0xffffff; break;
+    case 2 : a+=k[0]&0xffff; break;
+    case 1 : a+=k[0]&0xff; break;
+    case 0 : return c;              /* zero length strings require no mixing */
+    }
+
+#else /* make valgrind happy */
+
+    k8 = (const uint8_t *)k;
+    switch(length)
+    {
+    case 12: c+=k[2]; b+=k[1]; a+=k[0]; break;
+    case 11: c+=((uint32_t)k8[10])<<16;  /* fall through */
+    case 10: c+=((uint32_t)k8[9])<<8;    /* fall through */
+    case 9 : c+=k8[8];                   /* fall through */
+    case 8 : b+=k[1]; a+=k[0]; break;
+    case 7 : b+=((uint32_t)k8[6])<<16;   /* fall through */
+    case 6 : b+=((uint32_t)k8[5])<<8;    /* fall through */
+    case 5 : b+=k8[4];                   /* fall through */
+    case 4 : a+=k[0]; break;
+    case 3 : a+=((uint32_t)k8[2])<<16;   /* fall through */
+    case 2 : a+=((uint32_t)k8[1])<<8;    /* fall through */
+    case 1 : a+=k8[0]; break;
+    case 0 : return c;
+    }
+
+#endif /* !valgrind */
+
+  } else if (HASH_LITTLE_ENDIAN && ((u.i & 0x1) == 0)) {
+    const uint16_t *k = (const uint16_t *)key;         /* read 16-bit chunks */
+    const uint8_t  *k8;
+
+    /*--------------- all but last block: aligned reads and different mixing */
+    while (length > 12)
+    {
+      a += k[0] + (((uint32_t)k[1])<<16);
+      b += k[2] + (((uint32_t)k[3])<<16);
+      c += k[4] + (((uint32_t)k[5])<<16);
+      mix(a,b,c);
+      length -= 12;
+      k += 6;
+    }
+
+    /*----------------------------- handle the last (probably partial) block */
+    k8 = (const uint8_t *)k;
+    switch(length)
+    {
+    case 12: c+=k[4]+(((uint32_t)k[5])<<16);
+             b+=k[2]+(((uint32_t)k[3])<<16);
+             a+=k[0]+(((uint32_t)k[1])<<16);
+             break;
+    case 11: c+=((uint32_t)k8[10])<<16;     /* fall through */
+    case 10: c+=k[4];
+             b+=k[2]+(((uint32_t)k[3])<<16);
+             a+=k[0]+(((uint32_t)k[1])<<16);
+             break;
+    case 9 : c+=k8[8];                      /* fall through */
+    case 8 : b+=k[2]+(((uint32_t)k[3])<<16);
+             a+=k[0]+(((uint32_t)k[1])<<16);
+             break;
+    case 7 : b+=((uint32_t)k8[6])<<16;      /* fall through */
+    case 6 : b+=k[2];
+             a+=k[0]+(((uint32_t)k[1])<<16);
+             break;
+    case 5 : b+=k8[4];                      /* fall through */
+    case 4 : a+=k[0]+(((uint32_t)k[1])<<16);
+             break;
+    case 3 : a+=((uint32_t)k8[2])<<16;      /* fall through */
+    case 2 : a+=k[0];
+             break;
+    case 1 : a+=k8[0];
+             break;
+    case 0 : return c;                     /* zero length requires no mixing */
+    }
+
+  } else {                        /* need to read the key one byte at a time */
+    const uint8_t *k = (const uint8_t *)key;
+
+    /*--------------- all but the last block: affect some 32 bits of (a,b,c) */
+    while (length > 12)
+    {
+      a += k[0];
+      a += ((uint32_t)k[1])<<8;
+      a += ((uint32_t)k[2])<<16;
+      a += ((uint32_t)k[3])<<24;
+      b += k[4];
+      b += ((uint32_t)k[5])<<8;
+      b += ((uint32_t)k[6])<<16;
+      b += ((uint32_t)k[7])<<24;
+      c += k[8];
+      c += ((uint32_t)k[9])<<8;
+      c += ((uint32_t)k[10])<<16;
+      c += ((uint32_t)k[11])<<24;
+      mix(a,b,c);
+      length -= 12;
+      k += 12;
+    }
+
+    /*-------------------------------- last block: affect all 32 bits of (c) */
+    switch(length)                   /* all the case statements fall through */
+    {
+    case 12: c+=((uint32_t)k[11])<<24;
+    case 11: c+=((uint32_t)k[10])<<16;
+    case 10: c+=((uint32_t)k[9])<<8;
+    case 9 : c+=k[8];
+    case 8 : b+=((uint32_t)k[7])<<24;
+    case 7 : b+=((uint32_t)k[6])<<16;
+    case 6 : b+=((uint32_t)k[5])<<8;
+    case 5 : b+=k[4];
+    case 4 : a+=((uint32_t)k[3])<<24;
+    case 3 : a+=((uint32_t)k[2])<<16;
+    case 2 : a+=((uint32_t)k[1])<<8;
+    case 1 : a+=k[0];
+             break;
+    case 0 : return c;
+    }
+  }
+
+  final(a,b,c);
+  *val2 = b;
+  return c;
+}
+
+/*
+ * hashbig():
+ * This is the same as hash_word() on big-endian machines.  It is different
+ * from hashlittle() on all machines.  hashbig() takes advantage of
+ * big-endian byte ordering.
+ */
+static uint32_t hashbig( const void *key, size_t length, uint32_t *val2)
+{
+  uint32_t a,b,c;
+  union { const void *ptr; size_t i; } u; /* to cast key to (size_t) happily */
+
+  /* Set up the internal state */
+  a = b = c = 0xdeadbeef + ((uint32_t)length) + *val2;
+
+  u.ptr = key;
+  if (HASH_BIG_ENDIAN && ((u.i & 0x3) == 0)) {
+    const uint32_t *k = (const uint32_t *)key;         /* read 32-bit chunks */
+    const uint8_t  *k8;
+
+    /*------ all but last block: aligned reads and affect 32 bits of (a,b,c) */
+    while (length > 12)
+    {
+      a += k[0];
+      b += k[1];
+      c += k[2];
+      mix(a,b,c);
+      length -= 12;
+      k += 3;
+    }
+
+    /*----------------------------- handle the last (probably partial) block */
+    /*
+     * "k[2]<<8" actually reads beyond the end of the string, but
+     * then shifts out the part it's not allowed to read.  Because the
+     * string is aligned, the illegal read is in the same word as the
+     * rest of the string.  Every machine with memory protection I've seen
+     * does it on word boundaries, so is OK with this.  But VALGRIND will
+     * still catch it and complain.  The masking trick does make the hash
+     * noticably faster for short strings (like English words).
+     *
+     * Not on my testing with gcc 4.5 on an intel i5 CPU, at least --RR.
+     */
+#if 0
+    switch(length)
+    {
+    case 12: c+=k[2]; b+=k[1]; a+=k[0]; break;
+    case 11: c+=k[2]&0xffffff00; b+=k[1]; a+=k[0]; break;
+    case 10: c+=k[2]&0xffff0000; b+=k[1]; a+=k[0]; break;
+    case 9 : c+=k[2]&0xff000000; b+=k[1]; a+=k[0]; break;
+    case 8 : b+=k[1]; a+=k[0]; break;
+    case 7 : b+=k[1]&0xffffff00; a+=k[0]; break;
+    case 6 : b+=k[1]&0xffff0000; a+=k[0]; break;
+    case 5 : b+=k[1]&0xff000000; a+=k[0]; break;
+    case 4 : a+=k[0]; break;
+    case 3 : a+=k[0]&0xffffff00; break;
+    case 2 : a+=k[0]&0xffff0000; break;
+    case 1 : a+=k[0]&0xff000000; break;
+    case 0 : return c;              /* zero length strings require no mixing */
+    }
+
+#else  /* make valgrind happy */
+
+    k8 = (const uint8_t *)k;
+    switch(length)                   /* all the case statements fall through */
+    {
+    case 12: c+=k[2]; b+=k[1]; a+=k[0]; break;
+    case 11: c+=((uint32_t)k8[10])<<8;  /* fall through */
+    case 10: c+=((uint32_t)k8[9])<<16;  /* fall through */
+    case 9 : c+=((uint32_t)k8[8])<<24;  /* fall through */
+    case 8 : b+=k[1]; a+=k[0]; break;
+    case 7 : b+=((uint32_t)k8[6])<<8;   /* fall through */
+    case 6 : b+=((uint32_t)k8[5])<<16;  /* fall through */
+    case 5 : b+=((uint32_t)k8[4])<<24;  /* fall through */
+    case 4 : a+=k[0]; break;
+    case 3 : a+=((uint32_t)k8[2])<<8;   /* fall through */
+    case 2 : a+=((uint32_t)k8[1])<<16;  /* fall through */
+    case 1 : a+=((uint32_t)k8[0])<<24; break;
+    case 0 : return c;
+    }
+
+#endif /* !VALGRIND */
+
+  } else {                        /* need to read the key one byte at a time */
+    const uint8_t *k = (const uint8_t *)key;
+
+    /*--------------- all but the last block: affect some 32 bits of (a,b,c) */
+    while (length > 12)
+    {
+      a += ((uint32_t)k[0])<<24;
+      a += ((uint32_t)k[1])<<16;
+      a += ((uint32_t)k[2])<<8;
+      a += ((uint32_t)k[3]);
+      b += ((uint32_t)k[4])<<24;
+      b += ((uint32_t)k[5])<<16;
+      b += ((uint32_t)k[6])<<8;
+      b += ((uint32_t)k[7]);
+      c += ((uint32_t)k[8])<<24;
+      c += ((uint32_t)k[9])<<16;
+      c += ((uint32_t)k[10])<<8;
+      c += ((uint32_t)k[11]);
+      mix(a,b,c);
+      length -= 12;
+      k += 12;
+    }
+
+    /*-------------------------------- last block: affect all 32 bits of (c) */
+    switch(length)                   /* all the case statements fall through */
+    {
+    case 12: c+=k[11];
+    case 11: c+=((uint32_t)k[10])<<8;
+    case 10: c+=((uint32_t)k[9])<<16;
+    case 9 : c+=((uint32_t)k[8])<<24;
+    case 8 : b+=k[7];
+    case 7 : b+=((uint32_t)k[6])<<8;
+    case 6 : b+=((uint32_t)k[5])<<16;
+    case 5 : b+=((uint32_t)k[4])<<24;
+    case 4 : a+=k[3];
+    case 3 : a+=((uint32_t)k[2])<<8;
+    case 2 : a+=((uint32_t)k[1])<<16;
+    case 1 : a+=((uint32_t)k[0])<<24;
+             break;
+    case 0 : return c;
+    }
+  }
+
+  final(a,b,c);
+  *val2 = b;
+  return c;
+}
+
+/* I basically use hashlittle here, but use native endian within each
+ * element.  This delivers least-surprise: hash such as "int arr[] = {
+ * 1, 2 }; hash_stable(arr, 2, 0);" will be the same on big and little
+ * endian machines, even though a bytewise hash wouldn't be. */
+uint64_t hash64_stable_64(const void *key, size_t n, uint64_t base)
+{
+	const uint64_t *k = key;
+	uint32_t a,b,c;
+
+	/* Set up the internal state */
+	a = b = c = 0xdeadbeef + ((uint32_t)n*8) + (base >> 32) + base;
+
+	while (n > 3) {
+		a += (uint32_t)k[0];
+		b += (uint32_t)(k[0] >> 32);
+		c += (uint32_t)k[1];
+		mix(a,b,c);
+		a += (uint32_t)(k[1] >> 32);
+		b += (uint32_t)k[2];
+		c += (uint32_t)(k[2] >> 32);
+		mix(a,b,c);
+		n -= 3;
+		k += 3;
+	}
+	switch (n) {
+	case 2:
+		a += (uint32_t)k[0];
+		b += (uint32_t)(k[0] >> 32);
+		c += (uint32_t)k[1];
+		mix(a,b,c);
+		a += (uint32_t)(k[1] >> 32);
+		break;
+	case 1:
+		a += (uint32_t)k[0];
+		b += (uint32_t)(k[0] >> 32);
+		break;
+	case 0:
+		return c;
+	}
+	final(a,b,c);
+	return ((uint64_t)b << 32) | c;
+}
+
+uint64_t hash64_stable_32(const void *key, size_t n, uint64_t base)
+{
+	const uint32_t *k = key;
+	uint32_t a,b,c;
+
+	/* Set up the internal state */
+	a = b = c = 0xdeadbeef + ((uint32_t)n*4) + (base >> 32) + base;
+
+	while (n > 3) {
+		a += k[0];
+		b += k[1];
+		c += k[2];
+		mix(a,b,c);
+
+		n -= 3;
+		k += 3;
+	}
+	switch (n) {
+	case 2:
+		b += (uint32_t)k[1];
+	case 1:
+		a += (uint32_t)k[0];
+		break;
+	case 0:
+		return c;
+	}
+	final(a,b,c);
+	return ((uint64_t)b << 32) | c;
+}
+
+uint64_t hash64_stable_16(const void *key, size_t n, uint64_t base)
+{
+	const uint16_t *k = key;
+	uint32_t a,b,c;
+
+	/* Set up the internal state */
+	a = b = c = 0xdeadbeef + ((uint32_t)n*2) + (base >> 32) + base;
+
+	while (n > 6) {
+		a += (uint32_t)k[0] + ((uint32_t)k[1] << 16);
+		b += (uint32_t)k[2] + ((uint32_t)k[3] << 16);
+		c += (uint32_t)k[4] + ((uint32_t)k[5] << 16);
+		mix(a,b,c);
+
+		n -= 6;
+		k += 6;
+	}
+
+	switch (n) {
+	case 5:
+		c += (uint32_t)k[4];
+	case 4:
+		b += ((uint32_t)k[3] << 16);
+	case 3:
+		b += (uint32_t)k[2];
+	case 2:
+		a += ((uint32_t)k[1] << 16);
+	case 1:
+		a += (uint32_t)k[0];
+		break;
+	case 0:
+		return c;
+	}
+	final(a,b,c);
+	return ((uint64_t)b << 32) | c;
+}
+
+uint64_t hash64_stable_8(const void *key, size_t n, uint64_t base)
+{
+	uint32_t b32 = base + (base >> 32);
+	uint32_t lower = hashlittle(key, n, &b32);
+
+	return ((uint64_t)b32 << 32) | lower;
+}
+
+uint32_t hash_any(const void *key, size_t length, uint32_t base)
+{
+	if (HASH_BIG_ENDIAN)
+		return hashbig(key, length, &base);
+	else
+		return hashlittle(key, length, &base);
+}
+
+uint32_t hash_stable_64(const void *key, size_t n, uint32_t base)
+{
+	return hash64_stable_64(key, n, base);
+}
+
+uint32_t hash_stable_32(const void *key, size_t n, uint32_t base)
+{
+	return hash64_stable_32(key, n, base);
+}
+
+uint32_t hash_stable_16(const void *key, size_t n, uint32_t base)
+{
+	return hash64_stable_16(key, n, base);
+}
+
+uint32_t hash_stable_8(const void *key, size_t n, uint32_t base)
+{
+	return hashlittle(key, n, &base);
+}
+
+/* Jenkins' lookup8 is a 64 bit hash, but he says it's obsolete.  Use
+ * the plain one and recombine into 64 bits. */
+uint64_t hash64_any(const void *key, size_t length, uint64_t base)
+{
+	uint32_t b32 = base + (base >> 32);
+	uint32_t lower;
+
+	if (HASH_BIG_ENDIAN)
+		lower = hashbig(key, length, &b32);
+	else
+		lower = hashlittle(key, length, &b32);
+
+	return ((uint64_t)b32 << 32) | lower;
+}
+
+#ifdef SELF_TEST
+
+/* used for timings */
+void driver1()
+{
+  uint8_t buf[256];
+  uint32_t i;
+  uint32_t h=0;
+  time_t a,z;
+
+  time(&a);
+  for (i=0; i<256; ++i) buf[i] = 'x';
+  for (i=0; i<1; ++i)
+  {
+    h = hashlittle(&buf[0],1,h);
+  }
+  time(&z);
+  if (z-a > 0) printf("time %d %.8x\n", z-a, h);
+}
+
+/* check that every input bit changes every output bit half the time */
+#define HASHSTATE 1
+#define HASHLEN   1
+#define MAXPAIR 60
+#define MAXLEN  70
+void driver2()
+{
+  uint8_t qa[MAXLEN+1], qb[MAXLEN+2], *a = &qa[0], *b = &qb[1];
+  uint32_t c[HASHSTATE], d[HASHSTATE], i=0, j=0, k, l, m=0, z;
+  uint32_t e[HASHSTATE],f[HASHSTATE],g[HASHSTATE],h[HASHSTATE];
+  uint32_t x[HASHSTATE],y[HASHSTATE];
+  uint32_t hlen;
+
+  printf("No more than %d trials should ever be needed \n",MAXPAIR/2);
+  for (hlen=0; hlen < MAXLEN; ++hlen)
+  {
+    z=0;
+    for (i=0; i<hlen; ++i)  /*----------------------- for each input byte, */
+    {
+      for (j=0; j<8; ++j)   /*------------------------ for each input bit, */
+      {
+	for (m=1; m<8; ++m) /*------------ for several possible initvals, */
+	{
+	  for (l=0; l<HASHSTATE; ++l)
+	    e[l]=f[l]=g[l]=h[l]=x[l]=y[l]=~((uint32_t)0);
+
+	  /*---- check that every output bit is affected by that input bit */
+	  for (k=0; k<MAXPAIR; k+=2)
+	  {
+	    uint32_t finished=1;
+	    /* keys have one bit different */
+	    for (l=0; l<hlen+1; ++l) {a[l] = b[l] = (uint8_t)0;}
+	    /* have a and b be two keys differing in only one bit */
+	    a[i] ^= (k<<j);
+	    a[i] ^= (k>>(8-j));
+	     c[0] = hashlittle(a, hlen, m);
+	    b[i] ^= ((k+1)<<j);
+	    b[i] ^= ((k+1)>>(8-j));
+	     d[0] = hashlittle(b, hlen, m);
+	    /* check every bit is 1, 0, set, and not set at least once */
+	    for (l=0; l<HASHSTATE; ++l)
+	    {
+	      e[l] &= (c[l]^d[l]);
+	      f[l] &= ~(c[l]^d[l]);
+	      g[l] &= c[l];
+	      h[l] &= ~c[l];
+	      x[l] &= d[l];
+	      y[l] &= ~d[l];
+	      if (e[l]|f[l]|g[l]|h[l]|x[l]|y[l]) finished=0;
+	    }
+	    if (finished) break;
+	  }
+	  if (k>z) z=k;
+	  if (k==MAXPAIR)
+	  {
+	     printf("Some bit didn't change: ");
+	     printf("%.8x %.8x %.8x %.8x %.8x %.8x  ",
+	            e[0],f[0],g[0],h[0],x[0],y[0]);
+	     printf("i %d j %d m %d len %d\n", i, j, m, hlen);
+	  }
+	  if (z==MAXPAIR) goto done;
+	}
+      }
+    }
+   done:
+    if (z < MAXPAIR)
+    {
+      printf("Mix success  %2d bytes  %2d initvals  ",i,m);
+      printf("required  %d  trials\n", z/2);
+    }
+  }
+  printf("\n");
+}
+
+/* Check for reading beyond the end of the buffer and alignment problems */
+void driver3()
+{
+  uint8_t buf[MAXLEN+20], *b;
+  uint32_t len;
+  uint8_t q[] = "This is the time for all good men to come to the aid of their country...";
+  uint32_t h;
+  uint8_t qq[] = "xThis is the time for all good men to come to the aid of their country...";
+  uint32_t i;
+  uint8_t qqq[] = "xxThis is the time for all good men to come to the aid of their country...";
+  uint32_t j;
+  uint8_t qqqq[] = "xxxThis is the time for all good men to come to the aid of their country...";
+  uint32_t ref,x,y;
+  uint8_t *p;
+
+  printf("Endianness.  These lines should all be the same (for values filled in):\n");
+  printf("%.8x                            %.8x                            %.8x\n",
+         hash_word((const uint32_t *)q, (sizeof(q)-1)/4, 13),
+         hash_word((const uint32_t *)q, (sizeof(q)-5)/4, 13),
+         hash_word((const uint32_t *)q, (sizeof(q)-9)/4, 13));
+  p = q;
+  printf("%.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x\n",
+         hashlittle(p, sizeof(q)-1, 13), hashlittle(p, sizeof(q)-2, 13),
+         hashlittle(p, sizeof(q)-3, 13), hashlittle(p, sizeof(q)-4, 13),
+         hashlittle(p, sizeof(q)-5, 13), hashlittle(p, sizeof(q)-6, 13),
+         hashlittle(p, sizeof(q)-7, 13), hashlittle(p, sizeof(q)-8, 13),
+         hashlittle(p, sizeof(q)-9, 13), hashlittle(p, sizeof(q)-10, 13),
+         hashlittle(p, sizeof(q)-11, 13), hashlittle(p, sizeof(q)-12, 13));
+  p = &qq[1];
+  printf("%.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x\n",
+         hashlittle(p, sizeof(q)-1, 13), hashlittle(p, sizeof(q)-2, 13),
+         hashlittle(p, sizeof(q)-3, 13), hashlittle(p, sizeof(q)-4, 13),
+         hashlittle(p, sizeof(q)-5, 13), hashlittle(p, sizeof(q)-6, 13),
+         hashlittle(p, sizeof(q)-7, 13), hashlittle(p, sizeof(q)-8, 13),
+         hashlittle(p, sizeof(q)-9, 13), hashlittle(p, sizeof(q)-10, 13),
+         hashlittle(p, sizeof(q)-11, 13), hashlittle(p, sizeof(q)-12, 13));
+  p = &qqq[2];
+  printf("%.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x\n",
+         hashlittle(p, sizeof(q)-1, 13), hashlittle(p, sizeof(q)-2, 13),
+         hashlittle(p, sizeof(q)-3, 13), hashlittle(p, sizeof(q)-4, 13),
+         hashlittle(p, sizeof(q)-5, 13), hashlittle(p, sizeof(q)-6, 13),
+         hashlittle(p, sizeof(q)-7, 13), hashlittle(p, sizeof(q)-8, 13),
+         hashlittle(p, sizeof(q)-9, 13), hashlittle(p, sizeof(q)-10, 13),
+         hashlittle(p, sizeof(q)-11, 13), hashlittle(p, sizeof(q)-12, 13));
+  p = &qqqq[3];
+  printf("%.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x\n",
+         hashlittle(p, sizeof(q)-1, 13), hashlittle(p, sizeof(q)-2, 13),
+         hashlittle(p, sizeof(q)-3, 13), hashlittle(p, sizeof(q)-4, 13),
+         hashlittle(p, sizeof(q)-5, 13), hashlittle(p, sizeof(q)-6, 13),
+         hashlittle(p, sizeof(q)-7, 13), hashlittle(p, sizeof(q)-8, 13),
+         hashlittle(p, sizeof(q)-9, 13), hashlittle(p, sizeof(q)-10, 13),
+         hashlittle(p, sizeof(q)-11, 13), hashlittle(p, sizeof(q)-12, 13));
+  printf("\n");
+
+  /* check that hashlittle2 and hashlittle produce the same results */
+  i=47; j=0;
+  hashlittle2(q, sizeof(q), &i, &j);
+  if (hashlittle(q, sizeof(q), 47) != i)
+    printf("hashlittle2 and hashlittle mismatch\n");
+
+  /* check that hash_word2 and hash_word produce the same results */
+  len = 0xdeadbeef;
+  i=47, j=0;
+  hash_word2(&len, 1, &i, &j);
+  if (hash_word(&len, 1, 47) != i)
+    printf("hash_word2 and hash_word mismatch %x %x\n",
+	   i, hash_word(&len, 1, 47));
+
+  /* check hashlittle doesn't read before or after the ends of the string */
+  for (h=0, b=buf+1; h<8; ++h, ++b)
+  {
+    for (i=0; i<MAXLEN; ++i)
+    {
+      len = i;
+      for (j=0; j<i; ++j) *(b+j)=0;
+
+      /* these should all be equal */
+      ref = hashlittle(b, len, (uint32_t)1);
+      *(b+i)=(uint8_t)~0;
+      *(b-1)=(uint8_t)~0;
+      x = hashlittle(b, len, (uint32_t)1);
+      y = hashlittle(b, len, (uint32_t)1);
+      if ((ref != x) || (ref != y))
+      {
+	printf("alignment error: %.8x %.8x %.8x %d %d\n",ref,x,y,
+               h, i);
+      }
+    }
+  }
+}
+
+/* check for problems with nulls */
+ void driver4()
+{
+  uint8_t buf[1];
+  uint32_t h,i,state[HASHSTATE];
+
+
+  buf[0] = ~0;
+  for (i=0; i<HASHSTATE; ++i) state[i] = 1;
+  printf("These should all be different\n");
+  for (i=0, h=0; i<8; ++i)
+  {
+    h = hashlittle(buf, 0, h);
+    printf("%2ld  0-byte strings, hash is  %.8x\n", i, h);
+  }
+}
+
+
+int main()
+{
+  driver1();   /* test that the key is hashed: used for timings */
+  driver2();   /* test that whole key is hashed thoroughly */
+  driver3();   /* test that nothing but the key is hashed */
+  driver4();   /* test hashing multiple buffers (all buffers are null) */
+  return 1;
+}
+
+#endif  /* SELF_TEST */
diff --git a/lib/ccan/hash/hash.h b/lib/ccan/hash/hash.h
new file mode 100644
index 0000000000..5025c0d748
--- /dev/null
+++ b/lib/ccan/hash/hash.h
@@ -0,0 +1,312 @@
+#ifndef CCAN_HASH_H
+#define CCAN_HASH_H
+#include "config.h"
+#include <stdint.h>
+#include <stdlib.h>
+#include <ccan/build_assert/build_assert.h>
+
+/* Stolen mostly from: lookup3.c, by Bob Jenkins, May 2006, Public Domain.
+ *
+ * http://burtleburtle.net/bob/c/lookup3.c
+ */
+
+/**
+ * hash - fast hash of an array for internal use
+ * @p: the array or pointer to first element
+ * @num: the number of elements to hash
+ * @base: the base number to roll into the hash (usually 0)
+ *
+ * The memory region pointed to by p is combined with the base to form
+ * a 32-bit hash.
+ *
+ * This hash will have different results on different machines, so is
+ * only useful for internal hashes (ie. not hashes sent across the
+ * network or saved to disk).
+ *
+ * It may also change with future versions: it could even detect at runtime
+ * what the fastest hash to use is.
+ *
+ * See also: hash64, hash_stable.
+ *
+ * Example:
+ *	#include <ccan/hash/hash.h>
+ *	#include <err.h>
+ *	#include <stdio.h>
+ *	#include <string.h>
+ *
+ *	// Simple demonstration: idential strings will have the same hash, but
+ *	// two different strings will probably not.
+ *	int main(int argc, char *argv[])
+ *	{
+ *		uint32_t hash1, hash2;
+ *
+ *		if (argc != 3)
+ *			err(1, "Usage: %s <string1> <string2>", argv[0]);
+ *
+ *		hash1 = hash(argv[1], strlen(argv[1]), 0);
+ *		hash2 = hash(argv[2], strlen(argv[2]), 0);
+ *		printf("Hash is %s\n", hash1 == hash2 ? "same" : "different");
+ *		return 0;
+ *	}
+ */
+#define hash(p, num, base) hash_any((p), (num)*sizeof(*(p)), (base))
+
+/**
+ * hash_stable - hash of an array for external use
+ * @p: the array or pointer to first element
+ * @num: the number of elements to hash
+ * @base: the base number to roll into the hash (usually 0)
+ *
+ * The array of simple integer types pointed to by p is combined with
+ * the base to form a 32-bit hash.
+ *
+ * This hash will have the same results on different machines, so can
+ * be used for external hashes (ie. hashes sent across the network or
+ * saved to disk).  The results will not change in future versions of
+ * this module.
+ *
+ * Note that it is only legal to hand an array of simple integer types
+ * to this hash (ie. char, uint16_t, int64_t, etc).  In these cases,
+ * the same values will have the same hash result, even though the
+ * memory representations of integers depend on the machine
+ * endianness.
+ *
+ * See also:
+ *	hash64_stable
+ *
+ * Example:
+ *	#include <ccan/hash/hash.h>
+ *	#include <err.h>
+ *	#include <stdio.h>
+ *	#include <string.h>
+ *
+ *	int main(int argc, char *argv[])
+ *	{
+ *		if (argc != 2)
+ *			err(1, "Usage: %s <string-to-hash>", argv[0]);
+ *
+ *		printf("Hash stable result is %u\n",
+ *		       hash_stable(argv[1], strlen(argv[1]), 0));
+ *		return 0;
+ *	}
+ */
+#define hash_stable(p, num, base)					\
+	(BUILD_ASSERT_OR_ZERO(sizeof(*(p)) == 8 || sizeof(*(p)) == 4	\
+			      || sizeof(*(p)) == 2 || sizeof(*(p)) == 1) + \
+	 sizeof(*(p)) == 8 ? hash_stable_64((p), (num), (base))		\
+	 : sizeof(*(p)) == 4 ? hash_stable_32((p), (num), (base))	\
+	 : sizeof(*(p)) == 2 ? hash_stable_16((p), (num), (base))	\
+	 : hash_stable_8((p), (num), (base)))
+
+/**
+ * hash_u32 - fast hash an array of 32-bit values for internal use
+ * @key: the array of uint32_t
+ * @num: the number of elements to hash
+ * @base: the base number to roll into the hash (usually 0)
+ *
+ * The array of uint32_t pointed to by @key is combined with the base
+ * to form a 32-bit hash.  This is 2-3 times faster than hash() on small
+ * arrays, but the advantage vanishes over large hashes.
+ *
+ * This hash will have different results on different machines, so is
+ * only useful for internal hashes (ie. not hashes sent across the
+ * network or saved to disk).
+ */
+uint32_t hash_u32(const uint32_t *key, size_t num, uint32_t base);
+
+/**
+ * hash_string - very fast hash of an ascii string
+ * @str: the nul-terminated string
+ *
+ * The string is hashed, using a hash function optimized for ASCII and
+ * similar strings.  It's weaker than the other hash functions.
+ *
+ * This hash may have different results on different machines, so is
+ * only useful for internal hashes (ie. not hashes sent across the
+ * network or saved to disk).  The results will be different from the
+ * other hash functions in this module, too.
+ */
+static inline uint32_t hash_string(const char *string)
+{
+	/* This is Karl Nelson <kenelson@ece.ucdavis.edu>'s X31 hash.
+	 * It's a little faster than the (much better) lookup3 hash(): 56ns vs
+	 * 84ns on my 2GHz Intel Core Duo 2 laptop for a 10 char string. */
+	uint32_t ret;
+
+	for (ret = 0; *string; string++)
+		ret = (ret << 5) - ret + *string;
+
+	return ret;
+}
+
+/**
+ * hash64 - fast 64-bit hash of an array for internal use
+ * @p: the array or pointer to first element
+ * @num: the number of elements to hash
+ * @base: the 64-bit base number to roll into the hash (usually 0)
+ *
+ * The memory region pointed to by p is combined with the base to form
+ * a 64-bit hash.
+ *
+ * This hash will have different results on different machines, so is
+ * only useful for internal hashes (ie. not hashes sent across the
+ * network or saved to disk).
+ *
+ * It may also change with future versions: it could even detect at runtime
+ * what the fastest hash to use is.
+ *
+ * See also: hash.
+ *
+ * Example:
+ *	#include <ccan/hash/hash.h>
+ *	#include <err.h>
+ *	#include <stdio.h>
+ *	#include <string.h>
+ *
+ *	// Simple demonstration: idential strings will have the same hash, but
+ *	// two different strings will probably not.
+ *	int main(int argc, char *argv[])
+ *	{
+ *		uint64_t hash1, hash2;
+ *
+ *		if (argc != 3)
+ *			err(1, "Usage: %s <string1> <string2>", argv[0]);
+ *
+ *		hash1 = hash64(argv[1], strlen(argv[1]), 0);
+ *		hash2 = hash64(argv[2], strlen(argv[2]), 0);
+ *		printf("Hash is %s\n", hash1 == hash2 ? "same" : "different");
+ *		return 0;
+ *	}
+ */
+#define hash64(p, num, base) hash64_any((p), (num)*sizeof(*(p)), (base))
+
+/**
+ * hash64_stable - 64 bit hash of an array for external use
+ * @p: the array or pointer to first element
+ * @num: the number of elements to hash
+ * @base: the base number to roll into the hash (usually 0)
+ *
+ * The array of simple integer types pointed to by p is combined with
+ * the base to form a 64-bit hash.
+ *
+ * This hash will have the same results on different machines, so can
+ * be used for external hashes (ie. hashes sent across the network or
+ * saved to disk).  The results will not change in future versions of
+ * this module.
+ *
+ * Note that it is only legal to hand an array of simple integer types
+ * to this hash (ie. char, uint16_t, int64_t, etc).  In these cases,
+ * the same values will have the same hash result, even though the
+ * memory representations of integers depend on the machine
+ * endianness.
+ *
+ * See also:
+ *	hash_stable
+ *
+ * Example:
+ *	#include <ccan/hash/hash.h>
+ *	#include <err.h>
+ *	#include <stdio.h>
+ *	#include <string.h>
+ *
+ *	int main(int argc, char *argv[])
+ *	{
+ *		if (argc != 2)
+ *			err(1, "Usage: %s <string-to-hash>", argv[0]);
+ *
+ *		printf("Hash stable result is %llu\n",
+ *		       (long long)hash64_stable(argv[1], strlen(argv[1]), 0));
+ *		return 0;
+ *	}
+ */
+#define hash64_stable(p, num, base)					\
+	(BUILD_ASSERT_OR_ZERO(sizeof(*(p)) == 8 || sizeof(*(p)) == 4	\
+			      || sizeof(*(p)) == 2 || sizeof(*(p)) == 1) + \
+	 sizeof(*(p)) == 8 ? hash64_stable_64((p), (num), (base))	\
+	 : sizeof(*(p)) == 4 ? hash64_stable_32((p), (num), (base))	\
+	 : sizeof(*(p)) == 2 ? hash64_stable_16((p), (num), (base))	\
+	 : hash64_stable_8((p), (num), (base)))
+
+
+/**
+ * hashl - fast 32/64-bit hash of an array for internal use
+ * @p: the array or pointer to first element
+ * @num: the number of elements to hash
+ * @base: the base number to roll into the hash (usually 0)
+ *
+ * This is either hash() or hash64(), on 32/64 bit long machines.
+ */
+#define hashl(p, num, base)						\
+	(BUILD_ASSERT_OR_ZERO(sizeof(long) == sizeof(uint32_t)		\
+			      || sizeof(long) == sizeof(uint64_t)) +	\
+	(sizeof(long) == sizeof(uint64_t)				\
+	 ? hash64((p), (num), (base)) : hash((p), (num), (base))))
+
+/* Our underlying operations. */
+uint32_t hash_any(const void *key, size_t length, uint32_t base);
+uint32_t hash_stable_64(const void *key, size_t n, uint32_t base);
+uint32_t hash_stable_32(const void *key, size_t n, uint32_t base);
+uint32_t hash_stable_16(const void *key, size_t n, uint32_t base);
+uint32_t hash_stable_8(const void *key, size_t n, uint32_t base);
+uint64_t hash64_any(const void *key, size_t length, uint64_t base);
+uint64_t hash64_stable_64(const void *key, size_t n, uint64_t base);
+uint64_t hash64_stable_32(const void *key, size_t n, uint64_t base);
+uint64_t hash64_stable_16(const void *key, size_t n, uint64_t base);
+uint64_t hash64_stable_8(const void *key, size_t n, uint64_t base);
+
+/**
+ * hash_pointer - hash a pointer for internal use
+ * @p: the pointer value to hash
+ * @base: the base number to roll into the hash (usually 0)
+ *
+ * The pointer p (not what p points to!) is combined with the base to form
+ * a 32-bit hash.
+ *
+ * This hash will have different results on different machines, so is
+ * only useful for internal hashes (ie. not hashes sent across the
+ * network or saved to disk).
+ *
+ * Example:
+ *	#include <ccan/hash/hash.h>
+ *
+ *	// Code to keep track of memory regions.
+ *	struct region {
+ *		struct region *chain;
+ *		void *start;
+ *		unsigned int size;
+ *	};
+ *	// We keep a simple hash table.
+ *	static struct region *region_hash[128];
+ *
+ *	static void add_region(struct region *r)
+ *	{
+ *		unsigned int h = hash_pointer(r->start, 0);
+ *
+ *		r->chain = region_hash[h];
+ *		region_hash[h] = r->chain;
+ *	}
+ *
+ *	static struct region *find_region(const void *start)
+ *	{
+ *		struct region *r;
+ *
+ *		for (r = region_hash[hash_pointer(start, 0)]; r; r = r->chain)
+ *			if (r->start == start)
+ *				return r;
+ *		return NULL;
+ *	}
+ */
+static inline uint32_t hash_pointer(const void *p, uint32_t base)
+{
+	if (sizeof(p) % sizeof(uint32_t) == 0) {
+		/* This convoluted union is the right way of aliasing. */
+		union {
+			uint32_t u32[sizeof(p) / sizeof(uint32_t)];
+			const void *p;
+		} u;
+		u.p = p;
+		return hash_u32(u.u32, sizeof(p) / sizeof(uint32_t), base);
+	} else
+		return hash(&p, 1, base);
+}
+#endif /* HASH_H */
diff --git a/lib/ccan/hash/test/api-hash_stable.c b/lib/ccan/hash/test/api-hash_stable.c
new file mode 100644
index 0000000000..bb58d16b18
--- /dev/null
+++ b/lib/ccan/hash/test/api-hash_stable.c
@@ -0,0 +1,300 @@
+#include <ccan/hash/hash.h>
+#include <ccan/tap/tap.h>
+#include <stdbool.h>
+#include <string.h>
+
+#define ARRAY_WORDS 5
+
+int main(int argc, char *argv[])
+{
+	unsigned int i;
+	uint8_t u8array[ARRAY_WORDS];
+	uint16_t u16array[ARRAY_WORDS];
+	uint32_t u32array[ARRAY_WORDS];
+	uint64_t u64array[ARRAY_WORDS];
+
+	/* Initialize arrays. */
+	for (i = 0; i < ARRAY_WORDS; i++) {
+		u8array[i] = i;
+		u16array[i] = i;
+		u32array[i] = i;
+		u64array[i] = i;
+	}
+
+	plan_tests(264);
+
+	/* hash_stable is API-guaranteed. */
+	ok1(hash_stable(u8array, ARRAY_WORDS, 0) == 0x1d4833cc);
+	ok1(hash_stable(u8array, ARRAY_WORDS, 1) == 0x37125e2 );
+	ok1(hash_stable(u8array, ARRAY_WORDS, 2) == 0x330a007a);
+	ok1(hash_stable(u8array, ARRAY_WORDS, 4) == 0x7b0df29b);
+	ok1(hash_stable(u8array, ARRAY_WORDS, 8) == 0xe7e5d741);
+	ok1(hash_stable(u8array, ARRAY_WORDS, 16) == 0xaae57471);
+	ok1(hash_stable(u8array, ARRAY_WORDS, 32) == 0xc55399e5);
+	ok1(hash_stable(u8array, ARRAY_WORDS, 64) == 0x67f21f7 );
+	ok1(hash_stable(u8array, ARRAY_WORDS, 128) == 0x1d795b71);
+	ok1(hash_stable(u8array, ARRAY_WORDS, 256) == 0xeb961671);
+	ok1(hash_stable(u8array, ARRAY_WORDS, 512) == 0xc2597247);
+	ok1(hash_stable(u8array, ARRAY_WORDS, 1024) == 0x3f5c4d75);
+	ok1(hash_stable(u8array, ARRAY_WORDS, 2048) == 0xe65cf4f9);
+	ok1(hash_stable(u8array, ARRAY_WORDS, 4096) == 0xf2cd06cb);
+	ok1(hash_stable(u8array, ARRAY_WORDS, 8192) == 0x443041e1);
+	ok1(hash_stable(u8array, ARRAY_WORDS, 16384) == 0xdfc618f5);
+	ok1(hash_stable(u8array, ARRAY_WORDS, 32768) == 0x5e3d5b97);
+	ok1(hash_stable(u8array, ARRAY_WORDS, 65536) == 0xd5f64730);
+	ok1(hash_stable(u8array, ARRAY_WORDS, 131072) == 0x372bbecc);
+	ok1(hash_stable(u8array, ARRAY_WORDS, 262144) == 0x7c194c8d);
+	ok1(hash_stable(u8array, ARRAY_WORDS, 524288) == 0x16cbb416);
+	ok1(hash_stable(u8array, ARRAY_WORDS, 1048576) == 0x53e99222);
+	ok1(hash_stable(u8array, ARRAY_WORDS, 2097152) == 0x6394554a);
+	ok1(hash_stable(u8array, ARRAY_WORDS, 4194304) == 0xd83a506d);
+	ok1(hash_stable(u8array, ARRAY_WORDS, 8388608) == 0x7619d9a4);
+	ok1(hash_stable(u8array, ARRAY_WORDS, 16777216) == 0xfe98e5f6);
+	ok1(hash_stable(u8array, ARRAY_WORDS, 33554432) == 0x6c262927);
+	ok1(hash_stable(u8array, ARRAY_WORDS, 67108864) == 0x3f0106fd);
+	ok1(hash_stable(u8array, ARRAY_WORDS, 134217728) == 0xc91e3a28);
+	ok1(hash_stable(u8array, ARRAY_WORDS, 268435456) == 0x14229579);
+	ok1(hash_stable(u8array, ARRAY_WORDS, 536870912) == 0x9dbefa76);
+	ok1(hash_stable(u8array, ARRAY_WORDS, 1073741824) == 0xb05c0c78);
+	ok1(hash_stable(u8array, ARRAY_WORDS, 2147483648U) == 0x88f24d81);
+
+	ok1(hash_stable(u16array, ARRAY_WORDS, 0) == 0xecb5f507);
+	ok1(hash_stable(u16array, ARRAY_WORDS, 1) == 0xadd666e6);
+	ok1(hash_stable(u16array, ARRAY_WORDS, 2) == 0xea0f214c);
+	ok1(hash_stable(u16array, ARRAY_WORDS, 4) == 0xae4051ba);
+	ok1(hash_stable(u16array, ARRAY_WORDS, 8) == 0x6ed28026);
+	ok1(hash_stable(u16array, ARRAY_WORDS, 16) == 0xa3917a19);
+	ok1(hash_stable(u16array, ARRAY_WORDS, 32) == 0xf370f32b);
+	ok1(hash_stable(u16array, ARRAY_WORDS, 64) == 0x807af460);
+	ok1(hash_stable(u16array, ARRAY_WORDS, 128) == 0xb4c8cd83);
+	ok1(hash_stable(u16array, ARRAY_WORDS, 256) == 0xa10cb5b0);
+	ok1(hash_stable(u16array, ARRAY_WORDS, 512) == 0x8b7d7387);
+	ok1(hash_stable(u16array, ARRAY_WORDS, 1024) == 0x9e49d1c );
+	ok1(hash_stable(u16array, ARRAY_WORDS, 2048) == 0x288830d1);
+	ok1(hash_stable(u16array, ARRAY_WORDS, 4096) == 0xbe078a43);
+	ok1(hash_stable(u16array, ARRAY_WORDS, 8192) == 0xa16d5d88);
+	ok1(hash_stable(u16array, ARRAY_WORDS, 16384) == 0x46839fcd);
+	ok1(hash_stable(u16array, ARRAY_WORDS, 32768) == 0x9db9bd4f);
+	ok1(hash_stable(u16array, ARRAY_WORDS, 65536) == 0xedff58f8);
+	ok1(hash_stable(u16array, ARRAY_WORDS, 131072) == 0x95ecef18);
+	ok1(hash_stable(u16array, ARRAY_WORDS, 262144) == 0x23c31b7d);
+	ok1(hash_stable(u16array, ARRAY_WORDS, 524288) == 0x1d85c7d0);
+	ok1(hash_stable(u16array, ARRAY_WORDS, 1048576) == 0x25218842);
+	ok1(hash_stable(u16array, ARRAY_WORDS, 2097152) == 0x711d985c);
+	ok1(hash_stable(u16array, ARRAY_WORDS, 4194304) == 0x85470eca);
+	ok1(hash_stable(u16array, ARRAY_WORDS, 8388608) == 0x99ed4ceb);
+	ok1(hash_stable(u16array, ARRAY_WORDS, 16777216) == 0x67b3710c);
+	ok1(hash_stable(u16array, ARRAY_WORDS, 33554432) == 0x77f1ab35);
+	ok1(hash_stable(u16array, ARRAY_WORDS, 67108864) == 0x81f688aa);
+	ok1(hash_stable(u16array, ARRAY_WORDS, 134217728) == 0x27b56ca5);
+	ok1(hash_stable(u16array, ARRAY_WORDS, 268435456) == 0xf21ba203);
+	ok1(hash_stable(u16array, ARRAY_WORDS, 536870912) == 0xd48d1d1 );
+	ok1(hash_stable(u16array, ARRAY_WORDS, 1073741824) == 0xa542b62d);
+	ok1(hash_stable(u16array, ARRAY_WORDS, 2147483648U) == 0xa04c7058);
+
+	ok1(hash_stable(u32array, ARRAY_WORDS, 0) == 0x13305f8c);
+	ok1(hash_stable(u32array, ARRAY_WORDS, 1) == 0x171abf74);
+	ok1(hash_stable(u32array, ARRAY_WORDS, 2) == 0x7646fcc7);
+	ok1(hash_stable(u32array, ARRAY_WORDS, 4) == 0xa758ed5);
+	ok1(hash_stable(u32array, ARRAY_WORDS, 8) == 0x2dedc2e4);
+	ok1(hash_stable(u32array, ARRAY_WORDS, 16) == 0x28e2076b);
+	ok1(hash_stable(u32array, ARRAY_WORDS, 32) == 0xb73091c5);
+	ok1(hash_stable(u32array, ARRAY_WORDS, 64) == 0x87daf5db);
+	ok1(hash_stable(u32array, ARRAY_WORDS, 128) == 0xa16dfe20);
+	ok1(hash_stable(u32array, ARRAY_WORDS, 256) == 0x300c63c3);
+	ok1(hash_stable(u32array, ARRAY_WORDS, 512) == 0x255c91fc);
+	ok1(hash_stable(u32array, ARRAY_WORDS, 1024) == 0x6357b26);
+	ok1(hash_stable(u32array, ARRAY_WORDS, 2048) == 0x4bc5f339);
+	ok1(hash_stable(u32array, ARRAY_WORDS, 4096) == 0x1301617c);
+	ok1(hash_stable(u32array, ARRAY_WORDS, 8192) == 0x506792c9);
+	ok1(hash_stable(u32array, ARRAY_WORDS, 16384) == 0xcd596705);
+	ok1(hash_stable(u32array, ARRAY_WORDS, 32768) == 0xa8713cac);
+	ok1(hash_stable(u32array, ARRAY_WORDS, 65536) == 0x94d9794);
+	ok1(hash_stable(u32array, ARRAY_WORDS, 131072) == 0xac753e8);
+	ok1(hash_stable(u32array, ARRAY_WORDS, 262144) == 0xcd8bdd20);
+	ok1(hash_stable(u32array, ARRAY_WORDS, 524288) == 0xd44faf80);
+	ok1(hash_stable(u32array, ARRAY_WORDS, 1048576) == 0x2547ccbe);
+	ok1(hash_stable(u32array, ARRAY_WORDS, 2097152) == 0xbab06dbc);
+	ok1(hash_stable(u32array, ARRAY_WORDS, 4194304) == 0xaac0e882);
+	ok1(hash_stable(u32array, ARRAY_WORDS, 8388608) == 0x443f48d0);
+	ok1(hash_stable(u32array, ARRAY_WORDS, 16777216) == 0xdff49fcc);
+	ok1(hash_stable(u32array, ARRAY_WORDS, 33554432) == 0x9ce0fd65);
+	ok1(hash_stable(u32array, ARRAY_WORDS, 67108864) == 0x9ddb1def);
+	ok1(hash_stable(u32array, ARRAY_WORDS, 134217728) == 0x86096f25);
+	ok1(hash_stable(u32array, ARRAY_WORDS, 268435456) == 0xe713b7b5);
+	ok1(hash_stable(u32array, ARRAY_WORDS, 536870912) == 0x5baeffc5);
+	ok1(hash_stable(u32array, ARRAY_WORDS, 1073741824) == 0xde874f52);
+	ok1(hash_stable(u32array, ARRAY_WORDS, 2147483648U) == 0xeca13b4e);
+
+	ok1(hash_stable(u64array, ARRAY_WORDS, 0) == 0x12ef6302);
+	ok1(hash_stable(u64array, ARRAY_WORDS, 1) == 0xe9aeb406);
+	ok1(hash_stable(u64array, ARRAY_WORDS, 2) == 0xc4218ceb);
+	ok1(hash_stable(u64array, ARRAY_WORDS, 4) == 0xb3d11412);
+	ok1(hash_stable(u64array, ARRAY_WORDS, 8) == 0xdafbd654);
+	ok1(hash_stable(u64array, ARRAY_WORDS, 16) == 0x9c336cba);
+	ok1(hash_stable(u64array, ARRAY_WORDS, 32) == 0x65059721);
+	ok1(hash_stable(u64array, ARRAY_WORDS, 64) == 0x95b5bbe6);
+	ok1(hash_stable(u64array, ARRAY_WORDS, 128) == 0xe7596b84);
+	ok1(hash_stable(u64array, ARRAY_WORDS, 256) == 0x503622a2);
+	ok1(hash_stable(u64array, ARRAY_WORDS, 512) == 0xecdcc5ca);
+	ok1(hash_stable(u64array, ARRAY_WORDS, 1024) == 0xc40d0513);
+	ok1(hash_stable(u64array, ARRAY_WORDS, 2048) == 0xaab25e4d);
+	ok1(hash_stable(u64array, ARRAY_WORDS, 4096) == 0xcc353fb9);
+	ok1(hash_stable(u64array, ARRAY_WORDS, 8192) == 0x18e2319f);
+	ok1(hash_stable(u64array, ARRAY_WORDS, 16384) == 0xfddaae8d);
+	ok1(hash_stable(u64array, ARRAY_WORDS, 32768) == 0xef7976f2);
+	ok1(hash_stable(u64array, ARRAY_WORDS, 65536) == 0x86359fc9);
+	ok1(hash_stable(u64array, ARRAY_WORDS, 131072) == 0x8b5af385);
+	ok1(hash_stable(u64array, ARRAY_WORDS, 262144) == 0x80d4ee31);
+	ok1(hash_stable(u64array, ARRAY_WORDS, 524288) == 0x42f5f85b);
+	ok1(hash_stable(u64array, ARRAY_WORDS, 1048576) == 0x9a6920e1);
+	ok1(hash_stable(u64array, ARRAY_WORDS, 2097152) == 0x7b7c9850);
+	ok1(hash_stable(u64array, ARRAY_WORDS, 4194304) == 0x69573e09);
+	ok1(hash_stable(u64array, ARRAY_WORDS, 8388608) == 0xc942bc0e);
+	ok1(hash_stable(u64array, ARRAY_WORDS, 16777216) == 0x7a89f0f1);
+	ok1(hash_stable(u64array, ARRAY_WORDS, 33554432) == 0x2dd641ca);
+	ok1(hash_stable(u64array, ARRAY_WORDS, 67108864) == 0x89bbd391);
+	ok1(hash_stable(u64array, ARRAY_WORDS, 134217728) == 0xbcf88e31);
+	ok1(hash_stable(u64array, ARRAY_WORDS, 268435456) == 0xfa7a3460);
+	ok1(hash_stable(u64array, ARRAY_WORDS, 536870912) == 0x49a37be0);
+	ok1(hash_stable(u64array, ARRAY_WORDS, 1073741824) == 0x1b346394);
+	ok1(hash_stable(u64array, ARRAY_WORDS, 2147483648U) == 0x6c3a1592);
+
+	ok1(hash64_stable(u8array, ARRAY_WORDS, 0) == 16887282882572727244ULL);
+	ok1(hash64_stable(u8array, ARRAY_WORDS, 1) == 12032777473133454818ULL);
+	ok1(hash64_stable(u8array, ARRAY_WORDS, 2) == 18183407363221487738ULL);
+	ok1(hash64_stable(u8array, ARRAY_WORDS, 4) == 17860764172704150171ULL);
+	ok1(hash64_stable(u8array, ARRAY_WORDS, 8) == 18076051600675559233ULL);
+	ok1(hash64_stable(u8array, ARRAY_WORDS, 16) == 9909361918431556721ULL);
+	ok1(hash64_stable(u8array, ARRAY_WORDS, 32) == 12937969888744675813ULL);
+	ok1(hash64_stable(u8array, ARRAY_WORDS, 64) == 5245669057381736951ULL);
+	ok1(hash64_stable(u8array, ARRAY_WORDS, 128) == 4376874646406519665ULL);
+	ok1(hash64_stable(u8array, ARRAY_WORDS, 256) == 14219974419871569521ULL);
+	ok1(hash64_stable(u8array, ARRAY_WORDS, 512) == 2263415354134458951ULL);
+	ok1(hash64_stable(u8array, ARRAY_WORDS, 1024) == 4953859694526221685ULL);
+	ok1(hash64_stable(u8array, ARRAY_WORDS, 2048) == 3432228642067641593ULL);
+	ok1(hash64_stable(u8array, ARRAY_WORDS, 4096) == 1219647244417697483ULL);
+	ok1(hash64_stable(u8array, ARRAY_WORDS, 8192) == 7629939424585859553ULL);
+	ok1(hash64_stable(u8array, ARRAY_WORDS, 16384) == 10041660531376789749ULL);
+	ok1(hash64_stable(u8array, ARRAY_WORDS, 32768) == 13859885793922603927ULL);
+	ok1(hash64_stable(u8array, ARRAY_WORDS, 65536) == 15069060338344675120ULL);
+	ok1(hash64_stable(u8array, ARRAY_WORDS, 131072) == 818163430835601100ULL);
+	ok1(hash64_stable(u8array, ARRAY_WORDS, 262144) == 14914314323019517069ULL);
+	ok1(hash64_stable(u8array, ARRAY_WORDS, 524288) == 17518437749769352214ULL);
+	ok1(hash64_stable(u8array, ARRAY_WORDS, 1048576) == 14920048004901212706ULL);
+	ok1(hash64_stable(u8array, ARRAY_WORDS, 2097152) == 8758567366332536138ULL);
+	ok1(hash64_stable(u8array, ARRAY_WORDS, 4194304) == 6226655736088907885ULL);
+	ok1(hash64_stable(u8array, ARRAY_WORDS, 8388608) == 13716650013685832100ULL);
+	ok1(hash64_stable(u8array, ARRAY_WORDS, 16777216) == 305325651636315638ULL);
+	ok1(hash64_stable(u8array, ARRAY_WORDS, 33554432) == 16784147606583781671ULL);
+	ok1(hash64_stable(u8array, ARRAY_WORDS, 67108864) == 16509467555140798205ULL);
+	ok1(hash64_stable(u8array, ARRAY_WORDS, 134217728) == 8717281234694060584ULL);
+	ok1(hash64_stable(u8array, ARRAY_WORDS, 268435456) == 8098476701725660537ULL);
+	ok1(hash64_stable(u8array, ARRAY_WORDS, 536870912) == 16345871539461094006ULL);
+	ok1(hash64_stable(u8array, ARRAY_WORDS, 1073741824) == 3755557000429964408ULL);
+	ok1(hash64_stable(u8array, ARRAY_WORDS, 2147483648U) == 15017348801959710081ULL);
+
+	ok1(hash64_stable(u16array, ARRAY_WORDS, 0) == 1038028831307724039ULL);
+	ok1(hash64_stable(u16array, ARRAY_WORDS, 1) == 10155473272642627302ULL);
+	ok1(hash64_stable(u16array, ARRAY_WORDS, 2) == 5714751190106841420ULL);
+	ok1(hash64_stable(u16array, ARRAY_WORDS, 4) == 3923885607767527866ULL);
+	ok1(hash64_stable(u16array, ARRAY_WORDS, 8) == 3931017318293995558ULL);
+	ok1(hash64_stable(u16array, ARRAY_WORDS, 16) == 1469696588339313177ULL);
+	ok1(hash64_stable(u16array, ARRAY_WORDS, 32) == 11522218526952715051ULL);
+	ok1(hash64_stable(u16array, ARRAY_WORDS, 64) == 6953517591561958496ULL);
+	ok1(hash64_stable(u16array, ARRAY_WORDS, 128) == 7406689491740052867ULL);
+	ok1(hash64_stable(u16array, ARRAY_WORDS, 256) == 10101844489704093104ULL);
+	ok1(hash64_stable(u16array, ARRAY_WORDS, 512) == 12511348870707245959ULL);
+	ok1(hash64_stable(u16array, ARRAY_WORDS, 1024) == 1614019938016861468ULL);
+	ok1(hash64_stable(u16array, ARRAY_WORDS, 2048) == 5294796182374592721ULL);
+	ok1(hash64_stable(u16array, ARRAY_WORDS, 4096) == 16089570706643716675ULL);
+	ok1(hash64_stable(u16array, ARRAY_WORDS, 8192) == 1689302638424579464ULL);
+	ok1(hash64_stable(u16array, ARRAY_WORDS, 16384) == 1446340172370386893ULL);
+	ok1(hash64_stable(u16array, ARRAY_WORDS, 32768) == 16535503506744393039ULL);
+	ok1(hash64_stable(u16array, ARRAY_WORDS, 65536) == 3496794142527150328ULL);
+	ok1(hash64_stable(u16array, ARRAY_WORDS, 131072) == 6568245367474548504ULL);
+	ok1(hash64_stable(u16array, ARRAY_WORDS, 262144) == 9487676460765485949ULL);
+	ok1(hash64_stable(u16array, ARRAY_WORDS, 524288) == 4519762130966530000ULL);
+	ok1(hash64_stable(u16array, ARRAY_WORDS, 1048576) == 15623412069215340610ULL);
+	ok1(hash64_stable(u16array, ARRAY_WORDS, 2097152) == 544013388676438108ULL);
+	ok1(hash64_stable(u16array, ARRAY_WORDS, 4194304) == 5594904760290840266ULL);
+	ok1(hash64_stable(u16array, ARRAY_WORDS, 8388608) == 18098755780041592043ULL);
+	ok1(hash64_stable(u16array, ARRAY_WORDS, 16777216) == 6389168672387330316ULL);
+	ok1(hash64_stable(u16array, ARRAY_WORDS, 33554432) == 896986127732419381ULL);
+	ok1(hash64_stable(u16array, ARRAY_WORDS, 67108864) == 13232626471143901354ULL);
+	ok1(hash64_stable(u16array, ARRAY_WORDS, 134217728) == 53378562890493093ULL);
+	ok1(hash64_stable(u16array, ARRAY_WORDS, 268435456) == 10072361400297824771ULL);
+	ok1(hash64_stable(u16array, ARRAY_WORDS, 536870912) == 14511948118285144529ULL);
+	ok1(hash64_stable(u16array, ARRAY_WORDS, 1073741824) == 6981033484844447277ULL);
+	ok1(hash64_stable(u16array, ARRAY_WORDS, 2147483648U) == 5619339091684126808ULL);
+
+	ok1(hash64_stable(u32array, ARRAY_WORDS, 0) == 3037571077312110476ULL);
+	ok1(hash64_stable(u32array, ARRAY_WORDS, 1) == 14732398743825071988ULL);
+	ok1(hash64_stable(u32array, ARRAY_WORDS, 2) == 14949132158206672071ULL);
+	ok1(hash64_stable(u32array, ARRAY_WORDS, 4) == 1291370080511561429ULL);
+	ok1(hash64_stable(u32array, ARRAY_WORDS, 8) == 10792665964172133092ULL);
+	ok1(hash64_stable(u32array, ARRAY_WORDS, 16) == 14250138032054339435ULL);
+	ok1(hash64_stable(u32array, ARRAY_WORDS, 32) == 17136741522078732741ULL);
+	ok1(hash64_stable(u32array, ARRAY_WORDS, 64) == 3260193403318236635ULL);
+	ok1(hash64_stable(u32array, ARRAY_WORDS, 128) == 10526616652205653536ULL);
+	ok1(hash64_stable(u32array, ARRAY_WORDS, 256) == 9019690373358576579ULL);
+	ok1(hash64_stable(u32array, ARRAY_WORDS, 512) == 6997491436599677436ULL);
+	ok1(hash64_stable(u32array, ARRAY_WORDS, 1024) == 18302783371416533798ULL);
+	ok1(hash64_stable(u32array, ARRAY_WORDS, 2048) == 10149320644446516025ULL);
+	ok1(hash64_stable(u32array, ARRAY_WORDS, 4096) == 7073759949410623868ULL);
+	ok1(hash64_stable(u32array, ARRAY_WORDS, 8192) == 17442399482223760073ULL);
+	ok1(hash64_stable(u32array, ARRAY_WORDS, 16384) == 2983906194216281861ULL);
+	ok1(hash64_stable(u32array, ARRAY_WORDS, 32768) == 4975845419129060524ULL);
+	ok1(hash64_stable(u32array, ARRAY_WORDS, 65536) == 594019910205413268ULL);
+	ok1(hash64_stable(u32array, ARRAY_WORDS, 131072) == 11903010186073691112ULL);
+	ok1(hash64_stable(u32array, ARRAY_WORDS, 262144) == 7339636527154847008ULL);
+	ok1(hash64_stable(u32array, ARRAY_WORDS, 524288) == 15243305400579108736ULL);
+	ok1(hash64_stable(u32array, ARRAY_WORDS, 1048576) == 16737926245392043198ULL);
+	ok1(hash64_stable(u32array, ARRAY_WORDS, 2097152) == 15725083267699862972ULL);
+	ok1(hash64_stable(u32array, ARRAY_WORDS, 4194304) == 12527834265678833794ULL);
+	ok1(hash64_stable(u32array, ARRAY_WORDS, 8388608) == 13908436455987824848ULL);
+	ok1(hash64_stable(u32array, ARRAY_WORDS, 16777216) == 9672773345173872588ULL);
+	ok1(hash64_stable(u32array, ARRAY_WORDS, 33554432) == 2305314279896710501ULL);
+	ok1(hash64_stable(u32array, ARRAY_WORDS, 67108864) == 1866733780381408751ULL);
+	ok1(hash64_stable(u32array, ARRAY_WORDS, 134217728) == 11906263969465724709ULL);
+	ok1(hash64_stable(u32array, ARRAY_WORDS, 268435456) == 5501594918093830069ULL);
+	ok1(hash64_stable(u32array, ARRAY_WORDS, 536870912) == 15823785789276225477ULL);
+	ok1(hash64_stable(u32array, ARRAY_WORDS, 1073741824) == 17353000723889475410ULL);
+	ok1(hash64_stable(u32array, ARRAY_WORDS, 2147483648U) == 7494736910655503182ULL);
+
+	ok1(hash64_stable(u64array, ARRAY_WORDS, 0) == 9765419389786481410ULL);
+	ok1(hash64_stable(u64array, ARRAY_WORDS, 1) == 11182806172127114246ULL);
+	ok1(hash64_stable(u64array, ARRAY_WORDS, 2) == 2559155171395472619ULL);
+	ok1(hash64_stable(u64array, ARRAY_WORDS, 4) == 3311692033324815378ULL);
+	ok1(hash64_stable(u64array, ARRAY_WORDS, 8) == 1297175419505333844ULL);
+	ok1(hash64_stable(u64array, ARRAY_WORDS, 16) == 617896928653569210ULL);
+	ok1(hash64_stable(u64array, ARRAY_WORDS, 32) == 1517398559958603553ULL);
+	ok1(hash64_stable(u64array, ARRAY_WORDS, 64) == 4504821917445110758ULL);
+	ok1(hash64_stable(u64array, ARRAY_WORDS, 128) == 1971743331114904452ULL);
+	ok1(hash64_stable(u64array, ARRAY_WORDS, 256) == 6177667912354374306ULL);
+	ok1(hash64_stable(u64array, ARRAY_WORDS, 512) == 15570521289777792458ULL);
+	ok1(hash64_stable(u64array, ARRAY_WORDS, 1024) == 9204559632415917331ULL);
+	ok1(hash64_stable(u64array, ARRAY_WORDS, 2048) == 9008982669760028237ULL);
+	ok1(hash64_stable(u64array, ARRAY_WORDS, 4096) == 14803537660281700281ULL);
+	ok1(hash64_stable(u64array, ARRAY_WORDS, 8192) == 2873966517448487327ULL);
+	ok1(hash64_stable(u64array, ARRAY_WORDS, 16384) == 5859277625928363661ULL);
+	ok1(hash64_stable(u64array, ARRAY_WORDS, 32768) == 15520461285618185970ULL);
+	ok1(hash64_stable(u64array, ARRAY_WORDS, 65536) == 16746489793331175369ULL);
+	ok1(hash64_stable(u64array, ARRAY_WORDS, 131072) == 514952025484227461ULL);
+	ok1(hash64_stable(u64array, ARRAY_WORDS, 262144) == 10867212269810675249ULL);
+	ok1(hash64_stable(u64array, ARRAY_WORDS, 524288) == 9822204377278314587ULL);
+	ok1(hash64_stable(u64array, ARRAY_WORDS, 1048576) == 3295088921987850465ULL);
+	ok1(hash64_stable(u64array, ARRAY_WORDS, 2097152) == 7559197431498053712ULL);
+	ok1(hash64_stable(u64array, ARRAY_WORDS, 4194304) == 1667267269116771849ULL);
+	ok1(hash64_stable(u64array, ARRAY_WORDS, 8388608) == 2916804068951374862ULL);
+	ok1(hash64_stable(u64array, ARRAY_WORDS, 16777216) == 14422558383125688561ULL);
+	ok1(hash64_stable(u64array, ARRAY_WORDS, 33554432) == 10083112683694342602ULL);
+	ok1(hash64_stable(u64array, ARRAY_WORDS, 67108864) == 7222777647078298513ULL);
+	ok1(hash64_stable(u64array, ARRAY_WORDS, 134217728) == 18424513674048212529ULL);
+	ok1(hash64_stable(u64array, ARRAY_WORDS, 268435456) == 14913668581101810784ULL);
+	ok1(hash64_stable(u64array, ARRAY_WORDS, 536870912) == 14377721174297902048ULL);
+	ok1(hash64_stable(u64array, ARRAY_WORDS, 1073741824) == 6031715005667500948ULL);
+	ok1(hash64_stable(u64array, ARRAY_WORDS, 2147483648U) == 4827100319722378642ULL);
+
+	return exit_status();
+}
diff --git a/lib/ccan/hash/test/run.c b/lib/ccan/hash/test/run.c
new file mode 100644
index 0000000000..dad8e86b9e
--- /dev/null
+++ b/lib/ccan/hash/test/run.c
@@ -0,0 +1,149 @@
+#include <ccan/hash/hash.h>
+#include <ccan/tap/tap.h>
+#include <ccan/hash/hash.c>
+#include <stdbool.h>
+#include <string.h>
+
+#define ARRAY_WORDS 5
+
+int main(int argc, char *argv[])
+{
+	unsigned int i, j, k;
+	uint32_t array[ARRAY_WORDS], val;
+	char array2[sizeof(array) + sizeof(uint32_t)];
+	uint32_t results[256];
+
+	/* Initialize array. */
+	for (i = 0; i < ARRAY_WORDS; i++)
+		array[i] = i;
+
+	plan_tests(39);
+	/* Hash should be the same, indep of memory alignment. */
+	val = hash(array, ARRAY_WORDS, 0);
+	for (i = 0; i < sizeof(uint32_t); i++) {
+		memcpy(array2 + i, array, sizeof(array));
+		ok(hash(array2 + i, ARRAY_WORDS, 0) != val,
+		   "hash matched at offset %i", i);
+	}
+
+	/* Hash of random values should have random distribution:
+	 * check one byte at a time. */
+	for (i = 0; i < sizeof(uint32_t); i++) {
+		unsigned int lowest = -1U, highest = 0;
+
+		memset(results, 0, sizeof(results));
+
+		for (j = 0; j < 256000; j++) {
+			for (k = 0; k < ARRAY_WORDS; k++)
+				array[k] = random();
+			results[(hash(array, ARRAY_WORDS, 0) >> i*8)&0xFF]++;
+		}
+
+		for (j = 0; j < 256; j++) {
+			if (results[j] < lowest)
+				lowest = results[j];
+			if (results[j] > highest)
+				highest = results[j];
+		}
+		/* Expect within 20% */
+		ok(lowest > 800, "Byte %i lowest %i", i, lowest);
+		ok(highest < 1200, "Byte %i highest %i", i, highest);
+		diag("Byte %i, range %u-%u", i, lowest, highest);
+	}
+
+	/* Hash of random values should have random distribution:
+	 * check one byte at a time. */
+	for (i = 0; i < sizeof(uint64_t); i++) {
+		unsigned int lowest = -1U, highest = 0;
+
+		memset(results, 0, sizeof(results));
+
+		for (j = 0; j < 256000; j++) {
+			for (k = 0; k < ARRAY_WORDS; k++)
+				array[k] = random();
+			results[(hash64(array, sizeof(array)/sizeof(uint64_t),
+					0) >> i*8)&0xFF]++;
+		}
+
+		for (j = 0; j < 256; j++) {
+			if (results[j] < lowest)
+				lowest = results[j];
+			if (results[j] > highest)
+				highest = results[j];
+		}
+		/* Expect within 20% */
+		ok(lowest > 800, "Byte %i lowest %i", i, lowest);
+		ok(highest < 1200, "Byte %i highest %i", i, highest);
+		diag("Byte %i, range %u-%u", i, lowest, highest);
+	}
+
+	/* Hash of pointer values should also have random distribution. */
+	for (i = 0; i < sizeof(uint32_t); i++) {
+		unsigned int lowest = -1U, highest = 0;
+		char *p = malloc(256000);
+
+		memset(results, 0, sizeof(results));
+
+		for (j = 0; j < 256000; j++)
+			results[(hash_pointer(p + j, 0) >> i*8)&0xFF]++;
+		free(p);
+
+		for (j = 0; j < 256; j++) {
+			if (results[j] < lowest)
+				lowest = results[j];
+			if (results[j] > highest)
+				highest = results[j];
+		}
+		/* Expect within 20% */
+		ok(lowest > 800, "hash_pointer byte %i lowest %i", i, lowest);
+		ok(highest < 1200, "hash_pointer byte %i highest %i",
+		   i, highest);
+		diag("hash_pointer byte %i, range %u-%u", i, lowest, highest);
+	}
+
+	if (sizeof(long) == sizeof(uint32_t))
+		ok1(hashl(array, ARRAY_WORDS, 0)
+		    == hash(array, ARRAY_WORDS, 0));
+	else
+		ok1(hashl(array, ARRAY_WORDS, 0)
+		    == hash64(array, ARRAY_WORDS, 0));
+
+	/* String hash: weak, so only test bottom byte */
+	for (i = 0; i < 1; i++) {
+		unsigned int num = 0, cursor, lowest = -1U, highest = 0;
+		char p[5];
+
+		memset(results, 0, sizeof(results));
+
+		memset(p, 'A', sizeof(p));
+		p[sizeof(p)-1] = '\0';
+
+		for (;;) {
+			for (cursor = 0; cursor < sizeof(p)-1; cursor++) {
+				p[cursor]++;
+				if (p[cursor] <= 'z')
+					break;
+				p[cursor] = 'A';
+			}
+			if (cursor == sizeof(p)-1)
+				break;
+
+			results[(hash_string(p) >> i*8)&0xFF]++;
+			num++;
+		}
+
+		for (j = 0; j < 256; j++) {
+			if (results[j] < lowest)
+				lowest = results[j];
+			if (results[j] > highest)
+				highest = results[j];
+		}
+		/* Expect within 20% */
+		ok(lowest > 35000, "hash_pointer byte %i lowest %i", i, lowest);
+		ok(highest < 53000, "hash_pointer byte %i highest %i",
+		   i, highest);
+		diag("hash_pointer byte %i, range %u-%u", i, lowest, highest);
+	}
+
+	return exit_status();
+}
diff --git a/lib/ccan/htable/LICENSE b/lib/ccan/htable/LICENSE
new file mode 100644
index 0000000000..d511905c16
--- /dev/null
+++ b/lib/ccan/htable/LICENSE
@@ -0,0 +1,339 @@
+		    GNU GENERAL PUBLIC LICENSE
+		       Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+			    Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users.  This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it.  (Some other Free Software Foundation software is covered by
+the GNU Lesser General Public License instead.)  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+  To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have.  You must make sure that they, too, receive or can get the
+source code.  And you must show them these terms so they know their
+rights.
+
+  We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+  Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software.  If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+  Finally, any free program is threatened constantly by software
+patents.  We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary.  To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+		    GNU GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License.  The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language.  (Hereinafter, translation is included without limitation in
+the term "modification".)  Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+  1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+  2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) You must cause the modified files to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    b) You must cause any work that you distribute or publish, that in
+    whole or in part contains or is derived from the Program or any
+    part thereof, to be licensed as a whole at no charge to all third
+    parties under the terms of this License.
+
+    c) If the modified program normally reads commands interactively
+    when run, you must cause it, when started running for such
+    interactive use in the most ordinary way, to print or display an
+    announcement including an appropriate copyright notice and a
+    notice that there is no warranty (or else, saying that you provide
+    a warranty) and that users may redistribute the program under
+    these conditions, and telling the user how to view a copy of this
+    License.  (Exception: if the Program itself is interactive but
+    does not normally print such an announcement, your work based on
+    the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+    a) Accompany it with the complete corresponding machine-readable
+    source code, which must be distributed under the terms of Sections
+    1 and 2 above on a medium customarily used for software interchange; or,
+
+    b) Accompany it with a written offer, valid for at least three
+    years, to give any third party, for a charge no more than your
+    cost of physically performing source distribution, a complete
+    machine-readable copy of the corresponding source code, to be
+    distributed under the terms of Sections 1 and 2 above on a medium
+    customarily used for software interchange; or,
+
+    c) Accompany it with the information you received as to the offer
+    to distribute corresponding source code.  (This alternative is
+    allowed only for noncommercial distribution and only if you
+    received the program in object code or executable form with such
+    an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it.  For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable.  However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License.  Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+  5. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Program or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+  6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+  7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded.  In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+  9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation.  If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+  10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission.  For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this.  Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+			    NO WARRANTY
+
+  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+		     END OF TERMS AND CONDITIONS
+
+	    How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+    Gnomovision version 69, Copyright (C) year name of author
+    Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+  `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+  <signature of Ty Coon>, 1 April 1989
+  Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs.  If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library.  If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.
diff --git a/lib/ccan/htable/_info b/lib/ccan/htable/_info
new file mode 100644
index 0000000000..8dabe46a50
--- /dev/null
+++ b/lib/ccan/htable/_info
@@ -0,0 +1,115 @@
+#include <string.h>
+#include <stdio.h>
+
+/**
+ * htable - hash table routines
+ *
+ * A hash table is an efficient structure for looking up keys.  This version
+ * grows with usage and allows efficient deletion.
+ *
+ * Example:
+ *	#include <ccan/htable/htable.h>
+ *	#include <ccan/hash/hash.h>
+ *	#include <stdio.h>
+ *	#include <err.h>
+ *	#include <string.h>
+ *
+ *	struct name_to_digit {
+ *		const char *name;
+ *		unsigned int val;
+ *	};
+ *
+ *	static struct name_to_digit map[] = {
+ *		{ "zero", 0},
+ *		{ "one", 1 },
+ *		{ "two", 2 },
+ *		{ "three", 3 },
+ *		{ "four", 4 },
+ *		{ "five", 5 },
+ *		{ "six", 6 },
+ *		{ "seven", 7 },
+ *		{ "eight", 8 },
+ *		{ "nine", 9 }
+ *	};
+ *
+ *	// Wrapper for rehash function pointer.
+ *	static size_t rehash(const void *e, void *unused)
+ *	{
+ *		return hash_string(((struct name_to_digit *)e)->name);
+ *	}
+ *
+ *	// Comparison function.
+ *	static bool streq(const void *e, void *string)
+ *	{
+ *		return strcmp(((struct name_to_digit *)e)->name, string) == 0;
+ *	}
+ *
+ *	// We let them add their own aliases, eg. --alias=v=5
+ *	static void add_alias(struct htable *ht, const char *alias)
+ *	{
+ *		char *eq;
+ *		struct name_to_digit *n;
+ *
+ *		n = malloc(sizeof(*n));
+ *		n->name = strdup(alias);
+ *
+ *		eq = strchr(n->name, '=');
+ *		if (!eq || ((n->val = atoi(eq+1)) == 0 && !strcmp(eq+1, "0")))
+ *			errx(1, "Usage: --alias=<name>=<value>");
+ *		*eq = '\0';
+ *		htable_add(ht, hash_string(n->name), n);
+ *	}
+ *
+ *	int main(int argc, char *argv[])
+ *	{
+ *		struct htable *ht;
+ *		unsigned int i;
+ *		unsigned long val;
+ *
+ *		if (argc < 2)
+ *			errx(1, "Usage: %s [--alias=<name>=<val>]... <str>...",
+ *			     argv[0]);
+ *
+ *		// Create and populate hash table.
+ *		ht = htable_new(rehash, NULL);
+ *		for (i = 0; i < sizeof(map)/sizeof(map[0]); i++)
+ *			htable_add(ht, hash_string(map[i].name), &map[i]);
+ *
+ *		// Add any aliases to the hash table.
+ *		for (i = 1; i < argc; i++) {
+ *			if (!strncmp(argv[i], "--alias=", strlen("--alias=")))
+ *				add_alias(ht, argv[i] + strlen("--alias="));
+ *			else
+ *				break;
+ *		}
+ *
+ *		// Find the other args in the hash table.
+ *		for (val = 0; i < argc; i++) {
+ *			struct name_to_digit *n;
+ *			n = htable_get(ht, hash_string(argv[i]),
+ *				       streq, argv[i]);
+ *			if (!n)
+ *				errx(1, "Invalid digit name %s", argv[i]);
+ *			// Append it to the value we are building up.
+ *			val *= 10;
+ *			val += n->val;
+ *		}
+ *		printf("%lu\n", val);
+ *		return 0;
+ *	}
+ *
+ * License: GPLv2 (or later)
+ * Author: Rusty Russell <rusty@rustcorp.com.au>
+ */
+int main(int argc, char *argv[])
+{
+	if (argc != 2)
+		return 1;
+
+	if (strcmp(argv[1], "depends") == 0) {
+		printf("ccan/compiler\n");
+		return 0;
+	}
+
+	return 1;
+}
diff --git a/lib/ccan/htable/htable.c b/lib/ccan/htable/htable.c
new file mode 100644
index 0000000000..a15c54d795
--- /dev/null
+++ b/lib/ccan/htable/htable.c
@@ -0,0 +1,290 @@
+#include <ccan/htable/htable.h>
+#include <ccan/compiler/compiler.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <limits.h>
+#include <stdbool.h>
+#include <assert.h>
+
+/* This means a struct htable takes at least 512 bytes / 1k (32/64 bits). */
+#define HTABLE_BASE_BITS 7
+
+/* We use 0x1 as deleted marker. */
+#define HTABLE_DELETED (0x1)
+
+struct htable {
+	size_t (*rehash)(const void *elem, void *priv);
+	void *priv;
+	unsigned int bits;
+	size_t elems, deleted, max, max_with_deleted;
+	/* These are the bits which are the same in all pointers. */
+	uintptr_t common_mask, common_bits;
+	uintptr_t perfect_bit;
+	uintptr_t *table;
+};
+
+/* We clear out the bits which are always the same, and put metadata there. */
+static inline uintptr_t get_extra_ptr_bits(const struct htable *ht,
+					   uintptr_t e)
+{
+	return e & ht->common_mask;
+}
+
+static inline void *get_raw_ptr(const struct htable *ht, uintptr_t e)
+{
+	return (void *)((e & ~ht->common_mask) | ht->common_bits);
+}
+
+static inline uintptr_t make_hval(const struct htable *ht,
+				  const void *p, uintptr_t bits)
+{
+	return ((uintptr_t)p & ~ht->common_mask) | bits;
+}
+
+static inline bool entry_is_valid(uintptr_t e)
+{
+	return e > HTABLE_DELETED;
+}
+
+static inline uintptr_t get_hash_ptr_bits(const struct htable *ht,
+					  size_t hash)
+{
+	/* Shuffling the extra bits (as specified in mask) down the
+	 * end is quite expensive.  But the lower bits are redundant, so
+	 * we fold the value first. */
+	return (hash ^ (hash >> ht->bits))
+		& ht->common_mask & ~ht->perfect_bit;
+}
+
+struct htable *htable_new(size_t (*rehash)(const void *elem, void *priv),
+			  void *priv)
+{
+	struct htable *ht = malloc(sizeof(struct htable));
+	if (ht) {
+		ht->bits = HTABLE_BASE_BITS;
+		ht->rehash = rehash;
+		ht->priv = priv;
+		ht->elems = 0;
+		ht->deleted = 0;
+		ht->max = ((size_t)1 << ht->bits) * 3 / 4;
+		ht->max_with_deleted = ((size_t)1 << ht->bits) * 9 / 10;
+		/* This guarantees we enter update_common first add. */
+		ht->common_mask = -1;
+		ht->common_bits = 0;
+		ht->perfect_bit = 0;
+		ht->table = calloc(1 << ht->bits, sizeof(uintptr_t));
+		if (!ht->table) {
+			free(ht);
+			ht = NULL;
+		}
+	}
+	return ht;
+}
+
+void htable_free(const struct htable *ht)
+{
+	free((void *)ht->table);
+	free((void *)ht);
+}
+
+static size_t hash_bucket(const struct htable *ht, size_t h)
+{
+	return h & ((1 << ht->bits)-1);
+}
+
+static void *htable_val(const struct htable *ht,
+			struct htable_iter *i, size_t hash, uintptr_t perfect)
+{
+	uintptr_t h2 = get_hash_ptr_bits(ht, hash) | perfect;
+
+	while (ht->table[i->off]) {
+		if (ht->table[i->off] != HTABLE_DELETED) {
+			if (get_extra_ptr_bits(ht, ht->table[i->off]) == h2)
+				return get_raw_ptr(ht, ht->table[i->off]);
+		}
+		i->off = (i->off + 1) & ((1 << ht->bits)-1);
+		h2 &= ~perfect;
+	}
+	return NULL;
+}
+
+void *htable_firstval(const struct htable *ht,
+		      struct htable_iter *i, size_t hash)
+{
+	i->off = hash_bucket(ht, hash);
+	return htable_val(ht, i, hash, ht->perfect_bit);
+}
+
+void *htable_nextval(const struct htable *ht,
+		     struct htable_iter *i, size_t hash)
+{
+	i->off = (i->off + 1) & ((1 << ht->bits)-1);
+	return htable_val(ht, i, hash, 0);
+}
+
+void *htable_first(const struct htable *ht, struct htable_iter *i)
+{
+	for (i->off = 0; i->off < (size_t)1 << ht->bits; i->off++) {
+		if (entry_is_valid(ht->table[i->off]))
+			return get_raw_ptr(ht, ht->table[i->off]);
+	}
+	return NULL;
+}
+
+void *htable_next(const struct htable *ht, struct htable_iter *i)
+{
+	for (i->off++; i->off < (size_t)1 << ht->bits; i->off++) {
+		if (entry_is_valid(ht->table[i->off]))
+			return get_raw_ptr(ht, ht->table[i->off]);
+	}
+	return NULL;
+}
+
+/* This does not expand the hash table, that's up to caller. */
+static void ht_add(struct htable *ht, const void *new, size_t h)
+{
+	size_t i;
+	uintptr_t perfect = ht->perfect_bit;
+
+	i = hash_bucket(ht, h);
+
+	while (entry_is_valid(ht->table[i])) {
+		perfect = 0;
+		i = (i + 1) & ((1 << ht->bits)-1);
+	}
+	ht->table[i] = make_hval(ht, new, get_hash_ptr_bits(ht, h)|perfect);
+}
+
+static COLD bool double_table(struct htable *ht)
+{
+	unsigned int i;
+	size_t oldnum = (size_t)1 << ht->bits;
+	uintptr_t *oldtable, e;
+
+	oldtable = ht->table;
+	ht->table = calloc(1 << (ht->bits+1), sizeof(size_t));
+	if (!ht->table) {
+		ht->table = oldtable;
+		return false;
+	}
+	ht->bits++;
+	ht->max *= 2;
+	ht->max_with_deleted *= 2;
+
+	/* If we lost our "perfect bit", get it back now. */
+	if (!ht->perfect_bit && ht->common_mask) {
+		for (i = 0; i < sizeof(ht->common_mask) * CHAR_BIT; i++) {
+			if (ht->common_mask & ((size_t)1 << i)) {
+				ht->perfect_bit = (size_t)1 << i;
+				break;
+			}
+		}
+	}
+
+	for (i = 0; i < oldnum; i++) {
+		if (entry_is_valid(e = oldtable[i])) {
+			void *p = get_raw_ptr(ht, e);
+			ht_add(ht, p, ht->rehash(p, ht->priv));
+		}
+	}
+	ht->deleted = 0;
+	free(oldtable);
+	return true;
+}
+
+static COLD void rehash_table(struct htable *ht)
+{
+	size_t start, i;
+	uintptr_t e;
+
+	/* Beware wrap cases: we need to start from first empty bucket. */
+	for (start = 0; ht->table[start]; start++);
+
+	for (i = 0; i < (size_t)1 << ht->bits; i++) {
+		size_t h = (i + start) & ((1 << ht->bits)-1);
+		e = ht->table[h];
+		if (!e)
+			continue;
+		if (e == HTABLE_DELETED)
+			ht->table[h] = 0;
+		else if (!(e & ht->perfect_bit)) {
+			void *p = get_raw_ptr(ht, e);
+			ht->table[h] = 0;
+			ht_add(ht, p, ht->rehash(p, ht->priv));
+		}
+	}
+	ht->deleted = 0;
+}
+
+/* We stole some bits, now we need to put them back... */
+static COLD void update_common(struct htable *ht, const void *p)
+{
+	unsigned int i;
+	uintptr_t maskdiff, bitsdiff;
+
+	if (ht->elems == 0) {
+		ht->common_mask = -1;
+		ht->common_bits = (uintptr_t)p;
+		ht->perfect_bit = 1;
+		return;
+	}
+
+	/* Find bits which are unequal to old common set. */
+	maskdiff = ht->common_bits ^ ((uintptr_t)p & ht->common_mask);
+
+	/* These are the bits which go there in existing entries. */
+	bitsdiff = ht->common_bits & maskdiff;
+
+	for (i = 0; i < (size_t)1 << ht->bits; i++) {
+		if (!entry_is_valid(ht->table[i]))
+			continue;
+		/* Clear the bits no longer in the mask, set them as
+		 * expected. */
+		ht->table[i] &= ~maskdiff;
+		ht->table[i] |= bitsdiff;
+	}
+
+	/* Take away those bits from our mask, bits and perfect bit. */
+	ht->common_mask &= ~maskdiff;
+	ht->common_bits &= ~maskdiff;
+	ht->perfect_bit &= ~maskdiff;
+}
+
+bool htable_add(struct htable *ht, size_t hash, const void *p)
+{
+	if (ht->elems+1 > ht->max && !double_table(ht))
+		return false;
+	if (ht->elems+1 + ht->deleted > ht->max_with_deleted)
+		rehash_table(ht);
+	assert(p);
+	if (((uintptr_t)p & ht->common_mask) != ht->common_bits)
+		update_common(ht, p);
+
+	ht_add(ht, p, hash);
+	ht->elems++;
+	return true;
+}
+
+bool htable_del(struct htable *ht, size_t h, const void *p)
+{
+	struct htable_iter i;
+	void *c;
+
+	for (c = htable_firstval(ht,&i,h); c; c = htable_nextval(ht,&i,h)) {
+		if (c == p) {
+			htable_delval(ht, &i);
+			return true;
+		}
+	}
+	return false;
+}
+
+void htable_delval(struct htable *ht, struct htable_iter *i)
+{
+	assert(i->off < (size_t)1 << ht->bits);
+	assert(entry_is_valid(ht->table[i->off]));
+
+	ht->elems--;
+	ht->table[i->off] = HTABLE_DELETED;
+	ht->deleted++;
+}
diff --git a/lib/ccan/htable/htable.h b/lib/ccan/htable/htable.h
new file mode 100644
index 0000000000..b68442972c
--- /dev/null
+++ b/lib/ccan/htable/htable.h
@@ -0,0 +1,138 @@
+#ifndef CCAN_HTABLE_H
+#define CCAN_HTABLE_H
+#include "config.h"
+#include <stdbool.h>
+#include <stdlib.h>
+
+struct htable;
+
+/**
+ * htable_new - allocate a hash tree.
+ * @rehash: hash function to use for rehashing.
+ * @priv: private argument to @rehash function.
+ */
+struct htable *htable_new(size_t (*hash)(const void *elem, void *priv),
+			  void *priv);
+
+/**
+ * htable_free - dellocate a hash tree.
+ *
+ * This doesn't do anything to any pointers left in it.
+ */
+void htable_free(const struct htable *);
+
+/**
+ * htable_rehash - use a hashtree's rehash function
+ * @elem: the argument to rehash()
+ *
+ */
+size_t htable_rehash(const void *elem);
+
+/**
+ * htable_add - add a pointer into a hash tree.
+ * @ht: the htable
+ * @hash: the hash value of the object
+ * @p: the non-NULL pointer
+ *
+ * Also note that this can only fail due to allocation failure.  Otherwise, it
+ * returns true.
+ */
+bool htable_add(struct htable *ht, size_t hash, const void *p);
+
+/**
+ * htable_del - remove a pointer from a hash tree
+ * @ht: the htable
+ * @hash: the hash value of the object
+ * @p: the pointer
+ *
+ * Returns true if the pointer was found (and deleted).
+ */
+bool htable_del(struct htable *ht, size_t hash, const void *p);
+
+/**
+ * struct htable_iter - iterator or htable_first or htable_firstval etc.
+ *
+ * This refers to a location inside the hashtable.
+ */
+struct htable_iter {
+	size_t off;
+};
+
+/**
+ * htable_firstval - find a candidate for a given hash value
+ * @htable: the hashtable
+ * @i: the struct htable_iter to initialize
+ * @hash: the hash value
+ *
+ * You'll need to check the value is what you want; returns NULL if none.
+ * See Also:
+ *	htable_delval()
+ */
+void *htable_firstval(const struct htable *htable,
+		      struct htable_iter *i, size_t hash);
+
+/**
+ * htable_nextval - find another candidate for a given hash value
+ * @htable: the hashtable
+ * @i: the struct htable_iter to initialize
+ * @hash: the hash value
+ *
+ * You'll need to check the value is what you want; returns NULL if no more.
+ */
+void *htable_nextval(const struct htable *htable,
+		     struct htable_iter *i, size_t hash);
+
+/**
+ * htable_get - find an entry in the hash table
+ * @ht: the hashtable
+ * @h: the hash value of the entry
+ * @cmp: the comparison function
+ * @ptr: the pointer to hand to the comparison function.
+ *
+ * Convenient inline wrapper for htable_firstval/htable_nextval loop.
+ */
+static inline void *htable_get(const struct htable *ht,
+			       size_t h,
+			       bool (*cmp)(const void *candidate, void *ptr),
+			       const void *ptr)
+{
+	struct htable_iter i;
+	void *c;
+
+	for (c = htable_firstval(ht,&i,h); c; c = htable_nextval(ht,&i,h)) {
+		if (cmp(c, (void *)ptr))
+			return c;
+	}
+	return NULL;
+}
+
+/**
+ * htable_first - find an entry in the hash table
+ * @ht: the hashtable
+ * @i: the struct htable_iter to initialize
+ *
+ * Get an entry in the hashtable; NULL if empty.
+ */
+void *htable_first(const struct htable *htable, struct htable_iter *i);
+
+/**
+ * htable_next - find another entry in the hash table
+ * @ht: the hashtable
+ * @i: the struct htable_iter to use
+ *
+ * Get another entry in the hashtable; NULL if all done.
+ * This is usually used after htable_first or prior non-NULL htable_next.
+ */
+void *htable_next(const struct htable *htable, struct htable_iter *i);
+
+/**
+ * htable_delval - remove an iterated pointer from a hash tree
+ * @ht: the htable
+ * @i: the htable_iter
+ *
+ * Usually used to delete a hash entry after it has been found with
+ * htable_firstval etc.
+ */
+void htable_delval(struct htable *ht, struct htable_iter *i);
+
+#endif /* CCAN_HTABLE_H */
diff --git a/lib/ccan/htable/htable_type.h b/lib/ccan/htable/htable_type.h
new file mode 100644
index 0000000000..0d9e3fbb2d
--- /dev/null
+++ b/lib/ccan/htable/htable_type.h
@@ -0,0 +1,97 @@
+#ifndef CCAN_HTABLE_TYPE_H
+#define CCAN_HTABLE_TYPE_H
+#include <ccan/htable/htable.h>
+#include "config.h"
+
+/**
+ * HTABLE_DEFINE_TYPE - create a set of htable ops for a type
+ * @type: a type whose pointers will be values in the hash.
+ * @keyof: a function/macro to extract a key from a @type element.
+ * @hashfn: a hash function for a @key
+ * @cmpfn: a comparison function for two keyof()s.
+ * @name: a name for all the functions to define (of form htable_<name>_*)
+ *
+ * NULL values may not be placed into the hash table.
+ *
+ * The following wrapper functions are defined; each one is a
+ * simplified version of the htable.h equivalent:
+ *
+ *	// Creating and freeing.
+ *	struct htable_@name *htable_@name_new(void);
+ *	void htable_@name_free(const struct htable_@name *ht);
+ *
+ *	// Add, delete and find.
+ *	bool htable_@name_add(struct htable_@name *ht, const type *e);
+ *	bool htable_@name_del(struct htable_@name *ht, const type *e);
+ *	bool htable_@name_delkey(struct htable_@name *ht, const ktype *k);
+ *	type *htable_@name_get(const struct htable_@name *ht, const ktype *k);
+ *
+ *	// Iteration.
+ *	struct htable_@name_iter;
+ *	type *htable_@name_first(const struct htable_@name *ht,
+ *				 struct htable_@name_iter *i);
+ *	type *htable_@name_next(const struct htable_@name *ht,
+ *				struct htable_@name_iter *i);
+ */
+#define HTABLE_DEFINE_TYPE(type, keyof, hashfn, cmpfn, name)		\
+struct htable_##name;							\
+struct htable_##name##_iter { struct htable_iter i; };			\
+static inline size_t htable_##name##_hash(const void *elem, void *priv)	\
+{									\
+	return hashfn(keyof((const type *)elem));			\
+}									\
+static inline struct htable_##name *htable_##name##_new(void)		\
+{									\
+	return (struct htable_##name *)htable_new(htable_##name##_hash,	\
+						  NULL);		\
+}									\
+static inline void htable_##name##_free(const struct htable_##name *ht)	\
+{									\
+	htable_free((const struct htable *)ht);				\
+}									\
+static inline bool htable_##name##_add(struct htable_##name *ht,	\
+				       const type *elem)		\
+{									\
+	return htable_add((struct htable *)ht, hashfn(keyof(elem)), elem); \
+}									\
+static inline bool htable_##name##_del(const struct htable_##name *ht,	\
+				       const type *elem)		\
+{									\
+	return htable_del((struct htable *)ht, hashfn(keyof(elem)), elem); \
+}									\
+static inline type *htable_##name##_get(const struct htable_##name *ht,	\
+					const HTABLE_KTYPE(keyof) k)	\
+{									\
+	/* Typecheck for cmpfn */					\
+	(void)sizeof(cmpfn((const type *)NULL,				\
+			   keyof((const type *)NULL)));			\
+	return (type *)htable_get((const struct htable *)ht,		\
+				  hashfn(k),				\
+				  (bool (*)(const void *, void *))(cmpfn), \
+				  k);					\
+}									\
+static inline bool htable_##name##_delkey(struct htable_##name *ht,	\
+					  const HTABLE_KTYPE(keyof) k) \
+{									\
+	type *elem = htable_##name##_get(ht, k);			\
+	if (elem)							\
+		return htable_##name##_del(ht, elem);			\
+	return false;							\
+}									\
+static inline type *htable_##name##_first(const struct htable_##name *ht, \
+					  struct htable_##name##_iter *iter) \
+{									\
+	return htable_first((const struct htable *)ht, &iter->i);	\
+}									\
+static inline type *htable_##name##_next(const struct htable_##name *ht, \
+					 struct htable_##name##_iter *iter) \
+{									\
+	return htable_next((const struct htable *)ht, &iter->i);	\
+}
+
+#if HAVE_TYPEOF
+#define HTABLE_KTYPE(keyof) typeof(keyof(NULL))
+#else
+#define HTABLE_KTYPE(keyof) void *
+#endif
+#endif /* CCAN_HTABLE_TYPE_H */
diff --git a/lib/ccan/htable/test/run-type.c b/lib/ccan/htable/test/run-type.c
new file mode 100644
index 0000000000..02dac29e10
--- /dev/null
+++ b/lib/ccan/htable/test/run-type.c
@@ -0,0 +1,176 @@
+#include <ccan/htable/htable_type.h>
+#include <ccan/htable/htable.c>
+#include <ccan/tap/tap.h>
+#include <stdbool.h>
+#include <string.h>
+
+#define NUM_VALS (1 << HTABLE_BASE_BITS)
+
+struct obj {
+	/* Makes sure we don't try to treat and obj as a key or vice versa */
+	unsigned char unused;
+	unsigned int key;
+};
+
+static const unsigned int *objkey(const struct obj *obj)
+{
+	return &obj->key;
+}
+
+/* We use the number divided by two as the hash (for lots of
+   collisions), plus set all the higher bits so we can detect if they
+   don't get masked out. */
+static size_t objhash(const unsigned int *key)
+{
+	size_t h = *key / 2;
+	h |= -1UL << HTABLE_BASE_BITS;
+	return h;
+}
+
+static bool cmp(const struct obj *obj, const unsigned int *key)
+{
+	return obj->key == *key;
+}
+
+HTABLE_DEFINE_TYPE(struct obj, objkey, objhash, cmp, obj);
+
+static void add_vals(struct htable_obj *ht,
+		     struct obj val[], unsigned int num)
+{
+	unsigned int i;
+
+	for (i = 0; i < num; i++) {
+		if (htable_obj_get(ht, &i)) {
+			fail("%u already in hash", i);
+			return;
+		}
+		htable_obj_add(ht, &val[i]);
+		if (htable_obj_get(ht, &i) != &val[i]) {
+			fail("%u not added to hash", i);
+			return;
+		}
+	}
+	pass("Added %u numbers to hash", i);
+}
+
+static void find_vals(const struct htable_obj *ht,
+		      const struct obj val[], unsigned int num)
+{
+	unsigned int i;
+
+	for (i = 0; i < num; i++) {
+		if (htable_obj_get(ht, &i) != &val[i]) {
+			fail("%u not found in hash", i);
+			return;
+		}
+	}
+	pass("Found %u numbers in hash", i);
+}
+
+static void del_vals(struct htable_obj *ht,
+		     const struct obj val[], unsigned int num)
+{
+	unsigned int i;
+
+	for (i = 0; i < num; i++) {
+		if (!htable_obj_delkey(ht, &val[i].key)) {
+			fail("%u not deleted from hash", i);
+			return;
+		}
+	}
+	pass("Deleted %u numbers in hash", i);
+}
+
+static void del_vals_bykey(struct htable_obj *ht,
+			   const struct obj val[], unsigned int num)
+{
+	unsigned int i;
+
+	for (i = 0; i < num; i++) {
+		if (!htable_obj_delkey(ht, &i)) {
+			fail("%u not deleted by key from hash", i);
+			return;
+		}
+	}
+	pass("Deleted %u numbers by key from hash", i);
+}
+
+static bool check_mask(struct htable *ht, const struct obj val[], unsigned num)
+{
+	uint64_t i;
+
+	for (i = 0; i < num; i++) {
+		if (((uintptr_t)&val[i] & ht->common_mask) != ht->common_bits)
+			return false;
+	}
+	return true;
+}
+
+int main(int argc, char *argv[])
+{
+	unsigned int i;
+	struct htable_obj *ht;
+	struct obj val[NUM_VALS];
+	unsigned int dne;
+	void *p;
+	struct htable_obj_iter iter;
+
+	plan_tests(20);
+	for (i = 0; i < NUM_VALS; i++)
+		val[i].key = i;
+	dne = i;
+
+	ht = htable_obj_new();
+	ok1(((struct htable *)ht)->max < (1 << ((struct htable *)ht)->bits));
+	ok1(((struct htable *)ht)->bits == HTABLE_BASE_BITS);
+
+	/* We cannot find an entry which doesn't exist. */
+	ok1(!htable_obj_get(ht, &dne));
+
+	/* Fill it, it should increase in size (once). */
+	add_vals(ht, val, NUM_VALS);
+	ok1(((struct htable *)ht)->bits == HTABLE_BASE_BITS + 1);
+	ok1(((struct htable *)ht)->max < (1 << ((struct htable *)ht)->bits));
+
+	/* Mask should be set. */
+	ok1(((struct htable *)ht)->common_mask != 0);
+	ok1(((struct htable *)ht)->common_mask != -1);
+	ok1(check_mask((struct htable *)ht, val, NUM_VALS));
+
+	/* Find all. */
+	find_vals(ht, val, NUM_VALS);
+	ok1(!htable_obj_get(ht, &dne));
+
+	/* Walk once, should get them all. */
+	i = 0;
+	for (p = htable_obj_first(ht,&iter); p; p = htable_obj_next(ht, &iter))
+		i++;
+	ok1(i == NUM_VALS);
+
+	/* Delete all. */
+	del_vals(ht, val, NUM_VALS);
+	ok1(!htable_obj_get(ht, &val[0].key));
+
+	/* Worst case, a "pointer" which doesn't have any matching bits. */
+	htable_add((struct htable *)ht, 0,
+		   (void *)~(uintptr_t)&val[NUM_VALS-1]);
+	htable_obj_add(ht, &val[NUM_VALS-1]);
+	ok1(((struct htable *)ht)->common_mask == 0);
+	ok1(((struct htable *)ht)->common_bits == 0);
+	/* Delete the bogus one before we trip over it. */
+	htable_del((struct htable *)ht, 0,
+		   (void *)~(uintptr_t)&val[NUM_VALS-1]);
+
+	/* Add the rest. */
+	add_vals(ht, val, NUM_VALS-1);
+
+	/* Check we can find them all. */
+	find_vals(ht, val, NUM_VALS);
+	ok1(!htable_obj_get(ht, &dne));
+
+	/* Delete them all by key. */
+	del_vals_bykey(ht, val, NUM_VALS);
+	htable_obj_free(ht);
+
+	return exit_status();
+}
diff --git a/lib/ccan/htable/test/run.c b/lib/ccan/htable/test/run.c
new file mode 100644
index 0000000000..ece46a0fd7
--- /dev/null
+++ b/lib/ccan/htable/test/run.c
@@ -0,0 +1,176 @@
+#include <ccan/htable/htable.h>
+#include <ccan/htable/htable.c>
+#include <ccan/tap/tap.h>
+#include <stdbool.h>
+#include <string.h>
+
+#define NUM_VALS (1 << HTABLE_BASE_BITS)
+
+/* We use the number divided by two as the hash (for lots of
+   collisions), plus set all the higher bits so we can detect if they
+   don't get masked out. */
+static size_t hash(const void *elem, void *unused)
+{
+	size_t h = *(uint64_t *)elem / 2;
+	h |= -1UL << HTABLE_BASE_BITS;
+	return h;
+}
+
+static bool objcmp(const void *htelem, void *cmpdata)
+{
+	return *(uint64_t *)htelem == *(uint64_t *)cmpdata;
+}
+
+static void add_vals(struct htable *ht,
+		     const uint64_t val[], unsigned int num)
+{
+	uint64_t i;
+
+	for (i = 0; i < num; i++) {
+		if (htable_get(ht, hash(&i, NULL), objcmp, &i)) {
+			fail("%llu already in hash", (long long)i);
+			return;
+		}
+		htable_add(ht, hash(&val[i], NULL), &val[i]);
+		if (htable_get(ht, hash(&i, NULL), objcmp, &i) != &val[i]) {
+			fail("%llu not added to hash", (long long)i);
+			return;
+		}
+	}
+	pass("Added %llu numbers to hash", (long long)i);
+}
+
+#if 0
+static void refill_vals(struct htable *ht,
+			const uint64_t val[], unsigned int num)
+{
+	uint64_t i;
+
+	for (i = 0; i < num; i++) {
+		if (htable_get(ht, hash(&i, NULL), objcmp, &i))
+			continue;
+		htable_add(ht, hash(&val[i], NULL), &val[i]);
+	}
+}
+#endif
+
+static void find_vals(struct htable *ht,
+		      const uint64_t val[], unsigned int num)
+{
+	uint64_t i;
+
+	for (i = 0; i < num; i++) {
+		if (htable_get(ht, hash(&i, NULL), objcmp, &i) != &val[i]) {
+			fail("%llu not found in hash", (long long)i);
+			return;
+		}
+	}
+	pass("Found %llu numbers in hash", (long long)i);
+}
+
+static void del_vals(struct htable *ht,
+		     const uint64_t val[], unsigned int num)
+{
+	uint64_t i;
+
+	for (i = 0; i < num; i++) {
+		if (!htable_del(ht, hash(&val[i], NULL), &val[i])) {
+			fail("%llu not deleted from hash", (long long)i);
+			return;
+		}
+	}
+	pass("Deleted %llu numbers in hash", (long long)i);
+}
+
+static bool check_mask(struct htable *ht, uint64_t val[], unsigned num)
+{
+	uint64_t i;
+
+	for (i = 0; i < num; i++) {
+		if (((uintptr_t)&val[i] & ht->common_mask) != ht->common_bits)
+			return false;
+	}
+	return true;
+}
+
+int main(int argc, char *argv[])
+{
+	unsigned int i;
+	uintptr_t perfect_bit;
+	struct htable *ht;
+	uint64_t val[NUM_VALS];
+	uint64_t dne;
+	void *p;
+	struct htable_iter iter;
+
+	plan_tests(23);
+	for (i = 0; i < NUM_VALS; i++)
+		val[i] = i;
+	dne = i;
+
+	ht = htable_new(hash, NULL);
+	ok1(ht->max < (1 << ht->bits));
+	ok1(ht->bits == HTABLE_BASE_BITS);
+
+	/* We cannot find an entry which doesn't exist. */
+	ok1(!htable_get(ht, hash(&dne, NULL), objcmp, &dne));
+
+	/* Fill it, it should increase in size (once). */
+	add_vals(ht, val, NUM_VALS);
+	ok1(ht->bits == HTABLE_BASE_BITS + 1);
+	ok1(ht->max < (1 << ht->bits));
+
+	/* Mask should be set. */
+	ok1(ht->common_mask != 0);
+	ok1(ht->common_mask != -1);
+	ok1(check_mask(ht, val, NUM_VALS));
+
+	/* Find all. */
+	find_vals(ht, val, NUM_VALS);
+	ok1(!htable_get(ht, hash(&dne, NULL), objcmp, &dne));
+
+	/* Walk once, should get them all. */
+	i = 0;
+	for (p = htable_first(ht,&iter); p; p = htable_next(ht, &iter))
+		i++;
+	ok1(i == NUM_VALS);
+
+	/* Delete all. */
+	del_vals(ht, val, NUM_VALS);
+	ok1(!htable_get(ht, hash(&val[0], NULL), objcmp, &val[0]));
+
+	/* Worst case, a "pointer" which doesn't have any matching bits. */
+	htable_add(ht, 0, (void *)~(uintptr_t)&val[NUM_VALS-1]);
+	htable_add(ht, hash(&val[NUM_VALS-1], NULL), &val[NUM_VALS-1]);
+	ok1(ht->common_mask == 0);
+	ok1(ht->common_bits == 0);
+	/* Get rid of bogus pointer before we trip over it! */
+	htable_del(ht, 0, (void *)~(uintptr_t)&val[NUM_VALS-1]);
+
+	/* Add the rest. */
+	add_vals(ht, val, NUM_VALS-1);
+
+	/* Check we can find them all. */
+	find_vals(ht, val, NUM_VALS);
+	ok1(!htable_get(ht, hash(&dne, NULL), objcmp, &dne));
+
+	htable_free(ht);
+
+	/* Corner cases: wipe out the perfect bit using bogus pointer. */
+	ht = htable_new(hash, NULL);
+	htable_add(ht, 0, (void *)((uintptr_t)&val[NUM_VALS-1]));
+	ok1(ht->perfect_bit);
+	perfect_bit = ht->perfect_bit;
+	htable_add(ht, 0, (void *)((uintptr_t)&val[NUM_VALS-1]
+				   | perfect_bit));
+	ok1(ht->perfect_bit == 0);
+	htable_del(ht, 0, (void *)((uintptr_t)&val[NUM_VALS-1] | perfect_bit));
+
+	/* Enlarging should restore it... */
+	add_vals(ht, val, NUM_VALS-1);
+
+	ok1(ht->perfect_bit != 0);
+	htable_free(ht);
+
+	return exit_status();
+}
diff --git a/lib/ccan/htable/tools/Makefile b/lib/ccan/htable/tools/Makefile
new file mode 100644
index 0000000000..001e160b78
--- /dev/null
+++ b/lib/ccan/htable/tools/Makefile
@@ -0,0 +1,5 @@
+CFLAGS=-Wall -Werror -O3 -I../../..
+
+speed: speed.o ../../hash.o
+
+speed.o: speed.c ../htable.h ../htable.c
diff --git a/lib/ccan/htable/tools/speed.c b/lib/ccan/htable/tools/speed.c
new file mode 100644
index 0000000000..26231924a1
--- /dev/null
+++ b/lib/ccan/htable/tools/speed.c
@@ -0,0 +1,377 @@
+/* Simple speed tests for hashtables. */
+#include <ccan/htable/htable_type.h>
+#include <ccan/htable/htable.c>
+#include <ccan/hash/hash.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <unistd.h>
+#include <sys/time.h>
+
+static size_t hashcount;
+struct object {
+	/* The key. */
+	unsigned int key;
+
+	/* Some contents. Doubles as consistency check. */
+	struct object *self;
+};
+
+static const unsigned int *objkey(const struct object *obj)
+{
+	return &obj->key;
+}
+
+static size_t hash_obj(const unsigned int *key)
+{
+	hashcount++;
+	return hashl(key, 1, 0);
+}
+
+static bool cmp(const unsigned int *key1, const unsigned int *key2)
+{
+	return *key1 == *key2;
+}
+
+HTABLE_DEFINE_TYPE(struct object, objkey, hash_obj, cmp, obj);
+
+static unsigned int popcount(unsigned long val)
+{
+#if HAVE_BUILTIN_POPCOUNTL
+	return __builtin_popcountl(val);
+#else
+	if (sizeof(long) == sizeof(u64)) {
+		u64 v = val;
+		v = (v & 0x5555555555555555ULL)
+			+ ((v >> 1) & 0x5555555555555555ULL);
+		v = (v & 0x3333333333333333ULL)
+			+ ((v >> 1) & 0x3333333333333333ULL);
+		v = (v & 0x0F0F0F0F0F0F0F0FULL)
+			+ ((v >> 1) & 0x0F0F0F0F0F0F0F0FULL);
+		v = (v & 0x00FF00FF00FF00FFULL)
+			+ ((v >> 1) & 0x00FF00FF00FF00FFULL);
+		v = (v & 0x0000FFFF0000FFFFULL)
+			+ ((v >> 1) & 0x0000FFFF0000FFFFULL);
+		v = (v & 0x00000000FFFFFFFFULL)
+			+ ((v >> 1) & 0x00000000FFFFFFFFULL);
+		return v;
+	}
+	val = (val & 0x55555555ULL) + ((val >> 1) & 0x55555555ULL);
+	val = (val & 0x33333333ULL) + ((val >> 1) & 0x33333333ULL);
+	val = (val & 0x0F0F0F0FULL) + ((val >> 1) & 0x0F0F0F0FULL);
+	val = (val & 0x00FF00FFULL) + ((val >> 1) & 0x00FF00FFULL);
+	val = (val & 0x0000FFFFULL) + ((val >> 1) & 0x0000FFFFULL);
+	return val;
+#endif
+}
+
+static size_t perfect(const struct htable *ht)
+{
+	size_t i, placed_perfect = 0;
+
+	for (i = 0; i < ((size_t)1 << ht->bits); i++) {
+		if (!entry_is_valid(ht->table[i]))
+			continue;
+		if (hash_bucket(ht, ht->rehash(get_raw_ptr(ht, ht->table[i]),
+					       ht->priv)) == i) {
+			assert((ht->table[i] & ht->perfect_bit)
+			       == ht->perfect_bit);
+			placed_perfect++;
+		}
+	}
+	return placed_perfect;
+}
+
+static size_t count_deleted(const struct htable *ht)
+{
+	size_t i, delete_markers = 0;
+
+	for (i = 0; i < ((size_t)1 << ht->bits); i++) {
+		if (ht->table[i] == HTABLE_DELETED)
+			delete_markers++;
+	}
+	return delete_markers;
+}
+
+/* Nanoseconds per operation */
+static size_t normalize(const struct timeval *start,
+			const struct timeval *stop,
+			unsigned int num)
+{
+	struct timeval diff;
+
+	timersub(stop, start, &diff);
+
+	/* Floating point is more accurate here. */
+	return (double)(diff.tv_sec * 1000000 + diff.tv_usec)
+		/ num * 1000;
+}
+
+static size_t worst_run(struct htable *ht, size_t *deleted)
+{
+	size_t longest = 0, len = 0, this_del = 0, i;
+
+	*deleted = 0;
+	/* This doesn't take into account end-wrap, but gives an idea. */
+	for (i = 0; i < ((size_t)1 << ht->bits); i++) {
+		if (ht->table[i]) {
+			len++;
+			if (ht->table[i] == HTABLE_DELETED)
+				this_del++;
+		} else {
+			if (len > longest) {
+				longest = len;
+				*deleted = this_del;
+			}
+			len = 0;
+			this_del = 0;
+		}
+	}
+	return longest;
+}
+
+int main(int argc, char *argv[])
+{
+	struct object *objs;
+	size_t i, j, num, deleted;
+	struct timeval start, stop;
+	struct htable_obj *ht;
+	struct htable *htr;
+	bool make_dumb = false;
+
+	if (argv[1] && strcmp(argv[1], "--dumb") == 0) {
+		argv++;
+		make_dumb = true;
+	}
+	num = argv[1] ? atoi(argv[1]) : 1000000;
+	objs = calloc(num, sizeof(objs[0]));
+
+	for (i = 0; i < num; i++) {
+		objs[i].key = i;
+		objs[i].self = &objs[i];
+	}
+
+	ht = htable_obj_new();
+	htr = (void *)ht;
+
+	printf("Initial insert: ");
+	fflush(stdout);
+	gettimeofday(&start, NULL);
+	for (i = 0; i < num; i++)
+		htable_obj_add(ht, objs[i].self);
+	gettimeofday(&stop, NULL);
+	printf(" %zu ns\n", normalize(&start, &stop, num));
+	printf("Details: hash size %u, mask bits %u, perfect %.0f%%\n",
+	       1U << htr->bits, popcount(htr->common_mask),
+	       perfect(htr) * 100.0 / htr->elems);
+
+	if (make_dumb) {
+		/* Screw with mask, to hobble us. */
+		update_common(htr, (void *)~htr->common_bits);
+		printf("Details: DUMB MODE: mask bits %u\n",
+		       popcount(htr->common_mask));
+	}
+
+	printf("Initial lookup (match): ");
+	fflush(stdout);
+	gettimeofday(&start, NULL);
+	for (i = 0; i < num; i++)
+		if (htable_obj_get(ht, &i)->self != objs[i].self)
+			abort();
+	gettimeofday(&stop, NULL);
+	printf(" %zu ns\n", normalize(&start, &stop, num));
+
+	printf("Initial lookup (miss): ");
+	fflush(stdout);
+	gettimeofday(&start, NULL);
+	for (i = 0; i < num; i++) {
+		unsigned int n = i + num;
+		if (htable_obj_get(ht, &n))
+			abort();
+	}
+	gettimeofday(&stop, NULL);
+	printf(" %zu ns\n", normalize(&start, &stop, num));
+
+	/* Lookups in order are very cache-friendly for judy; try random */
+	printf("Initial lookup (random): ");
+	fflush(stdout);
+	gettimeofday(&start, NULL);
+	for (i = 0, j = 0; i < num; i++, j = (j + 10007) % num)
+		if (htable_obj_get(ht, &j)->self != &objs[j])
+			abort();
+	gettimeofday(&stop, NULL);
+	printf(" %zu ns\n", normalize(&start, &stop, num));
+
+	hashcount = 0;
+	printf("Initial delete all: ");
+	fflush(stdout);
+	gettimeofday(&start, NULL);
+	for (i = 0; i < num; i++)
+		if (!htable_obj_del(ht, objs[i].self))
+			abort();
+	gettimeofday(&stop, NULL);
+	printf(" %zu ns\n", normalize(&start, &stop, num));
+	printf("Details: rehashes %zu\n", hashcount);
+
+	printf("Initial re-inserting: ");
+	fflush(stdout);
+	gettimeofday(&start, NULL);
+	for (i = 0; i < num; i++)
+		htable_obj_add(ht, objs[i].self);
+	gettimeofday(&stop, NULL);
+	printf(" %zu ns\n", normalize(&start, &stop, num));
+
+	hashcount = 0;
+	printf("Deleting first half: ");
+	fflush(stdout);
+	gettimeofday(&start, NULL);
+	for (i = 0; i < num; i+=2)
+		if (!htable_obj_del(ht, objs[i].self))
+			abort();
+	gettimeofday(&stop, NULL);
+	printf(" %zu ns\n", normalize(&start, &stop, num));
+
+	printf("Details: rehashes %zu, delete markers %zu\n",
+	       hashcount, count_deleted(htr));
+
+	printf("Adding (a different) half: ");
+	fflush(stdout);
+
+	for (i = 0; i < num; i+=2)
+		objs[i].key = num+i;
+
+	gettimeofday(&start, NULL);
+	for (i = 0; i < num; i+=2)
+		htable_obj_add(ht, objs[i].self);
+	gettimeofday(&stop, NULL);
+	printf(" %zu ns\n", normalize(&start, &stop, num));
+
+	printf("Details: delete markers %zu, perfect %.0f%%\n",
+	       count_deleted(htr), perfect(htr) * 100.0 / htr->elems);
+
+	printf("Lookup after half-change (match): ");
+	fflush(stdout);
+	gettimeofday(&start, NULL);
+	for (i = 1; i < num; i+=2)
+		if (htable_obj_get(ht, &i)->self != objs[i].self)
+			abort();
+	for (i = 0; i < num; i+=2) {
+		unsigned int n = i + num;
+		if (htable_obj_get(ht, &n)->self != objs[i].self)
+			abort();
+	}
+	gettimeofday(&stop, NULL);
+	printf(" %zu ns\n", normalize(&start, &stop, num));
+
+	printf("Lookup after half-change (miss): ");
+	fflush(stdout);
+	gettimeofday(&start, NULL);
+	for (i = 0; i < num; i++) {
+		unsigned int n = i + num * 2;
+		if (htable_obj_get(ht, &n))
+			abort();
+	}
+	gettimeofday(&stop, NULL);
+	printf(" %zu ns\n", normalize(&start, &stop, num));
+
+	/* Hashtables with delete markers can fill with markers over time.
+	 * so do some changes to see how it operates in long-term. */
+	for (i = 0; i < 5; i++) {
+		if (i == 0) {
+			/* We don't measure this: jmap is different. */
+			printf("Details: initial churn\n");
+		} else {
+			printf("Churning %s time: ",
+			       i == 1 ? "second"
+			       : i == 2 ? "third"
+			       : i == 3 ? "fourth"
+			       : "fifth");
+			fflush(stdout);
+		}
+		gettimeofday(&start, NULL);
+		for (j = 0; j < num; j++) {
+			if (!htable_obj_del(ht, &objs[j]))
+				abort();
+			objs[j].key = num*i+j;
+			if (!htable_obj_add(ht, &objs[j]))
+				abort();
+		}
+		gettimeofday(&stop, NULL);
+		if (i != 0)
+			printf(" %zu ns\n", normalize(&start, &stop, num));
+	}
+
+	/* Spread out the keys more to try to make it harder. */
+	printf("Details: reinserting with spread\n");
+	for (i = 0; i < num; i++) {
+		if (!htable_obj_del(ht, objs[i].self))
+			abort();
+		objs[i].key = num * 5 + i * 9;
+		if (!htable_obj_add(ht, objs[i].self))
+			abort();
+	}
+	printf("Details: delete markers %zu, perfect %.0f%%\n",
+	       count_deleted(htr), perfect(htr) * 100.0 / htr->elems);
+	i = worst_run(htr, &deleted);
+	printf("Details: worst run %zu (%zu deleted)\n", i, deleted);
+
+	printf("Lookup after churn & spread (match): ");
+	fflush(stdout);
+	gettimeofday(&start, NULL);
+	for (i = 0; i < num; i++) {
+		unsigned int n = num * 5 + i * 9;
+		if (htable_obj_get(ht, &n)->self != objs[i].self)
+			abort();
+	}
+	gettimeofday(&stop, NULL);
+	printf(" %zu ns\n", normalize(&start, &stop, num));
+
+	printf("Lookup after churn & spread (miss): ");
+	fflush(stdout);
+	gettimeofday(&start, NULL);
+	for (i = 0; i < num; i++) {
+		unsigned int n = num * (5 + 9) + i * 9;
+		if (htable_obj_get(ht, &n))
+			abort();
+	}
+	gettimeofday(&stop, NULL);
+	printf(" %zu ns\n", normalize(&start, &stop, num));
+
+	printf("Lookup after churn & spread (random): ");
+	fflush(stdout);
+	gettimeofday(&start, NULL);
+	for (i = 0, j = 0; i < num; i++, j = (j + 10007) % num) {
+		unsigned int n = num * 5 + j * 9;
+		if (htable_obj_get(ht, &n)->self != &objs[j])
+			abort();
+	}
+	gettimeofday(&stop, NULL);
+	printf(" %zu ns\n", normalize(&start, &stop, num));
+
+	hashcount = 0;
+	printf("Deleting half after churn & spread: ");
+	fflush(stdout);
+	gettimeofday(&start, NULL);
+	for (i = 0; i < num; i+=2)
+		if (!htable_obj_del(ht, objs[i].self))
+			abort();
+	gettimeofday(&stop, NULL);
+	printf(" %zu ns\n", normalize(&start, &stop, num));
+
+	printf("Adding (a different) half after churn & spread: ");
+	fflush(stdout);
+
+	for (i = 0; i < num; i+=2)
+		objs[i].key = num*6+i*9;
+
+	gettimeofday(&start, NULL);
+	for (i = 0; i < num; i+=2)
+		htable_obj_add(ht, objs[i].self);
+	gettimeofday(&stop, NULL);
+	printf(" %zu ns\n", normalize(&start, &stop, num));
+
+	printf("Details: delete markers %zu, perfect %.0f%%\n",
+	       count_deleted(htr), perfect(htr) * 100.0 / htr->elems);
+
+	return 0;
+}
diff --git a/lib/ccan/ilog/LICENSE b/lib/ccan/ilog/LICENSE
new file mode 100644
index 0000000000..5522aa5f33
--- /dev/null
+++ b/lib/ccan/ilog/LICENSE
@@ -0,0 +1,508 @@
+
+                  GNU LESSER GENERAL PUBLIC LICENSE
+                       Version 2.1, February 1999
+
+ Copyright (C) 1991, 1999 Free Software Foundation, Inc.
+	51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+[This is the first released version of the Lesser GPL.  It also counts
+ as the successor of the GNU Library Public License, version 2, hence
+ the version number 2.1.]
+
+                            Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+Licenses are intended to guarantee your freedom to share and change
+free software--to make sure the software is free for all its users.
+
+  This license, the Lesser General Public License, applies to some
+specially designated software packages--typically libraries--of the
+Free Software Foundation and other authors who decide to use it.  You
+can use it too, but we suggest you first think carefully about whether
+this license or the ordinary General Public License is the better
+strategy to use in any particular case, based on the explanations
+below.
+
+  When we speak of free software, we are referring to freedom of use,
+not price.  Our General Public Licenses are designed to make sure that
+you have the freedom to distribute copies of free software (and charge
+for this service if you wish); that you receive source code or can get
+it if you want it; that you can change the software and use pieces of
+it in new free programs; and that you are informed that you can do
+these things.
+
+  To protect your rights, we need to make restrictions that forbid
+distributors to deny you these rights or to ask you to surrender these
+rights.  These restrictions translate to certain responsibilities for
+you if you distribute copies of the library or if you modify it.
+
+  For example, if you distribute copies of the library, whether gratis
+or for a fee, you must give the recipients all the rights that we gave
+you.  You must make sure that they, too, receive or can get the source
+code.  If you link other code with the library, you must provide
+complete object files to the recipients, so that they can relink them
+with the library after making changes to the library and recompiling
+it.  And you must show them these terms so they know their rights.
+
+  We protect your rights with a two-step method: (1) we copyright the
+library, and (2) we offer you this license, which gives you legal
+permission to copy, distribute and/or modify the library.
+
+  To protect each distributor, we want to make it very clear that
+there is no warranty for the free library.  Also, if the library is
+modified by someone else and passed on, the recipients should know
+that what they have is not the original version, so that the original
+author's reputation will not be affected by problems that might be
+introduced by others.
+
+  Finally, software patents pose a constant threat to the existence of
+any free program.  We wish to make sure that a company cannot
+effectively restrict the users of a free program by obtaining a
+restrictive license from a patent holder.  Therefore, we insist that
+any patent license obtained for a version of the library must be
+consistent with the full freedom of use specified in this license.
+
+  Most GNU software, including some libraries, is covered by the
+ordinary GNU General Public License.  This license, the GNU Lesser
+General Public License, applies to certain designated libraries, and
+is quite different from the ordinary General Public License.  We use
+this license for certain libraries in order to permit linking those
+libraries into non-free programs.
+
+  When a program is linked with a library, whether statically or using
+a shared library, the combination of the two is legally speaking a
+combined work, a derivative of the original library.  The ordinary
+General Public License therefore permits such linking only if the
+entire combination fits its criteria of freedom.  The Lesser General
+Public License permits more lax criteria for linking other code with
+the library.
+
+  We call this license the "Lesser" General Public License because it
+does Less to protect the user's freedom than the ordinary General
+Public License.  It also provides other free software developers Less
+of an advantage over competing non-free programs.  These disadvantages
+are the reason we use the ordinary General Public License for many
+libraries.  However, the Lesser license provides advantages in certain
+special circumstances.
+
+  For example, on rare occasions, there may be a special need to
+encourage the widest possible use of a certain library, so that it
+becomes a de-facto standard.  To achieve this, non-free programs must
+be allowed to use the library.  A more frequent case is that a free
+library does the same job as widely used non-free libraries.  In this
+case, there is little to gain by limiting the free library to free
+software only, so we use the Lesser General Public License.
+
+  In other cases, permission to use a particular library in non-free
+programs enables a greater number of people to use a large body of
+free software.  For example, permission to use the GNU C Library in
+non-free programs enables many more people to use the whole GNU
+operating system, as well as its variant, the GNU/Linux operating
+system.
+
+  Although the Lesser General Public License is Less protective of the
+users' freedom, it does ensure that the user of a program that is
+linked with the Library has the freedom and the wherewithal to run
+that program using a modified version of the Library.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.  Pay close attention to the difference between a
+"work based on the library" and a "work that uses the library".  The
+former contains code derived from the library, whereas the latter must
+be combined with the library in order to run.
+
+                  GNU LESSER GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License Agreement applies to any software library or other
+program which contains a notice placed by the copyright holder or
+other authorized party saying it may be distributed under the terms of
+this Lesser General Public License (also called "this License").
+Each licensee is addressed as "you".
+
+  A "library" means a collection of software functions and/or data
+prepared so as to be conveniently linked with application programs
+(which use some of those functions and data) to form executables.
+
+  The "Library", below, refers to any such software library or work
+which has been distributed under these terms.  A "work based on the
+Library" means either the Library or any derivative work under
+copyright law: that is to say, a work containing the Library or a
+portion of it, either verbatim or with modifications and/or translated
+straightforwardly into another language.  (Hereinafter, translation is
+included without limitation in the term "modification".)
+
+  "Source code" for a work means the preferred form of the work for
+making modifications to it.  For a library, complete source code means
+all the source code for all modules it contains, plus any associated
+interface definition files, plus the scripts used to control
+compilation and installation of the library.
+
+  Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running a program using the Library is not restricted, and output from
+such a program is covered only if its contents constitute a work based
+on the Library (independent of the use of the Library in a tool for
+writing it).  Whether that is true depends on what the Library does
+and what the program that uses the Library does.
+
+  1. You may copy and distribute verbatim copies of the Library's
+complete source code as you receive it, in any medium, provided that
+you conspicuously and appropriately publish on each copy an
+appropriate copyright notice and disclaimer of warranty; keep intact
+all the notices that refer to this License and to the absence of any
+warranty; and distribute a copy of this License along with the
+Library.
+
+  You may charge a fee for the physical act of transferring a copy,
+and you may at your option offer warranty protection in exchange for a
+fee.
+
+  2. You may modify your copy or copies of the Library or any portion
+of it, thus forming a work based on the Library, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) The modified work must itself be a software library.
+
+    b) You must cause the files modified to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    c) You must cause the whole of the work to be licensed at no
+    charge to all third parties under the terms of this License.
+
+    d) If a facility in the modified Library refers to a function or a
+    table of data to be supplied by an application program that uses
+    the facility, other than as an argument passed when the facility
+    is invoked, then you must make a good faith effort to ensure that,
+    in the event an application does not supply such function or
+    table, the facility still operates, and performs whatever part of
+    its purpose remains meaningful.
+
+    (For example, a function in a library to compute square roots has
+    a purpose that is entirely well-defined independent of the
+    application.  Therefore, Subsection 2d requires that any
+    application-supplied function or table used by this function must
+    be optional: if the application does not supply it, the square
+    root function must still compute square roots.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Library,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Library, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote
+it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Library.
+
+In addition, mere aggregation of another work not based on the Library
+with the Library (or with a work based on the Library) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may opt to apply the terms of the ordinary GNU General Public
+License instead of this License to a given copy of the Library.  To do
+this, you must alter all the notices that refer to this License, so
+that they refer to the ordinary GNU General Public License, version 2,
+instead of to this License.  (If a newer version than version 2 of the
+ordinary GNU General Public License has appeared, then you can specify
+that version instead if you wish.)  Do not make any other change in
+these notices.
+
+  Once this change is made in a given copy, it is irreversible for
+that copy, so the ordinary GNU General Public License applies to all
+subsequent copies and derivative works made from that copy.
+
+  This option is useful when you wish to copy part of the code of
+the Library into a program that is not a library.
+
+  4. You may copy and distribute the Library (or a portion or
+derivative of it, under Section 2) in object code or executable form
+under the terms of Sections 1 and 2 above provided that you accompany
+it with the complete corresponding machine-readable source code, which
+must be distributed under the terms of Sections 1 and 2 above on a
+medium customarily used for software interchange.
+
+  If distribution of object code is made by offering access to copy
+from a designated place, then offering equivalent access to copy the
+source code from the same place satisfies the requirement to
+distribute the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  5. A program that contains no derivative of any portion of the
+Library, but is designed to work with the Library by being compiled or
+linked with it, is called a "work that uses the Library".  Such a
+work, in isolation, is not a derivative work of the Library, and
+therefore falls outside the scope of this License.
+
+  However, linking a "work that uses the Library" with the Library
+creates an executable that is a derivative of the Library (because it
+contains portions of the Library), rather than a "work that uses the
+library".  The executable is therefore covered by this License.
+Section 6 states terms for distribution of such executables.
+
+  When a "work that uses the Library" uses material from a header file
+that is part of the Library, the object code for the work may be a
+derivative work of the Library even though the source code is not.
+Whether this is true is especially significant if the work can be
+linked without the Library, or if the work is itself a library.  The
+threshold for this to be true is not precisely defined by law.
+
+  If such an object file uses only numerical parameters, data
+structure layouts and accessors, and small macros and small inline
+functions (ten lines or less in length), then the use of the object
+file is unrestricted, regardless of whether it is legally a derivative
+work.  (Executables containing this object code plus portions of the
+Library will still fall under Section 6.)
+
+  Otherwise, if the work is a derivative of the Library, you may
+distribute the object code for the work under the terms of Section 6.
+Any executables containing that work also fall under Section 6,
+whether or not they are linked directly with the Library itself.
+
+  6. As an exception to the Sections above, you may also combine or
+link a "work that uses the Library" with the Library to produce a
+work containing portions of the Library, and distribute that work
+under terms of your choice, provided that the terms permit
+modification of the work for the customer's own use and reverse
+engineering for debugging such modifications.
+
+  You must give prominent notice with each copy of the work that the
+Library is used in it and that the Library and its use are covered by
+this License.  You must supply a copy of this License.  If the work
+during execution displays copyright notices, you must include the
+copyright notice for the Library among them, as well as a reference
+directing the user to the copy of this License.  Also, you must do one
+of these things:
+
+    a) Accompany the work with the complete corresponding
+    machine-readable source code for the Library including whatever
+    changes were used in the work (which must be distributed under
+    Sections 1 and 2 above); and, if the work is an executable linked
+    with the Library, with the complete machine-readable "work that
+    uses the Library", as object code and/or source code, so that the
+    user can modify the Library and then relink to produce a modified
+    executable containing the modified Library.  (It is understood
+    that the user who changes the contents of definitions files in the
+    Library will not necessarily be able to recompile the application
+    to use the modified definitions.)
+
+    b) Use a suitable shared library mechanism for linking with the
+    Library.  A suitable mechanism is one that (1) uses at run time a
+    copy of the library already present on the user's computer system,
+    rather than copying library functions into the executable, and (2)
+    will operate properly with a modified version of the library, if
+    the user installs one, as long as the modified version is
+    interface-compatible with the version that the work was made with.
+
+    c) Accompany the work with a written offer, valid for at least
+    three years, to give the same user the materials specified in
+    Subsection 6a, above, for a charge no more than the cost of
+    performing this distribution.
+
+    d) If distribution of the work is made by offering access to copy
+    from a designated place, offer equivalent access to copy the above
+    specified materials from the same place.
+
+    e) Verify that the user has already received a copy of these
+    materials or that you have already sent this user a copy.
+
+  For an executable, the required form of the "work that uses the
+Library" must include any data and utility programs needed for
+reproducing the executable from it.  However, as a special exception,
+the materials to be distributed need not include anything that is
+normally distributed (in either source or binary form) with the major
+components (compiler, kernel, and so on) of the operating system on
+which the executable runs, unless that component itself accompanies
+the executable.
+
+  It may happen that this requirement contradicts the license
+restrictions of other proprietary libraries that do not normally
+accompany the operating system.  Such a contradiction means you cannot
+use both them and the Library together in an executable that you
+distribute.
+
+  7. You may place library facilities that are a work based on the
+Library side-by-side in a single library together with other library
+facilities not covered by this License, and distribute such a combined
+library, provided that the separate distribution of the work based on
+the Library and of the other library facilities is otherwise
+permitted, and provided that you do these two things:
+
+    a) Accompany the combined library with a copy of the same work
+    based on the Library, uncombined with any other library
+    facilities.  This must be distributed under the terms of the
+    Sections above.
+
+    b) Give prominent notice with the combined library of the fact
+    that part of it is a work based on the Library, and explaining
+    where to find the accompanying uncombined form of the same work.
+
+  8. You may not copy, modify, sublicense, link with, or distribute
+the Library except as expressly provided under this License.  Any
+attempt otherwise to copy, modify, sublicense, link with, or
+distribute the Library is void, and will automatically terminate your
+rights under this License.  However, parties who have received copies,
+or rights, from you under this License will not have their licenses
+terminated so long as such parties remain in full compliance.
+
+  9. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Library or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Library (or any work based on the
+Library), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Library or works based on it.
+
+  10. Each time you redistribute the Library (or any work based on the
+Library), the recipient automatically receives a license from the
+original licensor to copy, distribute, link with or modify the Library
+subject to these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties with
+this License.
+
+  11. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Library at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Library by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Library.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply, and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  12. If the distribution and/or use of the Library is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Library under this License
+may add an explicit geographical distribution limitation excluding those
+countries, so that distribution is permitted only in or among
+countries not thus excluded.  In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+  13. The Free Software Foundation may publish revised and/or new
+versions of the Lesser General Public License from time to time.
+Such new versions will be similar in spirit to the present version,
+but may differ in detail to address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Library
+specifies a version number of this License which applies to it and
+"any later version", you have the option of following the terms and
+conditions either of that version or of any later version published by
+the Free Software Foundation.  If the Library does not specify a
+license version number, you may choose any version ever published by
+the Free Software Foundation.
+
+  14. If you wish to incorporate parts of the Library into other free
+programs whose distribution conditions are incompatible with these,
+write to the author to ask for permission.  For software which is
+copyrighted by the Free Software Foundation, write to the Free
+Software Foundation; we sometimes make exceptions for this.  Our
+decision will be guided by the two goals of preserving the free status
+of all derivatives of our free software and of promoting the sharing
+and reuse of software generally.
+
+                            NO WARRANTY
+
+  15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO
+WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW.
+EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR
+OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY
+KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE
+LIBRARY IS WITH YOU.  SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME
+THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN
+WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY
+AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU
+FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR
+CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
+LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
+RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
+FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
+SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
+DAMAGES.
+
+                     END OF TERMS AND CONDITIONS
+
+           How to Apply These Terms to Your New Libraries
+
+  If you develop a new library, and you want it to be of the greatest
+possible use to the public, we recommend making it free software that
+everyone can redistribute and change.  You can do so by permitting
+redistribution under these terms (or, alternatively, under the terms
+of the ordinary General Public License).
+
+  To apply these terms, attach the following notices to the library.
+It is safest to attach them to the start of each source file to most
+effectively convey the exclusion of warranty; and each file should
+have at least the "copyright" line and a pointer to where the full
+notice is found.
+
+
+    <one line to give the library's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This library is free software; you can redistribute it and/or
+    modify it under the terms of the GNU Lesser General Public
+    License as published by the Free Software Foundation; either
+    version 2.1 of the License, or (at your option) any later version.
+
+    This library is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public
+    License along with this library; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+
+Also add information on how to contact you by electronic and paper mail.
+
+You should also get your employer (if you work as a programmer) or
+your school, if any, to sign a "copyright disclaimer" for the library,
+if necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the
+  library `Frob' (a library for tweaking knobs) written by James
+  Random Hacker.
+
+  <signature of Ty Coon>, 1 April 1990
+  Ty Coon, President of Vice
+
+That's all there is to it!
diff --git a/lib/ccan/ilog/_info b/lib/ccan/ilog/_info
new file mode 100644
index 0000000000..56de50d610
--- /dev/null
+++ b/lib/ccan/ilog/_info
@@ -0,0 +1,47 @@
+/**
+ * ilog - Integer logarithm.
+ *
+ * ilog_32() and ilog_64() compute the minimum number of bits required to store
+ *  an unsigned 32-bit or 64-bit value without any leading zero bits.
+ * This can also be thought of as the location of the highest set bit, with
+ *  counting starting from one (so that 0 returns 0, 1 returns 1, and 2**31
+ *  returns 32).
+ * When the value is known to be non-zero ilog32_nz() and ilog64_nz() can
+ *  compile into as few as two instructions, one of which may get optimized out
+ *  later.
+ * STATIC_ILOG_32 and STATIC_ILOG_64 allow computation on compile-time
+ *  constants, so other compile-time constants can be derived from them.
+ *
+ * Example:
+ *  #include <stdio.h>
+ *  #include <limits.h>
+ *  #include <ccan/ilog/ilog.h>
+ *
+ *  int main(void){
+ *    int i;
+ *    printf("ilog32(0x%08X)=%i\n",0,ilog32(0));
+ *    for(i=1;i<=STATIC_ILOG_32(USHRT_MAX);i++){
+ *      uint32_t v;
+ *      v=(uint32_t)1U<<(i-1);
+ *      //Here we know v is non-zero, so we can use ilog32_nz().
+ *      printf("ilog32(0x%08X)=%i\n",v,ilog32_nz(v));
+ *    }
+ *    return 0;
+ *  }
+ *
+ * License: LGPL (v2 or later)
+ * Author: Timothy B. Terriberry <tterribe@xiph.org>
+ */
+#include <string.h>
+#include <stdio.h>
+#include "config.h"
+
+int main(int _argc,const char *_argv[]){
+  /*Expect exactly one argument.*/
+  if(_argc!=2)return 1;
+  if(strcmp(_argv[1],"depends")==0){
+    printf("ccan/compiler\n");
+    return 0;
+  }
+  return 1;
+}
diff --git a/lib/ccan/ilog/ilog.c b/lib/ccan/ilog/ilog.c
new file mode 100644
index 0000000000..40c5a6fd50
--- /dev/null
+++ b/lib/ccan/ilog/ilog.c
@@ -0,0 +1,139 @@
+/*(C) Timothy B. Terriberry (tterribe@xiph.org) 2001-2009 LGPL (v2 or later).*/
+#include "ilog.h"
+#include <limits.h>
+
+/*The fastest fallback strategy for platforms with fast multiplication appears
+   to be based on de Bruijn sequences~\cite{LP98}.
+  Tests confirmed this to be true even on an ARM11, where it is actually faster
+   than using the native clz instruction.
+  Define ILOG_NODEBRUIJN to use a simpler fallback on platforms where
+   multiplication or table lookups are too expensive.
+
+  @UNPUBLISHED{LP98,
+    author="Charles E. Leiserson and Harald Prokop",
+    title="Using de {Bruijn} Sequences to Index a 1 in a Computer Word",
+    month=Jun,
+    year=1998,
+    note="\url{http://supertech.csail.mit.edu/papers/debruijn.pdf}"
+  }*/
+static UNNEEDED const unsigned char DEBRUIJN_IDX32[32]={
+   0, 1,28, 2,29,14,24, 3,30,22,20,15,25,17, 4, 8,
+  31,27,13,23,21,19,16, 7,26,12,18, 6,11, 5,10, 9
+};
+
+/* We always compile these in, in case someone takes address of function. */
+#undef ilog32_nz
+#undef ilog32
+#undef ilog64_nz
+#undef ilog64
+
+int ilog32(uint32_t _v){
+/*On a Pentium M, this branchless version tested as the fastest version without
+   multiplications on 1,000,000,000 random 32-bit integers, edging out a
+   similar version with branches, and a 256-entry LUT version.*/
+# if defined(ILOG_NODEBRUIJN)
+  int ret;
+  int m;
+  ret=_v>0;
+  m=(_v>0xFFFFU)<<4;
+  _v>>=m;
+  ret|=m;
+  m=(_v>0xFFU)<<3;
+  _v>>=m;
+  ret|=m;
+  m=(_v>0xFU)<<2;
+  _v>>=m;
+  ret|=m;
+  m=(_v>3)<<1;
+  _v>>=m;
+  ret|=m;
+  ret+=_v>1;
+  return ret;
+/*This de Bruijn sequence version is faster if you have a fast multiplier.*/
+# else
+  int ret;
+  ret=_v>0;
+  _v|=_v>>1;
+  _v|=_v>>2;
+  _v|=_v>>4;
+  _v|=_v>>8;
+  _v|=_v>>16;
+  _v=(_v>>1)+1;
+  ret+=DEBRUIJN_IDX32[_v*0x77CB531U>>27&0x1F];
+  return ret;
+# endif
+}
+
+int ilog32_nz(uint32_t _v)
+{
+  return ilog32(_v);
+}
+
+int ilog64(uint64_t _v){
+# if defined(ILOG_NODEBRUIJN)
+  uint32_t v;
+  int      ret;
+  int      m;
+  ret=_v>0;
+  m=(_v>0xFFFFFFFFU)<<5;
+  v=(uint32_t)(_v>>m);
+  ret|=m;
+  m=(v>0xFFFFU)<<4;
+  v>>=m;
+  ret|=m;
+  m=(v>0xFFU)<<3;
+  v>>=m;
+  ret|=m;
+  m=(v>0xFU)<<2;
+  v>>=m;
+  ret|=m;
+  m=(v>3)<<1;
+  v>>=m;
+  ret|=m;
+  ret+=v>1;
+  return ret;
+# else
+/*If we don't have a 64-bit word, split it into two 32-bit halves.*/
+#  if LONG_MAX<9223372036854775807LL
+  uint32_t v;
+  int      ret;
+  int      m;
+  ret=_v>0;
+  m=(_v>0xFFFFFFFFU)<<5;
+  v=(uint32_t)(_v>>m);
+  ret|=m;
+  v|=v>>1;
+  v|=v>>2;
+  v|=v>>4;
+  v|=v>>8;
+  v|=v>>16;
+  v=(v>>1)+1;
+  ret+=DEBRUIJN_IDX32[v*0x77CB531U>>27&0x1F];
+  return ret;
+/*Otherwise do it in one 64-bit operation.*/
+#  else
+  static const unsigned char DEBRUIJN_IDX64[64]={
+     0, 1, 2, 7, 3,13, 8,19, 4,25,14,28, 9,34,20,40,
+     5,17,26,38,15,46,29,48,10,31,35,54,21,50,41,57,
+    63, 6,12,18,24,27,33,39,16,37,45,47,30,53,49,56,
+    62,11,23,32,36,44,52,55,61,22,43,51,60,42,59,58
+  };
+  int ret;
+  ret=_v>0;
+  _v|=_v>>1;
+  _v|=_v>>2;
+  _v|=_v>>4;
+  _v|=_v>>8;
+  _v|=_v>>16;
+  _v|=_v>>32;
+  _v=(_v>>1)+1;
+  ret+=DEBRUIJN_IDX64[_v*0x218A392CD3D5DBF>>58&0x3F];
+  return ret;
+#  endif
+# endif
+}
+
+int ilog64_nz(uint64_t _v)
+{
+  return ilog64(_v);
+}
diff --git a/lib/ccan/ilog/ilog.h b/lib/ccan/ilog/ilog.h
new file mode 100644
index 0000000000..55dd009885
--- /dev/null
+++ b/lib/ccan/ilog/ilog.h
@@ -0,0 +1,150 @@
+#if !defined(_ilog_H)
+# define _ilog_H (1)
+# include "config.h"
+# include <stdint.h>
+# include <limits.h>
+# include <ccan/compiler/compiler.h>
+
+/**
+ * ilog32 - Integer binary logarithm of a 32-bit value.
+ * @_v: A 32-bit value.
+ * Returns floor(log2(_v))+1, or 0 if _v==0.
+ * This is the number of bits that would be required to represent _v in two's
+ *  complement notation with all of the leading zeros stripped.
+ * Note that many uses will resolve to the fast macro version instead.
+ *
+ * See Also:
+ *	ilog32_nz(), ilog64()
+ *
+ * Example:
+ *	// Rounds up to next power of 2 (if not a power of 2).
+ *	static uint32_t round_up32(uint32_t i)
+ *	{
+ *		assert(i != 0);
+ *		return 1U << ilog32(i-1);
+ *	}
+ */
+int ilog32(uint32_t _v) IDEMPOTENT;
+
+/**
+ * ilog32_nz - Integer binary logarithm of a non-zero 32-bit value.
+ * @_v: A 32-bit value.
+ * Returns floor(log2(_v))+1, or undefined if _v==0.
+ * This is the number of bits that would be required to represent _v in two's
+ *  complement notation with all of the leading zeros stripped.
+ * Note that many uses will resolve to the fast macro version instead.
+ * See Also:
+ *	ilog32(), ilog64_nz()
+ * Example:
+ *	// Find Last Set (ie. highest bit set, 0 to 31).
+ *	static uint32_t fls32(uint32_t i)
+ *	{
+ *		assert(i != 0);
+ *		return ilog32_nz(i) - 1;
+ *	}
+ */
+int ilog32_nz(uint32_t _v) IDEMPOTENT;
+
+/**
+ * ilog64 - Integer binary logarithm of a 64-bit value.
+ * @_v: A 64-bit value.
+ * Returns floor(log2(_v))+1, or 0 if _v==0.
+ * This is the number of bits that would be required to represent _v in two's
+ *  complement notation with all of the leading zeros stripped.
+ * Note that many uses will resolve to the fast macro version instead.
+ * See Also:
+ *	ilog64_nz(), ilog32()
+ */
+int ilog64(uint64_t _v) IDEMPOTENT;
+
+/**
+ * ilog64_nz - Integer binary logarithm of a non-zero 64-bit value.
+ * @_v: A 64-bit value.
+ * Returns floor(log2(_v))+1, or undefined if _v==0.
+ * This is the number of bits that would be required to represent _v in two's
+ *  complement notation with all of the leading zeros stripped.
+ * Note that many uses will resolve to the fast macro version instead.
+ * See Also:
+ *	ilog64(), ilog32_nz()
+ */
+int ilog64_nz(uint64_t _v) IDEMPOTENT;
+
+/**
+ * STATIC_ILOG_32 - The integer logarithm of an (unsigned, 32-bit) constant.
+ * @_v: A non-negative 32-bit constant.
+ * Returns floor(log2(_v))+1, or 0 if _v==0.
+ * This is the number of bits that would be required to represent _v in two's
+ *  complement notation with all of the leading zeros stripped.
+ * This macro should only be used when you need a compile-time constant,
+ * otherwise ilog32 or ilog32_nz are just as fast and more flexible.
+ *
+ * Example:
+ *	#define MY_PAGE_SIZE	4096
+ *	#define MY_PAGE_BITS	(STATIC_ILOG_32(PAGE_SIZE) - 1)
+ */
+#define STATIC_ILOG_32(_v) (STATIC_ILOG5((uint32_t)(_v)))
+
+/**
+ * STATIC_ILOG_64 - The integer logarithm of an (unsigned, 64-bit) constant.
+ * @_v: A non-negative 64-bit constant.
+ * Returns floor(log2(_v))+1, or 0 if _v==0.
+ * This is the number of bits that would be required to represent _v in two's
+ *  complement notation with all of the leading zeros stripped.
+ * This macro should only be used when you need a compile-time constant,
+ * otherwise ilog64 or ilog64_nz are just as fast and more flexible.
+ */
+#define STATIC_ILOG_64(_v) (STATIC_ILOG6((uint64_t)(_v)))
+
+/* Private implementation details */
+
+/*Note the casts to (int) below: this prevents "upgrading"
+   the type of an entire expression to an (unsigned) size_t.*/
+#if INT_MAX>=2147483647 && HAVE_BUILTIN_CLZ
+#define builtin_ilog32_nz(v) \
+	(((int)sizeof(unsigned)*CHAR_BIT) - __builtin_clz(v))
+#elif LONG_MAX>=2147483647L && HAVE_BUILTIN_CLZL
+#define builtin_ilog32_nz(v) \
+	(((int)sizeof(unsigned)*CHAR_BIT) - __builtin_clzl(v))
+#endif
+
+#if INT_MAX>=9223372036854775807LL && HAVE_BUILTIN_CLZ
+#define builtin_ilog64_nz(v) \
+	(((int)sizeof(unsigned)*CHAR_BIT) - __builtin_clz(v))
+#elif LONG_MAX>=9223372036854775807LL && HAVE_BUILTIN_CLZL
+#define builtin_ilog64_nz(v) \
+	(((int)sizeof(unsigned long)*CHAR_BIT) - __builtin_clzl(v))
+#elif HAVE_BUILTIN_CLZLL
+#define builtin_ilog64_nz(v) \
+	(((int)sizeof(unsigned long long)*CHAR_BIT) - __builtin_clzll(v))
+#endif
+
+#ifdef builtin_ilog32_nz
+#define ilog32(_v) (builtin_ilog32_nz(_v)&-!!(_v))
+#define ilog32_nz(_v) builtin_ilog32_nz(_v)
+#else
+#define ilog32_nz(_v) ilog32(_v)
+#define ilog32(_v) (IS_COMPILE_CONSTANT(_v) ? STATIC_ILOG_32(_v) : ilog32(_v))
+#endif /* builtin_ilog32_nz */
+
+#ifdef builtin_ilog64_nz
+#define ilog64(_v) (builtin_ilog64_nz(_v)&-!!(_v))
+#define ilog64_nz(_v) builtin_ilog64_nz(_v)
+#else
+#define ilog64_nz(_v) ilog64(_v)
+#define ilog64(_v) (IS_COMPILE_CONSTANT(_v) ? STATIC_ILOG_64(_v) : ilog64(_v))
+#endif /* builtin_ilog64_nz */
+
+/* Macros for evaluating compile-time constant ilog. */
+# define STATIC_ILOG0(_v) (!!(_v))
+# define STATIC_ILOG1(_v) (((_v)&0x2)?2:STATIC_ILOG0(_v))
+# define STATIC_ILOG2(_v) (((_v)&0xC)?2+STATIC_ILOG1((_v)>>2):STATIC_ILOG1(_v))
+# define STATIC_ILOG3(_v) \
+ (((_v)&0xF0)?4+STATIC_ILOG2((_v)>>4):STATIC_ILOG2(_v))
+# define STATIC_ILOG4(_v) \
+ (((_v)&0xFF00)?8+STATIC_ILOG3((_v)>>8):STATIC_ILOG3(_v))
+# define STATIC_ILOG5(_v) \
+ (((_v)&0xFFFF0000)?16+STATIC_ILOG4((_v)>>16):STATIC_ILOG4(_v))
+# define STATIC_ILOG6(_v) \
+ (((_v)&0xFFFFFFFF00000000ULL)?32+STATIC_ILOG5((_v)>>32):STATIC_ILOG5(_v))
+
+#endif /* _ilog_H */
diff --git a/lib/ccan/ilog/test/run-out-of-line.c b/lib/ccan/ilog/test/run-out-of-line.c
new file mode 100644
index 0000000000..48205d380e
--- /dev/null
+++ b/lib/ccan/ilog/test/run-out-of-line.c
@@ -0,0 +1,65 @@
+#include <ccan/ilog/ilog.h>
+#include <ccan/ilog/ilog.c>
+#include <stdio.h>
+#include <ccan/tap/tap.h>
+
+/*Dead simple (but slow) versions to compare against.*/
+
+static int test_ilog32(uint32_t _v){
+  int ret;
+  for(ret=0;_v;ret++)_v>>=1;
+  return ret;
+}
+
+static int test_ilog64(uint64_t _v){
+  int ret;
+  for(ret=0;_v;ret++)_v>>=1;
+  return ret;
+}
+
+#define NTRIALS (64)
+
+int main(int _argc,const char *_argv[]){
+  int i;
+  int j;
+  int (*il32)(uint32_t) = ilog32;
+  int (*il64)(uint64_t) = ilog64;
+  int (*il32_nz)(uint32_t) = ilog32_nz;
+  int (*il64_nz)(uint64_t) = ilog64_nz;
+
+  /*This is how many tests you plan to run.*/
+  plan_tests(33 * NTRIALS * 3 + 65 * NTRIALS * 3);
+  for(i=0;i<=32;i++){
+    uint32_t v;
+    /*Test each bit in turn (and 0).*/
+    v=i?(uint32_t)1U<<(i-1):0;
+    for(j=0;j<NTRIALS;j++){
+      int l;
+      l=test_ilog32(v);
+      ok1(STATIC_ILOG_32(v)==l);
+      ok1(il32(v)==l);
+      ok1(il32_nz(v) == l || v == 0);
+      /*Also try a few more pseudo-random values with at most the same number
+         of bits.*/
+      v=(1103515245U*v+12345U)&0xFFFFFFFFU>>((33-i)>>1)>>((32-i)>>1);
+    }
+  }
+
+  for(i=0;i<=64;i++){
+    uint64_t v;
+    /*Test each bit in turn (and 0).*/
+    v=i?(uint64_t)1U<<(i-1):0;
+    for(j=0;j<NTRIALS;j++){
+      int l;
+      l=test_ilog64(v);
+      ok1(STATIC_ILOG_64(v)==l);
+      ok1(il64(v)==l);
+      ok1(il64_nz(v) == l || v == 0);
+      /*Also try a few more pseudo-random values with at most the same number
+         of bits.*/
+      v=(uint64_t)((2862933555777941757ULL*v+3037000493ULL)
+	&0xFFFFFFFFFFFFFFFFULL>>((65-i)>>1)>>((64-i)>>1));
+    }
+  }
+  return exit_status();
+}
diff --git a/lib/ccan/ilog/test/run.c b/lib/ccan/ilog/test/run.c
new file mode 100644
index 0000000000..bda59f920a
--- /dev/null
+++ b/lib/ccan/ilog/test/run.c
@@ -0,0 +1,60 @@
+#include <ccan/ilog/ilog.h>
+#include <ccan/ilog/ilog.c>
+#include <stdio.h>
+#include <ccan/tap/tap.h>
+
+/*Dead simple (but slow) versions to compare against.*/
+
+static int test_ilog32(uint32_t _v){
+  int ret;
+  for(ret=0;_v;ret++)_v>>=1;
+  return ret;
+}
+
+static int test_ilog64(uint64_t _v){
+  int ret;
+  for(ret=0;_v;ret++)_v>>=1;
+  return ret;
+}
+
+#define NTRIALS (64)
+
+int main(int _argc,const char *_argv[]){
+  int i;
+  int j;
+  /*This is how many tests you plan to run.*/
+  plan_tests(33 * NTRIALS * 3 + 65 * NTRIALS * 3);
+  for(i=0;i<=32;i++){
+    uint32_t v;
+    /*Test each bit in turn (and 0).*/
+    v=i?(uint32_t)1U<<(i-1):0;
+    for(j=0;j<NTRIALS;j++){
+      int l;
+      l=test_ilog32(v);
+      ok1(STATIC_ILOG_32(v)==l);
+      ok1(ilog32(v)==l);
+      ok1(ilog32_nz(v) == l || v == 0);
+      /*Also try a few more pseudo-random values with at most the same number
+         of bits.*/
+      v=(1103515245U*v+12345U)&0xFFFFFFFFU>>((33-i)>>1)>>((32-i)>>1);
+    }
+  }
+
+  for(i=0;i<=64;i++){
+    uint64_t v;
+    /*Test each bit in turn (and 0).*/
+    v=i?(uint64_t)1U<<(i-1):0;
+    for(j=0;j<NTRIALS;j++){
+      int l;
+      l=test_ilog64(v);
+      ok1(STATIC_ILOG_64(v)==l);
+      ok1(ilog64(v)==l);
+      ok1(ilog64_nz(v) == l || v == 0);
+      /*Also try a few more pseudo-random values with at most the same number
+         of bits.*/
+      v=(uint64_t)((2862933555777941757ULL*v+3037000493ULL)
+	&0xFFFFFFFFFFFFFFFFULL>>((65-i)>>1)>>((64-i)>>1));
+    }
+  }
+  return exit_status();
+}
diff --git a/lib/ccan/libccan.m4 b/lib/ccan/libccan.m4
new file mode 100644
index 0000000000..92676c3184
--- /dev/null
+++ b/lib/ccan/libccan.m4
@@ -0,0 +1,315 @@
+dnl find the ccan sources.
+ccandir="../lib/ccan"
+for d in $ccanpaths; do
+	if test -f "$srcdir/$d/str/str.c"; then
+		ccandir="$d"
+		AC_SUBST(ccandir)
+		break
+	fi
+done
+if test -f "$ccandir/str/str.c"; then :; else
+   AC_MSG_ERROR([cannot find ccan source in $ccandir])
+fi
+CCAN_OBJ="$ccandir/hash/hash.o $ccandir/htable/htable.o $ccandir/ilog/ilog.o $ccandir/likely/likely.o $ccandir/str/debug.o $ccandir/str/str.o $ccandir/tally/tally.o"
+
+AC_SUBST(CCAN_OBJ)
+
+# Preferred method for including ccan modules is #include <ccan/module/...>.
+CCAN_CFLAGS="-I$ccandir/.."
+AC_SUBST(CCAN_CFLAGS)
+
+# All the configuration checks.  Regrettably, the __attribute__ checks will
+# give false positives on old GCCs, since they just cause warnings.  But that's
+# fairly harmless.
+AC_CACHE_CHECK([whether we can compile with __attribute__((cold))],
+	       samba_cv_attribute_cold,
+	       [
+	         AC_COMPILE_IFELSE(
+			[
+				static void __attribute__((cold))
+				cleanup(void) { }
+			],
+			samba_cv_attribute_cold=yes)
+		])
+
+if test x"$samba_cv_attribute_cold" = xyes ; then
+   AC_DEFINE(HAVE_ATTRIBUTE_COLD, 1,
+	     [whether we can compile with __attribute__((cold))])
+fi
+
+AC_CACHE_CHECK([whether we can compile with __attribute__((const))],
+	       samba_cv_attribute_const,
+	       [
+	         AC_COMPILE_IFELSE(
+			[
+				static void __attribute__((const))
+				cleanup(void) { }
+			],
+			samba_cv_attribute_const=yes)
+		])
+
+if test x"$samba_cv_attribute_const" = xyes ; then
+   AC_DEFINE(HAVE_ATTRIBUTE_CONST, 1,
+	     [whether we can compile with __attribute__((const))])
+fi
+
+AC_CACHE_CHECK([whether we can compile with __attribute__((noreturn))],
+	       samba_cv_attribute_noreturn,
+	       [
+	         AC_COMPILE_IFELSE(
+			[
+				static void __attribute__((noreturn))
+				cleanup(void) { exit(1); }
+			],
+			samba_cv_attribute_noreturn=yes)
+		])
+
+if test x"$samba_cv_attribute_noreturn" = xyes ; then
+   AC_DEFINE(HAVE_ATTRIBUTE_NORETURN, 1,
+	     [whether we can compile with __attribute__((noreturn))])
+fi
+
+AC_CACHE_CHECK([whether we can compile with __attribute__((printf))],
+	       samba_cv_attribute_printf,
+	       [
+	         AC_COMPILE_IFELSE(
+			[
+				static void __attribute__((format(__printf__, 1, 2)))
+				cleanup(const char *fmt, ...) { }
+			],
+			samba_cv_attribute_printf=yes)
+		])
+
+if test x"$samba_cv_attribute_printf" = xyes ; then
+   AC_DEFINE(HAVE_ATTRIBUTE_PRINTF, 1,
+	     [whether we can compile with __attribute__((format(printf)))])
+fi
+
+AC_CACHE_CHECK([whether we can compile with __attribute__((unused))],
+	       samba_cv_attribute_unused,
+	       [
+	         AC_COMPILE_IFELSE(
+			[
+				static void __attribute__((unused))
+				cleanup(void) { }
+			],
+			samba_cv_attribute_unused=yes)
+		])
+
+if test x"$samba_cv_attribute_unused" = xyes ; then
+   AC_DEFINE(HAVE_ATTRIBUTE_UNUSED, 1,
+	     [whether we can compile with __attribute__((unused))])
+fi
+
+AC_CACHE_CHECK([whether we can compile with __attribute__((used))],
+	       samba_cv_attribute_used,
+	       [
+	         AC_COMPILE_IFELSE(
+			[
+				static void __attribute__((used))
+				cleanup(void) { }
+			],
+			samba_cv_attribute_used=yes)
+		])
+
+if test x"$samba_cv_attribute_used" = xyes ; then
+   AC_DEFINE(HAVE_ATTRIBUTE_USED, 1,
+	     [whether we can compile with __attribute__((used))])
+fi
+
+# FIXME: We could use endian.h or sys/endian.h here, and __BYTE_ORDER for
+# cross-compiling.
+AC_CACHE_CHECK([whether we are big endian],samba_cv_big_endian,[
+AC_TRY_RUN([int main(void) {
+union { int i; char c[sizeof(int)]; } u;
+	  u.i = 0x01020304;
+	  return u.c[0] == 0x01 && u.c[1] == 0x02 && u.c[2] == 0x03 && u.c[3] == 0x04 ? 0 : 1;
+}],
+samba_cv_big_endian=yes,
+samba_cv_big_endian=no)])
+if test x"$samba_cv_big_endian" = xyes ; then
+   AC_DEFINE(HAVE_BIG_ENDIAN, 1,
+	     [whether we are big endian])
+fi
+
+AC_CACHE_CHECK([whether we have __builtin_clz],
+	       samba_cv_builtin_clz,
+	       [
+	         AC_COMPILE_IFELSE(
+			[int main(void) {
+				return __builtin_clz(1) == (sizeof(int)*8 - 1) ? 0 : 1;
+			}],
+			samba_cv_builtin_clz=yes)
+		])
+
+if test x"$samba_cv_builtin_clz" = xyes ; then
+   AC_DEFINE(HAVE_BUILTIN_CLZ, 1,
+	     [whether we have __builtin_clz])
+fi
+
+AC_CACHE_CHECK([whether we have __builtin_clzl],
+	       samba_cv_builtin_clzl,
+	       [
+	         AC_COMPILE_IFELSE(
+			[int main(void) {
+				return __builtin_clzl(1) == (sizeof(int)*8 - 1) ? 0 : 1;
+			}],
+			samba_cv_builtin_clzl=yes)
+		])
+
+if test x"$samba_cv_builtin_clzl" = xyes ; then
+   AC_DEFINE(HAVE_BUILTIN_CLZL, 1,
+	     [whether we have __builtin_clzl])
+fi
+AC_CACHE_CHECK([whether we have __builtin_clzll],
+	       samba_cv_builtin_clzll,
+	       [
+	         AC_COMPILE_IFELSE(
+			[int main(void) {
+				return __builtin_clzll(1) == (sizeof(int)*8 - 1) ? 0 : 1;
+			}],
+			samba_cv_builtin_clzll=yes)
+		])
+
+if test x"$samba_cv_builtin_clzll" = xyes ; then
+   AC_DEFINE(HAVE_BUILTIN_CLZLL, 1,
+	     [whether we have __builtin_clzll])
+fi
+
+AC_CACHE_CHECK([whether we have __builtin_constant_p],
+	       samba_cv_builtin_constant_p,
+	       [
+	         AC_COMPILE_IFELSE(
+			[int main(void) {
+				return __builtin_constant_p(1) ? 0 : 1;
+			}],
+			samba_cv_builtin_constant_p=yes)
+		])
+
+if test x"$samba_cv_builtin_constant_p" = xyes ; then
+   AC_DEFINE(HAVE_BUILTIN_CONSTANT_P, 1,
+	     [whether we have __builtin_constant_p])
+fi
+
+AC_CACHE_CHECK([whether we have __builtin_expect],
+	       samba_cv_builtin_expect,
+	       [
+	         AC_COMPILE_IFELSE(
+			[int main(void) {
+				return __builtin_expect(main != 0) ? 0 : 1;
+			}],
+			samba_cv_builtin_expect=yes)
+		])
+
+if test x"$samba_cv_builtin_expect" = xyes ; then
+   AC_DEFINE(HAVE_BUILTIN_EXPECT, 1,
+	     [whether we have __builtin_expect])
+fi
+
+AC_CACHE_CHECK([whether we have __builtin_popcountl],
+	       samba_cv_builtin_popcountl,
+	       [
+	         AC_COMPILE_IFELSE(
+			[int main(void) {
+				return __builtin_popcountl(255L) == 8 ? 0 : 1;
+			}],
+			samba_cv_builtin_popcountl=yes)
+		])
+
+if test x"$samba_cv_builtin_popcountl" = xyes ; then
+   AC_DEFINE(HAVE_BUILTIN_POPCOUNTL, 1,
+	     [whether we have __builtin_popcountl])
+fi
+
+AC_CACHE_CHECK([whether we have __builtin_types_compatible_p],
+	       samba_cv_builtin_types_compatible_p,
+	       [
+	         AC_COMPILE_IFELSE(
+			[int main(void) {
+				return __builtin_types_compatible_p(char *, int) ? 1 : 0;
+			}],
+			samba_cv_builtin_types_compatible_p=yes)
+		])
+
+if test x"$samba_cv_builtin_types_compatible_p" = xyes ; then
+   AC_DEFINE(HAVE_BUILTIN_TYPES_COMPATIBLE_P, 1,
+	     [whether we have __builtin_types_compatible_p])
+fi
+
+AC_CACHE_CHECK([whether we have __builtin_compound_literals],
+	       samba_cv_builtin_compound_literals,
+	       [
+	         AC_COMPILE_IFELSE(
+			[int main(void) {
+				int *foo = (int[]) { 1, 2, 3, 4 };
+				return foo[0] == 1 ? 0 : 1;
+			}],
+			samba_cv_builtin_compound_literals=yes)
+		])
+
+if test x"$samba_cv_builtin_compound_literals" = xyes ; then
+   AC_DEFINE(HAVE_BUILTIN_COMPOUND_LITERALS, 1,
+	     [whether we have __builtin_compound_literals])
+fi
+
+AC_CACHE_CHECK([whether we have __builtin_have_isblank],
+	       samba_cv_builtin_have_isblank,
+	       [
+	         AC_COMPILE_IFELSE(
+			[#include <ctype.h>
+			 int main(void) { return isblank(' ') ? 0 : 1; }
+			],
+			samba_cv_builtin_have_isblank=yes)
+		])
+
+if test x"$samba_cv_builtin_have_isblank" = xyes ; then
+   AC_DEFINE(HAVE_BUILTIN_HAVE_ISBLANK, 1,
+	     [whether we have __builtin_have_isblank])
+fi
+
+# FIXME: We could use endian.h or sys/endian.h here, and __BYTE_ORDER for
+# cross-compiling.
+AC_CACHE_CHECK([whether we are little endian],samba_cv_little_endian,[
+AC_TRY_RUN([int main(void) {
+union { int i; char c[sizeof(int)]; } u;
+	  u.i = 0x01020304;
+	  return u.c[0] == 0x04 && u.c[1] == 0x03 && u.c[2] == 0x02 && u.c[3] == 0x01 ? 0 : 1;
+}],
+samba_cv_little_endian=yes,
+samba_cv_little_endian=no)])
+if test x"$samba_cv_little_endian" = xyes ; then
+   AC_DEFINE(HAVE_LITTLE_ENDIAN, 1,
+	     [whether we are little endian])
+fi
+
+AC_CACHE_CHECK([whether we have __typeof__],
+	       samba_cv_typeof,
+	       [
+	         AC_COMPILE_IFELSE(
+			[int main(void) {
+				int x = 1;
+				__typeof__(x) i;
+				i = x;
+				return i == x ? 0 : 1;
+			}],
+			samba_cv_typeof=yes)
+		])
+
+if test x"$samba_cv_typeof" = xyes ; then
+   AC_DEFINE(HAVE_TYPEOF, 1,
+	     [whether we have __typeof__])
+fi
+
+AC_CACHE_CHECK([whether we have __attribute__((warn_unused_result))],
+	       samba_cv_warn_unused_result,
+	       [
+	         AC_COMPILE_IFELSE(
+			[int __attribute__((warn_unused_result)) func(int x)
+			    { return x; }],
+			samba_cv_warn_unused_result=yes)
+		])
+
+if test x"$samba_cv_warn_unused_result" = xyes ; then
+   AC_DEFINE(HAVE_WARN_UNUSED_RESULT, 1,
+	     [whether we have __attribute__((warn_unused_result))])
+fi
diff --git a/lib/ccan/likely/LICENSE b/lib/ccan/likely/LICENSE
new file mode 100644
index 0000000000..5522aa5f33
--- /dev/null
+++ b/lib/ccan/likely/LICENSE
@@ -0,0 +1,508 @@
+
+                  GNU LESSER GENERAL PUBLIC LICENSE
+                       Version 2.1, February 1999
+
+ Copyright (C) 1991, 1999 Free Software Foundation, Inc.
+	51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+[This is the first released version of the Lesser GPL.  It also counts
+ as the successor of the GNU Library Public License, version 2, hence
+ the version number 2.1.]
+
+                            Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+Licenses are intended to guarantee your freedom to share and change
+free software--to make sure the software is free for all its users.
+
+  This license, the Lesser General Public License, applies to some
+specially designated software packages--typically libraries--of the
+Free Software Foundation and other authors who decide to use it.  You
+can use it too, but we suggest you first think carefully about whether
+this license or the ordinary General Public License is the better
+strategy to use in any particular case, based on the explanations
+below.
+
+  When we speak of free software, we are referring to freedom of use,
+not price.  Our General Public Licenses are designed to make sure that
+you have the freedom to distribute copies of free software (and charge
+for this service if you wish); that you receive source code or can get
+it if you want it; that you can change the software and use pieces of
+it in new free programs; and that you are informed that you can do
+these things.
+
+  To protect your rights, we need to make restrictions that forbid
+distributors to deny you these rights or to ask you to surrender these
+rights.  These restrictions translate to certain responsibilities for
+you if you distribute copies of the library or if you modify it.
+
+  For example, if you distribute copies of the library, whether gratis
+or for a fee, you must give the recipients all the rights that we gave
+you.  You must make sure that they, too, receive or can get the source
+code.  If you link other code with the library, you must provide
+complete object files to the recipients, so that they can relink them
+with the library after making changes to the library and recompiling
+it.  And you must show them these terms so they know their rights.
+
+  We protect your rights with a two-step method: (1) we copyright the
+library, and (2) we offer you this license, which gives you legal
+permission to copy, distribute and/or modify the library.
+
+  To protect each distributor, we want to make it very clear that
+there is no warranty for the free library.  Also, if the library is
+modified by someone else and passed on, the recipients should know
+that what they have is not the original version, so that the original
+author's reputation will not be affected by problems that might be
+introduced by others.
+
+  Finally, software patents pose a constant threat to the existence of
+any free program.  We wish to make sure that a company cannot
+effectively restrict the users of a free program by obtaining a
+restrictive license from a patent holder.  Therefore, we insist that
+any patent license obtained for a version of the library must be
+consistent with the full freedom of use specified in this license.
+
+  Most GNU software, including some libraries, is covered by the
+ordinary GNU General Public License.  This license, the GNU Lesser
+General Public License, applies to certain designated libraries, and
+is quite different from the ordinary General Public License.  We use
+this license for certain libraries in order to permit linking those
+libraries into non-free programs.
+
+  When a program is linked with a library, whether statically or using
+a shared library, the combination of the two is legally speaking a
+combined work, a derivative of the original library.  The ordinary
+General Public License therefore permits such linking only if the
+entire combination fits its criteria of freedom.  The Lesser General
+Public License permits more lax criteria for linking other code with
+the library.
+
+  We call this license the "Lesser" General Public License because it
+does Less to protect the user's freedom than the ordinary General
+Public License.  It also provides other free software developers Less
+of an advantage over competing non-free programs.  These disadvantages
+are the reason we use the ordinary General Public License for many
+libraries.  However, the Lesser license provides advantages in certain
+special circumstances.
+
+  For example, on rare occasions, there may be a special need to
+encourage the widest possible use of a certain library, so that it
+becomes a de-facto standard.  To achieve this, non-free programs must
+be allowed to use the library.  A more frequent case is that a free
+library does the same job as widely used non-free libraries.  In this
+case, there is little to gain by limiting the free library to free
+software only, so we use the Lesser General Public License.
+
+  In other cases, permission to use a particular library in non-free
+programs enables a greater number of people to use a large body of
+free software.  For example, permission to use the GNU C Library in
+non-free programs enables many more people to use the whole GNU
+operating system, as well as its variant, the GNU/Linux operating
+system.
+
+  Although the Lesser General Public License is Less protective of the
+users' freedom, it does ensure that the user of a program that is
+linked with the Library has the freedom and the wherewithal to run
+that program using a modified version of the Library.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.  Pay close attention to the difference between a
+"work based on the library" and a "work that uses the library".  The
+former contains code derived from the library, whereas the latter must
+be combined with the library in order to run.
+
+                  GNU LESSER GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License Agreement applies to any software library or other
+program which contains a notice placed by the copyright holder or
+other authorized party saying it may be distributed under the terms of
+this Lesser General Public License (also called "this License").
+Each licensee is addressed as "you".
+
+  A "library" means a collection of software functions and/or data
+prepared so as to be conveniently linked with application programs
+(which use some of those functions and data) to form executables.
+
+  The "Library", below, refers to any such software library or work
+which has been distributed under these terms.  A "work based on the
+Library" means either the Library or any derivative work under
+copyright law: that is to say, a work containing the Library or a
+portion of it, either verbatim or with modifications and/or translated
+straightforwardly into another language.  (Hereinafter, translation is
+included without limitation in the term "modification".)
+
+  "Source code" for a work means the preferred form of the work for
+making modifications to it.  For a library, complete source code means
+all the source code for all modules it contains, plus any associated
+interface definition files, plus the scripts used to control
+compilation and installation of the library.
+
+  Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running a program using the Library is not restricted, and output from
+such a program is covered only if its contents constitute a work based
+on the Library (independent of the use of the Library in a tool for
+writing it).  Whether that is true depends on what the Library does
+and what the program that uses the Library does.
+
+  1. You may copy and distribute verbatim copies of the Library's
+complete source code as you receive it, in any medium, provided that
+you conspicuously and appropriately publish on each copy an
+appropriate copyright notice and disclaimer of warranty; keep intact
+all the notices that refer to this License and to the absence of any
+warranty; and distribute a copy of this License along with the
+Library.
+
+  You may charge a fee for the physical act of transferring a copy,
+and you may at your option offer warranty protection in exchange for a
+fee.
+
+  2. You may modify your copy or copies of the Library or any portion
+of it, thus forming a work based on the Library, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) The modified work must itself be a software library.
+
+    b) You must cause the files modified to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    c) You must cause the whole of the work to be licensed at no
+    charge to all third parties under the terms of this License.
+
+    d) If a facility in the modified Library refers to a function or a
+    table of data to be supplied by an application program that uses
+    the facility, other than as an argument passed when the facility
+    is invoked, then you must make a good faith effort to ensure that,
+    in the event an application does not supply such function or
+    table, the facility still operates, and performs whatever part of
+    its purpose remains meaningful.
+
+    (For example, a function in a library to compute square roots has
+    a purpose that is entirely well-defined independent of the
+    application.  Therefore, Subsection 2d requires that any
+    application-supplied function or table used by this function must
+    be optional: if the application does not supply it, the square
+    root function must still compute square roots.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Library,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Library, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote
+it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Library.
+
+In addition, mere aggregation of another work not based on the Library
+with the Library (or with a work based on the Library) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may opt to apply the terms of the ordinary GNU General Public
+License instead of this License to a given copy of the Library.  To do
+this, you must alter all the notices that refer to this License, so
+that they refer to the ordinary GNU General Public License, version 2,
+instead of to this License.  (If a newer version than version 2 of the
+ordinary GNU General Public License has appeared, then you can specify
+that version instead if you wish.)  Do not make any other change in
+these notices.
+
+  Once this change is made in a given copy, it is irreversible for
+that copy, so the ordinary GNU General Public License applies to all
+subsequent copies and derivative works made from that copy.
+
+  This option is useful when you wish to copy part of the code of
+the Library into a program that is not a library.
+
+  4. You may copy and distribute the Library (or a portion or
+derivative of it, under Section 2) in object code or executable form
+under the terms of Sections 1 and 2 above provided that you accompany
+it with the complete corresponding machine-readable source code, which
+must be distributed under the terms of Sections 1 and 2 above on a
+medium customarily used for software interchange.
+
+  If distribution of object code is made by offering access to copy
+from a designated place, then offering equivalent access to copy the
+source code from the same place satisfies the requirement to
+distribute the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  5. A program that contains no derivative of any portion of the
+Library, but is designed to work with the Library by being compiled or
+linked with it, is called a "work that uses the Library".  Such a
+work, in isolation, is not a derivative work of the Library, and
+therefore falls outside the scope of this License.
+
+  However, linking a "work that uses the Library" with the Library
+creates an executable that is a derivative of the Library (because it
+contains portions of the Library), rather than a "work that uses the
+library".  The executable is therefore covered by this License.
+Section 6 states terms for distribution of such executables.
+
+  When a "work that uses the Library" uses material from a header file
+that is part of the Library, the object code for the work may be a
+derivative work of the Library even though the source code is not.
+Whether this is true is especially significant if the work can be
+linked without the Library, or if the work is itself a library.  The
+threshold for this to be true is not precisely defined by law.
+
+  If such an object file uses only numerical parameters, data
+structure layouts and accessors, and small macros and small inline
+functions (ten lines or less in length), then the use of the object
+file is unrestricted, regardless of whether it is legally a derivative
+work.  (Executables containing this object code plus portions of the
+Library will still fall under Section 6.)
+
+  Otherwise, if the work is a derivative of the Library, you may
+distribute the object code for the work under the terms of Section 6.
+Any executables containing that work also fall under Section 6,
+whether or not they are linked directly with the Library itself.
+
+  6. As an exception to the Sections above, you may also combine or
+link a "work that uses the Library" with the Library to produce a
+work containing portions of the Library, and distribute that work
+under terms of your choice, provided that the terms permit
+modification of the work for the customer's own use and reverse
+engineering for debugging such modifications.
+
+  You must give prominent notice with each copy of the work that the
+Library is used in it and that the Library and its use are covered by
+this License.  You must supply a copy of this License.  If the work
+during execution displays copyright notices, you must include the
+copyright notice for the Library among them, as well as a reference
+directing the user to the copy of this License.  Also, you must do one
+of these things:
+
+    a) Accompany the work with the complete corresponding
+    machine-readable source code for the Library including whatever
+    changes were used in the work (which must be distributed under
+    Sections 1 and 2 above); and, if the work is an executable linked
+    with the Library, with the complete machine-readable "work that
+    uses the Library", as object code and/or source code, so that the
+    user can modify the Library and then relink to produce a modified
+    executable containing the modified Library.  (It is understood
+    that the user who changes the contents of definitions files in the
+    Library will not necessarily be able to recompile the application
+    to use the modified definitions.)
+
+    b) Use a suitable shared library mechanism for linking with the
+    Library.  A suitable mechanism is one that (1) uses at run time a
+    copy of the library already present on the user's computer system,
+    rather than copying library functions into the executable, and (2)
+    will operate properly with a modified version of the library, if
+    the user installs one, as long as the modified version is
+    interface-compatible with the version that the work was made with.
+
+    c) Accompany the work with a written offer, valid for at least
+    three years, to give the same user the materials specified in
+    Subsection 6a, above, for a charge no more than the cost of
+    performing this distribution.
+
+    d) If distribution of the work is made by offering access to copy
+    from a designated place, offer equivalent access to copy the above
+    specified materials from the same place.
+
+    e) Verify that the user has already received a copy of these
+    materials or that you have already sent this user a copy.
+
+  For an executable, the required form of the "work that uses the
+Library" must include any data and utility programs needed for
+reproducing the executable from it.  However, as a special exception,
+the materials to be distributed need not include anything that is
+normally distributed (in either source or binary form) with the major
+components (compiler, kernel, and so on) of the operating system on
+which the executable runs, unless that component itself accompanies
+the executable.
+
+  It may happen that this requirement contradicts the license
+restrictions of other proprietary libraries that do not normally
+accompany the operating system.  Such a contradiction means you cannot
+use both them and the Library together in an executable that you
+distribute.
+
+  7. You may place library facilities that are a work based on the
+Library side-by-side in a single library together with other library
+facilities not covered by this License, and distribute such a combined
+library, provided that the separate distribution of the work based on
+the Library and of the other library facilities is otherwise
+permitted, and provided that you do these two things:
+
+    a) Accompany the combined library with a copy of the same work
+    based on the Library, uncombined with any other library
+    facilities.  This must be distributed under the terms of the
+    Sections above.
+
+    b) Give prominent notice with the combined library of the fact
+    that part of it is a work based on the Library, and explaining
+    where to find the accompanying uncombined form of the same work.
+
+  8. You may not copy, modify, sublicense, link with, or distribute
+the Library except as expressly provided under this License.  Any
+attempt otherwise to copy, modify, sublicense, link with, or
+distribute the Library is void, and will automatically terminate your
+rights under this License.  However, parties who have received copies,
+or rights, from you under this License will not have their licenses
+terminated so long as such parties remain in full compliance.
+
+  9. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Library or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Library (or any work based on the
+Library), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Library or works based on it.
+
+  10. Each time you redistribute the Library (or any work based on the
+Library), the recipient automatically receives a license from the
+original licensor to copy, distribute, link with or modify the Library
+subject to these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties with
+this License.
+
+  11. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Library at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Library by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Library.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply, and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  12. If the distribution and/or use of the Library is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Library under this License
+may add an explicit geographical distribution limitation excluding those
+countries, so that distribution is permitted only in or among
+countries not thus excluded.  In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+  13. The Free Software Foundation may publish revised and/or new
+versions of the Lesser General Public License from time to time.
+Such new versions will be similar in spirit to the present version,
+but may differ in detail to address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Library
+specifies a version number of this License which applies to it and
+"any later version", you have the option of following the terms and
+conditions either of that version or of any later version published by
+the Free Software Foundation.  If the Library does not specify a
+license version number, you may choose any version ever published by
+the Free Software Foundation.
+
+  14. If you wish to incorporate parts of the Library into other free
+programs whose distribution conditions are incompatible with these,
+write to the author to ask for permission.  For software which is
+copyrighted by the Free Software Foundation, write to the Free
+Software Foundation; we sometimes make exceptions for this.  Our
+decision will be guided by the two goals of preserving the free status
+of all derivatives of our free software and of promoting the sharing
+and reuse of software generally.
+
+                            NO WARRANTY
+
+  15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO
+WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW.
+EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR
+OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY
+KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE
+LIBRARY IS WITH YOU.  SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME
+THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN
+WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY
+AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU
+FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR
+CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
+LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
+RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
+FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
+SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
+DAMAGES.
+
+                     END OF TERMS AND CONDITIONS
+
+           How to Apply These Terms to Your New Libraries
+
+  If you develop a new library, and you want it to be of the greatest
+possible use to the public, we recommend making it free software that
+everyone can redistribute and change.  You can do so by permitting
+redistribution under these terms (or, alternatively, under the terms
+of the ordinary General Public License).
+
+  To apply these terms, attach the following notices to the library.
+It is safest to attach them to the start of each source file to most
+effectively convey the exclusion of warranty; and each file should
+have at least the "copyright" line and a pointer to where the full
+notice is found.
+
+
+    <one line to give the library's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This library is free software; you can redistribute it and/or
+    modify it under the terms of the GNU Lesser General Public
+    License as published by the Free Software Foundation; either
+    version 2.1 of the License, or (at your option) any later version.
+
+    This library is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public
+    License along with this library; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+
+Also add information on how to contact you by electronic and paper mail.
+
+You should also get your employer (if you work as a programmer) or
+your school, if any, to sign a "copyright disclaimer" for the library,
+if necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the
+  library `Frob' (a library for tweaking knobs) written by James
+  Random Hacker.
+
+  <signature of Ty Coon>, 1 April 1990
+  Ty Coon, President of Vice
+
+That's all there is to it!
diff --git a/lib/ccan/likely/_info b/lib/ccan/likely/_info
new file mode 100644
index 0000000000..471c1ffc78
--- /dev/null
+++ b/lib/ccan/likely/_info
@@ -0,0 +1,45 @@
+#include <string.h>
+#include <stdio.h>
+#include "config.h"
+
+/**
+ * likely - macros for annotating likely/unlikely branches in the code
+ *
+ * Inspired by Andi Kleen's macros for the Linux Kernel, these macros
+ * help you annotate rare paths in your code for the convenience of the
+ * compiler and the reader.
+ *
+ * License: LGPL (2 or any later version)
+ * Author: Rusty Russell <rusty@rustcorp.com.au>
+ *
+ * Example:
+ *	#include <ccan/likely/likely.h>
+ *	#include <stdio.h>
+ *
+ *	int main(int argc, char *argv[])
+ *	{
+ *		// This example is silly: the compiler knows exit() is unlikely.
+ *		if (unlikely(argc == 1)) {
+ *			fprintf(stderr, "Usage: %s <args>...\n", argv[0]);
+ *			return 1;
+ *		}
+ *		for (argc++; argv[argc]; argc++)
+ *			printf("%s\n", argv[argc]);
+ *		return 0;
+ *	}
+ */
+int main(int argc, char *argv[])
+{
+	/* Expect exactly one argument */
+	if (argc != 2)
+		return 1;
+
+	if (strcmp(argv[1], "depends") == 0) {
+		printf("ccan/str\n");
+		printf("ccan/htable\n");
+		printf("ccan/hash\n");
+		return 0;
+	}
+
+	return 1;
+}
diff --git a/lib/ccan/likely/likely.c b/lib/ccan/likely/likely.c
new file mode 100644
index 0000000000..8893d0b6d2
--- /dev/null
+++ b/lib/ccan/likely/likely.c
@@ -0,0 +1,141 @@
+#ifdef CCAN_LIKELY_DEBUG
+#include <ccan/likely/likely.h>
+#include <ccan/hash/hash.h>
+#include <ccan/htable/htable.h>
+#include <stdlib.h>
+#include <stdio.h>
+static struct htable *htable;
+
+struct trace {
+	const char *condstr;
+	const char *file;
+	unsigned int line;
+	bool expect;
+	unsigned long count, right;
+};
+
+/* We hash the pointers, which will be identical for same call. */
+static unsigned long hash_trace(const struct trace *trace)
+{
+	return hash_pointer(trace->condstr,
+			    hash_pointer(trace->file,
+					 trace->line + trace->expect));
+}
+
+static bool hash_cmp(const void *htelem, void *cmpdata)
+{
+	const struct trace *t1 = htelem, *t2 = cmpdata;
+	return t1->condstr == t2->condstr
+		&& t1->file == t2->file
+		&& t1->line == t2->line
+		&& t1->expect == t2->expect;
+}
+
+static size_t rehash(const void *elem, void *priv)
+{
+	return hash_trace(elem);
+}
+
+static void init_trace(struct trace *trace,
+		       const char *condstr, const char *file, unsigned int line,
+		       bool expect)
+{
+	trace->condstr = condstr;
+	trace->file = file;
+	trace->line = line;
+	trace->expect = expect;
+	trace->count = trace->right = 0;
+}
+
+static struct trace *add_trace(const char *condstr,
+			       const char *file, unsigned int line, bool expect)
+{
+	struct trace *trace = malloc(sizeof(*trace));
+	init_trace(trace, condstr, file, line, expect);
+	htable_add(htable, hash_trace(trace), trace);
+	return trace;
+}
+
+long _likely_trace(bool cond, bool expect,
+		   const char *condstr,
+		   const char *file, unsigned int line)
+{
+	struct trace *p, trace;
+
+	if (!htable)
+		htable = htable_new(rehash, NULL);
+
+	init_trace(&trace, condstr, file, line, expect);
+	p = htable_get(htable, hash_trace(&trace), hash_cmp, &trace);
+	if (!p)
+		p = add_trace(condstr, file, line, expect);
+
+	p->count++;
+	if (cond == expect)
+		p->right++;
+
+	return cond;
+}
+
+struct get_stats_info {
+	struct trace *worst;
+	unsigned int min_hits;
+	double worst_ratio;
+};
+
+static double right_ratio(const struct trace *t)
+{
+	return (double)t->right / t->count;
+}
+
+static void get_stats(struct trace *trace, struct get_stats_info *info)
+{
+	if (trace->count < info->min_hits)
+		return;
+
+	if (right_ratio(trace) < info->worst_ratio) {
+		info->worst = trace;
+		info->worst_ratio = right_ratio(trace);
+	}
+}
+
+const char *likely_stats(unsigned int min_hits, unsigned int percent)
+{
+	struct get_stats_info info;
+	struct htable_iter i;
+	char *ret;
+	struct trace *trace;
+
+	if (!htable)
+		return NULL;
+
+	info.min_hits = min_hits;
+	info.worst = NULL;
+	info.worst_ratio = 2;
+
+	/* This is O(n), but it's not likely called that often. */
+	for (trace = htable_first(htable, &i);
+	     trace;
+	     trace = htable_next(htable,&i)) {
+		get_stats(trace, &info);
+	}
+
+	if (info.worst_ratio * 100 > percent)
+		return NULL;
+
+	ret = malloc(strlen(info.worst->condstr) +
+		     strlen(info.worst->file) +
+		     sizeof(long int) * 8 +
+		     sizeof("%s:%u:%slikely(%s) correct %u%% (%lu/%lu)"));
+	sprintf(ret, "%s:%u:%slikely(%s) correct %u%% (%lu/%lu)",
+		info.worst->file, info.worst->line,
+		info.worst->expect ? "" : "un", info.worst->condstr,
+		(unsigned)(info.worst_ratio * 100),
+		info.worst->right, info.worst->count);
+
+	htable_del(htable, hash_trace(info.worst), info.worst);
+	free(info.worst);
+
+	return ret;
+}
+#endif /*CCAN_LIKELY_DEBUG*/
diff --git a/lib/ccan/likely/likely.h b/lib/ccan/likely/likely.h
new file mode 100644
index 0000000000..80d695c842
--- /dev/null
+++ b/lib/ccan/likely/likely.h
@@ -0,0 +1,105 @@
+#ifndef CCAN_LIKELY_H
+#define CCAN_LIKELY_H
+#include "config.h"
+#include <ccan/str/str.h>
+#include <stdbool.h>
+
+#ifndef CCAN_LIKELY_DEBUG
+#if HAVE_BUILTIN_EXPECT
+/**
+ * likely - indicate that a condition is likely to be true.
+ * @cond: the condition
+ *
+ * This uses a compiler extension where available to indicate a likely
+ * code path and optimize appropriately; it's also useful for readers
+ * to quickly identify exceptional paths through functions.  The
+ * threshold for "likely" is usually considered to be between 90 and
+ * 99%; marginal cases should not be marked either way.
+ *
+ * See Also:
+ *	unlikely(), likely_stats()
+ *
+ * Example:
+ *	// Returns false if we overflow.
+ *	static inline bool inc_int(unsigned int *val)
+ *	{
+ *		(*val)++;
+ *		if (likely(*val))
+ *			return true;
+ *		return false;
+ *	}
+ */
+#define likely(cond) __builtin_expect(!!(cond), 1)
+
+/**
+ * unlikely - indicate that a condition is unlikely to be true.
+ * @cond: the condition
+ *
+ * This uses a compiler extension where available to indicate an unlikely
+ * code path and optimize appropriately; see likely() above.
+ *
+ * See Also:
+ *	likely(), likely_stats(), COLD (compiler.h)
+ *
+ * Example:
+ *	// Prints a warning if we overflow.
+ *	static inline void inc_int(unsigned int *val)
+ *	{
+ *		(*val)++;
+ *		if (unlikely(*val == 0))
+ *			fprintf(stderr, "Overflow!");
+ *	}
+ */
+#define unlikely(cond) __builtin_expect(!!(cond), 0)
+#else
+#define likely(cond) (!!(cond))
+#define unlikely(cond) (!!(cond))
+#endif
+#else /* CCAN_LIKELY_DEBUG versions */
+#define likely(cond) \
+	(_likely_trace(!!(cond), 1, stringify(cond), __FILE__, __LINE__))
+#define unlikely(cond) \
+	(_likely_trace(!!(cond), 0, stringify(cond), __FILE__, __LINE__))
+
+long _likely_trace(bool cond, bool expect,
+		   const char *condstr,
+		   const char *file, unsigned int line);
+#endif
+
+#ifdef CCAN_LIKELY_DEBUG
+/**
+ * likely_stats - return description of abused likely()/unlikely()
+ * @min_hits: minimum number of hits
+ * @percent: maximum percentage correct
+ *
+ * When CCAN_LIKELY_DEBUG is defined, likely() and unlikely() trace their
+ * results: this causes a significant slowdown, but allows analysis of
+ * whether the branches are labelled correctly.
+ *
+ * This function returns a malloc'ed description of the least-correct
+ * usage of likely() or unlikely().  It ignores places which have been
+ * called less than @min_hits times, and those which were predicted
+ * correctly more than @percent of the time.  It returns NULL when
+ * nothing meets those criteria.
+ *
+ * Note that this call is destructive; the returned offender is
+ * removed from the trace so that the next call to likely_stats() will
+ * return the next-worst likely()/unlikely() usage.
+ *
+ * Example:
+ *	// Print every place hit more than twice which was wrong > 5%.
+ *	static void report_stats(void)
+ *	{
+ *	#ifdef CCAN_LIKELY_DEBUG
+ *		const char *bad;
+ *
+ *		while ((bad = likely_stats(2, 95)) != NULL) {
+ *			printf("Suspicious likely: %s", bad);
+ *			free(bad);
+ *		}
+ *	#endif
+ *	}
+ */
+const char *likely_stats(unsigned int min_hits, unsigned int percent);
+#endif /* CCAN_LIKELY_DEBUG */
+#endif /* CCAN_LIKELY_H */
diff --git a/lib/ccan/likely/test/run-debug.c b/lib/ccan/likely/test/run-debug.c
new file mode 100644
index 0000000000..df78619271
--- /dev/null
+++ b/lib/ccan/likely/test/run-debug.c
@@ -0,0 +1,87 @@
+#define CCAN_LIKELY_DEBUG 1
+#include <ccan/likely/likely.c>
+#include <ccan/likely/likely.h>
+#include <ccan/tap/tap.h>
+#include <stdlib.h>
+
+static bool one_seems_likely(unsigned int val)
+{
+	if (likely(val == 1))
+		return true;
+	return false;
+}
+
+static bool one_seems_unlikely(unsigned int val)
+{
+	if (unlikely(val == 1))
+		return true;
+	return false;
+}
+
+static bool likely_one_unlikely_two(unsigned int val1, unsigned int val2)
+{
+	/* Same line, check we don't get confused! */
+	if (likely(val1 == 1) && unlikely(val2 == 2))
+		return true;
+	return false;
+}
+
+int main(int argc, char *argv[])
+{
+	const char *bad;
+
+	plan_tests(13);
+
+	/* Correct guesses. */
+	one_seems_likely(1);
+	ok1(likely_stats(0, 90) == NULL);
+	one_seems_unlikely(2);
+	ok1(likely_stats(0, 90) == NULL);
+
+	/* Incorrect guesses. */
+	one_seems_likely(0);
+	one_seems_likely(2);
+	/* Hasn't been hit 4 times, so this fails */
+	ok1(!likely_stats(4, 90));
+	bad = likely_stats(3, 90);
+	ok(strends(bad, "run-debug.c:9:likely(val == 1) correct 33% (1/3)"),
+	   "likely_stats returned %s", bad);
+
+	/* Nothing else above 90% */
+	ok1(!likely_stats(0, 90));
+
+	/* This should get everything. */
+	bad = likely_stats(0, 100);
+	ok(strends(bad, "run-debug.c:16:unlikely(val == 1) correct 100% (1/1)"),
+	   "likely_stats returned %s", bad);
+
+	/* Nothing left (table is actually cleared) */
+	ok1(!likely_stats(0, 100));
+
+	/* Make sure unlikely works */
+	one_seems_unlikely(0);
+	one_seems_unlikely(2);
+	one_seems_unlikely(1);
+
+	bad = likely_stats(0, 90);
+	ok(strends(bad, "run-debug.c:16:unlikely(val == 1) correct 66% (2/3)"),
+	   "likely_stats returned %s", bad);
+	ok1(!likely_stats(0, 100));
+
+	likely_one_unlikely_two(1, 1);
+	likely_one_unlikely_two(1, 1);
+	likely_one_unlikely_two(1, 1);
+	ok1(!likely_stats(0, 90));
+	likely_one_unlikely_two(1, 2);
+
+	bad = likely_stats(0, 90);
+	ok(strends(bad, "run-debug.c:24:unlikely(val2 == 2) correct 75% (3/4)"),
+	   "likely_stats returned %s", bad);
+	bad = likely_stats(0, 100);
+	ok(strends(bad, "run-debug.c:24:likely(val1 == 1) correct 100% (4/4)"),
+	   "likely_stats returned %s", bad);
+
+	ok1(!likely_stats(0, 100));
+
+	exit(exit_status());
+}
diff --git a/lib/ccan/likely/test/run.c b/lib/ccan/likely/test/run.c
new file mode 100644
index 0000000000..fa1dc9f6ea
--- /dev/null
+++ b/lib/ccan/likely/test/run.c
@@ -0,0 +1,30 @@
+#include <ccan/likely/likely.c>
+#include <ccan/likely/likely.h>
+#include <ccan/tap/tap.h>
+#include <stdlib.h>
+
+static bool one_seems_likely(unsigned int val)
+{
+	if (likely(val == 1))
+		return true;
+	return false;
+}
+
+static bool one_seems_unlikely(unsigned int val)
+{
+	if (unlikely(val == 1))
+		return true;
+	return false;
+}
+
+int main(int argc, char *argv[])
+{
+	plan_tests(4);
+
+	/* Without debug, we can only check that it doesn't effect functions. */
+	ok1(one_seems_likely(1));
+	ok1(!one_seems_likely(2));
+	ok1(one_seems_unlikely(1));
+	ok1(!one_seems_unlikely(2));
+	exit(exit_status());
+}
diff --git a/lib/ccan/str/LICENSE b/lib/ccan/str/LICENSE
new file mode 100644
index 0000000000..5522aa5f33
--- /dev/null
+++ b/lib/ccan/str/LICENSE
@@ -0,0 +1,508 @@
+
+                  GNU LESSER GENERAL PUBLIC LICENSE
+                       Version 2.1, February 1999
+
+ Copyright (C) 1991, 1999 Free Software Foundation, Inc.
+	51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+[This is the first released version of the Lesser GPL.  It also counts
+ as the successor of the GNU Library Public License, version 2, hence
+ the version number 2.1.]
+
+                            Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+Licenses are intended to guarantee your freedom to share and change
+free software--to make sure the software is free for all its users.
+
+  This license, the Lesser General Public License, applies to some
+specially designated software packages--typically libraries--of the
+Free Software Foundation and other authors who decide to use it.  You
+can use it too, but we suggest you first think carefully about whether
+this license or the ordinary General Public License is the better
+strategy to use in any particular case, based on the explanations
+below.
+
+  When we speak of free software, we are referring to freedom of use,
+not price.  Our General Public Licenses are designed to make sure that
+you have the freedom to distribute copies of free software (and charge
+for this service if you wish); that you receive source code or can get
+it if you want it; that you can change the software and use pieces of
+it in new free programs; and that you are informed that you can do
+these things.
+
+  To protect your rights, we need to make restrictions that forbid
+distributors to deny you these rights or to ask you to surrender these
+rights.  These restrictions translate to certain responsibilities for
+you if you distribute copies of the library or if you modify it.
+
+  For example, if you distribute copies of the library, whether gratis
+or for a fee, you must give the recipients all the rights that we gave
+you.  You must make sure that they, too, receive or can get the source
+code.  If you link other code with the library, you must provide
+complete object files to the recipients, so that they can relink them
+with the library after making changes to the library and recompiling
+it.  And you must show them these terms so they know their rights.
+
+  We protect your rights with a two-step method: (1) we copyright the
+library, and (2) we offer you this license, which gives you legal
+permission to copy, distribute and/or modify the library.
+
+  To protect each distributor, we want to make it very clear that
+there is no warranty for the free library.  Also, if the library is
+modified by someone else and passed on, the recipients should know
+that what they have is not the original version, so that the original
+author's reputation will not be affected by problems that might be
+introduced by others.
+
+  Finally, software patents pose a constant threat to the existence of
+any free program.  We wish to make sure that a company cannot
+effectively restrict the users of a free program by obtaining a
+restrictive license from a patent holder.  Therefore, we insist that
+any patent license obtained for a version of the library must be
+consistent with the full freedom of use specified in this license.
+
+  Most GNU software, including some libraries, is covered by the
+ordinary GNU General Public License.  This license, the GNU Lesser
+General Public License, applies to certain designated libraries, and
+is quite different from the ordinary General Public License.  We use
+this license for certain libraries in order to permit linking those
+libraries into non-free programs.
+
+  When a program is linked with a library, whether statically or using
+a shared library, the combination of the two is legally speaking a
+combined work, a derivative of the original library.  The ordinary
+General Public License therefore permits such linking only if the
+entire combination fits its criteria of freedom.  The Lesser General
+Public License permits more lax criteria for linking other code with
+the library.
+
+  We call this license the "Lesser" General Public License because it
+does Less to protect the user's freedom than the ordinary General
+Public License.  It also provides other free software developers Less
+of an advantage over competing non-free programs.  These disadvantages
+are the reason we use the ordinary General Public License for many
+libraries.  However, the Lesser license provides advantages in certain
+special circumstances.
+
+  For example, on rare occasions, there may be a special need to
+encourage the widest possible use of a certain library, so that it
+becomes a de-facto standard.  To achieve this, non-free programs must
+be allowed to use the library.  A more frequent case is that a free
+library does the same job as widely used non-free libraries.  In this
+case, there is little to gain by limiting the free library to free
+software only, so we use the Lesser General Public License.
+
+  In other cases, permission to use a particular library in non-free
+programs enables a greater number of people to use a large body of
+free software.  For example, permission to use the GNU C Library in
+non-free programs enables many more people to use the whole GNU
+operating system, as well as its variant, the GNU/Linux operating
+system.
+
+  Although the Lesser General Public License is Less protective of the
+users' freedom, it does ensure that the user of a program that is
+linked with the Library has the freedom and the wherewithal to run
+that program using a modified version of the Library.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.  Pay close attention to the difference between a
+"work based on the library" and a "work that uses the library".  The
+former contains code derived from the library, whereas the latter must
+be combined with the library in order to run.
+
+                  GNU LESSER GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License Agreement applies to any software library or other
+program which contains a notice placed by the copyright holder or
+other authorized party saying it may be distributed under the terms of
+this Lesser General Public License (also called "this License").
+Each licensee is addressed as "you".
+
+  A "library" means a collection of software functions and/or data
+prepared so as to be conveniently linked with application programs
+(which use some of those functions and data) to form executables.
+
+  The "Library", below, refers to any such software library or work
+which has been distributed under these terms.  A "work based on the
+Library" means either the Library or any derivative work under
+copyright law: that is to say, a work containing the Library or a
+portion of it, either verbatim or with modifications and/or translated
+straightforwardly into another language.  (Hereinafter, translation is
+included without limitation in the term "modification".)
+
+  "Source code" for a work means the preferred form of the work for
+making modifications to it.  For a library, complete source code means
+all the source code for all modules it contains, plus any associated
+interface definition files, plus the scripts used to control
+compilation and installation of the library.
+
+  Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running a program using the Library is not restricted, and output from
+such a program is covered only if its contents constitute a work based
+on the Library (independent of the use of the Library in a tool for
+writing it).  Whether that is true depends on what the Library does
+and what the program that uses the Library does.
+
+  1. You may copy and distribute verbatim copies of the Library's
+complete source code as you receive it, in any medium, provided that
+you conspicuously and appropriately publish on each copy an
+appropriate copyright notice and disclaimer of warranty; keep intact
+all the notices that refer to this License and to the absence of any
+warranty; and distribute a copy of this License along with the
+Library.
+
+  You may charge a fee for the physical act of transferring a copy,
+and you may at your option offer warranty protection in exchange for a
+fee.
+
+  2. You may modify your copy or copies of the Library or any portion
+of it, thus forming a work based on the Library, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) The modified work must itself be a software library.
+
+    b) You must cause the files modified to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    c) You must cause the whole of the work to be licensed at no
+    charge to all third parties under the terms of this License.
+
+    d) If a facility in the modified Library refers to a function or a
+    table of data to be supplied by an application program that uses
+    the facility, other than as an argument passed when the facility
+    is invoked, then you must make a good faith effort to ensure that,
+    in the event an application does not supply such function or
+    table, the facility still operates, and performs whatever part of
+    its purpose remains meaningful.
+
+    (For example, a function in a library to compute square roots has
+    a purpose that is entirely well-defined independent of the
+    application.  Therefore, Subsection 2d requires that any
+    application-supplied function or table used by this function must
+    be optional: if the application does not supply it, the square
+    root function must still compute square roots.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Library,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Library, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote
+it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Library.
+
+In addition, mere aggregation of another work not based on the Library
+with the Library (or with a work based on the Library) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may opt to apply the terms of the ordinary GNU General Public
+License instead of this License to a given copy of the Library.  To do
+this, you must alter all the notices that refer to this License, so
+that they refer to the ordinary GNU General Public License, version 2,
+instead of to this License.  (If a newer version than version 2 of the
+ordinary GNU General Public License has appeared, then you can specify
+that version instead if you wish.)  Do not make any other change in
+these notices.
+
+  Once this change is made in a given copy, it is irreversible for
+that copy, so the ordinary GNU General Public License applies to all
+subsequent copies and derivative works made from that copy.
+
+  This option is useful when you wish to copy part of the code of
+the Library into a program that is not a library.
+
+  4. You may copy and distribute the Library (or a portion or
+derivative of it, under Section 2) in object code or executable form
+under the terms of Sections 1 and 2 above provided that you accompany
+it with the complete corresponding machine-readable source code, which
+must be distributed under the terms of Sections 1 and 2 above on a
+medium customarily used for software interchange.
+
+  If distribution of object code is made by offering access to copy
+from a designated place, then offering equivalent access to copy the
+source code from the same place satisfies the requirement to
+distribute the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  5. A program that contains no derivative of any portion of the
+Library, but is designed to work with the Library by being compiled or
+linked with it, is called a "work that uses the Library".  Such a
+work, in isolation, is not a derivative work of the Library, and
+therefore falls outside the scope of this License.
+
+  However, linking a "work that uses the Library" with the Library
+creates an executable that is a derivative of the Library (because it
+contains portions of the Library), rather than a "work that uses the
+library".  The executable is therefore covered by this License.
+Section 6 states terms for distribution of such executables.
+
+  When a "work that uses the Library" uses material from a header file
+that is part of the Library, the object code for the work may be a
+derivative work of the Library even though the source code is not.
+Whether this is true is especially significant if the work can be
+linked without the Library, or if the work is itself a library.  The
+threshold for this to be true is not precisely defined by law.
+
+  If such an object file uses only numerical parameters, data
+structure layouts and accessors, and small macros and small inline
+functions (ten lines or less in length), then the use of the object
+file is unrestricted, regardless of whether it is legally a derivative
+work.  (Executables containing this object code plus portions of the
+Library will still fall under Section 6.)
+
+  Otherwise, if the work is a derivative of the Library, you may
+distribute the object code for the work under the terms of Section 6.
+Any executables containing that work also fall under Section 6,
+whether or not they are linked directly with the Library itself.
+
+  6. As an exception to the Sections above, you may also combine or
+link a "work that uses the Library" with the Library to produce a
+work containing portions of the Library, and distribute that work
+under terms of your choice, provided that the terms permit
+modification of the work for the customer's own use and reverse
+engineering for debugging such modifications.
+
+  You must give prominent notice with each copy of the work that the
+Library is used in it and that the Library and its use are covered by
+this License.  You must supply a copy of this License.  If the work
+during execution displays copyright notices, you must include the
+copyright notice for the Library among them, as well as a reference
+directing the user to the copy of this License.  Also, you must do one
+of these things:
+
+    a) Accompany the work with the complete corresponding
+    machine-readable source code for the Library including whatever
+    changes were used in the work (which must be distributed under
+    Sections 1 and 2 above); and, if the work is an executable linked
+    with the Library, with the complete machine-readable "work that
+    uses the Library", as object code and/or source code, so that the
+    user can modify the Library and then relink to produce a modified
+    executable containing the modified Library.  (It is understood
+    that the user who changes the contents of definitions files in the
+    Library will not necessarily be able to recompile the application
+    to use the modified definitions.)
+
+    b) Use a suitable shared library mechanism for linking with the
+    Library.  A suitable mechanism is one that (1) uses at run time a
+    copy of the library already present on the user's computer system,
+    rather than copying library functions into the executable, and (2)
+    will operate properly with a modified version of the library, if
+    the user installs one, as long as the modified version is
+    interface-compatible with the version that the work was made with.
+
+    c) Accompany the work with a written offer, valid for at least
+    three years, to give the same user the materials specified in
+    Subsection 6a, above, for a charge no more than the cost of
+    performing this distribution.
+
+    d) If distribution of the work is made by offering access to copy
+    from a designated place, offer equivalent access to copy the above
+    specified materials from the same place.
+
+    e) Verify that the user has already received a copy of these
+    materials or that you have already sent this user a copy.
+
+  For an executable, the required form of the "work that uses the
+Library" must include any data and utility programs needed for
+reproducing the executable from it.  However, as a special exception,
+the materials to be distributed need not include anything that is
+normally distributed (in either source or binary form) with the major
+components (compiler, kernel, and so on) of the operating system on
+which the executable runs, unless that component itself accompanies
+the executable.
+
+  It may happen that this requirement contradicts the license
+restrictions of other proprietary libraries that do not normally
+accompany the operating system.  Such a contradiction means you cannot
+use both them and the Library together in an executable that you
+distribute.
+
+  7. You may place library facilities that are a work based on the
+Library side-by-side in a single library together with other library
+facilities not covered by this License, and distribute such a combined
+library, provided that the separate distribution of the work based on
+the Library and of the other library facilities is otherwise
+permitted, and provided that you do these two things:
+
+    a) Accompany the combined library with a copy of the same work
+    based on the Library, uncombined with any other library
+    facilities.  This must be distributed under the terms of the
+    Sections above.
+
+    b) Give prominent notice with the combined library of the fact
+    that part of it is a work based on the Library, and explaining
+    where to find the accompanying uncombined form of the same work.
+
+  8. You may not copy, modify, sublicense, link with, or distribute
+the Library except as expressly provided under this License.  Any
+attempt otherwise to copy, modify, sublicense, link with, or
+distribute the Library is void, and will automatically terminate your
+rights under this License.  However, parties who have received copies,
+or rights, from you under this License will not have their licenses
+terminated so long as such parties remain in full compliance.
+
+  9. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Library or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Library (or any work based on the
+Library), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Library or works based on it.
+
+  10. Each time you redistribute the Library (or any work based on the
+Library), the recipient automatically receives a license from the
+original licensor to copy, distribute, link with or modify the Library
+subject to these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties with
+this License.
+
+  11. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Library at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Library by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Library.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply, and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  12. If the distribution and/or use of the Library is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Library under this License
+may add an explicit geographical distribution limitation excluding those
+countries, so that distribution is permitted only in or among
+countries not thus excluded.  In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+  13. The Free Software Foundation may publish revised and/or new
+versions of the Lesser General Public License from time to time.
+Such new versions will be similar in spirit to the present version,
+but may differ in detail to address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Library
+specifies a version number of this License which applies to it and
+"any later version", you have the option of following the terms and
+conditions either of that version or of any later version published by
+the Free Software Foundation.  If the Library does not specify a
+license version number, you may choose any version ever published by
+the Free Software Foundation.
+
+  14. If you wish to incorporate parts of the Library into other free
+programs whose distribution conditions are incompatible with these,
+write to the author to ask for permission.  For software which is
+copyrighted by the Free Software Foundation, write to the Free
+Software Foundation; we sometimes make exceptions for this.  Our
+decision will be guided by the two goals of preserving the free status
+of all derivatives of our free software and of promoting the sharing
+and reuse of software generally.
+
+                            NO WARRANTY
+
+  15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO
+WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW.
+EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR
+OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY
+KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE
+LIBRARY IS WITH YOU.  SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME
+THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN
+WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY
+AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU
+FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR
+CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
+LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
+RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
+FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
+SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
+DAMAGES.
+
+                     END OF TERMS AND CONDITIONS
+
+           How to Apply These Terms to Your New Libraries
+
+  If you develop a new library, and you want it to be of the greatest
+possible use to the public, we recommend making it free software that
+everyone can redistribute and change.  You can do so by permitting
+redistribution under these terms (or, alternatively, under the terms
+of the ordinary General Public License).
+
+  To apply these terms, attach the following notices to the library.
+It is safest to attach them to the start of each source file to most
+effectively convey the exclusion of warranty; and each file should
+have at least the "copyright" line and a pointer to where the full
+notice is found.
+
+
+    <one line to give the library's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This library is free software; you can redistribute it and/or
+    modify it under the terms of the GNU Lesser General Public
+    License as published by the Free Software Foundation; either
+    version 2.1 of the License, or (at your option) any later version.
+
+    This library is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public
+    License along with this library; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+
+Also add information on how to contact you by electronic and paper mail.
+
+You should also get your employer (if you work as a programmer) or
+your school, if any, to sign a "copyright disclaimer" for the library,
+if necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the
+  library `Frob' (a library for tweaking knobs) written by James
+  Random Hacker.
+
+  <signature of Ty Coon>, 1 April 1990
+  Ty Coon, President of Vice
+
+That's all there is to it!
diff --git a/lib/ccan/str/_info b/lib/ccan/str/_info
new file mode 100644
index 0000000000..ea314dbf79
--- /dev/null
+++ b/lib/ccan/str/_info
@@ -0,0 +1,52 @@
+#include <stdio.h>
+#include <string.h>
+#include "config.h"
+
+/**
+ * str - string helper routines
+ *
+ * This is a grab bag of functions for string operations, designed to enhance
+ * the standard string.h.
+ *
+ * Note that if you define CCAN_STR_DEBUG, you will get extra compile
+ * checks on common misuses of the following functions (they will now
+ * be out-of-line, so there is a runtime penalty!).
+ *
+ *	strstr, strchr, strrchr:
+ *		Return const char * if first argument is const (gcc only).
+ *
+ *	isalnum, isalpha, isascii, isblank, iscntrl, isdigit, isgraph,
+ *	    islower, isprint, ispunct, isspace, isupper, isxdigit:
+ *		Static and runtime check that input is EOF or an *unsigned*
+ *		char, as per C standard (really!).
+ *
+ * Example:
+ *	#include <stdio.h>
+ *	#include <ccan/str/str.h>
+ *
+ *	int main(int argc, char *argv[])
+ *	{
+ *		if (argv[1] && streq(argv[1], "--verbose"))
+ *			printf("verbose set\n");
+ *		if (argv[1] && strstarts(argv[1], "--"))
+ *			printf("Some option set\n");
+ *		if (argv[1] && strends(argv[1], "cow-powers"))
+ *			printf("Magic option set\n");
+ *		return 0;
+ *	}
+ *
+ * License: LGPL (2 or any later version)
+ * Author: Rusty Russell <rusty@rustcorp.com.au>
+ */
+int main(int argc, char *argv[])
+{
+	if (argc != 2)
+		return 1;
+
+	if (strcmp(argv[1], "depends") == 0) {
+		printf("ccan/build_assert\n");
+		return 0;
+	}
+
+	return 1;
+}
diff --git a/lib/ccan/str/debug.c b/lib/ccan/str/debug.c
new file mode 100644
index 0000000000..9ef756766a
--- /dev/null
+++ b/lib/ccan/str/debug.c
@@ -0,0 +1,104 @@
+#include "config.h"
+#include <ccan/str/str_debug.h>
+#include <assert.h>
+#include <ctype.h>
+#include <string.h>
+
+#ifdef CCAN_STR_DEBUG
+/* Because we mug the real ones with macros, we need our own wrappers. */
+int str_isalnum(int i)
+{
+	assert(i >= -1 && i < 256);
+	return isalnum(i);
+}
+
+int str_isalpha(int i)
+{
+	assert(i >= -1 && i < 256);
+	return isalpha(i);
+}
+
+int str_isascii(int i)
+{
+	assert(i >= -1 && i < 256);
+	return isascii(i);
+}
+
+#if HAVE_ISBLANK
+int str_isblank(int i)
+{
+	assert(i >= -1 && i < 256);
+	return isblank(i);
+}
+#endif
+
+int str_iscntrl(int i)
+{
+	assert(i >= -1 && i < 256);
+	return iscntrl(i);
+}
+
+int str_isdigit(int i)
+{
+	assert(i >= -1 && i < 256);
+	return isdigit(i);
+}
+
+int str_isgraph(int i)
+{
+	assert(i >= -1 && i < 256);
+	return isgraph(i);
+}
+
+int str_islower(int i)
+{
+	assert(i >= -1 && i < 256);
+	return islower(i);
+}
+
+int str_isprint(int i)
+{
+	assert(i >= -1 && i < 256);
+	return isprint(i);
+}
+
+int str_ispunct(int i)
+{
+	assert(i >= -1 && i < 256);
+	return ispunct(i);
+}
+
+int str_isspace(int i)
+{
+	assert(i >= -1 && i < 256);
+	return isspace(i);
+}
+
+int str_isupper(int i)
+{
+	assert(i >= -1 && i < 256);
+	return isupper(i);
+}
+
+int str_isxdigit(int i)
+{
+	assert(i >= -1 && i < 256);
+	return isxdigit(i);
+}
+
+
+char *str_strstr(const char *haystack, const char *needle)
+{
+	return strstr(haystack, needle);
+}
+
+char *str_strchr(const char *haystack, int c)
+{
+	return strchr(haystack, c);
+}
+
+char *str_strrchr(const char *haystack, int c)
+{
+	return strrchr(haystack, c);
+}
+#endif
diff --git a/lib/ccan/str/str.c b/lib/ccan/str/str.c
new file mode 100644
index 0000000000..fa9809fbd9
--- /dev/null
+++ b/lib/ccan/str/str.c
@@ -0,0 +1,12 @@
+#include <ccan/str/str.h>
+
+size_t strcount(const char *haystack, const char *needle)
+{
+	size_t i = 0, nlen = strlen(needle);
+
+	while ((haystack = strstr(haystack, needle)) != NULL) {
+		i++;
+		haystack += nlen;
+	}
+	return i;
+}
diff --git a/lib/ccan/str/str.h b/lib/ccan/str/str.h
new file mode 100644
index 0000000000..ae51cdcc99
--- /dev/null
+++ b/lib/ccan/str/str.h
@@ -0,0 +1,200 @@
+#ifndef CCAN_STR_H
+#define CCAN_STR_H
+#include "config.h"
+#include <string.h>
+#include <stdbool.h>
+#include <ctype.h>
+
+/**
+ * streq - Are two strings equal?
+ * @a: first string
+ * @b: first string
+ *
+ * This macro is arguably more readable than "!strcmp(a, b)".
+ *
+ * Example:
+ *	if (streq(somestring, ""))
+ *		printf("String is empty!\n");
+ */
+#define streq(a,b) (strcmp((a),(b)) == 0)
+
+/**
+ * strstarts - Does this string start with this prefix?
+ * @str: string to test
+ * @prefix: prefix to look for at start of str
+ *
+ * Example:
+ *	if (strstarts(somestring, "foo"))
+ *		printf("String %s begins with 'foo'!\n", somestring);
+ */
+#define strstarts(str,prefix) (strncmp((str),(prefix),strlen(prefix)) == 0)
+
+/**
+ * strends - Does this string end with this postfix?
+ * @str: string to test
+ * @postfix: postfix to look for at end of str
+ *
+ * Example:
+ *	if (strends(somestring, "foo"))
+ *		printf("String %s end with 'foo'!\n", somestring);
+ */
+static inline bool strends(const char *str, const char *postfix)
+{
+	if (strlen(str) < strlen(postfix))
+		return false;
+
+	return streq(str + strlen(str) - strlen(postfix), postfix);
+}
+
+/**
+ * stringify - Turn expression into a string literal
+ * @expr: any C expression
+ *
+ * Example:
+ *	#define PRINT_COND_IF_FALSE(cond) \
+ *		((cond) || printf("%s is false!", stringify(cond)))
+ */
+#define stringify(expr)		stringify_1(expr)
+/* Double-indirection required to stringify expansions */
+#define stringify_1(expr)	#expr
+
+/**
+ * strcount - Count number of (non-overlapping) occurrences of a substring.
+ * @haystack: a C string
+ * @needle: a substring
+ *
+ * Example:
+ *	int i;
+ *      i = strcount("aaa aaa", "a");  // i = 6;
+ *      i = strcount("aaa aaa", "ab"); // i = 0;
+ *      i = strcount("aaa aaa", "aa"); // i = 2;
+ */
+size_t strcount(const char *haystack, const char *needle);
+
+/**
+ * cisalnum - isalnum() which takes a char (and doesn't accept EOF)
+ * @c: a character
+ *
+ * Surprisingly, the standard ctype.h isalnum() takes an int, which
+ * must have the value of EOF (-1) or an unsigned char.  This variant
+ * takes a real char, and doesn't accept EOF.
+ */
+static inline bool cisalnum(char c)
+{
+	return isalnum((unsigned char)c);
+}
+static inline bool cisalpha(char c)
+{
+	return isalpha((unsigned char)c);
+}
+static inline bool cisascii(char c)
+{
+	return isascii((unsigned char)c);
+}
+#if HAVE_ISBLANK
+static inline bool cisblank(char c)
+{
+	return isblank((unsigned char)c);
+}
+#endif
+static inline bool ciscntrl(char c)
+{
+	return iscntrl((unsigned char)c);
+}
+static inline bool cisdigit(char c)
+{
+	return isdigit((unsigned char)c);
+}
+static inline bool cisgraph(char c)
+{
+	return isgraph((unsigned char)c);
+}
+static inline bool cislower(char c)
+{
+	return islower((unsigned char)c);
+}
+static inline bool cisprint(char c)
+{
+	return isprint((unsigned char)c);
+}
+static inline bool cispunct(char c)
+{
+	return ispunct((unsigned char)c);
+}
+static inline bool cisspace(char c)
+{
+	return isspace((unsigned char)c);
+}
+static inline bool cisupper(char c)
+{
+	return isupper((unsigned char)c);
+}
+static inline bool cisxdigit(char c)
+{
+	return isxdigit((unsigned char)c);
+}
+
+#include <ccan/str/str_debug.h>
+
+/* These checks force things out of line, hence they are under DEBUG. */
+#ifdef CCAN_STR_DEBUG
+#include <ccan/build_assert/build_assert.h>
+
+/* These are commonly misused: they take -1 or an *unsigned* char value. */
+#undef isalnum
+#undef isalpha
+#undef isascii
+#undef isblank
+#undef iscntrl
+#undef isdigit
+#undef isgraph
+#undef islower
+#undef isprint
+#undef ispunct
+#undef isspace
+#undef isupper
+#undef isxdigit
+
+/* You can use a char if char is unsigned. */
+#if HAVE_BUILTIN_TYPES_COMPATIBLE_P && HAVE_TYPEOF
+#define str_check_arg_(i)						\
+	((i) + BUILD_ASSERT_OR_ZERO(!__builtin_types_compatible_p(typeof(i), \
+								  char)	\
+				    || (char)255 > 0))
+#else
+#define str_check_arg_(i) (i)
+#endif
+
+#define isalnum(i) str_isalnum(str_check_arg_(i))
+#define isalpha(i) str_isalpha(str_check_arg_(i))
+#define isascii(i) str_isascii(str_check_arg_(i))
+#if HAVE_ISBLANK
+#define isblank(i) str_isblank(str_check_arg_(i))
+#endif
+#define iscntrl(i) str_iscntrl(str_check_arg_(i))
+#define isdigit(i) str_isdigit(str_check_arg_(i))
+#define isgraph(i) str_isgraph(str_check_arg_(i))
+#define islower(i) str_islower(str_check_arg_(i))
+#define isprint(i) str_isprint(str_check_arg_(i))
+#define ispunct(i) str_ispunct(str_check_arg_(i))
+#define isspace(i) str_isspace(str_check_arg_(i))
+#define isupper(i) str_isupper(str_check_arg_(i))
+#define isxdigit(i) str_isxdigit(str_check_arg_(i))
+
+#if HAVE_TYPEOF
+/* With GNU magic, we can make const-respecting standard string functions. */
+#undef strstr
+#undef strchr
+#undef strrchr
+
+/* + 0 is needed to decay array into pointer. */
+#define strstr(haystack, needle)					\
+	((typeof((haystack) + 0))str_strstr((haystack), (needle)))
+#define strchr(haystack, c)					\
+	((typeof((haystack) + 0))str_strchr((haystack), (c)))
+#define strrchr(haystack, c)					\
+	((typeof((haystack) + 0))str_strrchr((haystack), (c)))
+#endif
+#endif /* CCAN_STR_DEBUG */
+
+#endif /* CCAN_STR_H */
diff --git a/lib/ccan/str/str_debug.h b/lib/ccan/str/str_debug.h
new file mode 100644
index 0000000000..6b56477689
--- /dev/null
+++ b/lib/ccan/str/str_debug.h
@@ -0,0 +1,29 @@
+#ifndef CCAN_STR_DEBUG_H
+#define CCAN_STR_DEBUG_H
+
+/* #define CCAN_STR_DEBUG 1 */
+
+#ifdef CCAN_STR_DEBUG
+/* Because we mug the real ones with macros, we need our own wrappers. */
+int str_isalnum(int i);
+int str_isalpha(int i);
+int str_isascii(int i);
+#if HAVE_ISBLANK
+int str_isblank(int i);
+#endif
+int str_iscntrl(int i);
+int str_isdigit(int i);
+int str_isgraph(int i);
+int str_islower(int i);
+int str_isprint(int i);
+int str_ispunct(int i);
+int str_isspace(int i);
+int str_isupper(int i);
+int str_isxdigit(int i);
+
+char *str_strstr(const char *haystack, const char *needle);
+char *str_strchr(const char *s, int c);
+char *str_strrchr(const char *s, int c);
+#endif /* CCAN_STR_DEBUG */
+
+#endif /* CCAN_STR_DEBUG_H */
diff --git a/lib/ccan/str/test/compile_fail-isalnum.c b/lib/ccan/str/test/compile_fail-isalnum.c
new file mode 100644
index 0000000000..930defffa0
--- /dev/null
+++ b/lib/ccan/str/test/compile_fail-isalnum.c
@@ -0,0 +1,22 @@
+#define CCAN_STR_DEBUG 1
+#include <ccan/str/str.h>
+
+int main(int argc, char *argv[])
+{
+#ifdef FAIL
+#if !HAVE_BUILTIN_TYPES_COMPATIBLE_P || !HAVE_TYPEOF
+#error We need typeof to check isalnum.
+#endif
+	char
+#else
+	unsigned char
+#endif
+		c = argv[0][0];
+
+#ifdef FAIL
+	/* Fake fail on unsigned char platforms. */
+	BUILD_ASSERT((char)255 < 0);
+#endif
+
+	return isalnum(c);
+}
diff --git a/lib/ccan/str/test/compile_fail-isalpha.c b/lib/ccan/str/test/compile_fail-isalpha.c
new file mode 100644
index 0000000000..2005109829
--- /dev/null
+++ b/lib/ccan/str/test/compile_fail-isalpha.c
@@ -0,0 +1,22 @@
+#define CCAN_STR_DEBUG 1
+#include <ccan/str/str.h>
+
+int main(int argc, char *argv[])
+{
+#ifdef FAIL
+#if !HAVE_BUILTIN_TYPES_COMPATIBLE_P || !HAVE_TYPEOF
+#error We need typeof to check isalpha.
+#endif
+	char
+#else
+	unsigned char
+#endif
+		c = argv[0][0];
+
+#ifdef FAIL
+	/* Fake fail on unsigned char platforms. */
+	BUILD_ASSERT((char)255 < 0);
+#endif
+
+	return isalpha(c);
+}
diff --git a/lib/ccan/str/test/compile_fail-isascii.c b/lib/ccan/str/test/compile_fail-isascii.c
new file mode 100644
index 0000000000..ee55e49974
--- /dev/null
+++ b/lib/ccan/str/test/compile_fail-isascii.c
@@ -0,0 +1,22 @@
+#define CCAN_STR_DEBUG 1
+#include <ccan/str/str.h>
+
+int main(int argc, char *argv[])
+{
+#ifdef FAIL
+#if !HAVE_BUILTIN_TYPES_COMPATIBLE_P || !HAVE_TYPEOF
+#error We need typeof to check isascii.
+#endif
+	char
+#else
+	unsigned char
+#endif
+		c = argv[0][0];
+
+#ifdef FAIL
+	/* Fake fail on unsigned char platforms. */
+	BUILD_ASSERT((char)255 < 0);
+#endif
+
+	return isascii(c);
+}
diff --git a/lib/ccan/str/test/compile_fail-isblank.c b/lib/ccan/str/test/compile_fail-isblank.c
new file mode 100644
index 0000000000..f4cb961d74
--- /dev/null
+++ b/lib/ccan/str/test/compile_fail-isblank.c
@@ -0,0 +1,26 @@
+#define CCAN_STR_DEBUG 1
+#include <ccan/str/str.h>
+
+int main(int argc, char *argv[])
+{
+#ifdef FAIL
+#if !HAVE_BUILTIN_TYPES_COMPATIBLE_P || !HAVE_TYPEOF || !HAVE_ISBLANK
+#error We need typeof to check isblank.
+#endif
+	char
+#else
+	unsigned char
+#endif
+		c = argv[0][0];
+
+#ifdef FAIL
+	/* Fake fail on unsigned char platforms. */
+	BUILD_ASSERT((char)255 < 0);
+#endif
+
+#if HAVE_ISBLANK
+	return isblank(c);
+#else
+	return c;
+#endif
+}
diff --git a/lib/ccan/str/test/compile_fail-iscntrl.c b/lib/ccan/str/test/compile_fail-iscntrl.c
new file mode 100644
index 0000000000..bc74146542
--- /dev/null
+++ b/lib/ccan/str/test/compile_fail-iscntrl.c
@@ -0,0 +1,22 @@
+#define CCAN_STR_DEBUG 1
+#include <ccan/str/str.h>
+
+int main(int argc, char *argv[])
+{
+#ifdef FAIL
+#if !HAVE_BUILTIN_TYPES_COMPATIBLE_P || !HAVE_TYPEOF
+#error We need typeof to check iscntrl.
+#endif
+	char
+#else
+	unsigned char
+#endif
+		c = argv[0][0];
+
+#ifdef FAIL
+	/* Fake fail on unsigned char platforms. */
+	BUILD_ASSERT((char)255 < 0);
+#endif
+
+	return iscntrl(c);
+}
diff --git a/lib/ccan/str/test/compile_fail-isdigit.c b/lib/ccan/str/test/compile_fail-isdigit.c
new file mode 100644
index 0000000000..71d1c71433
--- /dev/null
+++ b/lib/ccan/str/test/compile_fail-isdigit.c
@@ -0,0 +1,22 @@
+#define CCAN_STR_DEBUG 1
+#include <ccan/str/str.h>
+
+int main(int argc, char *argv[])
+{
+#ifdef FAIL
+#if !HAVE_BUILTIN_TYPES_COMPATIBLE_P || !HAVE_TYPEOF
+#error We need typeof to check isdigit.
+#endif
+	char
+#else
+	unsigned char
+#endif
+		c = argv[0][0];
+
+#ifdef FAIL
+	/* Fake fail on unsigned char platforms. */
+	BUILD_ASSERT((char)255 < 0);
+#endif
+
+	return isdigit(c);
+}
diff --git a/lib/ccan/str/test/compile_fail-islower.c b/lib/ccan/str/test/compile_fail-islower.c
new file mode 100644
index 0000000000..ca3f9907e5
--- /dev/null
+++ b/lib/ccan/str/test/compile_fail-islower.c
@@ -0,0 +1,22 @@
+#define CCAN_STR_DEBUG 1
+#include <ccan/str/str.h>
+
+int main(int argc, char *argv[])
+{
+#ifdef FAIL
+#if !HAVE_BUILTIN_TYPES_COMPATIBLE_P || !HAVE_TYPEOF
+#error We need typeof to check islower.
+#endif
+	char
+#else
+	unsigned char
+#endif
+		c = argv[0][0];
+
+#ifdef FAIL
+	/* Fake fail on unsigned char platforms. */
+	BUILD_ASSERT((char)255 < 0);
+#endif
+
+	return islower(c);
+}
diff --git a/lib/ccan/str/test/compile_fail-isprint.c b/lib/ccan/str/test/compile_fail-isprint.c
new file mode 100644
index 0000000000..6432e41d2b
--- /dev/null
+++ b/lib/ccan/str/test/compile_fail-isprint.c
@@ -0,0 +1,22 @@
+#define CCAN_STR_DEBUG 1
+#include <ccan/str/str.h>
+
+int main(int argc, char *argv[])
+{
+#ifdef FAIL
+#if !HAVE_BUILTIN_TYPES_COMPATIBLE_P || !HAVE_TYPEOF
+#error We need typeof to check isprint.
+#endif
+	char
+#else
+	unsigned char
+#endif
+		c = argv[0][0];
+
+#ifdef FAIL
+	/* Fake fail on unsigned char platforms. */
+	BUILD_ASSERT((char)255 < 0);
+#endif
+
+	return isprint(c);
+}
diff --git a/lib/ccan/str/test/compile_fail-ispunct.c b/lib/ccan/str/test/compile_fail-ispunct.c
new file mode 100644
index 0000000000..5d941fcba6
--- /dev/null
+++ b/lib/ccan/str/test/compile_fail-ispunct.c
@@ -0,0 +1,22 @@
+#define CCAN_STR_DEBUG 1
+#include <ccan/str/str.h>
+
+int main(int argc, char *argv[])
+{
+#ifdef FAIL
+#if !HAVE_BUILTIN_TYPES_COMPATIBLE_P || !HAVE_TYPEOF
+#error We need typeof to check ispunct.
+#endif
+	char
+#else
+	unsigned char
+#endif
+		c = argv[0][0];
+
+#ifdef FAIL
+	/* Fake fail on unsigned char platforms. */
+	BUILD_ASSERT((char)255 < 0);
+#endif
+
+	return ispunct(c);
+}
diff --git a/lib/ccan/str/test/compile_fail-isspace.c b/lib/ccan/str/test/compile_fail-isspace.c
new file mode 100644
index 0000000000..bfee1f89f1
--- /dev/null
+++ b/lib/ccan/str/test/compile_fail-isspace.c
@@ -0,0 +1,22 @@
+#define CCAN_STR_DEBUG 1
+#include <ccan/str/str.h>
+
+int main(int argc, char *argv[])
+{
+#ifdef FAIL
+#if !HAVE_BUILTIN_TYPES_COMPATIBLE_P || !HAVE_TYPEOF
+#error We need typeof to check isspace.
+#endif
+	char
+#else
+	unsigned char
+#endif
+		c = argv[0][0];
+
+#ifdef FAIL
+	/* Fake fail on unsigned char platforms. */
+	BUILD_ASSERT((char)255 < 0);
+#endif
+
+	return isspace(c);
+}
diff --git a/lib/ccan/str/test/compile_fail-isupper.c b/lib/ccan/str/test/compile_fail-isupper.c
new file mode 100644
index 0000000000..4cf9fd3578
--- /dev/null
+++ b/lib/ccan/str/test/compile_fail-isupper.c
@@ -0,0 +1,22 @@
+#define CCAN_STR_DEBUG 1
+#include <ccan/str/str.h>
+
+int main(int argc, char *argv[])
+{
+#ifdef FAIL
+#if !HAVE_BUILTIN_TYPES_COMPATIBLE_P || !HAVE_TYPEOF
+#error We need typeof to check isupper.
+#endif
+	char
+#else
+	unsigned char
+#endif
+		c = argv[0][0];
+
+#ifdef FAIL
+	/* Fake fail on unsigned char platforms. */
+	BUILD_ASSERT((char)255 < 0);
+#endif
+
+	return isupper(c);
+}
diff --git a/lib/ccan/str/test/compile_fail-isxdigit.c b/lib/ccan/str/test/compile_fail-isxdigit.c
new file mode 100644
index 0000000000..65e6006a88
--- /dev/null
+++ b/lib/ccan/str/test/compile_fail-isxdigit.c
@@ -0,0 +1,22 @@
+#define CCAN_STR_DEBUG 1
+#include <ccan/str/str.h>
+
+int main(int argc, char *argv[])
+{
+#ifdef FAIL
+#if !HAVE_BUILTIN_TYPES_COMPATIBLE_P || !HAVE_TYPEOF
+#error We need typeof to check isxdigit.
+#endif
+	char
+#else
+	unsigned char
+#endif
+		c = argv[0][0];
+
+#ifdef FAIL
+	/* Fake fail on unsigned char platforms. */
+	BUILD_ASSERT((char)255 < 0);
+#endif
+
+	return isxdigit(c);
+}
diff --git a/lib/ccan/str/test/compile_fail-strchr.c b/lib/ccan/str/test/compile_fail-strchr.c
new file mode 100644
index 0000000000..74a7314d06
--- /dev/null
+++ b/lib/ccan/str/test/compile_fail-strchr.c
@@ -0,0 +1,18 @@
+#define CCAN_STR_DEBUG 1
+#include <ccan/str/str.h>
+
+int main(int argc, char *argv[])
+{
+#ifdef FAIL
+#if !HAVE_TYPEOF
+	#error We need typeof to check strstr.
+#endif
+#else
+	const
+#endif
+		char *ret;
+	const char *str = "hello";
+
+	ret = strchr(str, 'l');
+	return ret ? 0 : 1;
+}
diff --git a/lib/ccan/str/test/compile_fail-strrchr.c b/lib/ccan/str/test/compile_fail-strrchr.c
new file mode 100644
index 0000000000..ba7d17e031
--- /dev/null
+++ b/lib/ccan/str/test/compile_fail-strrchr.c
@@ -0,0 +1,18 @@
+#define CCAN_STR_DEBUG 1
+#include <ccan/str/str.h>
+
+int main(int argc, char *argv[])
+{
+#ifdef FAIL
+#if !HAVE_TYPEOF
+	#error We need typeof to check strstr.
+#endif
+#else
+	const
+#endif
+		char *ret;
+	const char *str = "hello";
+
+	ret = strrchr(str, 'l');
+	return ret ? 0 : 1;
+}
diff --git a/lib/ccan/str/test/compile_fail-strstr.c b/lib/ccan/str/test/compile_fail-strstr.c
new file mode 100644
index 0000000000..deefef6542
--- /dev/null
+++ b/lib/ccan/str/test/compile_fail-strstr.c
@@ -0,0 +1,18 @@
+#define CCAN_STR_DEBUG 1
+#include <ccan/str/str.h>
+
+int main(int argc, char *argv[])
+{
+#ifdef FAIL
+#if !HAVE_TYPEOF
+	#error We need typeof to check strstr.
+#endif
+#else
+	const
+#endif
+		char *ret;
+	const char *str = "hello";
+
+	ret = strstr(str, "hell");
+	return ret ? 0 : 1;
+}
diff --git a/lib/ccan/str/test/debug.c b/lib/ccan/str/test/debug.c
new file mode 100644
index 0000000000..4bd384f2c4
--- /dev/null
+++ b/lib/ccan/str/test/debug.c
@@ -0,0 +1,5 @@
+/* We can't use the normal "#include the .c file" trick, since this is
+   contaminated by str.h's macro overrides.  So we put it in all tests
+   like this. */
+#define CCAN_STR_DEBUG 1
+#include <ccan/str/debug.c>
diff --git a/lib/ccan/str/test/run.c b/lib/ccan/str/test/run.c
new file mode 100644
index 0000000000..a15654f8f3
--- /dev/null
+++ b/lib/ccan/str/test/run.c
@@ -0,0 +1,105 @@
+#include <ccan/str/str.h>
+#include <ccan/str/str.c>
+#include <stdlib.h>
+#include <stdio.h>
+#include <ccan/tap/tap.h>
+
+#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0]))
+
+static char *substrings[] = { "far", "bar", "baz", "b", "ba", "z", "ar", NULL };
+
+#define NUM_SUBSTRINGS (ARRAY_SIZE(substrings) - 1)
+
+static char *strdup_rev(const char *s)
+{
+	char *ret = strdup(s);
+	unsigned int i;
+
+	for (i = 0; i < strlen(s); i++)
+		ret[i] = s[strlen(s) - i - 1];
+	return ret;
+}
+
+int main(int argc, char *argv[])
+{
+	unsigned int i, j, n;
+	char *strings[NUM_SUBSTRINGS * NUM_SUBSTRINGS];
+
+	n = 0;
+	for (i = 0; i < NUM_SUBSTRINGS; i++) {
+		for (j = 0; j < NUM_SUBSTRINGS; j++) {
+			strings[n] = malloc(strlen(substrings[i])
+					    + strlen(substrings[j]) + 1);
+			sprintf(strings[n++], "%s%s",
+				substrings[i], substrings[j]);
+		}
+	}
+
+	plan_tests(n * n * 5 + 16);
+	for (i = 0; i < n; i++) {
+		for (j = 0; j < n; j++) {
+			unsigned int k, identical = 0;
+			char *reva, *revb;
+
+			/* Find first difference. */
+			for (k = 0; strings[i][k]==strings[j][k]; k++) {
+				if (k == strlen(strings[i])) {
+					identical = 1;
+					break;
+				}
+			}
+
+			if (identical)
+				ok1(streq(strings[i], strings[j]));
+			else
+				ok1(!streq(strings[i], strings[j]));
+
+			/* Postfix test should be equivalent to prefix
+			 * test on reversed string. */
+			reva = strdup_rev(strings[i]);
+			revb = strdup_rev(strings[j]);
+
+			if (!strings[i][k]) {
+				ok1(strstarts(strings[j], strings[i]));
+				ok1(strends(revb, reva));
+			} else {
+				ok1(!strstarts(strings[j], strings[i]));
+				ok1(!strends(revb, reva));
+			}
+			if (!strings[j][k]) {
+				ok1(strstarts(strings[i], strings[j]));
+				ok1(strends(reva, revb));
+			} else {
+				ok1(!strstarts(strings[i], strings[j]));
+				ok1(!strends(reva, revb));
+			}
+			free(reva);
+			free(revb);
+		}
+	}
+
+	for (i = 0; i < n; i++)
+		free(strings[i]);
+
+	ok1(streq(stringify(NUM_SUBSTRINGS),
+		  "((sizeof(substrings) / sizeof(substrings[0])) - 1)"));
+	ok1(streq(stringify(ARRAY_SIZE(substrings)),
+		  "(sizeof(substrings) / sizeof(substrings[0]))"));
+	ok1(streq(stringify(i == 0), "i == 0"));
+
+	ok1(strcount("aaaaaa", "b") == 0);
+	ok1(strcount("aaaaaa", "a") == 6);
+	ok1(strcount("aaaaaa", "aa") == 3);
+	ok1(strcount("aaaaaa", "aaa") == 2);
+	ok1(strcount("aaaaaa", "aaaa") == 1);
+	ok1(strcount("aaaaaa", "aaaaa") == 1);
+	ok1(strcount("aaaaaa", "aaaaaa") == 1);
+	ok1(strcount("aaa aaa", "b") == 0);
+	ok1(strcount("aaa aaa", "a") == 6);
+	ok1(strcount("aaa aaa", "aa") == 2);
+	ok1(strcount("aaa aaa", "aaa") == 2);
+	ok1(strcount("aaa aaa", "aaaa") == 0);
+	ok1(strcount("aaa aaa", "aaaaa") == 0);
+
+	return exit_status();
+}
diff --git a/lib/ccan/tally/LICENSE b/lib/ccan/tally/LICENSE
new file mode 100644
index 0000000000..cca7fc278f
--- /dev/null
+++ b/lib/ccan/tally/LICENSE
@@ -0,0 +1,165 @@
+		   GNU LESSER GENERAL PUBLIC LICENSE
+                       Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+
+  This version of the GNU Lesser General Public License incorporates
+the terms and conditions of version 3 of the GNU General Public
+License, supplemented by the additional permissions listed below.
+
+  0. Additional Definitions.
+
+  As used herein, "this License" refers to version 3 of the GNU Lesser
+General Public License, and the "GNU GPL" refers to version 3 of the GNU
+General Public License.
+
+  "The Library" refers to a covered work governed by this License,
+other than an Application or a Combined Work as defined below.
+
+  An "Application" is any work that makes use of an interface provided
+by the Library, but which is not otherwise based on the Library.
+Defining a subclass of a class defined by the Library is deemed a mode
+of using an interface provided by the Library.
+
+  A "Combined Work" is a work produced by combining or linking an
+Application with the Library.  The particular version of the Library
+with which the Combined Work was made is also called the "Linked
+Version".
+
+  The "Minimal Corresponding Source" for a Combined Work means the
+Corresponding Source for the Combined Work, excluding any source code
+for portions of the Combined Work that, considered in isolation, are
+based on the Application, and not on the Linked Version.
+
+  The "Corresponding Application Code" for a Combined Work means the
+object code and/or source code for the Application, including any data
+and utility programs needed for reproducing the Combined Work from the
+Application, but excluding the System Libraries of the Combined Work.
+
+  1. Exception to Section 3 of the GNU GPL.
+
+  You may convey a covered work under sections 3 and 4 of this License
+without being bound by section 3 of the GNU GPL.
+
+  2. Conveying Modified Versions.
+
+  If you modify a copy of the Library, and, in your modifications, a
+facility refers to a function or data to be supplied by an Application
+that uses the facility (other than as an argument passed when the
+facility is invoked), then you may convey a copy of the modified
+version:
+
+   a) under this License, provided that you make a good faith effort to
+   ensure that, in the event an Application does not supply the
+   function or data, the facility still operates, and performs
+   whatever part of its purpose remains meaningful, or
+
+   b) under the GNU GPL, with none of the additional permissions of
+   this License applicable to that copy.
+
+  3. Object Code Incorporating Material from Library Header Files.
+
+  The object code form of an Application may incorporate material from
+a header file that is part of the Library.  You may convey such object
+code under terms of your choice, provided that, if the incorporated
+material is not limited to numerical parameters, data structure
+layouts and accessors, or small macros, inline functions and templates
+(ten or fewer lines in length), you do both of the following:
+
+   a) Give prominent notice with each copy of the object code that the
+   Library is used in it and that the Library and its use are
+   covered by this License.
+
+   b) Accompany the object code with a copy of the GNU GPL and this license
+   document.
+
+  4. Combined Works.
+
+  You may convey a Combined Work under terms of your choice that,
+taken together, effectively do not restrict modification of the
+portions of the Library contained in the Combined Work and reverse
+engineering for debugging such modifications, if you also do each of
+the following:
+
+   a) Give prominent notice with each copy of the Combined Work that
+   the Library is used in it and that the Library and its use are
+   covered by this License.
+
+   b) Accompany the Combined Work with a copy of the GNU GPL and this license
+   document.
+
+   c) For a Combined Work that displays copyright notices during
+   execution, include the copyright notice for the Library among
+   these notices, as well as a reference directing the user to the
+   copies of the GNU GPL and this license document.
+
+   d) Do one of the following:
+
+       0) Convey the Minimal Corresponding Source under the terms of this
+       License, and the Corresponding Application Code in a form
+       suitable for, and under terms that permit, the user to
+       recombine or relink the Application with a modified version of
+       the Linked Version to produce a modified Combined Work, in the
+       manner specified by section 6 of the GNU GPL for conveying
+       Corresponding Source.
+
+       1) Use a suitable shared library mechanism for linking with the
+       Library.  A suitable mechanism is one that (a) uses at run time
+       a copy of the Library already present on the user's computer
+       system, and (b) will operate properly with a modified version
+       of the Library that is interface-compatible with the Linked
+       Version.
+
+   e) Provide Installation Information, but only if you would otherwise
+   be required to provide such information under section 6 of the
+   GNU GPL, and only to the extent that such information is
+   necessary to install and execute a modified version of the
+   Combined Work produced by recombining or relinking the
+   Application with a modified version of the Linked Version. (If
+   you use option 4d0, the Installation Information must accompany
+   the Minimal Corresponding Source and Corresponding Application
+   Code. If you use option 4d1, you must provide the Installation
+   Information in the manner specified by section 6 of the GNU GPL
+   for conveying Corresponding Source.)
+
+  5. Combined Libraries.
+
+  You may place library facilities that are a work based on the
+Library side by side in a single library together with other library
+facilities that are not Applications and are not covered by this
+License, and convey such a combined library under terms of your
+choice, if you do both of the following:
+
+   a) Accompany the combined library with a copy of the same work based
+   on the Library, uncombined with any other library facilities,
+   conveyed under the terms of this License.
+
+   b) Give prominent notice with the combined library that part of it
+   is a work based on the Library, and explaining where to find the
+   accompanying uncombined form of the same work.
+
+  6. Revised Versions of the GNU Lesser General Public License.
+
+  The Free Software Foundation may publish revised and/or new versions
+of the GNU Lesser General Public License from time to time. Such new
+versions will be similar in spirit to the present version, but may
+differ in detail to address new problems or concerns.
+
+  Each version is given a distinguishing version number. If the
+Library as you received it specifies that a certain numbered version
+of the GNU Lesser General Public License "or any later version"
+applies to it, you have the option of following the terms and
+conditions either of that published version or of any later version
+published by the Free Software Foundation. If the Library as you
+received it does not specify a version number of the GNU Lesser
+General Public License, you may choose any version of the GNU Lesser
+General Public License ever published by the Free Software Foundation.
+
+  If the Library as you received it specifies that a proxy can decide
+whether future versions of the GNU Lesser General Public License shall
+apply, that proxy's public statement of acceptance of any version is
+permanent authorization for you to choose that version for the
+Library.
diff --git a/lib/ccan/tally/_info b/lib/ccan/tally/_info
new file mode 100644
index 0000000000..1d67274f5c
--- /dev/null
+++ b/lib/ccan/tally/_info
@@ -0,0 +1,58 @@
+#include <stdio.h>
+#include <string.h>
+#include "config.h"
+
+/**
+ * tally - running tally of integers
+ *
+ * The tally module implements simple analysis of a stream of integers.
+ * Numbers are fed in via tally_add(), and then the mean, median, mode and
+ * a histogram can be read out.
+ *
+ * Example:
+ *	#include <stdio.h>
+ *	#include <err.h>
+ *	#include <ccan/tally/tally.h>
+ *
+ *	int main(int argc, char *argv[])
+ *	{
+ *		struct tally *t;
+ *		unsigned int i;
+ *		size_t err;
+ *		ssize_t val;
+ *		char *histogram;
+ *
+ *		if (argc < 2)
+ *			errx(1, "Usage: %s <number>...\n", argv[0]);
+ *
+ *		t = tally_new(100);
+ *		for (i = 1; i < argc; i++)
+ *			tally_add(t, atol(argv[i]));
+ *
+ *		printf("Mean = %zi\n", tally_mean(t));
+ *		val = tally_approx_median(t, &err);
+ *		printf("Median = %zi (+/- %zu)\n", val, err);
+ *		val = tally_approx_mode(t, &err);
+ *		printf("Mode = %zi (+/- %zu)\n", val, err);
+ *		histogram = tally_histogram(t, 50, 10);
+ *		printf("Histogram:\n%s", histogram);
+ *		free(histogram);
+ *		return 0;
+ *	}
+ *
+ * License: LGPL (3 or any later version)
+ * Author: Rusty Russell <rusty@rustcorp.com.au>
+ */
+int main(int argc, char *argv[])
+{
+	if (argc != 2)
+		return 1;
+
+	if (strcmp(argv[1], "depends") == 0) {
+		printf("ccan/build_assert\n");
+		printf("ccan/likely\n");
+		return 0;
+	}
+
+	return 1;
+}
diff --git a/lib/ccan/tally/tally.c b/lib/ccan/tally/tally.c
new file mode 100644
index 0000000000..b1839befe3
--- /dev/null
+++ b/lib/ccan/tally/tally.c
@@ -0,0 +1,490 @@
+#include <ccan/tally/tally.h>
+#include <ccan/build_assert/build_assert.h>
+#include <ccan/likely/likely.h>
+#include <stdint.h>
+#include <limits.h>
+#include <string.h>
+#include <stdio.h>
+#include <assert.h>
+#include <stdlib.h>
+
+#define SIZET_BITS (sizeof(size_t)*CHAR_BIT)
+
+/* We use power of 2 steps.  I tried being tricky, but it got buggy. */
+struct tally {
+	ssize_t min, max;
+	size_t total[2];
+	/* This allows limited frequency analysis. */
+	unsigned buckets, step_bits;
+	size_t counts[1 /* Actually: [buckets] */ ];
+};
+
+struct tally *tally_new(unsigned buckets)
+{
+	struct tally *tally;
+
+	/* There is always 1 bucket. */
+	if (buckets == 0)
+		buckets = 1;
+
+	/* Check for overflow. */
+	if (buckets && SIZE_MAX / buckets < sizeof(tally->counts[0]))
+		return NULL;
+	tally = malloc(sizeof(*tally) + sizeof(tally->counts[0])*(buckets-1));
+	if (tally) {
+		tally->max = ((size_t)1 << (SIZET_BITS - 1));
+		tally->min = ~tally->max;
+		tally->total[0] = tally->total[1] = 0;
+		tally->buckets = buckets;
+		tally->step_bits = 0;
+		memset(tally->counts, 0, sizeof(tally->counts[0])*buckets);
+	}
+	return tally;
+}
+
+static unsigned bucket_of(ssize_t min, unsigned step_bits, ssize_t val)
+{
+	/* Don't over-shift. */
+	if (step_bits == SIZET_BITS)
+		return 0;
+	assert(step_bits < SIZET_BITS);
+	return (size_t)(val - min) >> step_bits;
+}
+
+/* Return the min value in bucket b. */
+static ssize_t bucket_min(ssize_t min, unsigned step_bits, unsigned b)
+{
+	/* Don't over-shift. */
+	if (step_bits == SIZET_BITS)
+		return min;
+	assert(step_bits < SIZET_BITS);
+	return min + ((ssize_t)b << step_bits);
+}
+
+/* Does shifting by this many bits truncate the number? */
+static bool shift_overflows(size_t num, unsigned bits)
+{
+	if (bits == 0)
+		return false;
+
+	return ((num << bits) >> 1) != (num << (bits - 1));
+}
+
+/* When min or max change, we may need to shuffle the frequency counts. */
+static void renormalize(struct tally *tally,
+			ssize_t new_min, ssize_t new_max)
+{
+	size_t range, spill;
+	unsigned int i, old_min;
+
+	/* Uninitialized?  Don't do anything... */
+	if (tally->max < tally->min)
+		goto update;
+
+	/* If we don't have sufficient range, increase step bits until
+	 * buckets cover entire range of ssize_t anyway. */
+	range = (new_max - new_min) + 1;
+	while (!shift_overflows(tally->buckets, tally->step_bits)
+	       && range > ((size_t)tally->buckets << tally->step_bits)) {
+		/* Collapse down. */
+		for (i = 1; i < tally->buckets; i++) {
+			tally->counts[i/2] += tally->counts[i];
+			tally->counts[i] = 0;
+		}
+		tally->step_bits++;
+	}
+
+	/* Now if minimum has dropped, move buckets up. */
+	old_min = bucket_of(new_min, tally->step_bits, tally->min);
+	memmove(tally->counts + old_min,
+		tally->counts,
+		sizeof(tally->counts[0]) * (tally->buckets - old_min));
+	memset(tally->counts, 0, sizeof(tally->counts[0]) * old_min);
+
+	/* If we moved boundaries, adjust buckets to that ratio. */
+	spill = (tally->min - new_min) % (1 << tally->step_bits);
+	for (i = 0; i < tally->buckets-1; i++) {
+		size_t adjust = (tally->counts[i] >> tally->step_bits) * spill;
+		tally->counts[i] -= adjust;
+		tally->counts[i+1] += adjust;
+	}
+
+update:
+	tally->min = new_min;
+	tally->max = new_max;
+}
+
+void tally_add(struct tally *tally, ssize_t val)
+{
+	ssize_t new_min = tally->min, new_max = tally->max;
+	bool need_renormalize = false;
+
+	if (val < tally->min) {
+		new_min = val;
+		need_renormalize = true;
+	}
+	if (val > tally->max) {
+		new_max = val;
+		need_renormalize = true;
+	}
+	if (need_renormalize)
+		renormalize(tally, new_min, new_max);
+
+	/* 128-bit arithmetic!  If we didn't want exact mean, we could just
+	 * pull it out of counts. */
+	if (val > 0 && tally->total[0] + val < tally->total[0])
+		tally->total[1]++;
+	else if (val < 0 && tally->total[0] + val > tally->total[0])
+		tally->total[1]--;
+	tally->total[0] += val;
+	tally->counts[bucket_of(tally->min, tally->step_bits, val)]++;
+}
+
+size_t tally_num(const struct tally *tally)
+{
+	size_t i, num = 0;
+	for (i = 0; i < tally->buckets; i++)
+		num += tally->counts[i];
+	return num;
+}
+
+ssize_t tally_min(const struct tally *tally)
+{
+	return tally->min;
+}
+
+ssize_t tally_max(const struct tally *tally)
+{
+	return tally->max;
+}
+
+/* FIXME: Own ccan module please! */
+static unsigned fls64(uint64_t val)
+{
+#if HAVE_BUILTIN_CLZL
+	if (val <= ULONG_MAX) {
+		/* This is significantly faster! */
+		return val ? sizeof(long) * CHAR_BIT - __builtin_clzl(val) : 0;
+	} else {
+#endif
+	uint64_t r = 64;
+
+	if (!val)
+		return 0;
+	if (!(val & 0xffffffff00000000ull)) {
+		val <<= 32;
+		r -= 32;
+	}
+	if (!(val & 0xffff000000000000ull)) {
+		val <<= 16;
+		r -= 16;
+	}
+	if (!(val & 0xff00000000000000ull)) {
+		val <<= 8;
+		r -= 8;
+	}
+	if (!(val & 0xf000000000000000ull)) {
+		val <<= 4;
+		r -= 4;
+	}
+	if (!(val & 0xc000000000000000ull)) {
+		val <<= 2;
+		r -= 2;
+	}
+	if (!(val & 0x8000000000000000ull)) {
+		val <<= 1;
+		r -= 1;
+	}
+	return r;
+#if HAVE_BUILTIN_CLZL
+	}
+#endif
+}
+
+/* This is stolen straight from Hacker's Delight. */
+static uint64_t divlu64(uint64_t u1, uint64_t u0, uint64_t v)
+{
+	const uint64_t b = 4294967296ULL; // Number base (32 bits).
+	uint32_t un[4],		  // Dividend and divisor
+		vn[2];		  // normalized and broken
+				  // up into halfwords.
+	uint32_t q[2];		  // Quotient as halfwords.
+	uint64_t un1, un0,	  // Dividend and divisor
+		vn0;		  // as fullwords.
+	uint64_t qhat;		  // Estimated quotient digit.
+	uint64_t rhat;		  // A remainder.
+	uint64_t p;		  // Product of two digits.
+	int64_t s, i, j, t, k;
+
+	if (u1 >= v)		  // If overflow, return the largest
+		return (uint64_t)-1; // possible quotient.
+
+	s = 64 - fls64(v);		  // 0 <= s <= 63.
+	vn0 = v << s;		  // Normalize divisor.
+	vn[1] = vn0 >> 32;	  // Break divisor up into
+	vn[0] = vn0 & 0xFFFFFFFF; // two 32-bit halves.
+
+	// Shift dividend left.
+	un1 = ((u1 << s) | (u0 >> (64 - s))) & (-s >> 63);
+	un0 = u0 << s;
+	un[3] = un1 >> 32;	  // Break dividend up into
+	un[2] = un1;		  // four 32-bit halfwords
+	un[1] = un0 >> 32;	  // Note: storing into
+	un[0] = un0;		  // halfwords truncates.
+
+	for (j = 1; j >= 0; j--) {
+		// Compute estimate qhat of q[j].
+		qhat = (un[j+2]*b + un[j+1])/vn[1];
+		rhat = (un[j+2]*b + un[j+1]) - qhat*vn[1];
+	again:
+		if (qhat >= b || qhat*vn[0] > b*rhat + un[j]) {
+			qhat = qhat - 1;
+			rhat = rhat + vn[1];
+			if (rhat < b) goto again;
+		}
+
+		// Multiply and subtract.
+		k = 0;
+		for (i = 0; i < 2; i++) {
+			p = qhat*vn[i];
+			t = un[i+j] - k - (p & 0xFFFFFFFF);
+			un[i+j] = t;
+			k = (p >> 32) - (t >> 32);
+		}
+		t = un[j+2] - k;
+		un[j+2] = t;
+
+		q[j] = qhat;		  // Store quotient digit.
+		if (t < 0) {		  // If we subtracted too
+			q[j] = q[j] - 1;  // much, add back.
+			k = 0;
+			for (i = 0; i < 2; i++) {
+				t = un[i+j] + vn[i] + k;
+				un[i+j] = t;
+				k = t >> 32;
+			}
+			un[j+2] = un[j+2] + k;
+		}
+	} // End j.
+
+	return q[1]*b + q[0];
+}
+
+static int64_t divls64(int64_t u1, uint64_t u0, int64_t v)
+{
+	int64_t q, uneg, vneg, diff, borrow;
+
+	uneg = u1 >> 63;	  // -1 if u < 0.
+	if (uneg) {		  // Compute the absolute
+		u0 = -u0;	  // value of the dividend u.
+		borrow = (u0 != 0);
+		u1 = -u1 - borrow;
+	}
+
+	vneg = v >> 63;		  // -1 if v < 0.
+	v = (v ^ vneg) - vneg;	  // Absolute value of v.
+
+	if ((uint64_t)u1 >= (uint64_t)v)
+		goto overflow;
+
+	q = divlu64(u1, u0, v);
+
+	diff = uneg ^ vneg;	  // Negate q if signs of
+	q = (q ^ diff) - diff;	  // u and v differed.
+
+	if ((diff ^ q) < 0 && q != 0) {	   // If overflow, return the largest
+	overflow:			   // possible neg. quotient.
+		q = 0x8000000000000000ULL;
+	}
+	return q;
+}
+
+ssize_t tally_mean(const struct tally *tally)
+{
+	size_t count = tally_num(tally);
+	if (!count)
+		return 0;
+
+	if (sizeof(tally->total[0]) == sizeof(uint32_t)) {
+		/* Use standard 64-bit arithmetic. */
+		int64_t total = tally->total[0]
+			| (((uint64_t)tally->total[1]) << 32);
+		return total / count;
+	}
+	return divls64(tally->total[1], tally->total[0], count);
+}
+
+ssize_t tally_total(const struct tally *tally, ssize_t *overflow)
+{
+	if (overflow) {
+		*overflow = tally->total[1];
+		return tally->total[0];
+	}
+
+	/* If result is negative, make sure we can represent it. */
+	if (tally->total[1] & ((size_t)1 << (SIZET_BITS-1))) {
+		/* Must have only underflowed once, and must be able to
+		 * represent result at ssize_t. */
+		if ((~tally->total[1])+1 != 0
+		    || (ssize_t)tally->total[0] >= 0) {
+			/* Underflow, return minimum. */
+			return (ssize_t)((size_t)1 << (SIZET_BITS - 1));
+		}
+	} else {
+		/* Result is positive, must not have overflowed, and must be
+		 * able to represent as ssize_t. */
+		if (tally->total[1] || (ssize_t)tally->total[0] < 0) {
+			/* Overflow.  Return maximum. */
+			return (ssize_t)~((size_t)1 << (SIZET_BITS - 1));
+		}
+	}
+	return tally->total[0];
+}
+
+static ssize_t bucket_range(const struct tally *tally, unsigned b, size_t *err)
+{
+	ssize_t min, max;
+
+	min = bucket_min(tally->min, tally->step_bits, b);
+	if (b == tally->buckets - 1)
+		max = tally->max;
+	else
+		max = bucket_min(tally->min, tally->step_bits, b+1) - 1;
+
+	/* FIXME: Think harder about cumulative error; is this enough?. */
+	*err = (max - min + 1) / 2;
+	/* Avoid overflow. */
+	return min + (max - min) / 2;
+}
+
+ssize_t tally_approx_median(const struct tally *tally, size_t *err)
+{
+	size_t count = tally_num(tally), total = 0;
+	unsigned int i;
+
+	for (i = 0; i < tally->buckets; i++) {
+		total += tally->counts[i];
+		if (total * 2 >= count)
+			break;
+	}
+	return bucket_range(tally, i, err);
+}
+
+ssize_t tally_approx_mode(const struct tally *tally, size_t *err)
+{
+	unsigned int i, min_best = 0, max_best = 0;
+
+	for (i = 0; i < tally->buckets; i++) {
+		if (tally->counts[i] > tally->counts[min_best]) {
+			min_best = max_best = i;
+		} else if (tally->counts[i] == tally->counts[min_best]) {
+			max_best = i;
+		}
+	}
+
+	/* We can have more than one best, making our error huge. */
+	if (min_best != max_best) {
+		ssize_t min, max;
+		min = bucket_range(tally, min_best, err);
+		max = bucket_range(tally, max_best, err);
+		max += *err;
+		*err += (size_t)(max - min);
+		return min + (max - min) / 2;
+	}
+
+	return bucket_range(tally, min_best, err);
+}
+
+static unsigned get_max_bucket(const struct tally *tally)
+{
+	unsigned int i;
+
+	for (i = tally->buckets; i > 0; i--)
+		if (tally->counts[i-1])
+			break;
+	return i;
+}
+
+char *tally_histogram(const struct tally *tally,
+		      unsigned width, unsigned height)
+{
+	unsigned int i, count, max_bucket, largest_bucket;
+	struct tally *tmp;
+	char *graph, *p;
+
+	assert(width >= TALLY_MIN_HISTO_WIDTH);
+	assert(height >= TALLY_MIN_HISTO_HEIGHT);
+
+	/* Ignore unused buckets. */
+	max_bucket = get_max_bucket(tally);
+
+	/* FIXME: It'd be nice to smooth here... */
+	if (height >= max_bucket) {
+		height = max_bucket;
+		tmp = NULL;
+	} else {
+		/* We create a temporary then renormalize so < height. */
+		/* FIXME: Antialias properly! */
+		tmp = tally_new(tally->buckets);
+		if (!tmp)
+			return NULL;
+		tmp->min = tally->min;
+		tmp->max = tally->max;
+		tmp->step_bits = tally->step_bits;
+		memcpy(tmp->counts, tally->counts,
+		       sizeof(tally->counts[0]) * tmp->buckets);
+		while ((max_bucket = get_max_bucket(tmp)) >= height)
+			renormalize(tmp, tmp->min, tmp->max * 2);
+		/* Restore max */
+		tmp->max = tally->max;
+		tally = tmp;
+		height = max_bucket;
+	}
+
+	/* Figure out longest line, for scale. */
+	largest_bucket = 0;
+	for (i = 0; i < tally->buckets; i++) {
+		if (tally->counts[i] > largest_bucket)
+			largest_bucket = tally->counts[i];
+	}
+
+	p = graph = malloc(height * (width + 1) + 1);
+	if (!graph) {
+		free(tmp);
+		return NULL;
+	}
+
+	for (i = 0; i < height; i++) {
+		unsigned covered = 1, row;
+
+		/* People expect minimum at the bottom. */
+		row = height - i - 1;
+		count = (double)tally->counts[row] / largest_bucket * (width-1)+1;
+
+		if (row == 0)
+			covered = snprintf(p, width, "%zi", tally->min);
+		else if (row == height - 1)
+			covered = snprintf(p, width, "%zi", tally->max);
+		else if (row == bucket_of(tally->min, tally->step_bits, 0))
+			*p = '+';
+		else
+			*p = '|';
+
+		if (covered > width)
+			covered = width;
+		p += covered;
+
+		if (count > covered)
+			count -= covered;
+		else
+			count = 0;
+
+		memset(p, '*', count);
+		p += count;
+		*p = '\n';
+		p++;
+	}
+	*p = '\0';
+	free(tmp);
+	return graph;
+}
diff --git a/lib/ccan/tally/tally.h b/lib/ccan/tally/tally.h
new file mode 100644
index 0000000000..650e2656cd
--- /dev/null
+++ b/lib/ccan/tally/tally.h
@@ -0,0 +1,104 @@
+#ifndef CCAN_TALLY_H
+#define CCAN_TALLY_H
+#include "config.h"
+#include <sys/types.h>
+
+struct tally;
+
+/**
+ * tally_new - allocate the tally structure.
+ * @buckets: the number of frequency buckets.
+ *
+ * This allocates a tally structure using malloc().  The greater the value
+ * of @buckets, the more accurate tally_approx_median() and tally_approx_mode()
+ * and tally_histogram() will be, but more memory is consumed.  If you want
+ * to use tally_histogram(), the optimal bucket value is the same as that
+ * @height argument.
+ */
+struct tally *tally_new(unsigned int buckets);
+
+/**
+ * tally_add - add a value.
+ * @tally: the tally structure.
+ * @val: the value to add.
+ */
+void tally_add(struct tally *tally, ssize_t val);
+
+/**
+ * tally_num - how many times as tally_add been called?
+ * @tally: the tally structure.
+ */
+size_t tally_num(const struct tally *tally);
+
+/**
+ * tally_min - the minimum value passed to tally_add.
+ * @tally: the tally structure.
+ *
+ * Undefined if tally_num() == 0.
+ */
+ssize_t tally_min(const struct tally *tally);
+
+/**
+ * tally_max - the maximum value passed to tally_add.
+ * @tally: the tally structure.
+ *
+ * Undefined if tally_num() == 0.
+ */
+ssize_t tally_max(const struct tally *tally);
+
+/**
+ * tally_mean - the mean value passed to tally_add.
+ * @tally: the tally structure.
+ *
+ * Undefined if tally_num() == 0, but will not crash.
+ */
+ssize_t tally_mean(const struct tally *tally);
+
+/**
+ * tally_total - the total value passed to tally_add.
+ * @tally: the tally structure.
+ * @overflow: the overflow value (or NULL).
+ *
+ * If your total can't overflow a ssize_t, you don't need @overflow.
+ * Otherwise, @overflow is the upper ssize_t, and the return value should
+ * be treated as the lower size_t (ie. the sign bit is in @overflow).
+ */
+ssize_t tally_total(const struct tally *tally, ssize_t *overflow);
+
+/**
+ * tally_approx_median - the approximate median value passed to tally_add.
+ * @tally: the tally structure.
+ * @err: the error in the returned value (ie. real median is +/- @err).
+ *
+ * Undefined if tally_num() == 0, but will not crash.  Because we
+ * don't reallocate, we don't store all values, so this median cannot be
+ * exact.
+ */
+ssize_t tally_approx_median(const struct tally *tally, size_t *err);
+
+/**
+ * tally_approx_mode - the approximate mode value passed to tally_add.
+ * @tally: the tally structure.
+ * @err: the error in the returned value (ie. real mode is +/- @err).
+ *
+ * Undefined if tally_num() == 0, but will not crash.  Because we
+ * don't reallocate, we don't store all values, so this mode cannot be
+ * exact.  It could well be a value which was never passed to tally_add!
+ */
+ssize_t tally_approx_mode(const struct tally *tally, size_t *err);
+
+#define TALLY_MIN_HISTO_WIDTH 8
+#define TALLY_MIN_HISTO_HEIGHT 3
+
+/**
+ * tally_graph - return an ASCII image of the tally_add distribution
+ * @tally: the tally structure.
+ * @width: the maximum string width to use (>= TALLY_MIN_HISTO_WIDTH)
+ * @height: the maximum string height to use (>= TALLY_MIN_HISTO_HEIGHT)
+ *
+ * Returns a malloc()ed string which draws a multi-line graph of the
+ * distribution of values.  On out of memory returns NULL.
+ */
+char *tally_histogram(const struct tally *tally,
+		      unsigned width, unsigned height);
+#endif /* CCAN_TALLY_H */
diff --git a/lib/ccan/tally/test/run-bucket_of.c b/lib/ccan/tally/test/run-bucket_of.c
new file mode 100644
index 0000000000..5e12725757
--- /dev/null
+++ b/lib/ccan/tally/test/run-bucket_of.c
@@ -0,0 +1,71 @@
+#include <ccan/tally/tally.c>
+#include <ccan/tap/tap.h>
+
+int main(void)
+{
+	unsigned int i, max_step;
+	ssize_t min, max;
+
+	max = (ssize_t)~(1ULL << (sizeof(max)*CHAR_BIT - 1));
+	min = (ssize_t)(1ULL << (sizeof(max)*CHAR_BIT - 1));
+	max_step = sizeof(max)*CHAR_BIT;
+
+	plan_tests(2 + 100 + 10 + 5
+		   + 2 + 100 + 5 + 4
+		   + (1 << 7) * (max_step - 7));
+
+	/* Single step, single bucket == easy. */
+	ok1(bucket_of(0, 0, 0) == 0);
+
+	/* Double step, still in first bucket. */
+	ok1(bucket_of(0, 1, 0) == 0);
+
+	/* Step 8. */
+	for (i = 0; i < 100; i++)
+		ok1(bucket_of(0, 3, i) == i >> 3);
+
+	/* 10 values in 5 buckets, step 2. */
+	for (i = 0; i < 10; i++)
+		ok1(bucket_of(0, 1, i) == i >> 1);
+
+	/* Extreme cases. */
+	ok1(bucket_of(min, 0, min) == 0);
+	ok1(bucket_of(min, max_step-1, min) == 0);
+	ok1(bucket_of(min, max_step-1, max) == 1);
+	ok1(bucket_of(min, max_step, min) == 0);
+	ok1(bucket_of(min, max_step, max) == 0);
+
+	/* Now, bucket_min() should match: */
+	ok1(bucket_min(0, 0, 0) == 0);
+
+	/* Double step, val in first bucket still 0. */
+	ok1(bucket_min(0, 1, 0) == 0);
+
+	/* Step 8. */
+	for (i = 0; i < 100; i++)
+		ok1(bucket_min(0, 3, i) == i << 3);
+
+	/* 10 values in 5 buckets, step 2. */
+	for (i = 0; i < 5; i++)
+		ok1(bucket_min(0, 1, i) == i << 1);
+
+	/* Extreme cases. */
+	ok1(bucket_min(min, 0, 0) == min);
+	ok1(bucket_min(min, max_step-1, 0) == min);
+	ok1(bucket_min(min, max_step-1, 1) == 0);
+	ok1(bucket_min(min, max_step, 0) == min);
+
+	/* Now, vary step and number of buckets, but bucket_min and bucket_of
+	 * must agree. */
+	for (i = 0; i < (1 << 7); i++) {
+		unsigned int j;
+		for (j = 0; j < max_step - 7; j++) {
+			ssize_t val;
+
+			val = bucket_min(-(ssize_t)i, j, i);
+			ok1(bucket_of(-(ssize_t)i, j, val) == i);
+		}
+	}
+
+	return exit_status();
+}
diff --git a/lib/ccan/tally/test/run-divlu64.c b/lib/ccan/tally/test/run-divlu64.c
new file mode 100644
index 0000000000..057e47432c
--- /dev/null
+++ b/lib/ccan/tally/test/run-divlu64.c
@@ -0,0 +1,31 @@
+#include <ccan/tally/tally.c>
+#include <ccan/tap/tap.h>
+
+int main(void)
+{
+	unsigned int i, j;
+
+	plan_tests(5985);
+	/* Simple tests. */
+	for (i = 0; i < 127; i++) {
+		uint64_t u1, u0;
+		if (i < 64) {
+			u1 = 0;
+			u0 = 1ULL << i;
+			j = 0;
+		} else {
+			u1 = 1ULL << (i - 64);
+			u0 = 0;
+			j = i - 63;
+		}
+		for (; j < 63; j++) {
+			uint64_t answer;
+			if (j > i)
+				answer = 0;
+			else
+				answer = 1ULL << (i - j);
+			ok1(divlu64(u1, u0, 1ULL << j) == answer);
+		}
+	}
+	return exit_status();
+}
diff --git a/lib/ccan/tally/test/run-histogram.c b/lib/ccan/tally/test/run-histogram.c
new file mode 100644
index 0000000000..a9894ecd85
--- /dev/null
+++ b/lib/ccan/tally/test/run-histogram.c
@@ -0,0 +1,108 @@
+#include <ccan/tally/tally.c>
+#include <ccan/tap/tap.h>
+
+int main(void)
+{
+	int i;
+	struct tally *tally;
+	char *graph, *p;
+
+	plan_tests(100 + 1 + 10 + 1 + 100 + 1 + 10 + 1 + 10 * 2 + 1);
+
+	/* Uniform distribution, easy. */
+	tally = tally_new(100);
+	for (i = 0; i < 100; i++)
+		tally_add(tally, i);
+
+	/* 1:1 height. */
+	graph = p = tally_histogram(tally, 20, 100);
+	for (i = 0; i < 100; i++) {
+		char *eol = strchr(p, '\n');
+
+		/* We expect it filled all way to the end. */
+		ok1(eol - p == 20);
+		p = eol + 1;
+	}
+	ok1(!*p);
+	free(graph);
+
+	/* Reduced height. */
+	graph = p = tally_histogram(tally, 20, 10);
+	for (i = 0; i < 10; i++) {
+		char *eol = strchr(p, '\n');
+
+		/* First once can be truncated (bucket aliasing) */
+		if (eol) {
+			ok1(eol - p == 20 || (eol - p < 20 && i == 0));
+		} else
+			/* We should, at worst, half-fill graph */
+			ok1(i > 5);
+
+		if (eol)
+			p = eol + 1;
+	}
+	ok1(!*p);
+	free(graph);
+
+	/* Enlarged height (gets capped). */
+	graph = p = tally_histogram(tally, 20, 1000);
+	for (i = 0; i < 100; i++) {
+		char *eol = strchr(p, '\n');
+		/* We expect it filled all way to the end. */
+		ok1(eol - p == 20);
+		p = eol + 1;
+	}
+	ok1(!*p);
+	free(graph);
+	free(tally);
+
+	/* Distinctive increasing pattern. */
+	tally = tally_new(10);
+	for (i = 0; i < 10; i++) {
+		unsigned int j;
+		for (j = 0; j <= i; j++)
+			tally_add(tally, i);
+	}
+
+	graph = p = tally_histogram(tally, 10, 10);
+	for (i = 0; i < 10; i++) {
+		char *eol = strchr(p, '\n');
+		ok1(eol - p == 10 - i);
+		p = eol + 1;
+	}
+	ok1(!*p);
+	diag("Here's the pretty: %s", graph);
+	free(graph);
+	free(tally);
+
+	/* With negative values. */
+	tally = tally_new(10);
+	for (i = 0; i < 10; i++) {
+		tally_add(tally, i - 5);
+	}
+
+	graph = p = tally_histogram(tally, 10, 10);
+	for (i = 0; i < 10; i++) {
+		char *eol = strchr(p, '\n');
+
+		/* We expect it filled all way to the end. */
+		ok1(eol - p == 10);
+
+		/* Check min/max labels. */
+		if (i == 0)
+			ok1(strncmp(p, "4*", 2) == 0);
+		else if (i == 9)
+			ok1(strncmp(p, "-5*", 3) == 0);
+		else if (i == 4)
+			ok1(p[0] == '+'); /* 0 marker */
+		else
+			ok1(p[0] == '|');
+		p = eol + 1;
+	}
+	ok1(!*p);
+	diag("Here's the pretty: %s", graph);
+	free(graph);
+	free(tally);
+
+	return exit_status();
+}
diff --git a/lib/ccan/tally/test/run-mean.c b/lib/ccan/tally/test/run-mean.c
new file mode 100644
index 0000000000..b43dea6b28
--- /dev/null
+++ b/lib/ccan/tally/test/run-mean.c
@@ -0,0 +1,30 @@
+#include <ccan/tally/tally.c>
+#include <ccan/tap/tap.h>
+
+int main(void)
+{
+	int i;
+	struct tally *tally = tally_new(0);
+	ssize_t min, max;
+
+	max = (ssize_t)~(1ULL << (sizeof(max)*CHAR_BIT - 1));
+	min = (ssize_t)(1ULL << (sizeof(max)*CHAR_BIT - 1));
+
+	plan_tests(100 + 100);
+	/* Simple mean test: should always be 0. */
+	for (i = 0; i < 100; i++) {
+		tally_add(tally, i);
+		tally_add(tally, -i);
+		ok1(tally_mean(tally) == 0);
+	}
+
+	/* Works for big values too... */
+	for (i = 0; i < 100; i++) {
+		tally_add(tally, max - i);
+		tally_add(tally, min + 1 + i);
+		ok1(tally_mean(tally) == 0);
+	}
+
+	free(tally);
+	return exit_status();
+}
diff --git a/lib/ccan/tally/test/run-median.c b/lib/ccan/tally/test/run-median.c
new file mode 100644
index 0000000000..b12fd8a021
--- /dev/null
+++ b/lib/ccan/tally/test/run-median.c
@@ -0,0 +1,46 @@
+#include <ccan/tally/tally.c>
+#include <ccan/tap/tap.h>
+
+int main(void)
+{
+	int i;
+	struct tally *tally = tally_new(100);
+	ssize_t min, max, median;
+	size_t err;
+
+	max = (ssize_t)~(1ULL << (sizeof(max)*CHAR_BIT - 1));
+	min = (ssize_t)(1ULL << (sizeof(max)*CHAR_BIT - 1));
+
+	plan_tests(100*2 + 100*2 + 100*2);
+	/* Simple median test: should always be around 0. */
+	for (i = 0; i < 100; i++) {
+		tally_add(tally, i);
+		tally_add(tally, -i);
+		median = tally_approx_median(tally, &err);
+		ok1(err <= 4);
+		ok1(median - (ssize_t)err <= 0 && median + (ssize_t)err >= 0);
+	}
+
+	/* Works for big values too... */
+	for (i = 0; i < 100; i++) {
+		tally_add(tally, max - i);
+		tally_add(tally, min + 1 + i);
+		median = tally_approx_median(tally, &err);
+		/* Error should be < 100th of max - min. */
+		ok1(err <= max / 100 * 2);
+		ok1(median - (ssize_t)err <= 0 && median + (ssize_t)err >= 0);
+	}
+	free(tally);
+
+	tally = tally_new(10);
+	for (i = 0; i < 100; i++) {
+		tally_add(tally, i);
+		median = tally_approx_median(tally, &err);
+		ok1(err <= i / 10 + 1);
+		ok1(median - (ssize_t)err <= i/2
+		    && median + (ssize_t)err >= i/2);
+	}
+	free(tally);
+
+	return exit_status();
+}
diff --git a/lib/ccan/tally/test/run-min-max.c b/lib/ccan/tally/test/run-min-max.c
new file mode 100644
index 0000000000..c92f6d382a
--- /dev/null
+++ b/lib/ccan/tally/test/run-min-max.c
@@ -0,0 +1,21 @@
+#include <ccan/tally/tally.c>
+#include <ccan/tap/tap.h>
+
+int main(void)
+{
+	int i;
+	struct tally *tally = tally_new(0);
+
+	plan_tests(100 * 4);
+	/* Test max, min and num. */
+	for (i = 0; i < 100; i++) {
+		tally_add(tally, i);
+		ok1(tally_num(tally) == i*2 + 1);
+		tally_add(tally, -i);
+		ok1(tally_num(tally) == i*2 + 2);
+		ok1(tally_max(tally) == i);
+		ok1(tally_min(tally) == -i);
+	}
+	free(tally);
+	return exit_status();
+}
diff --git a/lib/ccan/tally/test/run-mode.c b/lib/ccan/tally/test/run-mode.c
new file mode 100644
index 0000000000..cd2f230443
--- /dev/null
+++ b/lib/ccan/tally/test/run-mode.c
@@ -0,0 +1,46 @@
+#include <ccan/tally/tally.c>
+#include <ccan/tap/tap.h>
+
+int main(void)
+{
+	int i;
+	struct tally *tally = tally_new(100);
+	ssize_t min, max, mode;
+	size_t err;
+
+	max = (ssize_t)~(1ULL << (sizeof(max)*CHAR_BIT - 1));
+	min = (ssize_t)(1ULL << (sizeof(max)*CHAR_BIT - 1));
+
+	plan_tests(100 + 50 + 100 + 100 + 10);
+	/* Simple mode test: should always be around 0 (we add that twice). */
+	for (i = 0; i < 100; i++) {
+		tally_add(tally, i);
+		tally_add(tally, -i);
+		mode = tally_approx_mode(tally, &err);
+		if (i < 50)
+			ok1(err == 0);
+		ok1(mode - (ssize_t)err <= 0 && mode + (ssize_t)err >= 0);
+	}
+
+	/* Works for big values too... */
+	for (i = 0; i < 100; i++) {
+		tally_add(tally, max - i);
+		tally_add(tally, min + 1 + i);
+		mode = tally_approx_mode(tally, &err);
+		ok1(mode - (ssize_t)err <= 0 && mode + (ssize_t)err >= 0);
+	}
+	free(tally);
+
+	tally = tally_new(10);
+	tally_add(tally, 0);
+	for (i = 0; i < 100; i++) {
+		tally_add(tally, i);
+		mode = tally_approx_mode(tally, &err);
+		if (i < 10)
+			ok1(err == 0);
+		ok1(mode - (ssize_t)err <= 0 && mode + (ssize_t)err >= 0);
+	}
+
+	free(tally);
+	return exit_status();
+}
diff --git a/lib/ccan/tally/test/run-renormalize.c b/lib/ccan/tally/test/run-renormalize.c
new file mode 100644
index 0000000000..8fe9dbce32
--- /dev/null
+++ b/lib/ccan/tally/test/run-renormalize.c
@@ -0,0 +1,26 @@
+#include <ccan/tally/tally.c>
+#include <ccan/tap/tap.h>
+
+int main(void)
+{
+	struct tally *tally = tally_new(2);
+
+	plan_tests(4);
+	tally->min = 0;
+	tally->max = 0;
+	tally->counts[0] = 1;
+
+	/* This renormalize should do nothing. */
+	renormalize(tally, 0, 1);
+	ok1(tally->counts[0] == 1);
+	ok1(tally->counts[1] == 0);
+	tally->counts[1]++;
+
+	/* This renormalize should collapse both into bucket 0. */
+	renormalize(tally, 0, 3);
+	ok1(tally->counts[0] == 2);
+	ok1(tally->counts[1] == 0);
+
+	free(tally);
+	return exit_status();
+}
diff --git a/lib/ccan/tally/test/run-total.c b/lib/ccan/tally/test/run-total.c
new file mode 100644
index 0000000000..d7d73e58a5
--- /dev/null
+++ b/lib/ccan/tally/test/run-total.c
@@ -0,0 +1,56 @@
+#include <ccan/tally/tally.c>
+#include <ccan/tap/tap.h>
+
+int main(void)
+{
+	struct tally *tally;
+	ssize_t total, overflow;
+	ssize_t min, max;
+
+	max = (ssize_t)~(1ULL << (sizeof(max)*CHAR_BIT - 1));
+	min = (ssize_t)(1ULL << (sizeof(max)*CHAR_BIT - 1));
+
+	plan_tests(15);
+
+	/* Simple case. */
+	tally = tally_new(0);
+	tally_add(tally, min);
+	ok1(tally_total(tally, NULL) == min);
+	ok1(tally_total(tally, &overflow) == min);
+	ok1(overflow == -1);
+
+	/* Underflow. */
+	tally_add(tally, min);
+	total = tally_total(tally, &overflow);
+	ok1(overflow == -1);
+	ok1((size_t)total == 0);
+	ok1(tally_total(tally, NULL) == min);
+	free(tally);
+
+	/* Simple case. */
+	tally = tally_new(0);
+	tally_add(tally, max);
+	ok1(tally_total(tally, NULL) == max);
+	ok1(tally_total(tally, &overflow) == max);
+	ok1(overflow == 0);
+
+	/* Overflow into sign bit... */
+	tally_add(tally, max);
+	total = tally_total(tally, &overflow);
+	ok1(overflow == 0);
+	ok1((size_t)total == (size_t)-2);
+	ok1(tally_total(tally, NULL) == max);
+
+	/* Overflow into upper size_t. */
+	tally_add(tally, max);
+	total = tally_total(tally, &overflow);
+	ok1(overflow == 1);
+	if (sizeof(size_t) == 4)
+		ok1((size_t)total == 0x7FFFFFFD);
+	else if (sizeof(size_t) == 8)
+		ok1((size_t)total == 0x7FFFFFFFFFFFFFFDULL);
+	ok1(tally_total(tally, NULL) == max);
+	free(tally);
+
+	return exit_status();
+}
diff --git a/lib/ccan/typesafe_cb/LICENSE b/lib/ccan/typesafe_cb/LICENSE
new file mode 100644
index 0000000000..5522aa5f33
--- /dev/null
+++ b/lib/ccan/typesafe_cb/LICENSE
@@ -0,0 +1,508 @@
+
+                  GNU LESSER GENERAL PUBLIC LICENSE
+                       Version 2.1, February 1999
+
+ Copyright (C) 1991, 1999 Free Software Foundation, Inc.
+	51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+[This is the first released version of the Lesser GPL.  It also counts
+ as the successor of the GNU Library Public License, version 2, hence
+ the version number 2.1.]
+
+                            Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+Licenses are intended to guarantee your freedom to share and change
+free software--to make sure the software is free for all its users.
+
+  This license, the Lesser General Public License, applies to some
+specially designated software packages--typically libraries--of the
+Free Software Foundation and other authors who decide to use it.  You
+can use it too, but we suggest you first think carefully about whether
+this license or the ordinary General Public License is the better
+strategy to use in any particular case, based on the explanations
+below.
+
+  When we speak of free software, we are referring to freedom of use,
+not price.  Our General Public Licenses are designed to make sure that
+you have the freedom to distribute copies of free software (and charge
+for this service if you wish); that you receive source code or can get
+it if you want it; that you can change the software and use pieces of
+it in new free programs; and that you are informed that you can do
+these things.
+
+  To protect your rights, we need to make restrictions that forbid
+distributors to deny you these rights or to ask you to surrender these
+rights.  These restrictions translate to certain responsibilities for
+you if you distribute copies of the library or if you modify it.
+
+  For example, if you distribute copies of the library, whether gratis
+or for a fee, you must give the recipients all the rights that we gave
+you.  You must make sure that they, too, receive or can get the source
+code.  If you link other code with the library, you must provide
+complete object files to the recipients, so that they can relink them
+with the library after making changes to the library and recompiling
+it.  And you must show them these terms so they know their rights.
+
+  We protect your rights with a two-step method: (1) we copyright the
+library, and (2) we offer you this license, which gives you legal
+permission to copy, distribute and/or modify the library.
+
+  To protect each distributor, we want to make it very clear that
+there is no warranty for the free library.  Also, if the library is
+modified by someone else and passed on, the recipients should know
+that what they have is not the original version, so that the original
+author's reputation will not be affected by problems that might be
+introduced by others.
+
+  Finally, software patents pose a constant threat to the existence of
+any free program.  We wish to make sure that a company cannot
+effectively restrict the users of a free program by obtaining a
+restrictive license from a patent holder.  Therefore, we insist that
+any patent license obtained for a version of the library must be
+consistent with the full freedom of use specified in this license.
+
+  Most GNU software, including some libraries, is covered by the
+ordinary GNU General Public License.  This license, the GNU Lesser
+General Public License, applies to certain designated libraries, and
+is quite different from the ordinary General Public License.  We use
+this license for certain libraries in order to permit linking those
+libraries into non-free programs.
+
+  When a program is linked with a library, whether statically or using
+a shared library, the combination of the two is legally speaking a
+combined work, a derivative of the original library.  The ordinary
+General Public License therefore permits such linking only if the
+entire combination fits its criteria of freedom.  The Lesser General
+Public License permits more lax criteria for linking other code with
+the library.
+
+  We call this license the "Lesser" General Public License because it
+does Less to protect the user's freedom than the ordinary General
+Public License.  It also provides other free software developers Less
+of an advantage over competing non-free programs.  These disadvantages
+are the reason we use the ordinary General Public License for many
+libraries.  However, the Lesser license provides advantages in certain
+special circumstances.
+
+  For example, on rare occasions, there may be a special need to
+encourage the widest possible use of a certain library, so that it
+becomes a de-facto standard.  To achieve this, non-free programs must
+be allowed to use the library.  A more frequent case is that a free
+library does the same job as widely used non-free libraries.  In this
+case, there is little to gain by limiting the free library to free
+software only, so we use the Lesser General Public License.
+
+  In other cases, permission to use a particular library in non-free
+programs enables a greater number of people to use a large body of
+free software.  For example, permission to use the GNU C Library in
+non-free programs enables many more people to use the whole GNU
+operating system, as well as its variant, the GNU/Linux operating
+system.
+
+  Although the Lesser General Public License is Less protective of the
+users' freedom, it does ensure that the user of a program that is
+linked with the Library has the freedom and the wherewithal to run
+that program using a modified version of the Library.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.  Pay close attention to the difference between a
+"work based on the library" and a "work that uses the library".  The
+former contains code derived from the library, whereas the latter must
+be combined with the library in order to run.
+
+                  GNU LESSER GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License Agreement applies to any software library or other
+program which contains a notice placed by the copyright holder or
+other authorized party saying it may be distributed under the terms of
+this Lesser General Public License (also called "this License").
+Each licensee is addressed as "you".
+
+  A "library" means a collection of software functions and/or data
+prepared so as to be conveniently linked with application programs
+(which use some of those functions and data) to form executables.
+
+  The "Library", below, refers to any such software library or work
+which has been distributed under these terms.  A "work based on the
+Library" means either the Library or any derivative work under
+copyright law: that is to say, a work containing the Library or a
+portion of it, either verbatim or with modifications and/or translated
+straightforwardly into another language.  (Hereinafter, translation is
+included without limitation in the term "modification".)
+
+  "Source code" for a work means the preferred form of the work for
+making modifications to it.  For a library, complete source code means
+all the source code for all modules it contains, plus any associated
+interface definition files, plus the scripts used to control
+compilation and installation of the library.
+
+  Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running a program using the Library is not restricted, and output from
+such a program is covered only if its contents constitute a work based
+on the Library (independent of the use of the Library in a tool for
+writing it).  Whether that is true depends on what the Library does
+and what the program that uses the Library does.
+
+  1. You may copy and distribute verbatim copies of the Library's
+complete source code as you receive it, in any medium, provided that
+you conspicuously and appropriately publish on each copy an
+appropriate copyright notice and disclaimer of warranty; keep intact
+all the notices that refer to this License and to the absence of any
+warranty; and distribute a copy of this License along with the
+Library.
+
+  You may charge a fee for the physical act of transferring a copy,
+and you may at your option offer warranty protection in exchange for a
+fee.
+
+  2. You may modify your copy or copies of the Library or any portion
+of it, thus forming a work based on the Library, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) The modified work must itself be a software library.
+
+    b) You must cause the files modified to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    c) You must cause the whole of the work to be licensed at no
+    charge to all third parties under the terms of this License.
+
+    d) If a facility in the modified Library refers to a function or a
+    table of data to be supplied by an application program that uses
+    the facility, other than as an argument passed when the facility
+    is invoked, then you must make a good faith effort to ensure that,
+    in the event an application does not supply such function or
+    table, the facility still operates, and performs whatever part of
+    its purpose remains meaningful.
+
+    (For example, a function in a library to compute square roots has
+    a purpose that is entirely well-defined independent of the
+    application.  Therefore, Subsection 2d requires that any
+    application-supplied function or table used by this function must
+    be optional: if the application does not supply it, the square
+    root function must still compute square roots.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Library,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Library, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote
+it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Library.
+
+In addition, mere aggregation of another work not based on the Library
+with the Library (or with a work based on the Library) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may opt to apply the terms of the ordinary GNU General Public
+License instead of this License to a given copy of the Library.  To do
+this, you must alter all the notices that refer to this License, so
+that they refer to the ordinary GNU General Public License, version 2,
+instead of to this License.  (If a newer version than version 2 of the
+ordinary GNU General Public License has appeared, then you can specify
+that version instead if you wish.)  Do not make any other change in
+these notices.
+
+  Once this change is made in a given copy, it is irreversible for
+that copy, so the ordinary GNU General Public License applies to all
+subsequent copies and derivative works made from that copy.
+
+  This option is useful when you wish to copy part of the code of
+the Library into a program that is not a library.
+
+  4. You may copy and distribute the Library (or a portion or
+derivative of it, under Section 2) in object code or executable form
+under the terms of Sections 1 and 2 above provided that you accompany
+it with the complete corresponding machine-readable source code, which
+must be distributed under the terms of Sections 1 and 2 above on a
+medium customarily used for software interchange.
+
+  If distribution of object code is made by offering access to copy
+from a designated place, then offering equivalent access to copy the
+source code from the same place satisfies the requirement to
+distribute the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  5. A program that contains no derivative of any portion of the
+Library, but is designed to work with the Library by being compiled or
+linked with it, is called a "work that uses the Library".  Such a
+work, in isolation, is not a derivative work of the Library, and
+therefore falls outside the scope of this License.
+
+  However, linking a "work that uses the Library" with the Library
+creates an executable that is a derivative of the Library (because it
+contains portions of the Library), rather than a "work that uses the
+library".  The executable is therefore covered by this License.
+Section 6 states terms for distribution of such executables.
+
+  When a "work that uses the Library" uses material from a header file
+that is part of the Library, the object code for the work may be a
+derivative work of the Library even though the source code is not.
+Whether this is true is especially significant if the work can be
+linked without the Library, or if the work is itself a library.  The
+threshold for this to be true is not precisely defined by law.
+
+  If such an object file uses only numerical parameters, data
+structure layouts and accessors, and small macros and small inline
+functions (ten lines or less in length), then the use of the object
+file is unrestricted, regardless of whether it is legally a derivative
+work.  (Executables containing this object code plus portions of the
+Library will still fall under Section 6.)
+
+  Otherwise, if the work is a derivative of the Library, you may
+distribute the object code for the work under the terms of Section 6.
+Any executables containing that work also fall under Section 6,
+whether or not they are linked directly with the Library itself.
+
+  6. As an exception to the Sections above, you may also combine or
+link a "work that uses the Library" with the Library to produce a
+work containing portions of the Library, and distribute that work
+under terms of your choice, provided that the terms permit
+modification of the work for the customer's own use and reverse
+engineering for debugging such modifications.
+
+  You must give prominent notice with each copy of the work that the
+Library is used in it and that the Library and its use are covered by
+this License.  You must supply a copy of this License.  If the work
+during execution displays copyright notices, you must include the
+copyright notice for the Library among them, as well as a reference
+directing the user to the copy of this License.  Also, you must do one
+of these things:
+
+    a) Accompany the work with the complete corresponding
+    machine-readable source code for the Library including whatever
+    changes were used in the work (which must be distributed under
+    Sections 1 and 2 above); and, if the work is an executable linked
+    with the Library, with the complete machine-readable "work that
+    uses the Library", as object code and/or source code, so that the
+    user can modify the Library and then relink to produce a modified
+    executable containing the modified Library.  (It is understood
+    that the user who changes the contents of definitions files in the
+    Library will not necessarily be able to recompile the application
+    to use the modified definitions.)
+
+    b) Use a suitable shared library mechanism for linking with the
+    Library.  A suitable mechanism is one that (1) uses at run time a
+    copy of the library already present on the user's computer system,
+    rather than copying library functions into the executable, and (2)
+    will operate properly with a modified version of the library, if
+    the user installs one, as long as the modified version is
+    interface-compatible with the version that the work was made with.
+
+    c) Accompany the work with a written offer, valid for at least
+    three years, to give the same user the materials specified in
+    Subsection 6a, above, for a charge no more than the cost of
+    performing this distribution.
+
+    d) If distribution of the work is made by offering access to copy
+    from a designated place, offer equivalent access to copy the above
+    specified materials from the same place.
+
+    e) Verify that the user has already received a copy of these
+    materials or that you have already sent this user a copy.
+
+  For an executable, the required form of the "work that uses the
+Library" must include any data and utility programs needed for
+reproducing the executable from it.  However, as a special exception,
+the materials to be distributed need not include anything that is
+normally distributed (in either source or binary form) with the major
+components (compiler, kernel, and so on) of the operating system on
+which the executable runs, unless that component itself accompanies
+the executable.
+
+  It may happen that this requirement contradicts the license
+restrictions of other proprietary libraries that do not normally
+accompany the operating system.  Such a contradiction means you cannot
+use both them and the Library together in an executable that you
+distribute.
+
+  7. You may place library facilities that are a work based on the
+Library side-by-side in a single library together with other library
+facilities not covered by this License, and distribute such a combined
+library, provided that the separate distribution of the work based on
+the Library and of the other library facilities is otherwise
+permitted, and provided that you do these two things:
+
+    a) Accompany the combined library with a copy of the same work
+    based on the Library, uncombined with any other library
+    facilities.  This must be distributed under the terms of the
+    Sections above.
+
+    b) Give prominent notice with the combined library of the fact
+    that part of it is a work based on the Library, and explaining
+    where to find the accompanying uncombined form of the same work.
+
+  8. You may not copy, modify, sublicense, link with, or distribute
+the Library except as expressly provided under this License.  Any
+attempt otherwise to copy, modify, sublicense, link with, or
+distribute the Library is void, and will automatically terminate your
+rights under this License.  However, parties who have received copies,
+or rights, from you under this License will not have their licenses
+terminated so long as such parties remain in full compliance.
+
+  9. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Library or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Library (or any work based on the
+Library), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Library or works based on it.
+
+  10. Each time you redistribute the Library (or any work based on the
+Library), the recipient automatically receives a license from the
+original licensor to copy, distribute, link with or modify the Library
+subject to these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties with
+this License.
+
+  11. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Library at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Library by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Library.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply, and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  12. If the distribution and/or use of the Library is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Library under this License
+may add an explicit geographical distribution limitation excluding those
+countries, so that distribution is permitted only in or among
+countries not thus excluded.  In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+  13. The Free Software Foundation may publish revised and/or new
+versions of the Lesser General Public License from time to time.
+Such new versions will be similar in spirit to the present version,
+but may differ in detail to address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Library
+specifies a version number of this License which applies to it and
+"any later version", you have the option of following the terms and
+conditions either of that version or of any later version published by
+the Free Software Foundation.  If the Library does not specify a
+license version number, you may choose any version ever published by
+the Free Software Foundation.
+
+  14. If you wish to incorporate parts of the Library into other free
+programs whose distribution conditions are incompatible with these,
+write to the author to ask for permission.  For software which is
+copyrighted by the Free Software Foundation, write to the Free
+Software Foundation; we sometimes make exceptions for this.  Our
+decision will be guided by the two goals of preserving the free status
+of all derivatives of our free software and of promoting the sharing
+and reuse of software generally.
+
+                            NO WARRANTY
+
+  15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO
+WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW.
+EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR
+OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY
+KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE
+LIBRARY IS WITH YOU.  SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME
+THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN
+WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY
+AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU
+FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR
+CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
+LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
+RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
+FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
+SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
+DAMAGES.
+
+                     END OF TERMS AND CONDITIONS
+
+           How to Apply These Terms to Your New Libraries
+
+  If you develop a new library, and you want it to be of the greatest
+possible use to the public, we recommend making it free software that
+everyone can redistribute and change.  You can do so by permitting
+redistribution under these terms (or, alternatively, under the terms
+of the ordinary General Public License).
+
+  To apply these terms, attach the following notices to the library.
+It is safest to attach them to the start of each source file to most
+effectively convey the exclusion of warranty; and each file should
+have at least the "copyright" line and a pointer to where the full
+notice is found.
+
+
+    <one line to give the library's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This library is free software; you can redistribute it and/or
+    modify it under the terms of the GNU Lesser General Public
+    License as published by the Free Software Foundation; either
+    version 2.1 of the License, or (at your option) any later version.
+
+    This library is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public
+    License along with this library; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+
+Also add information on how to contact you by electronic and paper mail.
+
+You should also get your employer (if you work as a programmer) or
+your school, if any, to sign a "copyright disclaimer" for the library,
+if necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the
+  library `Frob' (a library for tweaking knobs) written by James
+  Random Hacker.
+
+  <signature of Ty Coon>, 1 April 1990
+  Ty Coon, President of Vice
+
+That's all there is to it!
diff --git a/lib/ccan/typesafe_cb/_info b/lib/ccan/typesafe_cb/_info
new file mode 100644
index 0000000000..4f4570afc9
--- /dev/null
+++ b/lib/ccan/typesafe_cb/_info
@@ -0,0 +1,151 @@
+#include <stdio.h>
+#include <string.h>
+#include "config.h"
+
+/**
+ * typesafe_cb - macros for safe callbacks.
+ *
+ * The basis of the typesafe_cb header is typesafe_cb_cast(): a
+ * conditional cast macro.   If an expression exactly matches a given
+ * type, it is cast to the target type, otherwise it is left alone.
+ *
+ * This allows us to create functions which take a small number of
+ * specific types, rather than being forced to use a void *.  In
+ * particular, it is useful for creating typesafe callbacks as the
+ * helpers typesafe_cb(), typesafe_cb_preargs() and
+ * typesafe_cb_postargs() demonstrate.
+ *
+ * The standard way of passing arguments to callback functions in C is
+ * to use a void pointer, which the callback then casts back to the
+ * expected type.  This unfortunately subverts the type checking the
+ * compiler would perform if it were a direct call.  Here's an example:
+ *
+ *	static void my_callback(void *_obj)
+ *	{
+ *		struct obj *obj = _obj;
+ *		...
+ *	}
+ *	...
+ *		register_callback(my_callback, &my_obj);
+ *
+ * If we wanted to use the natural type for my_callback (ie. "void
+ * my_callback(struct obj *obj)"), we could make register_callback()
+ * take a void * as its first argument, but this would subvert all
+ * type checking.  We really want register_callback() to accept only
+ * the exactly correct function type to match the argument, or a
+ * function which takes a void *.
+ *
+ * This is where typesafe_cb() comes in: it uses typesafe_cb_cast() to
+ * cast the callback function if it matches the argument type:
+ *
+ *	void _register_callback(void (*cb)(void *arg), void *arg);
+ *	#define register_callback(cb, arg)				\
+ *		_register_callback(typesafe_cb(void, void *, (cb), (arg)), \
+ *				   (arg))
+ *
+ * On compilers which don't support the extensions required
+ * typesafe_cb_cast() and friend become an unconditional cast, so your
+ * code will compile but you won't get type checking.
+ *
+ * Example:
+ *	#include <ccan/typesafe_cb/typesafe_cb.h>
+ *	#include <stdlib.h>
+ *	#include <stdio.h>
+ *
+ *	// Generic callback infrastructure.
+ *	struct callback {
+ *		struct callback *next;
+ *		int value;
+ *		int (*callback)(int value, void *arg);
+ *		void *arg;
+ *	};
+ *	static struct callback *callbacks;
+ *
+ *	static void _register_callback(int value, int (*cb)(int, void *),
+ *				       void *arg)
+ *	{
+ *		struct callback *new = malloc(sizeof(*new));
+ *		new->next = callbacks;
+ *		new->value = value;
+ *		new->callback = cb;
+ *		new->arg = arg;
+ *		callbacks = new;
+ *	}
+ *	#define register_callback(value, cb, arg)			\
+ *		_register_callback(value,				\
+ *				   typesafe_cb_preargs(int, void *,	\
+ *						       (cb), (arg), int),\
+ *				   (arg))
+ *
+ *	static struct callback *find_callback(int value)
+ *	{
+ *		struct callback *i;
+ *
+ *		for (i = callbacks; i; i = i->next)
+ *			if (i->value == value)
+ *				return i;
+ *		return NULL;
+ *	}
+ *
+ *	// Define several silly callbacks.  Note they don't use void *!
+ *	#define DEF_CALLBACK(name, op)			\
+ *		static int name(int val, int *arg)	\
+ *		{					\
+ *			printf("%s", #op);		\
+ *			return val op *arg;		\
+ *		}
+ *	DEF_CALLBACK(multiply, *);
+ *	DEF_CALLBACK(add, +);
+ *	DEF_CALLBACK(divide, /);
+ *	DEF_CALLBACK(sub, -);
+ *	DEF_CALLBACK(or, |);
+ *	DEF_CALLBACK(and, &);
+ *	DEF_CALLBACK(xor, ^);
+ *	DEF_CALLBACK(assign, =);
+ *
+ *	// Silly game to find the longest chain of values.
+ *	int main(int argc, char *argv[])
+ *	{
+ *		int i, run = 1, num = argv[1] ? atoi(argv[1]) : 0;
+ *
+ *		for (i = 1; i < 1024;) {
+ *			// Since run is an int, compiler checks "add" does too.
+ *			register_callback(i++, add, &run);
+ *			register_callback(i++, divide, &run);
+ *			register_callback(i++, sub, &run);
+ *			register_callback(i++, multiply, &run);
+ *			register_callback(i++, or, &run);
+ *			register_callback(i++, and, &run);
+ *			register_callback(i++, xor, &run);
+ *			register_callback(i++, assign, &run);
+ *		}
+ *
+ *		printf("%i ", num);
+ *		while (run < 56) {
+ *			struct callback *cb = find_callback(num % i);
+ *			if (!cb) {
+ *				printf("-> STOP\n");
+ *				return 1;
+ *			}
+ *			num = cb->callback(num, cb->arg);
+ *			printf("->%i ", num);
+ *			run++;
+ *		}
+ *		printf("-> Winner!\n");
+ *		return 0;
+ *	}
+ *
+ * License: LGPL (2 or any later version)
+ * Author: Rusty Russell <rusty@rustcorp.com.au>
+ */
+int main(int argc, char *argv[])
+{
+	if (argc != 2)
+		return 1;
+
+	if (strcmp(argv[1], "depends") == 0) {
+		return 0;
+	}
+
+	return 1;
+}
diff --git a/lib/ccan/typesafe_cb/test/compile_fail-cast_if_any.c b/lib/ccan/typesafe_cb/test/compile_fail-cast_if_any.c
new file mode 100644
index 0000000000..dfb51167ff
--- /dev/null
+++ b/lib/ccan/typesafe_cb/test/compile_fail-cast_if_any.c
@@ -0,0 +1,42 @@
+#include <ccan/typesafe_cb/typesafe_cb.h>
+#include <stdlib.h>
+
+struct foo {
+	int x;
+};
+
+struct bar {
+	int x;
+};
+
+struct baz {
+	int x;
+};
+
+struct any {
+	int x;
+};
+
+struct other {
+	int x;
+};
+
+static void take_any(struct any *any)
+{
+}
+
+int main(int argc, char *argv[])
+{
+#ifdef FAIL
+	struct other
+#if !HAVE_TYPEOF||!HAVE_BUILTIN_CHOOSE_EXPR||!HAVE_BUILTIN_TYPES_COMPATIBLE_P
+#error "Unfortunately we don't fail if cast_if_type is a noop."
+#endif
+#else
+	struct foo
+#endif
+		*arg = NULL;
+	take_any(cast_if_any(struct any *, arg, arg,
+			     struct foo *, struct bar *, struct baz *));
+	return 0;
+}
diff --git a/lib/ccan/typesafe_cb/test/compile_fail-cast_if_type-promotable.c b/lib/ccan/typesafe_cb/test/compile_fail-cast_if_type-promotable.c
new file mode 100644
index 0000000000..11d42f4c6b
--- /dev/null
+++ b/lib/ccan/typesafe_cb/test/compile_fail-cast_if_type-promotable.c
@@ -0,0 +1,23 @@
+#include <ccan/typesafe_cb/typesafe_cb.h>
+#include <stdbool.h>
+
+static void _set_some_value(void *val)
+{
+}
+
+#define set_some_value(expr)						\
+	_set_some_value(typesafe_cb_cast(void *, long, (expr)))
+
+int main(int argc, char *argv[])
+{
+#ifdef FAIL
+	bool x = 0;
+#if !HAVE_TYPEOF||!HAVE_BUILTIN_CHOOSE_EXPR||!HAVE_BUILTIN_TYPES_COMPATIBLE_P
+#error "Unfortunately we don't fail if typesafe_cb_cast is a noop."
+#endif
+#else
+	long x = 0;
+#endif
+	set_some_value(x);
+	return 0;
+}
diff --git a/lib/ccan/typesafe_cb/test/compile_fail-cast_if_type.c b/lib/ccan/typesafe_cb/test/compile_fail-cast_if_type.c
new file mode 100644
index 0000000000..610793514f
--- /dev/null
+++ b/lib/ccan/typesafe_cb/test/compile_fail-cast_if_type.c
@@ -0,0 +1,25 @@
+#include <ccan/typesafe_cb/typesafe_cb.h>
+
+void _set_some_value(void *val);
+
+void _set_some_value(void *val)
+{
+}
+
+#define set_some_value(expr)						\
+	_set_some_value(cast_if_type(void *, (expr), (expr), unsigned long))
+
+int main(int argc, char *argv[])
+{
+#ifdef FAIL
+	int x = 0;
+	set_some_value(x);
+#if !HAVE_TYPEOF||!HAVE_BUILTIN_CHOOSE_EXPR||!HAVE_BUILTIN_TYPES_COMPATIBLE_P
+#error "Unfortunately we don't fail if cast_if_type is a noop."
+#endif
+#else
+	void *p = 0;
+	set_some_value(p);
+#endif
+	return 0;
+}
diff --git a/lib/ccan/typesafe_cb/test/compile_fail-typesafe_cb-int.c b/lib/ccan/typesafe_cb/test/compile_fail-typesafe_cb-int.c
new file mode 100644
index 0000000000..c4033364d4
--- /dev/null
+++ b/lib/ccan/typesafe_cb/test/compile_fail-typesafe_cb-int.c
@@ -0,0 +1,27 @@
+#include <ccan/typesafe_cb/typesafe_cb.h>
+#include <stdlib.h>
+
+void _callback(void (*fn)(void *arg), void *arg);
+void _callback(void (*fn)(void *arg), void *arg)
+{
+	fn(arg);
+}
+
+/* Callback is set up to warn if arg isn't a pointer (since it won't
+ * pass cleanly to _callback's second arg. */
+#define callback(fn, arg)						\
+	_callback(typesafe_cb(void, (fn), (arg)), (arg))
+
+void my_callback(int something);
+void my_callback(int something)
+{
+}
+
+int main(int argc, char *argv[])
+{
+#ifdef FAIL
+	/* This fails due to arg, not due to cast. */
+	callback(my_callback, 100);
+#endif
+	return 0;
+}
diff --git a/lib/ccan/typesafe_cb/test/compile_fail-typesafe_cb.c b/lib/ccan/typesafe_cb/test/compile_fail-typesafe_cb.c
new file mode 100644
index 0000000000..81e36d7b87
--- /dev/null
+++ b/lib/ccan/typesafe_cb/test/compile_fail-typesafe_cb.c
@@ -0,0 +1,34 @@
+#include <ccan/typesafe_cb/typesafe_cb.h>
+#include <stdlib.h>
+
+static void _register_callback(void (*cb)(void *arg), void *arg)
+{
+}
+
+#define register_callback(cb, arg)				\
+	_register_callback(typesafe_cb(void, void *, (cb), (arg)), (arg))
+
+static void my_callback(char *p)
+{
+}
+
+int main(int argc, char *argv[])
+{
+	char str[] = "hello world";
+#ifdef FAIL
+	int *p;
+#if !HAVE_TYPEOF||!HAVE_BUILTIN_CHOOSE_EXPR||!HAVE_BUILTIN_TYPES_COMPATIBLE_P
+#error "Unfortunately we don't fail if typesafe_cb_cast is a noop."
+#endif
+#else
+	char *p;
+#endif
+	p = NULL;
+
+	/* This should work always. */
+	register_callback(my_callback, str);
+
+	/* This will fail with FAIL defined */
+	register_callback(my_callback, p);
+	return 0;
+}
diff --git a/lib/ccan/typesafe_cb/test/compile_fail-typesafe_cb_cast-multi.c b/lib/ccan/typesafe_cb/test/compile_fail-typesafe_cb_cast-multi.c
new file mode 100644
index 0000000000..62b5f91e18
--- /dev/null
+++ b/lib/ccan/typesafe_cb/test/compile_fail-typesafe_cb_cast-multi.c
@@ -0,0 +1,43 @@
+#include <ccan/typesafe_cb/typesafe_cb.h>
+#include <stdlib.h>
+
+struct foo {
+	int x;
+};
+
+struct bar {
+	int x;
+};
+
+struct baz {
+	int x;
+};
+
+struct any {
+	int x;
+};
+
+struct other {
+	int x;
+};
+
+static void take_any(struct any *any)
+{
+}
+
+int main(int argc, char *argv[])
+{
+#ifdef FAIL
+	struct other
+#if !HAVE_TYPEOF||!HAVE_BUILTIN_CHOOSE_EXPR||!HAVE_BUILTIN_TYPES_COMPATIBLE_P
+#error "Unfortunately we don't fail if typesafe_cb_cast is a noop."
+#endif
+#else
+	struct foo
+#endif
+		*arg = NULL;
+	take_any(typesafe_cb_cast3(struct any *,
+				   struct foo *, struct bar *, struct baz *,
+				   arg));
+	return 0;
+}
diff --git a/lib/ccan/typesafe_cb/test/compile_fail-typesafe_cb_cast.c b/lib/ccan/typesafe_cb/test/compile_fail-typesafe_cb_cast.c
new file mode 100644
index 0000000000..d2e6f2ab40
--- /dev/null
+++ b/lib/ccan/typesafe_cb/test/compile_fail-typesafe_cb_cast.c
@@ -0,0 +1,25 @@
+#include <ccan/typesafe_cb/typesafe_cb.h>
+
+void _set_some_value(void *val);
+
+void _set_some_value(void *val)
+{
+}
+
+#define set_some_value(expr)						\
+	_set_some_value(typesafe_cb_cast(void *, unsigned long, (expr)))
+
+int main(int argc, char *argv[])
+{
+#ifdef FAIL
+	int x = 0;
+	set_some_value(x);
+#if !HAVE_TYPEOF||!HAVE_BUILTIN_CHOOSE_EXPR||!HAVE_BUILTIN_TYPES_COMPATIBLE_P
+#error "Unfortunately we don't fail if typesafe_cb_cast is a noop."
+#endif
+#else
+	void *p = 0;
+	set_some_value(p);
+#endif
+	return 0;
+}
diff --git a/lib/ccan/typesafe_cb/test/compile_fail-typesafe_cb_exact.c b/lib/ccan/typesafe_cb/test/compile_fail-typesafe_cb_exact.c
new file mode 100644
index 0000000000..0f61d5decd
--- /dev/null
+++ b/lib/ccan/typesafe_cb/test/compile_fail-typesafe_cb_exact.c
@@ -0,0 +1,33 @@
+#include <ccan/typesafe_cb/typesafe_cb.h>
+#include <stdlib.h>
+
+static void _register_callback(void (*cb)(void *arg), const void *arg)
+{
+}
+
+#define register_callback(cb, arg)				\
+	_register_callback(typesafe_cb_exact(void, (cb), (arg)), (arg))
+
+static void my_callback(const char *p)
+{
+}
+
+int main(int argc, char *argv[])
+{
+#ifdef FAIL
+	char *p;
+#if !HAVE_TYPEOF||!HAVE_BUILTIN_CHOOSE_EXPR||!HAVE_BUILTIN_TYPES_COMPATIBLE_P
+#error "Unfortunately we don't fail if cast_if_type is a noop."
+#endif
+#else
+	const char *p;
+#endif
+	p = NULL;
+
+	/* This should work always. */
+	register_callback(my_callback, (const char *)"hello world");
+
+	/* This will fail with FAIL defined */
+	register_callback(my_callback, p);
+	return 0;
+}
diff --git a/lib/ccan/typesafe_cb/test/compile_fail-typesafe_cb_postargs.c b/lib/ccan/typesafe_cb/test/compile_fail-typesafe_cb_postargs.c
new file mode 100644
index 0000000000..7d3530851d
--- /dev/null
+++ b/lib/ccan/typesafe_cb/test/compile_fail-typesafe_cb_postargs.c
@@ -0,0 +1,27 @@
+#include <ccan/typesafe_cb/typesafe_cb.h>
+#include <stdlib.h>
+
+static void _register_callback(void (*cb)(void *arg, int x), void *arg)
+{
+}
+#define register_callback(cb, arg)				\
+	_register_callback(typesafe_cb_postargs(void, void *, (cb), (arg), int), (arg))
+
+static void my_callback(char *p, int x)
+{
+}
+
+int main(int argc, char *argv[])
+{
+#ifdef FAIL
+	int *p;
+#if !HAVE_TYPEOF||!HAVE_BUILTIN_CHOOSE_EXPR||!HAVE_BUILTIN_TYPES_COMPATIBLE_P
+#error "Unfortunately we don't fail if typesafe_cb_cast is a noop."
+#endif
+#else
+	char *p;
+#endif
+	p = NULL;
+	register_callback(my_callback, p);
+	return 0;
+}
diff --git a/lib/ccan/typesafe_cb/test/compile_fail-typesafe_cb_preargs.c b/lib/ccan/typesafe_cb/test/compile_fail-typesafe_cb_preargs.c
new file mode 100644
index 0000000000..bd55c6722c
--- /dev/null
+++ b/lib/ccan/typesafe_cb/test/compile_fail-typesafe_cb_preargs.c
@@ -0,0 +1,28 @@
+#include <ccan/typesafe_cb/typesafe_cb.h>
+#include <stdlib.h>
+
+static void _register_callback(void (*cb)(int x, void *arg), void *arg)
+{
+}
+
+#define register_callback(cb, arg)				\
+	_register_callback(typesafe_cb_preargs(void, void *, (cb), (arg), int), (arg))
+
+static void my_callback(int x, char *p)
+{
+}
+
+int main(int argc, char *argv[])
+{
+#ifdef FAIL
+	int *p;
+#if !HAVE_TYPEOF||!HAVE_BUILTIN_CHOOSE_EXPR||!HAVE_BUILTIN_TYPES_COMPATIBLE_P
+#error "Unfortunately we don't fail if typesafe_cb_cast is a noop."
+#endif
+#else
+	char *p;
+#endif
+	p = NULL;
+	register_callback(my_callback, p);
+	return 0;
+}
diff --git a/lib/ccan/typesafe_cb/test/compile_ok-cast_if_any.c b/lib/ccan/typesafe_cb/test/compile_ok-cast_if_any.c
new file mode 100644
index 0000000000..e8f3c49406
--- /dev/null
+++ b/lib/ccan/typesafe_cb/test/compile_ok-cast_if_any.c
@@ -0,0 +1,41 @@
+#include <ccan/typesafe_cb/typesafe_cb.h>
+#include <stdlib.h>
+
+struct foo {
+	int x;
+};
+
+struct bar {
+	int x;
+};
+
+struct baz {
+	int x;
+};
+
+struct any {
+	int x;
+};
+
+static void take_any(struct any *any)
+{
+}
+
+int main(int argc, char *argv[])
+{
+#if HAVE_TYPEOF
+	/* Otherwise we get unused warnings for these. */
+	struct foo *foo = NULL;
+	struct bar *bar = NULL;
+	struct baz *baz = NULL;
+#endif
+	struct other *arg = NULL;
+
+	take_any(cast_if_any(struct any *, arg, foo,
+			     struct foo *, struct bar *, struct baz *));
+	take_any(cast_if_any(struct any *, arg, bar,
+			     struct foo *, struct bar *, struct baz *));
+	take_any(cast_if_any(struct any *, arg, baz,
+			     struct foo *, struct bar *, struct baz *));
+	return 0;
+}
diff --git a/lib/ccan/typesafe_cb/test/compile_ok-typesafe_cb-NULL.c b/lib/ccan/typesafe_cb/test/compile_ok-typesafe_cb-NULL.c
new file mode 100644
index 0000000000..265de8b14e
--- /dev/null
+++ b/lib/ccan/typesafe_cb/test/compile_ok-typesafe_cb-NULL.c
@@ -0,0 +1,17 @@
+#include <ccan/typesafe_cb/typesafe_cb.h>
+#include <stdlib.h>
+
+/* NULL args for callback function should be OK for normal and _def. */
+
+static void _register_callback(void (*cb)(const void *arg), const void *arg)
+{
+}
+
+#define register_callback(cb, arg)				\
+	_register_callback(typesafe_cb(void, const void *, (cb), (arg)), (arg))
+
+int main(int argc, char *argv[])
+{
+	register_callback(NULL, "hello world");
+	return 0;
+}
diff --git a/lib/ccan/typesafe_cb/test/compile_ok-typesafe_cb-const.c b/lib/ccan/typesafe_cb/test/compile_ok-typesafe_cb-const.c
new file mode 100644
index 0000000000..7c2d62ef23
--- /dev/null
+++ b/lib/ccan/typesafe_cb/test/compile_ok-typesafe_cb-const.c
@@ -0,0 +1,50 @@
+#include <ccan/typesafe_cb/typesafe_cb.h>
+#include <stdlib.h>
+
+/* const args in callbacks should be OK. */
+
+static void _register_callback(void (*cb)(void *arg), void *arg)
+{
+}
+
+#define register_callback(cb, arg)				\
+	_register_callback(typesafe_cb(void, (cb), (arg)), (arg))
+
+#define register_callback_def(cb, arg)				\
+	_register_callback(typesafe_cb_def(void, (cb), (arg)), (arg))
+
+static void _register_callback_pre(void (*cb)(int x, void *arg), void *arg)
+{
+}
+
+#define register_callback_pre(cb, arg)					\
+	_register_callback_pre(typesafe_cb_preargs(void, (cb), (arg), int), (arg))
+
+static void _register_callback_post(void (*cb)(void *arg, int x), void *arg)
+{
+}
+
+#define register_callback_post(cb, arg)					\
+	_register_callback_post(typesafe_cb_postargs(void, (cb), (arg), int), (arg))
+
+static void my_callback(const char *p)
+{
+}
+
+static void my_callback_pre(int x, /*const*/ char *p)
+{
+}
+
+static void my_callback_post(/*const*/ char *p, int x)
+{
+}
+
+int main(int argc, char *argv[])
+{
+	char p[] = "hello world";
+	register_callback(my_callback, p);
+	register_callback_def(my_callback, p);
+	register_callback_pre(my_callback_pre, p);
+	register_callback_post(my_callback_post, p);
+	return 0;
+}
diff --git a/lib/ccan/typesafe_cb/test/compile_ok-typesafe_cb-undefined.c b/lib/ccan/typesafe_cb/test/compile_ok-typesafe_cb-undefined.c
new file mode 100644
index 0000000000..aa50bad6a9
--- /dev/null
+++ b/lib/ccan/typesafe_cb/test/compile_ok-typesafe_cb-undefined.c
@@ -0,0 +1,49 @@
+#include <ccan/typesafe_cb/typesafe_cb.h>
+#include <stdlib.h>
+
+/* const args in callbacks should be OK. */
+
+static void _register_callback(void (*cb)(void *arg), void *arg)
+{
+}
+
+#define register_callback(cb, arg)				\
+	_register_callback(typesafe_cb(void, void *, (cb), (arg)), (arg))
+
+static void _register_callback_pre(void (*cb)(int x, void *arg), void *arg)
+{
+}
+
+#define register_callback_pre(cb, arg)					\
+	_register_callback_pre(typesafe_cb_preargs(void, void *, (cb), (arg), int), (arg))
+
+static void _register_callback_post(void (*cb)(void *arg, int x), void *arg)
+{
+}
+
+#define register_callback_post(cb, arg)					\
+	_register_callback_post(typesafe_cb_postargs(void, void *, (cb), (arg), int), (arg))
+
+struct undefined;
+
+static void my_callback(struct undefined *undef)
+{
+}
+
+static void my_callback_pre(int x, struct undefined *undef)
+{
+}
+
+static void my_callback_post(struct undefined *undef, int x)
+{
+}
+
+int main(int argc, char *argv[])
+{
+	struct undefined *handle = NULL;
+
+	register_callback(my_callback, handle);
+	register_callback_pre(my_callback_pre, handle);
+	register_callback_post(my_callback_post, handle);
+	return 0;
+}
diff --git a/lib/ccan/typesafe_cb/test/compile_ok-typesafe_cb-vars.c b/lib/ccan/typesafe_cb/test/compile_ok-typesafe_cb-vars.c
new file mode 100644
index 0000000000..f6a2bfecbc
--- /dev/null
+++ b/lib/ccan/typesafe_cb/test/compile_ok-typesafe_cb-vars.c
@@ -0,0 +1,52 @@
+#include <ccan/typesafe_cb/typesafe_cb.h>
+#include <stdlib.h>
+
+/* const args in callbacks should be OK. */
+
+static void _register_callback(void (*cb)(void *arg), void *arg)
+{
+}
+
+#define register_callback(cb, arg)				\
+	_register_callback(typesafe_cb(void, void *, (cb), (arg)), (arg))
+
+static void _register_callback_pre(void (*cb)(int x, void *arg), void *arg)
+{
+}
+
+#define register_callback_pre(cb, arg)					\
+	_register_callback_pre(typesafe_cb_preargs(void, void *, (cb), (arg), int), (arg))
+
+static void _register_callback_post(void (*cb)(void *arg, int x), void *arg)
+{
+}
+
+#define register_callback_post(cb, arg)					\
+	_register_callback_post(typesafe_cb_postargs(void, void *, (cb), (arg), int), (arg))
+
+struct undefined;
+
+static void my_callback(struct undefined *undef)
+{
+}
+
+static void my_callback_pre(int x, struct undefined *undef)
+{
+}
+
+static void my_callback_post(struct undefined *undef, int x)
+{
+}
+
+int main(int argc, char *argv[])
+{
+	struct undefined *handle = NULL;
+	void (*cb)(struct undefined *undef) = my_callback;
+	void (*pre)(int x, struct undefined *undef) = my_callback_pre;
+	void (*post)(struct undefined *undef, int x) = my_callback_post;
+
+	register_callback(cb, handle);
+	register_callback_pre(pre, handle);
+	register_callback_post(post, handle);
+	return 0;
+}
diff --git a/lib/ccan/typesafe_cb/test/compile_ok-typesafe_cb-volatile.c b/lib/ccan/typesafe_cb/test/compile_ok-typesafe_cb-volatile.c
new file mode 100644
index 0000000000..3fcb1ff656
--- /dev/null
+++ b/lib/ccan/typesafe_cb/test/compile_ok-typesafe_cb-volatile.c
@@ -0,0 +1,47 @@
+#include <ccan/typesafe_cb/typesafe_cb.h>
+#include <stdlib.h>
+
+/* volatile args in callbacks should be OK. */
+
+static void _register_callback(void (*cb)(void *arg), void *arg)
+{
+}
+
+#define register_callback(cb, arg)				\
+	_register_callback(typesafe_cb(void, (cb), (arg)), (arg))
+
+static void _register_callback_pre(void (*cb)(int x, void *arg), void *arg)
+{
+}
+
+#define register_callback_pre(cb, arg)					\
+	_register_callback_pre(typesafe_cb_preargs(void, (cb), (arg), int), (arg))
+
+static void _register_callback_post(void (*cb)(void *arg, int x), void *arg)
+{
+}
+
+#define register_callback_post(cb, arg)					\
+	_register_callback_post(typesafe_cb_postargs(void, (cb), (arg), int), (arg))
+
+static void my_callback(volatile char *p)
+{
+}
+
+/* FIXME: Can't handle volatile for these */
+static void my_callback_pre(int x, /* volatile */ char *p)
+{
+}
+
+static void my_callback_post(/* volatile */ char *p, int x)
+{
+}
+
+int main(int argc, char *argv[])
+{
+	char p[] = "hello world";
+	register_callback(my_callback, p);
+	register_callback_pre(my_callback_pre, p);
+	register_callback_post(my_callback_post, p);
+	return 0;
+}
diff --git a/lib/ccan/typesafe_cb/test/compile_ok-typesafe_cb_cast.c b/lib/ccan/typesafe_cb/test/compile_ok-typesafe_cb_cast.c
new file mode 100644
index 0000000000..b7f21dc094
--- /dev/null
+++ b/lib/ccan/typesafe_cb/test/compile_ok-typesafe_cb_cast.c
@@ -0,0 +1,41 @@
+#include <ccan/typesafe_cb/typesafe_cb.h>
+#include <stdlib.h>
+
+struct foo {
+	int x;
+};
+
+struct bar {
+	int x;
+};
+
+struct baz {
+	int x;
+};
+
+struct any {
+	int x;
+};
+
+static void take_any(struct any *any)
+{
+}
+
+int main(int argc, char *argv[])
+{
+	/* Otherwise we get unused warnings for these. */
+	struct foo *foo = NULL;
+	struct bar *bar = NULL;
+	struct baz *baz = NULL;
+
+	take_any(typesafe_cb_cast3(struct any *,
+				   struct foo *, struct bar *, struct baz *,
+				   foo));
+	take_any(typesafe_cb_cast3(struct any *,
+				   struct foo *, struct bar *, struct baz *,
+				   bar));
+	take_any(typesafe_cb_cast3(struct any *,
+				   struct foo *, struct bar *, struct baz *,
+				   baz));
+	return 0;
+}
diff --git a/lib/ccan/typesafe_cb/test/compile_ok-typesafe_cb_def-const.c b/lib/ccan/typesafe_cb/test/compile_ok-typesafe_cb_def-const.c
new file mode 100644
index 0000000000..01e090f1dc
--- /dev/null
+++ b/lib/ccan/typesafe_cb/test/compile_ok-typesafe_cb_def-const.c
@@ -0,0 +1,46 @@
+#include <ccan/typesafe_cb/typesafe_cb.h>
+#include <stdlib.h>
+
+/* const args in callbacks should be OK. */
+
+static void _register_callback(void (*cb)(void *arg), void *arg)
+{
+}
+
+#define register_callback(cb, arg)				\
+	_register_callback(typesafe_cb(void, (cb), (arg)), (arg))
+
+static void _register_callback_pre(void (*cb)(int x, void *arg), void *arg)
+{
+}
+
+#define register_callback_pre(cb, arg)					\
+	_register_callback_pre(typesafe_cb_preargs(void, (cb), (arg), int), (arg))
+
+static void _register_callback_post(void (*cb)(void *arg, int x), void *arg)
+{
+}
+
+#define register_callback_post(cb, arg)					\
+	_register_callback_post(typesafe_cb_postargs(void, (cb), (arg), int), (arg))
+
+static void my_callback(const char *p)
+{
+}
+
+static void my_callback_pre(int x, /*const*/ char *p)
+{
+}
+
+static void my_callback_post(/*const*/ char *p, int x)
+{
+}
+
+int main(int argc, char *argv[])
+{
+	char p[] = "hello world";
+	register_callback(my_callback, p);
+	register_callback_pre(my_callback_pre, p);
+	register_callback_post(my_callback_post, p);
+	return 0;
+}
diff --git a/lib/ccan/typesafe_cb/test/run.c b/lib/ccan/typesafe_cb/test/run.c
new file mode 100644
index 0000000000..116e7d1946
--- /dev/null
+++ b/lib/ccan/typesafe_cb/test/run.c
@@ -0,0 +1,109 @@
+#include <ccan/typesafe_cb/typesafe_cb.h>
+#include <string.h>
+#include <stdint.h>
+#include <ccan/tap/tap.h>
+
+static char dummy = 0;
+
+/* The example usage. */
+static void _set_some_value(void *val)
+{
+	ok1(val == &dummy);
+}
+
+#define set_some_value(expr)						\
+	_set_some_value(typesafe_cb_cast(void *, unsigned long, (expr)))
+
+static void _callback_onearg(void (*fn)(void *arg), void *arg)
+{
+	fn(arg);
+}
+
+static void _callback_preargs(void (*fn)(int a, int b, void *arg), void *arg)
+{
+	fn(1, 2, arg);
+}
+
+static void _callback_postargs(void (*fn)(void *arg, int a, int b), void *arg)
+{
+	fn(arg, 1, 2);
+}
+
+#define callback_onearg(cb, arg)					\
+	_callback_onearg(typesafe_cb(void, void *, (cb), (arg)), (arg))
+
+#define callback_preargs(cb, arg)					\
+	_callback_preargs(typesafe_cb_preargs(void, void *, (cb), (arg), int, int), (arg))
+
+#define callback_postargs(cb, arg)					\
+	_callback_postargs(typesafe_cb_postargs(void, void *, (cb), (arg), int, int), (arg))
+
+static void my_callback_onearg(char *p)
+{
+	ok1(strcmp(p, "hello world") == 0);
+}
+
+static void my_callback_preargs(int a, int b, char *p)
+{
+	ok1(a == 1);
+	ok1(b == 2);
+	ok1(strcmp(p, "hello world") == 0);
+}
+
+static void my_callback_postargs(char *p, int a, int b)
+{
+	ok1(a == 1);
+	ok1(b == 2);
+	ok1(strcmp(p, "hello world") == 0);
+}
+
+/* This is simply a compile test; we promised typesafe_cb_cast can be in a
+ * static initializer. */
+struct callback_onearg
+{
+	void (*fn)(void *arg);
+	const void *arg;
+};
+
+struct callback_onearg cb_onearg
+= { typesafe_cb(void, void *, my_callback_onearg, (char *)(intptr_t)"hello world"),
+    "hello world" };
+
+struct callback_preargs
+{
+	void (*fn)(int a, int b, void *arg);
+	const void *arg;
+};
+
+struct callback_preargs cb_preargs
+= { typesafe_cb_preargs(void, void *, my_callback_preargs,
+			(char *)(intptr_t)"hi", int, int), "hi" };
+
+struct callback_postargs
+{
+	void (*fn)(void *arg, int a, int b);
+	const void *arg;
+};
+
+struct callback_postargs cb_postargs
+= { typesafe_cb_postargs(void, void *, my_callback_postargs,
+			 (char *)(intptr_t)"hi", int, int), "hi" };
+
+int main(int argc, char *argv[])
+{
+	void *p = &dummy;
+	unsigned long l = (unsigned long)p;
+	char str[] = "hello world";
+
+	plan_tests(2 + 1 + 3 + 3);
+	set_some_value(p);
+	set_some_value(l);
+
+	callback_onearg(my_callback_onearg, str);
+
+	callback_preargs(my_callback_preargs, str);
+
+	callback_postargs(my_callback_postargs, str);
+
+	return exit_status();
+}
diff --git a/lib/ccan/typesafe_cb/typesafe_cb.h b/lib/ccan/typesafe_cb/typesafe_cb.h
new file mode 100644
index 0000000000..40cfa39798
--- /dev/null
+++ b/lib/ccan/typesafe_cb/typesafe_cb.h
@@ -0,0 +1,133 @@
+#ifndef CCAN_TYPESAFE_CB_H
+#define CCAN_TYPESAFE_CB_H
+#include "config.h"
+
+#if HAVE_TYPEOF && HAVE_BUILTIN_CHOOSE_EXPR && HAVE_BUILTIN_TYPES_COMPATIBLE_P
+/**
+ * typesafe_cb_cast - only cast an expression if it matches a given type
+ * @desttype: the type to cast to
+ * @oktype: the type we allow
+ * @expr: the expression to cast
+ *
+ * This macro is used to create functions which allow multiple types.
+ * The result of this macro is used somewhere that a @desttype type is
+ * expected: if @expr is exactly of type @oktype, then it will be
+ * cast to @desttype type, otherwise left alone.
+ *
+ * This macro can be used in static initializers.
+ *
+ * This is merely useful for warnings: if the compiler does not
+ * support the primitives required for typesafe_cb_cast(), it becomes an
+ * unconditional cast, and the @oktype argument is not used.  In
+ * particular, this means that @oktype can be a type which uses the
+ * "typeof": it will not be evaluated if typeof is not supported.
+ *
+ * Example:
+ *	// We can take either an unsigned long or a void *.
+ *	void _set_some_value(void *val);
+ *	#define set_some_value(e)			\
+ *		_set_some_value(typesafe_cb_cast(void *, (e), unsigned long))
+ */
+#define typesafe_cb_cast(desttype, oktype, expr)			\
+	__builtin_choose_expr(						\
+		__builtin_types_compatible_p(__typeof__(0?(expr):(expr)), \
+					     oktype),			\
+		(desttype)(expr), (expr))
+#else
+#define typesafe_cb_cast(desttype, oktype, expr) ((desttype)(expr))
+#endif
+
+/**
+ * typesafe_cb_cast3 - only cast an expression if it matches given types
+ * @desttype: the type to cast to
+ * @ok1: the first type we allow
+ * @ok2: the second type we allow
+ * @ok3: the third type we allow
+ * @expr: the expression to cast
+ *
+ * This is a convenient wrapper for multiple typesafe_cb_cast() calls.
+ * You can chain them inside each other (ie. use typesafe_cb_cast()
+ * for expr) if you need more than 3 arguments.
+ *
+ * Example:
+ *	// We can take either a long, unsigned long, void * or a const void *.
+ *	void _set_some_value(void *val);
+ *	#define set_some_value(expr)					\
+ *		_set_some_value(typesafe_cb_cast3(void *,,		\
+ *					    long, unsigned long, const void *,\
+ *					    (expr)))
+ */
+#define typesafe_cb_cast3(desttype, ok1, ok2, ok3, expr)		\
+	typesafe_cb_cast(desttype, ok1,					\
+			 typesafe_cb_cast(desttype, ok2,		\
+					  typesafe_cb_cast(desttype, ok3, \
+							   (expr))))
+
+/**
+ * typesafe_cb - cast a callback function if it matches the arg
+ * @rtype: the return type of the callback function
+ * @atype: the (pointer) type which the callback function expects.
+ * @fn: the callback function to cast
+ * @arg: the (pointer) argument to hand to the callback function.
+ *
+ * If a callback function takes a single argument, this macro does
+ * appropriate casts to a function which takes a single atype argument if the
+ * callback provided matches the @arg.
+ *
+ * It is assumed that @arg is of pointer type: usually @arg is passed
+ * or assigned to a void * elsewhere anyway.
+ *
+ * Example:
+ *	void _register_callback(void (*fn)(void *arg), void *arg);
+ *	#define register_callback(fn, arg) \
+ *		_register_callback(typesafe_cb(void, (fn), void*, (arg)), (arg))
+ */
+#define typesafe_cb(rtype, atype, fn, arg)			\
+	typesafe_cb_cast(rtype (*)(atype),			\
+			 rtype (*)(__typeof__(arg)),		\
+			 (fn))
+
+/**
+ * typesafe_cb_preargs - cast a callback function if it matches the arg
+ * @rtype: the return type of the callback function
+ * @atype: the (pointer) type which the callback function expects.
+ * @fn: the callback function to cast
+ * @arg: the (pointer) argument to hand to the callback function.
+ *
+ * This is a version of typesafe_cb() for callbacks that take other arguments
+ * before the @arg.
+ *
+ * Example:
+ *	void _register_callback(void (*fn)(int, void *arg), void *arg);
+ *	#define register_callback(fn, arg)				   \
+ *		_register_callback(typesafe_cb_preargs(void, (fn), void *, \
+ *				   (arg), int),				   \
+ *				   (arg))
+ */
+#define typesafe_cb_preargs(rtype, atype, fn, arg, ...)			\
+	typesafe_cb_cast(rtype (*)(__VA_ARGS__, atype),			\
+			 rtype (*)(__VA_ARGS__, __typeof__(arg)),	\
+			 (fn))
+
+/**
+ * typesafe_cb_postargs - cast a callback function if it matches the arg
+ * @rtype: the return type of the callback function
+ * @atype: the (pointer) type which the callback function expects.
+ * @fn: the callback function to cast
+ * @arg: the (pointer) argument to hand to the callback function.
+ *
+ * This is a version of typesafe_cb() for callbacks that take other arguments
+ * after the @arg.
+ *
+ * Example:
+ *	void _register_callback(void (*fn)(void *arg, int), void *arg);
+ *	#define register_callback(fn, arg) \
+ *		_register_callback(typesafe_cb_postargs(void, (fn), void *, \
+ *				   (arg), int),				    \
+ *				   (arg))
+ */
+#define typesafe_cb_postargs(rtype, atype, fn, arg, ...)		\
+	typesafe_cb_cast(rtype (*)(atype, __VA_ARGS__),			\
+			 rtype (*)(__typeof__(arg), __VA_ARGS__),	\
+			 (fn))
+#endif /* CCAN_CAST_IF_TYPE_H */
diff --git a/lib/ccan/wscript b/lib/ccan/wscript
new file mode 100644
index 0000000000..0543a4de07
--- /dev/null
+++ b/lib/ccan/wscript
@@ -0,0 +1,120 @@
+#!/usr/bin/env python
+
+import Logs, sys
+
+def configure(conf):
+    # FIXME: if they don't have -Werror, these will all fail.  But they
+    # probably will anyway...
+    conf.CHECK_CODE('int __attribute__((cold)) func(int x) { return x; }',
+                    addmain=False, link=False, cflags="-Werror",
+                    define='HAVE_ATTRIBUTE_COLD')
+    conf.CHECK_CODE('int __attribute__((const)) func(int x) { return x; }',
+                    addmain=False, link=False, cflags="-Werror",
+                    define='HAVE_ATTRIBUTE_CONST')
+    conf.CHECK_CODE('void __attribute__((noreturn)) func(int x) { exit(x); }',
+                    addmain=False, link=False, cflags="-Werror",
+                    define='HAVE_ATTRIBUTE_NORETURN')
+    conf.CHECK_CODE('void __attribute__((format(__printf__, 1, 2))) func(const char *fmt, ...) { }',
+                    addmain=False, link=False, cflags="-Werror",
+                    define='HAVE_ATTRIBUTE_PRINTF')
+    conf.CHECK_CODE('int __attribute__((unused)) func(int x) { return x; }',
+                    addmain=False, link=False, cflags="-Werror",
+                    define='HAVE_ATTRIBUTE_UNUSED')
+    conf.CHECK_CODE('int __attribute__((used)) func(int x) { return x; }',
+                    addmain=False, link=False, cflags="-Werror",
+                    define='HAVE_ATTRIBUTE_USED')
+    # We try to use headers for a compile-time test.
+    conf.CHECK_CODE(code = """#ifdef __BYTE_ORDER
+                        #define B __BYTE_ORDER
+                        #elif defined(BYTE_ORDER)
+                        #define B BYTE_ORDER
+                        #endif
+
+                        #ifdef __LITTLE_ENDIAN
+                        #define LITTLE __LITTLE_ENDIAN
+                        #elif defined(LITTLE_ENDIAN)
+                        #define LITTLE LITTLE_ENDIAN
+                        #endif
+
+                        #if !defined(LITTLE) || !defined(B) || LITTLE != B
+                        #error Not little endian.
+                        #endif""",
+                           headers="endian.h sys/endian.h",
+                           define="HAVE_LITTLE_ENDIAN")
+    conf.CHECK_CODE(code = """#ifdef __BYTE_ORDER
+                        #define B __BYTE_ORDER
+                        #elif defined(BYTE_ORDER)
+                        #define B BYTE_ORDER
+                        #endif
+
+                        #ifdef __BIG_ENDIAN
+                        #define BIG __BIG_ENDIAN
+                        #elif defined(BIG_ENDIAN)
+                        #define BIG BIG_ENDIAN
+                        #endif
+
+                        #if !defined(BIG) || !defined(B) || BIG != B
+                        #error Not big endian.
+                        #endif""",
+                           headers="endian.h sys/endian.h",
+                           define="HAVE_BIG_ENDIAN")
+
+    if not conf.CONFIG_SET("HAVE_BIG_ENDIAN") and not conf.CONFIG_SET("HAVE_LITTLE_ENDIAN"):
+        # That didn't work!  Do runtime test.
+        conf.CHECK_CODE("""union { int i; char c[sizeof(int)]; } u;
+	  u.i = 0x01020304;
+	  return u.c[0] == 0x04 && u.c[1] == 0x03 && u.c[2] == 0x02 && u.c[3] == 0x01 ? 0 : 1;""",
+                        addmain=True, execute=True,
+                        define='HAVE_LITTLE_ENDIAN',
+                        msg="Checking for HAVE_LITTLE_ENDIAN - runtime")
+        conf.CHECK_CODE("""union { int i; char c[sizeof(int)]; } u;
+	  u.i = 0x01020304;
+	  return u.c[0] == 0x01 && u.c[1] == 0x02 && u.c[2] == 0x03 && u.c[3] == 0x04 ? 0 : 1;""",
+                        addmain=True, execute=True,
+                        define='HAVE_BIG_ENDIAN',
+                        msg="Checking for HAVE_BIG_ENDIAN - runtime")
+
+    # Extra sanity check.
+    if conf.CONFIG_SET("HAVE_BIG_ENDIAN") == conf.CONFIG_SET("HAVE_LITTLE_ENDIAN"):
+        Logs.error("Failed endian determination.  The PDP-11 is back?")
+        sys.exit(1)
+
+    conf.CHECK_CODE('return __builtin_clz(1) == (sizeof(int)*8 - 1) ? 0 : 1;',
+                    link=True,
+                    define='HAVE_BUILTIN_CLZ')
+    conf.CHECK_CODE('return __builtin_clzl(1) == (sizeof(long)*8 - 1) ? 0 : 1;',
+                    link=True,
+                    define='HAVE_BUILTIN_CLZL')
+    conf.CHECK_CODE('return __builtin_clzll(1) == (sizeof(long long)*8 - 1) ? 0 : 1;',
+                    link=True,
+                    define='HAVE_BUILTIN_CLZLL')
+    conf.CHECK_CODE('return __builtin_constant_p(1) ? 0 : 1;',
+                    link=True,
+                    define='HAVE_BUILTIN_CONSTANT_P')
+    conf.CHECK_CODE('return __builtin_expect(main != 0, 1) ? 0 : 1;',
+                    link=True,
+                    define='HAVE_BUILTIN_EXPECT')
+    conf.CHECK_CODE('return __builtin_popcountl(255L) == 8 ? 0 : 1;',
+                    link=True,
+                    define='HAVE_BUILTIN_POPCOUNTL')
+    conf.CHECK_CODE('return __builtin_types_compatible_p(char *, int) ? 1 : 0;',
+                    link=True,
+                    define='HAVE_BUILTIN_TYPES_COMPATIBLE_P')
+    conf.CHECK_CODE('int *foo = (int[]) { 1, 2, 3, 4 }; return foo[0] ? 0 : 1;',
+                    define='HAVE_COMPOUND_LITERALS')
+    conf.CHECK_CODE("""#include <ctype.h>
+	  int main(void) { return isblank(' ') ? 0 : 1; }""",
+                    link=True, addmain=False, add_headers=False,
+                    define='HAVE_ISBLANK')
+    conf.CHECK_CODE('int x = 1; __typeof__(x) i; i = x; return i == x ? 0 : 1;',
+                    link=True,
+                    define='HAVE_TYPEOF')
+    conf.CHECK_CODE('int __attribute__((warn_unused_result)) func(int x) { return x; }',
+                    addmain=False, link=False, cflags="-Werror",
+                    define='HAVE_WARN_UNUSED_RESULT')
+
+def build(bld):
+    bld.SAMBA_LIBRARY('ccan',
+                      vnum="0.1-init-1161-g661d41f",
+                      source=bld.path.ant_glob('*/*.c'),
+                      private_library=True)
diff --git a/lib/nss_wrapper/nss_wrapper.c b/lib/nss_wrapper/nss_wrapper.c
index cfa5a68712..8767fbfd89 100644
--- a/lib/nss_wrapper/nss_wrapper.c
+++ b/lib/nss_wrapper/nss_wrapper.c
@@ -36,7 +36,9 @@
 
 /* defining this gives us the posix getpwnam_r() calls on solaris
    Thanks to heimdal for this */
+#ifndef _POSIX_PTHREAD_SEMANTICS
 #define _POSIX_PTHREAD_SEMANTICS
+#endif
 
 #define NSS_WRAPPER_NOT_REPLACE
 #include "../replace/replace.h"
diff --git a/lib/replace/libreplace_network.m4 b/lib/replace/libreplace_network.m4
index f9bca40ce9..eadcc6bfc1 100644
--- a/lib/replace/libreplace_network.m4
+++ b/lib/replace/libreplace_network.m4
@@ -240,12 +240,25 @@ if test x"$libreplace_cv_HAVE_GETADDRINFO" = x"yes"; then
 		{
 			struct addrinfo hints = {0,};
 			struct addrinfo *ppres;
-			const char hostname[] = "0.0.0.0";
+			const char hostname1[] = "0.0.0.0";
+			const char hostname2[] = "127.0.0.1";
+			const char hostname3[] = "::";
 			hints.ai_socktype = SOCK_STREAM;
-			hints.ai_family = AF_INET;
+			hints.ai_family = AF_UNSPEC;
 			hints.ai_flags =
 				AI_NUMERICHOST|AI_PASSIVE|AI_ADDRCONFIG;
-			return getaddrinfo(hostname, NULL, &hints, &ppres) != 0 ? 1 : 0;
+			/* Test for broken flag combination on AIX. */
+			if (getaddrinfo(hostname1, NULL, &hints, &ppres) == EAI_BADFLAGS) {
+				/* This fails on an IPv6-only box, but not with
+				   the EAI_BADFLAGS error. */
+				return 1;
+			}
+			if (getaddrinfo(hostname2, NULL, &hints, &ppres) == 0) {
+				/* IPv4 lookup works - good enough. */
+				return 0;
+			}
+			/* Uh-oh, no IPv4. Are we IPv6-only ? */
+			return getaddrinfo(hostname3, NULL, &hints, &ppres) != 0 ? 1 : 0;
 		}],
 		libreplace_cv_HAVE_GETADDRINFO=yes,
 		libreplace_cv_HAVE_GETADDRINFO=no)
diff --git a/lib/replace/system/kerberos.h b/lib/replace/system/kerberos.h
index bb1f1b9a09..7762d4be46 100644
--- a/lib/replace/system/kerberos.h
+++ b/lib/replace/system/kerberos.h
@@ -37,5 +37,19 @@
 #include <com_err.h>
 #endif
 
+#ifdef HAVE_GSSAPI_GSSAPI_EXT_H
+#include <gssapi/gssapi_ext.h>
+#elif HAVE_GSSAPI_GSSAPI_H
+#include <gssapi/gssapi.h>
+#elif HAVE_GSSAPI_GSSAPI_GENERIC_H
+#include <gssapi/gssapi_generic.h>
+#elif HAVE_GSSAPI_H
+#include <gssapi.h>
+#endif
+
+#if HAVE_GSSAPI_GSSAPI_KRB5_H
+#include <gssapi/gssapi_krb5.h>
+#endif
+
 #endif
 #endif
diff --git a/lib/replace/system/network.h b/lib/replace/system/network.h
index f7c1bcfacb..a4e6a7e31a 100644
--- a/lib/replace/system/network.h
+++ b/lib/replace/system/network.h
@@ -331,8 +331,6 @@ typedef unsigned short int sa_family_t;
      * which might return 512 or bigger
      */
 #   define IOV_MAX 512
-#  else
-#   error IOV_MAX and UIO_MAXIOV undefined
 #  endif
 # endif
 #endif
diff --git a/lib/smbconf/smbconf.c b/lib/smbconf/smbconf.c
index 80fe9aac37..e0441ed985 100644
--- a/lib/smbconf/smbconf.c
+++ b/lib/smbconf/smbconf.c
@@ -27,12 +27,13 @@
  *
  **********************************************************************/
 
-static WERROR smbconf_global_check(struct smbconf_ctx *ctx)
+static sbcErr smbconf_global_check(struct smbconf_ctx *ctx)
 {
 	if (!smbconf_share_exists(ctx, GLOBAL_NAME)) {
 		return smbconf_create_share(ctx, GLOBAL_NAME);
 	}
-	return WERR_OK;
+
+	return SBC_ERR_OK;
 }
 
 
@@ -42,6 +43,41 @@ static WERROR smbconf_global_check(struct smbconf_ctx *ctx)
  *
  **********************************************************************/
 
+const char *sbcErrorString(sbcErr error)
+{
+	switch (error) {
+		case SBC_ERR_OK:
+			return "SBC_ERR_OK";
+		case SBC_ERR_NOT_IMPLEMENTED:
+			return "SBC_ERR_NOT_IMPLEMENTED";
+		case SBC_ERR_NOT_SUPPORTED:
+			return "SBC_ERR_NOT_SUPPORTED";
+		case SBC_ERR_UNKNOWN_FAILURE:
+			return "SBC_ERR_UNKNOWN_FAILURE";
+		case SBC_ERR_NOMEM:
+			return "SBC_ERR_NOMEM";
+		case SBC_ERR_INVALID_PARAM:
+			return "SBC_ERR_INVALID_PARAM";
+		case SBC_ERR_BADFILE:
+			return "SBC_ERR_BADFILE";
+		case SBC_ERR_NO_SUCH_SERVICE:
+			return "SBC_ERR_NO_SUCH_SERVICE";
+		case SBC_ERR_IO_FAILURE:
+			return "SBC_ERR_IO_FAILURE";
+		case SBC_ERR_CAN_NOT_COMPLETE:
+			return "SBC_ERR_CAN_NOT_COMPLETE";
+		case SBC_ERR_NO_MORE_ITEMS:
+			return "SBC_ERR_NO_MORE_ITEMS";
+		case SBC_ERR_FILE_EXISTS:
+			return "SBC_ERR_FILE_EXISTS";
+		case SBC_ERR_ACCESS_DENIED:
+			return "SBC_ERR_ACCESS_DENIED";
+	}
+
+	return "unknown sbcErr value";
+}
+
+
 /**
  * Tell whether the backend requires messaging to be set up
  * for the backend to work correctly.
@@ -91,7 +127,7 @@ bool smbconf_changed(struct smbconf_ctx *ctx, struct smbconf_csn *csn,
 /**
  * Drop the whole configuration (restarting empty).
  */
-WERROR smbconf_drop(struct smbconf_ctx *ctx)
+sbcErr smbconf_drop(struct smbconf_ctx *ctx)
 {
 	return ctx->ops->drop(ctx);
 }
@@ -105,12 +141,12 @@ WERROR smbconf_drop(struct smbconf_ctx *ctx)
  *  param_names  : list of lists of parameter names for each share
  *  param_values : list of lists of parameter values for each share
  */
-WERROR smbconf_get_config(struct smbconf_ctx *ctx,
+sbcErr smbconf_get_config(struct smbconf_ctx *ctx,
 			  TALLOC_CTX *mem_ctx,
 			  uint32_t *num_shares,
 			  struct smbconf_service ***services)
 {
-	WERROR werr = WERR_OK;
+	sbcErr err;
 	TALLOC_CTX *tmp_ctx = NULL;
 	uint32_t tmp_num_shares;
 	char **tmp_share_names;
@@ -118,36 +154,35 @@ WERROR smbconf_get_config(struct smbconf_ctx *ctx,
 	uint32_t count;
 
 	if ((num_shares == NULL) || (services == NULL)) {
-		werr = WERR_INVALID_PARAM;
+		err = SBC_ERR_INVALID_PARAM;
 		goto done;
 	}
 
 	tmp_ctx = talloc_stackframe();
 
-	werr = smbconf_get_share_names(ctx, tmp_ctx, &tmp_num_shares,
-				       &tmp_share_names);
-	if (!W_ERROR_IS_OK(werr)) {
+	err = smbconf_get_share_names(ctx, tmp_ctx, &tmp_num_shares,
+				      &tmp_share_names);
+	if (!SBC_ERROR_IS_OK(err)) {
 		goto done;
 	}
 
 	tmp_services = talloc_array(tmp_ctx, struct smbconf_service *,
 				    tmp_num_shares);
-
 	if (tmp_services == NULL) {
-		werr = WERR_NOMEM;
+		err = SBC_ERR_NOMEM;
 		goto done;
 	}
 
 	for (count = 0; count < tmp_num_shares; count++) {
-		werr = smbconf_get_share(ctx, tmp_services,
-					 tmp_share_names[count],
-					 &tmp_services[count]);
-		if (!W_ERROR_IS_OK(werr)) {
+		err = smbconf_get_share(ctx, tmp_services,
+					tmp_share_names[count],
+					&tmp_services[count]);
+		if (!SBC_ERROR_IS_OK(err)) {
 			goto done;
 		}
 	}
 
-	werr = WERR_OK;
+	err = SBC_ERR_OK;
 
 	*num_shares = tmp_num_shares;
 	if (tmp_num_shares > 0) {
@@ -158,13 +193,13 @@ WERROR smbconf_get_config(struct smbconf_ctx *ctx,
 
 done:
 	talloc_free(tmp_ctx);
-	return werr;
+	return err;
 }
 
 /**
  * get the list of share names defined in the configuration.
  */
-WERROR smbconf_get_share_names(struct smbconf_ctx *ctx,
+sbcErr smbconf_get_share_names(struct smbconf_ctx *ctx,
 			       TALLOC_CTX *mem_ctx,
 			       uint32_t *num_shares,
 			       char ***share_names)
@@ -185,11 +220,11 @@ bool smbconf_share_exists(struct smbconf_ctx *ctx,
 /**
  * Add a service if it does not already exist.
  */
-WERROR smbconf_create_share(struct smbconf_ctx *ctx,
+sbcErr smbconf_create_share(struct smbconf_ctx *ctx,
 			    const char *servicename)
 {
 	if ((servicename != NULL) && smbconf_share_exists(ctx, servicename)) {
-		return WERR_FILE_EXISTS;
+		return SBC_ERR_FILE_EXISTS;
 	}
 
 	return ctx->ops->create_share(ctx, servicename);
@@ -198,7 +233,7 @@ WERROR smbconf_create_share(struct smbconf_ctx *ctx,
 /**
  * get a definition of a share (service) from configuration.
  */
-WERROR smbconf_get_share(struct smbconf_ctx *ctx,
+sbcErr smbconf_get_share(struct smbconf_ctx *ctx,
 			 TALLOC_CTX *mem_ctx,
 			 const char *servicename,
 			 struct smbconf_service **service)
@@ -209,10 +244,10 @@ WERROR smbconf_get_share(struct smbconf_ctx *ctx,
 /**
  * delete a service from configuration
  */
-WERROR smbconf_delete_share(struct smbconf_ctx *ctx, const char *servicename)
+sbcErr smbconf_delete_share(struct smbconf_ctx *ctx, const char *servicename)
 {
 	if (!smbconf_share_exists(ctx, servicename)) {
-		return WERR_NO_SUCH_SERVICE;
+		return SBC_ERR_NO_SUCH_SERVICE;
 	}
 
 	return ctx->ops->delete_share(ctx, servicename);
@@ -221,7 +256,7 @@ WERROR smbconf_delete_share(struct smbconf_ctx *ctx, const char *servicename)
 /**
  * set a configuration parameter to the value provided.
  */
-WERROR smbconf_set_parameter(struct smbconf_ctx *ctx,
+sbcErr smbconf_set_parameter(struct smbconf_ctx *ctx,
 			     const char *service,
 			     const char *param,
 			     const char *valstr)
@@ -235,30 +270,31 @@ WERROR smbconf_set_parameter(struct smbconf_ctx *ctx,
  *
  * This also creates [global] when it does not exist.
  */
-WERROR smbconf_set_global_parameter(struct smbconf_ctx *ctx,
+sbcErr smbconf_set_global_parameter(struct smbconf_ctx *ctx,
 				    const char *param, const char *val)
 {
-	WERROR werr;
+	sbcErr err;
 
-	werr = smbconf_global_check(ctx);
-	if (W_ERROR_IS_OK(werr)) {
-		werr = smbconf_set_parameter(ctx, GLOBAL_NAME, param, val);
+	err = smbconf_global_check(ctx);
+	if (!SBC_ERROR_IS_OK(err)) {
+		return err;
 	}
+	err = smbconf_set_parameter(ctx, GLOBAL_NAME, param, val);
 
-	return werr;
+	return err;
 }
 
 /**
  * get the value of a configuration parameter as a string
  */
-WERROR smbconf_get_parameter(struct smbconf_ctx *ctx,
+sbcErr smbconf_get_parameter(struct smbconf_ctx *ctx,
 			     TALLOC_CTX *mem_ctx,
 			     const char *service,
 			     const char *param,
 			     char **valstr)
 {
 	if (valstr == NULL) {
-		return WERR_INVALID_PARAM;
+		return SBC_ERR_INVALID_PARAM;
 	}
 
 	return ctx->ops->get_parameter(ctx, mem_ctx, service, param, valstr);
@@ -269,26 +305,28 @@ WERROR smbconf_get_parameter(struct smbconf_ctx *ctx,
  *
  * Create [global] if it does not exist.
  */
-WERROR smbconf_get_global_parameter(struct smbconf_ctx *ctx,
+sbcErr smbconf_get_global_parameter(struct smbconf_ctx *ctx,
 				    TALLOC_CTX *mem_ctx,
 				    const char *param,
 				    char **valstr)
 {
-	WERROR werr;
+	sbcErr err;
 
-	werr = smbconf_global_check(ctx);
-	if (W_ERROR_IS_OK(werr)) {
-		werr = smbconf_get_parameter(ctx, mem_ctx, GLOBAL_NAME, param,
-					     valstr);
+	err = smbconf_global_check(ctx);
+	if (!SBC_ERROR_IS_OK(err)) {
+		return err;
 	}
 
-	return werr;
+	err = smbconf_get_parameter(ctx, mem_ctx, GLOBAL_NAME, param,
+				    valstr);
+
+	return err;
 }
 
 /**
  * delete a parameter from configuration
  */
-WERROR smbconf_delete_parameter(struct smbconf_ctx *ctx,
+sbcErr smbconf_delete_parameter(struct smbconf_ctx *ctx,
 				const char *service, const char *param)
 {
 	return ctx->ops->delete_parameter(ctx, service, param);
@@ -299,20 +337,21 @@ WERROR smbconf_delete_parameter(struct smbconf_ctx *ctx,
  *
  * Create [global] if it does not exist.
  */
-WERROR smbconf_delete_global_parameter(struct smbconf_ctx *ctx,
+sbcErr smbconf_delete_global_parameter(struct smbconf_ctx *ctx,
 				       const char *param)
 {
-	WERROR werr;
+	sbcErr err;
 
-	werr = smbconf_global_check(ctx);
-	if (W_ERROR_IS_OK(werr)) {
-		werr = smbconf_delete_parameter(ctx, GLOBAL_NAME, param);
+	err = smbconf_global_check(ctx);
+	if (!SBC_ERROR_IS_OK(err)) {
+		return err;
 	}
+	err = smbconf_delete_parameter(ctx, GLOBAL_NAME, param);
 
-	return werr;
+	return err;
 }
 
-WERROR smbconf_get_includes(struct smbconf_ctx *ctx,
+sbcErr smbconf_get_includes(struct smbconf_ctx *ctx,
 			    TALLOC_CTX *mem_ctx,
 			    const char *service,
 			    uint32_t *num_includes, char ***includes)
@@ -321,72 +360,75 @@ WERROR smbconf_get_includes(struct smbconf_ctx *ctx,
 				      includes);
 }
 
-WERROR smbconf_get_global_includes(struct smbconf_ctx *ctx,
+sbcErr smbconf_get_global_includes(struct smbconf_ctx *ctx,
 				   TALLOC_CTX *mem_ctx,
 				   uint32_t *num_includes, char ***includes)
 {
-	WERROR werr;
+	sbcErr err;
 
-	werr = smbconf_global_check(ctx);
-	if (W_ERROR_IS_OK(werr)) {
-		werr = smbconf_get_includes(ctx, mem_ctx, GLOBAL_NAME,
-					    num_includes, includes);
+	err = smbconf_global_check(ctx);
+	if (!SBC_ERROR_IS_OK(err)) {
+		return err;
 	}
+	err = smbconf_get_includes(ctx, mem_ctx, GLOBAL_NAME,
+				    num_includes, includes);
 
-	return werr;
+	return err;
 }
 
-WERROR smbconf_set_includes(struct smbconf_ctx *ctx,
+sbcErr smbconf_set_includes(struct smbconf_ctx *ctx,
 			    const char *service,
 			    uint32_t num_includes, const char **includes)
 {
 	return ctx->ops->set_includes(ctx, service, num_includes, includes);
 }
 
-WERROR smbconf_set_global_includes(struct smbconf_ctx *ctx,
+sbcErr smbconf_set_global_includes(struct smbconf_ctx *ctx,
 				   uint32_t num_includes,
 				   const char **includes)
 {
-	WERROR werr;
+	sbcErr err;
 
-	werr = smbconf_global_check(ctx);
-	if (W_ERROR_IS_OK(werr)) {
-		werr = smbconf_set_includes(ctx, GLOBAL_NAME,
-					    num_includes, includes);
+	err = smbconf_global_check(ctx);
+	if (!SBC_ERROR_IS_OK(err)) {
+		return err;
 	}
+	err = smbconf_set_includes(ctx, GLOBAL_NAME,
+				   num_includes, includes);
 
-	return werr;
+	return err;
 }
 
 
-WERROR smbconf_delete_includes(struct smbconf_ctx *ctx, const char *service)
+sbcErr smbconf_delete_includes(struct smbconf_ctx *ctx, const char *service)
 {
 	return ctx->ops->delete_includes(ctx, service);
 }
 
-WERROR smbconf_delete_global_includes(struct smbconf_ctx *ctx)
+sbcErr smbconf_delete_global_includes(struct smbconf_ctx *ctx)
 {
-	WERROR werr;
+	sbcErr err;
 
-	werr = smbconf_global_check(ctx);
-	if (W_ERROR_IS_OK(werr)) {
-		werr = smbconf_delete_includes(ctx, GLOBAL_NAME);
+	err = smbconf_global_check(ctx);
+	if (!SBC_ERROR_IS_OK(err)) {
+		return err;
 	}
+	err = smbconf_delete_includes(ctx, GLOBAL_NAME);
 
-	return werr;
+	return err;
 }
 
-WERROR smbconf_transaction_start(struct smbconf_ctx *ctx)
+sbcErr smbconf_transaction_start(struct smbconf_ctx *ctx)
 {
 	return ctx->ops->transaction_start(ctx);
 }
 
-WERROR smbconf_transaction_commit(struct smbconf_ctx *ctx)
+sbcErr smbconf_transaction_commit(struct smbconf_ctx *ctx)
 {
 	return ctx->ops->transaction_commit(ctx);
 }
 
-WERROR smbconf_transaction_cancel(struct smbconf_ctx *ctx)
+sbcErr smbconf_transaction_cancel(struct smbconf_ctx *ctx)
 {
 	return ctx->ops->transaction_cancel(ctx);
 }
diff --git a/lib/smbconf/smbconf.h b/lib/smbconf/smbconf.h
index 517302ac88..7f62b06af4 100644
--- a/lib/smbconf/smbconf.h
+++ b/lib/smbconf/smbconf.h
@@ -20,6 +20,39 @@
 #ifndef __LIBSMBCONF_H__
 #define __LIBSMBCONF_H__
 
+/**
+ * @defgroup libsmbconf The smbconf API
+ *
+ * libsmbconf is a library to read or, based on the backend, modify the Samba
+ * configuration.
+ *
+ * @{
+ */
+
+/**
+ * @brief Status codes returned from smbconf functions
+ */
+enum _sbcErrType {
+	SBC_ERR_OK = 0,          /**< Successful completion **/
+	SBC_ERR_NOT_IMPLEMENTED, /**< Function not implemented **/
+	SBC_ERR_NOT_SUPPORTED,   /**< Function not supported **/
+	SBC_ERR_UNKNOWN_FAILURE, /**< General failure **/
+	SBC_ERR_NOMEM,           /**< Memory allocation error **/
+	SBC_ERR_INVALID_PARAM,   /**< An Invalid parameter was supplied **/
+	SBC_ERR_BADFILE,         /**< A bad file was supplied **/
+	SBC_ERR_NO_SUCH_SERVICE, /**< There is no such service provided **/
+	SBC_ERR_IO_FAILURE,      /**< There was an IO error **/
+	SBC_ERR_CAN_NOT_COMPLETE,/**< Can not complete action **/
+	SBC_ERR_NO_MORE_ITEMS,   /**< No more items left **/
+	SBC_ERR_FILE_EXISTS,     /**< File already exists **/
+	SBC_ERR_ACCESS_DENIED,   /**< Access has been denied **/
+};
+
+typedef enum _sbcErrType sbcErr;
+
+#define SBC_ERROR_IS_OK(x) ((x) == SBC_ERR_OK)
+#define SBC_ERROR_EQUAL(x,y) ((x) == (y))
+
 struct smbconf_ctx;
 
 /* the change sequence number */
@@ -27,75 +60,428 @@ struct smbconf_csn {
 	uint64_t csn;
 };
 
+/** Information about a service */
 struct smbconf_service {
-	char *name;
-	uint32_t num_params;
-	char **param_names;
-	char **param_values;
+	char *name;          /**< The name of the share */
+	uint32_t num_params; /**< List of length num_shares of parameter counts for each share */
+	char **param_names;  /**< List of lists of parameter names for each share */
+	char **param_values; /**< List of lists of parameter values for each share */
 };
 
 /*
- * the smbconf API functions
+ * The smbconf API functions
+ */
+
+/**
+ * @brief Translate an error value into a string
+ *
+ * @param error
+ *
+ * @return a pointer to a static string
+ **/
+const char *sbcErrorString(sbcErr error);
+
+/**
+ * @brief Check if the backend requires messaging to be set up.
+ *
+ * Tell whether the backend requires messaging to be set up
+ * for the backend to work correctly.
+ *
+ * @param[in] ctx       The smbconf context to check.
+ *
+ * @return              True if needed, false if not.
  */
 bool smbconf_backend_requires_messaging(struct smbconf_ctx *ctx);
+
+/**
+ * @brief Tell whether the source is writeable.
+ *
+ * @param[in] ctx       The smbconf context to check.
+ *
+ * @return              True if it is writeable, false if not.
+ */
 bool smbconf_is_writeable(struct smbconf_ctx *ctx);
+
+/**
+ * @brief Close the configuration.
+ *
+ * @param[in] ctx       The smbconf context to close.
+ */
 void smbconf_shutdown(struct smbconf_ctx *ctx);
+
+/**
+ * @brief Detect changes in the configuration.
+ *
+ * Get the change sequence number of the given service/parameter. Service and
+ * parameter strings may be NULL.
+ *
+ * The given change sequence number (csn) struct is filled with the current
+ * csn. smbconf_changed() can also be used for initial retrieval of the csn.
+ *
+ * @param[in] ctx       The smbconf context to check for changes.
+ *
+ * @param[inout] csn    The smbconf csn to be filled.
+ *
+ * @param[in] service   The service name to check or NULL.
+ *
+ * @param[in] param     The param to check or NULL.
+ *
+ * @return              True if it has been changed, false if not.
+ */
 bool smbconf_changed(struct smbconf_ctx *ctx, struct smbconf_csn *csn,
 		     const char *service, const char *param);
-WERROR smbconf_drop(struct smbconf_ctx *ctx);
-WERROR smbconf_get_config(struct smbconf_ctx *ctx,
+
+/**
+ * @brief Drop the whole configuration (restarting empty).
+ *
+ * @param[in] ctx       The smbconf context to drop the config.
+ *
+ * @return              SBC_ERR_OK on success, a corresponding sbcErr if an
+ *                      error occured.
+ */
+sbcErr smbconf_drop(struct smbconf_ctx *ctx);
+
+/**
+ * @brief Get the whole configuration as lists of strings with counts.
+ *
+ * @param[in] ctx       The smbconf context to get the lists from.
+ *
+ * @param[in] mem_ctx   The memory context to use.
+ *
+ * @param[in] num_shares A pointer to store the number of shares.
+ *
+ * @param[out] services  A pointer to store the services.
+ *
+ * @return              SBC_ERR_OK on success, a corresponding sbcErr if an
+ *                      error occured.
+ *
+ * @see smbconf_service
+ */
+sbcErr smbconf_get_config(struct smbconf_ctx *ctx,
 			  TALLOC_CTX *mem_ctx,
 			  uint32_t *num_shares,
 			  struct smbconf_service ***services);
-WERROR smbconf_get_share_names(struct smbconf_ctx *ctx,
+
+/**
+ * @brief Get the list of share names defined in the configuration.
+ *
+ * @param[in] ctx       The smbconf context to use.
+ *
+ * @param[in] mem_ctx   The memory context to use.
+ *
+ * @param[in] num_shares A pointer to store the number of shares.
+ *
+ * @param[in] share_names A pointer to store the share names.
+ *
+ * @return              SBC_ERR_OK on success, a corresponding sbcErr if an
+ *                      error occured.
+ */
+sbcErr smbconf_get_share_names(struct smbconf_ctx *ctx,
 			       TALLOC_CTX *mem_ctx,
 			       uint32_t *num_shares,
 			       char ***share_names);
+
+/**
+ * @brief Check if a share/service of a given name exists.
+ *
+ * @param[in] ctx       The smbconf context to use.
+ *
+ * @param[in] servicename The service name to check if it exists.
+ *
+ * @return              True if it exists, false if not.
+ */
 bool smbconf_share_exists(struct smbconf_ctx *ctx, const char *servicename);
-WERROR smbconf_create_share(struct smbconf_ctx *ctx, const char *servicename);
-WERROR smbconf_get_share(struct smbconf_ctx *ctx,
+
+/**
+ * @brief Add a service if it does not already exist.
+ *
+ * @param[in] ctx       The smbconf context to use.
+ *
+ * @param[in] servicename The name of the service to add.
+ *
+ * @return              SBC_ERR_OK on success, a corresponding sbcErr if an
+ *                      error occured.
+ */
+sbcErr smbconf_create_share(struct smbconf_ctx *ctx, const char *servicename);
+
+/**
+ * @brief Get a definition of a share (service) from configuration.
+ *
+ * @param[in] ctx       The smbconf context to use.
+ *
+ * @param[in] mem_ctx   A memory context to allocate the result.
+ *
+ * @param[in] servicename The service name to get the information from.
+ *
+ * @param[out] service  A pointer to store the service information about the
+ *                      share.
+ *
+ * @return              SBC_ERR_OK on success, a corresponding sbcErr if an
+ *                      error occured.
+ *
+ * @see smbconf_service
+ */
+sbcErr smbconf_get_share(struct smbconf_ctx *ctx,
 			 TALLOC_CTX *mem_ctx,
 			 const char *servicename,
 			 struct smbconf_service **service);
-WERROR smbconf_delete_share(struct smbconf_ctx *ctx,
+
+/**
+ * @brief Delete a service from configuration.
+ *
+ * @param[in] ctx       The smbconf context to use.
+ *
+ * @param[in] servicename The service name to delete.
+ *
+ * @return              SBC_ERR_OK on success, a corresponding sbcErr if an
+ *                      error occured.
+ */
+sbcErr smbconf_delete_share(struct smbconf_ctx *ctx,
 			    const char *servicename);
-WERROR smbconf_set_parameter(struct smbconf_ctx *ctx,
+
+/**
+ * @brief Set a configuration parameter to the value provided.
+ *
+ * @param[in] ctx       The smbconf context to use.
+ *
+ * @param[in] service   The service name to set the parameter.
+ *
+ * @param[in] param     The name of the parameter to set.
+ *
+ * @param[in] valstr    The value to set.
+ *
+ * @return              SBC_ERR_OK on success, a corresponding sbcErr if an
+ *                      error occured.
+ */
+sbcErr smbconf_set_parameter(struct smbconf_ctx *ctx,
 			     const char *service,
 			     const char *param,
 			     const char *valstr);
-WERROR smbconf_set_global_parameter(struct smbconf_ctx *ctx,
+
+/**
+ * @brief Set a global configuration parameter to the value provided.
+ *
+ * This adds a paramet in the [global] service. It also creates [global] if it
+ * does't exist.
+ *
+ * @param[in] ctx       The smbconf context to use.
+ *
+ * @param[in] param     The name of the parameter to set.
+ *
+ * @param[in] val       The value to set.
+ *
+ * @return              SBC_ERR_OK on success, a corresponding sbcErr if an
+ *                      error occured.
+ */
+sbcErr smbconf_set_global_parameter(struct smbconf_ctx *ctx,
 				    const char *param, const char *val);
-WERROR smbconf_get_parameter(struct smbconf_ctx *ctx,
+
+/**
+ * @brief Get the value of a configuration parameter as a string.
+ *
+ * @param[in]  ctx      The smbconf context to use.
+ *
+ * @param[in]  mem_ctx  The memory context to allocate the string on.
+ *
+ * @param[in]  service  The name of the service where to find the parameter.
+ *
+ * @param[in]  param    The parameter to get.
+ *
+ * @param[out] valstr   A pointer to store the value as a string.
+ *
+ * @return              SBC_ERR_OK on success, a corresponding sbcErr if an
+ *                      error occured.
+ */
+sbcErr smbconf_get_parameter(struct smbconf_ctx *ctx,
 			     TALLOC_CTX *mem_ctx,
 			     const char *service,
 			     const char *param,
 			     char **valstr);
-WERROR smbconf_get_global_parameter(struct smbconf_ctx *ctx,
+
+/**
+ * @brief Get the value of a global configuration parameter as a string.
+ *
+ * It also creates [global] if it does't exist.
+ *
+ * @param[in]  ctx      The smbconf context to use.
+ *
+ * @param[in]  mem_ctx  The memory context to allocate the string on.
+ *
+ * @param[in]  param    The parameter to get.
+ *
+ * @param[out] valstr   A pointer to store the value as a string.
+ *
+ * @return              SBC_ERR_OK on success, a corresponding sbcErr if an
+ *                      error occured.
+ */
+sbcErr smbconf_get_global_parameter(struct smbconf_ctx *ctx,
 				    TALLOC_CTX *mem_ctx,
 				    const char *param,
 				    char **valstr);
-WERROR smbconf_delete_parameter(struct smbconf_ctx *ctx,
+
+/**
+ * @brief Delete a parameter from the configuration.
+ *
+ * @param[in]  ctx      The smbconf context to use.
+ *
+ * @param[in] service   The service where the parameter can be found.
+ *
+ * @param[in] param     The name of the parameter to delete.
+ *
+ * @return              SBC_ERR_OK on success, a corresponding sbcErr if an
+ *                      error occured.
+ */
+sbcErr smbconf_delete_parameter(struct smbconf_ctx *ctx,
 				const char *service, const char *param);
-WERROR smbconf_delete_global_parameter(struct smbconf_ctx *ctx,
+
+/**
+ * @brief Delete a global parameter from the configuration.
+ *
+ * It also creates [global] if it does't exist.
+ *
+ * @param[in]  ctx      The smbconf context to use.
+ *
+ * @param[in] param     The name of the parameter to delete.
+ *
+ * @return              SBC_ERR_OK on success, a corresponding sbcErr if an
+ *                      error occured.
+ */
+sbcErr smbconf_delete_global_parameter(struct smbconf_ctx *ctx,
 				       const char *param);
-WERROR smbconf_get_includes(struct smbconf_ctx *ctx,
+
+/**
+ * @brief Get the list of names of included files.
+ *
+ * @param[in]  ctx      The smbconf context to use.
+ *
+ * @param[in]  mem_ctx  The memory context to allocate the names.
+ *
+ * @param[in]  service  The service name to get the include files.
+ *
+ * @param[out] num_includes A pointer to store the number of included files.
+ *
+ * @param[out] includes A pointer to store the paths of the included files.
+ *
+ * @return              SBC_ERR_OK on success, a corresponding sbcErr if an
+ *                      error occured.
+ */
+sbcErr smbconf_get_includes(struct smbconf_ctx *ctx,
 			    TALLOC_CTX *mem_ctx,
 			    const char *service,
 			    uint32_t *num_includes, char ***includes);
-WERROR smbconf_get_global_includes(struct smbconf_ctx *ctx,
+
+/**
+ * @brief Get the list of globally included files.
+ *
+ * @param[in]  ctx      The smbconf context to use.
+ *
+ * @param[in]  mem_ctx  The memory context to allocate the names.
+ *
+ * @param[out] num_includes A pointer to store the number of included files.
+ *
+ * @param[out] includes A pointer to store the paths of the included files.
+ *
+ * @return              SBC_ERR_OK on success, a corresponding sbcErr if an
+ *                      error occured.
+ */
+sbcErr smbconf_get_global_includes(struct smbconf_ctx *ctx,
 				   TALLOC_CTX *mem_ctx,
 				   uint32_t *num_includes, char ***includes);
-WERROR smbconf_set_includes(struct smbconf_ctx *ctx,
+
+/**
+ * @brief Set a list of config files to include on the given service.
+ *
+ * @param[in]  ctx      The smbconf context to use.
+ *
+ * @param[in]  service  The service to add includes.
+ *
+ * @param[in]  num_includes The number of includes to set.
+ *
+ * @param[in]  includes A list of paths to include.
+ *
+ * @return              SBC_ERR_OK on success, a corresponding sbcErr if an
+ *                      error occured.
+ */
+sbcErr smbconf_set_includes(struct smbconf_ctx *ctx,
 			    const char *service,
 			    uint32_t num_includes, const char **includes);
-WERROR smbconf_set_global_includes(struct smbconf_ctx *ctx,
+
+/**
+ * @brief Set a list of config files to include globally.
+ *
+ * @param[in]  ctx      The smbconf context to use.
+ *
+ * @param[in]  num_includes The number of includes to set.
+ *
+ * @param[in]  includes A list of paths to include.
+ *
+ * @return              SBC_ERR_OK on success, a corresponding sbcErr if an
+ *                      error occured.
+ */
+sbcErr smbconf_set_global_includes(struct smbconf_ctx *ctx,
 				   uint32_t num_includes,
 				   const char **includes);
-WERROR smbconf_delete_includes(struct smbconf_ctx *ctx, const char *service);
-WERROR smbconf_delete_global_includes(struct smbconf_ctx *ctx);
 
-WERROR smbconf_transaction_start(struct smbconf_ctx *ctx);
-WERROR smbconf_transaction_commit(struct smbconf_ctx *ctx);
-WERROR smbconf_transaction_cancel(struct smbconf_ctx *ctx);
+/**
+ * @brief Delete include parameter on the given service.
+ *
+ * @param[in]  ctx      The smbconf context to use.
+ *
+ * @param[in]  service  The name of the service to delete the includes from.
+ *
+ * @return              SBC_ERR_OK on success, a corresponding sbcErr if an
+ *                      error occured.
+ */
+sbcErr smbconf_delete_includes(struct smbconf_ctx *ctx, const char *service);
+
+/**
+ * @brief Delete include parameter from the global service.
+ *
+ * @param[in]  ctx      The smbconf context to use.
+ *
+ * @return              SBC_ERR_OK on success, a corresponding sbcErr if an
+ *                      error occured.
+ */
+sbcErr smbconf_delete_global_includes(struct smbconf_ctx *ctx);
+
+/**
+ * @brief Start a transaction on the configuration backend.
+ *
+ * This is to speed up writes to the registry based backend.
+ *
+ * @param[in] ctx       The smbconf context to start the transaction.
+ *
+ * @return              SBC_ERR_OK on success, a corresponding sbcErr if an
+ *                      error occured.
+ */
+sbcErr smbconf_transaction_start(struct smbconf_ctx *ctx);
+
+/**
+ * @brief Commit a transaction on the configuration backend.
+ *
+ * This is to speed up writes to the registry based backend.
+ *
+ * @param[in] ctx       The smbconf context to commit the transaction.
+ *
+ * @return              SBC_ERR_OK on success, a corresponding sbcErr if an
+ *                      error occured.
+ *
+ * @see smbconf_transaction_start()
+ */
+sbcErr smbconf_transaction_commit(struct smbconf_ctx *ctx);
+
+/**
+ * @brief Cancel a transaction on the configuration backend.
+ *
+ * @param[in] ctx       The smbconf context to cancel the transaction.
+ *
+ * @return              SBC_ERR_OK on success, a corresponding sbcErr if an
+ *                      error occured.
+ *
+ * @see smbconf_transaction_start()
+ */
+sbcErr smbconf_transaction_cancel(struct smbconf_ctx *ctx);
+
+/* @} ******************************************************************/
 
 #endif /*  _LIBSMBCONF_H_  */
diff --git a/lib/smbconf/smbconf_private.h b/lib/smbconf/smbconf_private.h
index e6998ad639..e768c30b91 100644
--- a/lib/smbconf/smbconf_private.h
+++ b/lib/smbconf/smbconf_private.h
@@ -27,50 +27,50 @@
 #include "lib/smbconf/smbconf.h"
 
 struct smbconf_ops {
-	WERROR (*init)(struct smbconf_ctx *ctx, const char *path);
+	sbcErr (*init)(struct smbconf_ctx *ctx, const char *path);
 	int (*shutdown)(struct smbconf_ctx *ctx);
 	bool (*requires_messaging)(struct smbconf_ctx *ctx);
 	bool (*is_writeable)(struct smbconf_ctx *ctx);
-	WERROR (*open_conf)(struct smbconf_ctx *ctx);
+	sbcErr (*open_conf)(struct smbconf_ctx *ctx);
 	int (*close_conf)(struct smbconf_ctx *ctx);
 	void (*get_csn)(struct smbconf_ctx *ctx, struct smbconf_csn *csn,
 			const char *service, const char *param);
-	WERROR (*drop)(struct smbconf_ctx *ctx);
-	WERROR (*get_share_names)(struct smbconf_ctx *ctx,
+	sbcErr (*drop)(struct smbconf_ctx *ctx);
+	sbcErr (*get_share_names)(struct smbconf_ctx *ctx,
 				  TALLOC_CTX *mem_ctx,
 				  uint32_t *num_shares,
 				  char ***share_names);
 	bool (*share_exists)(struct smbconf_ctx *ctx, const char *service);
-	WERROR (*create_share)(struct smbconf_ctx *ctx, const char *service);
-	WERROR (*get_share)(struct smbconf_ctx *ctx,
+	sbcErr (*create_share)(struct smbconf_ctx *ctx, const char *service);
+	sbcErr (*get_share)(struct smbconf_ctx *ctx,
 			    TALLOC_CTX *mem_ctx,
 			    const char *servicename,
 			    struct smbconf_service **service);
-	WERROR (*delete_share)(struct smbconf_ctx *ctx,
+	sbcErr (*delete_share)(struct smbconf_ctx *ctx,
 				    const char *servicename);
-	WERROR (*set_parameter)(struct smbconf_ctx *ctx,
+	sbcErr (*set_parameter)(struct smbconf_ctx *ctx,
 				const char *service,
 				const char *param,
 				const char *valstr);
-	WERROR (*get_parameter)(struct smbconf_ctx *ctx,
+	sbcErr (*get_parameter)(struct smbconf_ctx *ctx,
 				TALLOC_CTX *mem_ctx,
 				const char *service,
 				const char *param,
 				char **valstr);
-	WERROR (*delete_parameter)(struct smbconf_ctx *ctx,
+	sbcErr (*delete_parameter)(struct smbconf_ctx *ctx,
 				   const char *service, const char *param);
-	WERROR (*get_includes)(struct smbconf_ctx *ctx,
+	sbcErr (*get_includes)(struct smbconf_ctx *ctx,
 			       TALLOC_CTX *mem_ctx,
 			       const char *service,
 			       uint32_t *num_includes, char ***includes);
-	WERROR (*set_includes)(struct smbconf_ctx *ctx,
+	sbcErr (*set_includes)(struct smbconf_ctx *ctx,
 			       const char *service,
 			       uint32_t num_includes, const char **includes);
-	WERROR (*delete_includes)(struct smbconf_ctx *ctx,
+	sbcErr (*delete_includes)(struct smbconf_ctx *ctx,
 				  const char *service);
-	WERROR (*transaction_start)(struct smbconf_ctx *ctx);
-	WERROR (*transaction_commit)(struct smbconf_ctx *ctx);
-	WERROR (*transaction_cancel)(struct smbconf_ctx *ctx);
+	sbcErr (*transaction_start)(struct smbconf_ctx *ctx);
+	sbcErr (*transaction_commit)(struct smbconf_ctx *ctx);
+	sbcErr (*transaction_cancel)(struct smbconf_ctx *ctx);
 };
 
 struct smbconf_ctx {
@@ -79,10 +79,10 @@ struct smbconf_ctx {
 	void *data; /* private data for use in backends */
 };
 
-WERROR smbconf_init_internal(TALLOC_CTX *mem_ctx, struct smbconf_ctx **conf_ctx,
+sbcErr smbconf_init_internal(TALLOC_CTX *mem_ctx, struct smbconf_ctx **conf_ctx,
 			     const char *path, struct smbconf_ops *ops);
 
-WERROR smbconf_add_string_to_array(TALLOC_CTX *mem_ctx,
+sbcErr smbconf_add_string_to_array(TALLOC_CTX *mem_ctx,
 				   char ***array,
 				   uint32_t count,
 				   const char *string);
diff --git a/lib/smbconf/smbconf_txt.c b/lib/smbconf/smbconf_txt.c
index 2114841b81..5c4bd27b9d 100644
--- a/lib/smbconf/smbconf_txt.c
+++ b/lib/smbconf/smbconf_txt.c
@@ -60,7 +60,7 @@ static struct txt_private_data *pd(struct smbconf_ctx *ctx)
 
 static bool smbconf_txt_do_section(const char *section, void *private_data)
 {
-	WERROR werr;
+	sbcErr err;
 	uint32_t idx;
 	struct txt_private_data *tpd = (struct txt_private_data *)private_data;
 	struct txt_cache *cache = tpd->cache;
@@ -72,9 +72,9 @@ static bool smbconf_txt_do_section(const char *section, void *private_data)
 		return true;
 	}
 
-	werr = smbconf_add_string_to_array(cache, &(cache->share_names),
-					   cache->num_shares, section);
-	if (!W_ERROR_IS_OK(werr)) {
+	err = smbconf_add_string_to_array(cache, &(cache->share_names),
+					  cache->num_shares, section);
+	if (!SBC_ERROR_IS_OK(err)) {
 		return false;
 	}
 	cache->current_share = cache->num_shares;
@@ -114,7 +114,7 @@ static bool smbconf_txt_do_parameter(const char *param_name,
 				     const char *param_value,
 				     void *private_data)
 {
-	WERROR werr;
+	sbcErr err;
 	char **param_names, **param_values;
 	uint32_t num_params;
 	uint32_t idx;
@@ -146,17 +146,17 @@ static bool smbconf_txt_do_parameter(const char *param_name,
 		}
 		return true;
 	}
-	werr = smbconf_add_string_to_array(cache,
+	err = smbconf_add_string_to_array(cache,
 				&(cache->param_names[cache->current_share]),
 				num_params, param_name);
-	if (!W_ERROR_IS_OK(werr)) {
+	if (!SBC_ERROR_IS_OK(err)) {
 		return false;
 	}
-	werr = smbconf_add_string_to_array(cache,
+	err = smbconf_add_string_to_array(cache,
 				&(cache->param_values[cache->current_share]),
 				num_params, param_value);
 	cache->num_params[cache->current_share]++;
-	return W_ERROR_IS_OK(werr);
+	return SBC_ERROR_IS_OK(err);
 }
 
 static void smbconf_txt_flush_cache(struct smbconf_ctx *ctx)
@@ -165,7 +165,7 @@ static void smbconf_txt_flush_cache(struct smbconf_ctx *ctx)
 	pd(ctx)->cache = NULL;
 }
 
-static WERROR smbconf_txt_init_cache(struct smbconf_ctx *ctx)
+static sbcErr smbconf_txt_init_cache(struct smbconf_ctx *ctx)
 {
 	if (pd(ctx)->cache != NULL) {
 		smbconf_txt_flush_cache(ctx);
@@ -174,40 +174,40 @@ static WERROR smbconf_txt_init_cache(struct smbconf_ctx *ctx)
 	pd(ctx)->cache = talloc_zero(pd(ctx), struct txt_cache);
 
 	if (pd(ctx)->cache == NULL) {
-		return WERR_NOMEM;
+		return SBC_ERR_NOMEM;
 	}
 
-	return WERR_OK;
+	return SBC_ERR_OK;
 }
 
-static WERROR smbconf_txt_load_file(struct smbconf_ctx *ctx)
+static sbcErr smbconf_txt_load_file(struct smbconf_ctx *ctx)
 {
-	WERROR werr;
+	sbcErr err;
 	uint64_t new_csn;
 
 	if (!file_exist(ctx->path)) {
-		return WERR_BADFILE;
+		return SBC_ERR_BADFILE;
 	}
 
 	new_csn = (uint64_t)file_modtime(ctx->path);
 	if (new_csn == pd(ctx)->csn) {
-		return WERR_OK;
+		return SBC_ERR_OK;
 	}
 
-	werr = smbconf_txt_init_cache(ctx);
-	if (!W_ERROR_IS_OK(werr)) {
-		return werr;
+	err = smbconf_txt_init_cache(ctx);
+	if (!SBC_ERROR_IS_OK(err)) {
+		return err;
 	}
 
 	if (!pm_process(ctx->path, smbconf_txt_do_section,
 			smbconf_txt_do_parameter, pd(ctx)))
 	{
-		return WERR_CAN_NOT_COMPLETE;
+		return SBC_ERR_CAN_NOT_COMPLETE;
 	}
 
 	pd(ctx)->csn = new_csn;
 
-	return WERR_OK;
+	return SBC_ERR_OK;
 }
 
 
@@ -220,24 +220,24 @@ static WERROR smbconf_txt_load_file(struct smbconf_ctx *ctx)
 /**
  * initialize the text based smbconf backend
  */
-static WERROR smbconf_txt_init(struct smbconf_ctx *ctx, const char *path)
+static sbcErr smbconf_txt_init(struct smbconf_ctx *ctx, const char *path)
 {
 	if (path == NULL) {
-		return WERR_BADFILE;
+		return SBC_ERR_BADFILE;
 	}
 	ctx->path = talloc_strdup(ctx, path);
 	if (ctx->path == NULL) {
-		return WERR_NOMEM;
+		return SBC_ERR_NOMEM;
 	}
 
 	ctx->data = talloc_zero(ctx, struct txt_private_data);
 	if (ctx->data == NULL) {
-		return WERR_NOMEM;
+		return SBC_ERR_NOMEM;
 	}
 
 	pd(ctx)->verbatim = true;
 
-	return WERR_OK;
+	return SBC_ERR_OK;
 }
 
 static int smbconf_txt_shutdown(struct smbconf_ctx *ctx)
@@ -256,7 +256,7 @@ static bool smbconf_txt_is_writeable(struct smbconf_ctx *ctx)
 	return false;
 }
 
-static WERROR smbconf_txt_open(struct smbconf_ctx *ctx)
+static sbcErr smbconf_txt_open(struct smbconf_ctx *ctx)
 {
 	return smbconf_txt_load_file(ctx);
 }
@@ -285,15 +285,15 @@ static void smbconf_txt_get_csn(struct smbconf_ctx *ctx,
 /**
  * Drop the whole configuration (restarting empty)
  */
-static WERROR smbconf_txt_drop(struct smbconf_ctx *ctx)
+static sbcErr smbconf_txt_drop(struct smbconf_ctx *ctx)
 {
-	return WERR_NOT_SUPPORTED;
+	return SBC_ERR_NOT_SUPPORTED;
 }
 
 /**
  * get the list of share names defined in the configuration.
  */
-static WERROR smbconf_txt_get_share_names(struct smbconf_ctx *ctx,
+static sbcErr smbconf_txt_get_share_names(struct smbconf_ctx *ctx,
 					  TALLOC_CTX *mem_ctx,
 					  uint32_t *num_shares,
 					  char ***share_names)
@@ -301,17 +301,16 @@ static WERROR smbconf_txt_get_share_names(struct smbconf_ctx *ctx,
 	uint32_t count;
 	uint32_t added_count = 0;
 	TALLOC_CTX *tmp_ctx = NULL;
-	WERROR werr = WERR_OK;
+	sbcErr err = SBC_ERR_OK;
 	char **tmp_share_names = NULL;
 
 	if ((num_shares == NULL) || (share_names == NULL)) {
-		werr = WERR_INVALID_PARAM;
-		goto done;
+		return SBC_ERR_INVALID_PARAM;
 	}
 
-	werr = smbconf_txt_load_file(ctx);
-	if (!W_ERROR_IS_OK(werr)) {
-		return werr;
+	err = smbconf_txt_load_file(ctx);
+	if (!SBC_ERROR_IS_OK(err)) {
+		return err;
 	}
 
 	tmp_ctx = talloc_stackframe();
@@ -320,18 +319,18 @@ static WERROR smbconf_txt_get_share_names(struct smbconf_ctx *ctx,
 	 * possibly after NULL section */
 
 	if (smbconf_share_exists(ctx, NULL)) {
-		werr = smbconf_add_string_to_array(tmp_ctx, &tmp_share_names,
-						   0, NULL);
-		if (!W_ERROR_IS_OK(werr)) {
+		err = smbconf_add_string_to_array(tmp_ctx, &tmp_share_names,
+						  0, NULL);
+		if (!SBC_ERROR_IS_OK(err)) {
 			goto done;
 		}
 		added_count++;
 	}
 
 	if (smbconf_share_exists(ctx, GLOBAL_NAME)) {
-		werr = smbconf_add_string_to_array(tmp_ctx, &tmp_share_names,
+		err = smbconf_add_string_to_array(tmp_ctx, &tmp_share_names,
 						   added_count, GLOBAL_NAME);
-		if (!W_ERROR_IS_OK(werr)) {
+		if (!SBC_ERROR_IS_OK(err)) {
 			goto done;
 		}
 		added_count++;
@@ -344,10 +343,10 @@ static WERROR smbconf_txt_get_share_names(struct smbconf_ctx *ctx,
 			continue;
 		}
 
-		werr = smbconf_add_string_to_array(tmp_ctx, &tmp_share_names,
+		err = smbconf_add_string_to_array(tmp_ctx, &tmp_share_names,
 					added_count,
 					pd(ctx)->cache->share_names[count]);
-		if (!W_ERROR_IS_OK(werr)) {
+		if (!SBC_ERROR_IS_OK(err)) {
 			goto done;
 		}
 		added_count++;
@@ -362,7 +361,7 @@ static WERROR smbconf_txt_get_share_names(struct smbconf_ctx *ctx,
 
 done:
 	talloc_free(tmp_ctx);
-	return werr;
+	return err;
 }
 
 /**
@@ -371,10 +370,10 @@ done:
 static bool smbconf_txt_share_exists(struct smbconf_ctx *ctx,
 				     const char *servicename)
 {
-	WERROR werr;
+	sbcErr err;
 
-	werr = smbconf_txt_load_file(ctx);
-	if (!W_ERROR_IS_OK(werr)) {
+	err = smbconf_txt_load_file(ctx);
+	if (!SBC_ERROR_IS_OK(err)) {
 		return false;
 	}
 
@@ -386,29 +385,29 @@ static bool smbconf_txt_share_exists(struct smbconf_ctx *ctx,
 /**
  * Add a service if it does not already exist
  */
-static WERROR smbconf_txt_create_share(struct smbconf_ctx *ctx,
+static sbcErr smbconf_txt_create_share(struct smbconf_ctx *ctx,
 				       const char *servicename)
 {
-	return WERR_NOT_SUPPORTED;
+	return SBC_ERR_NOT_SUPPORTED;
 }
 
 /**
  * get a definition of a share (service) from configuration.
  */
-static WERROR smbconf_txt_get_share(struct smbconf_ctx *ctx,
+static sbcErr smbconf_txt_get_share(struct smbconf_ctx *ctx,
 				    TALLOC_CTX *mem_ctx,
 				    const char *servicename,
 				    struct smbconf_service **service)
 {
-	WERROR werr;
+	sbcErr err;
 	uint32_t sidx, count;
 	bool found;
 	TALLOC_CTX *tmp_ctx = NULL;
 	struct smbconf_service *tmp_service = NULL;
 
-	werr = smbconf_txt_load_file(ctx);
-	if (!W_ERROR_IS_OK(werr)) {
-		return werr;
+	err = smbconf_txt_load_file(ctx);
+	if (!SBC_ERROR_IS_OK(err)) {
+		return err;
 	}
 
 	found = smbconf_find_in_array(servicename,
@@ -416,38 +415,38 @@ static WERROR smbconf_txt_get_share(struct smbconf_ctx *ctx,
 				      pd(ctx)->cache->num_shares,
 				      &sidx);
 	if (!found) {
-		return WERR_NO_SUCH_SERVICE;
+		return SBC_ERR_NO_SUCH_SERVICE;
 	}
 
 	tmp_ctx = talloc_stackframe();
 
 	tmp_service = talloc_zero(tmp_ctx, struct smbconf_service);
 	if (tmp_service == NULL) {
-		werr = WERR_NOMEM;
+		err = SBC_ERR_NOMEM;
 		goto done;
 	}
 
 	if (servicename != NULL) {
 		tmp_service->name = talloc_strdup(tmp_service, servicename);
 		if (tmp_service->name == NULL) {
-			werr = WERR_NOMEM;
+			err = SBC_ERR_NOMEM;
 			goto done;
 		}
 	}
 
 	for (count = 0; count < pd(ctx)->cache->num_params[sidx]; count++) {
-		werr = smbconf_add_string_to_array(tmp_service,
+		err = smbconf_add_string_to_array(tmp_service,
 				&(tmp_service->param_names),
 				count,
 				pd(ctx)->cache->param_names[sidx][count]);
-		if (!W_ERROR_IS_OK(werr)) {
+		if (!SBC_ERROR_IS_OK(err)) {
 			goto done;
 		}
-		werr = smbconf_add_string_to_array(tmp_service,
+		err = smbconf_add_string_to_array(tmp_service,
 				&(tmp_service->param_values),
 				count,
 				pd(ctx)->cache->param_values[sidx][count]);
-		if (!W_ERROR_IS_OK(werr)) {
+		if (!SBC_ERROR_IS_OK(err)) {
 			goto done;
 		}
 	}
@@ -457,45 +456,45 @@ static WERROR smbconf_txt_get_share(struct smbconf_ctx *ctx,
 
 done:
 	talloc_free(tmp_ctx);
-	return werr;
+	return err;
 }
 
 /**
  * delete a service from configuration
  */
-static WERROR smbconf_txt_delete_share(struct smbconf_ctx *ctx,
+static sbcErr smbconf_txt_delete_share(struct smbconf_ctx *ctx,
 				       const char *servicename)
 {
-	return WERR_NOT_SUPPORTED;
+	return SBC_ERR_NOT_SUPPORTED;
 }
 
 /**
  * set a configuration parameter to the value provided.
  */
-static WERROR smbconf_txt_set_parameter(struct smbconf_ctx *ctx,
+static sbcErr smbconf_txt_set_parameter(struct smbconf_ctx *ctx,
 					const char *service,
 					const char *param,
 					const char *valstr)
 {
-	return WERR_NOT_SUPPORTED;
+	return SBC_ERR_NOT_SUPPORTED;
 }
 
 /**
  * get the value of a configuration parameter as a string
  */
-static WERROR smbconf_txt_get_parameter(struct smbconf_ctx *ctx,
+static sbcErr smbconf_txt_get_parameter(struct smbconf_ctx *ctx,
 					TALLOC_CTX *mem_ctx,
 					const char *service,
 					const char *param,
 					char **valstr)
 {
-	WERROR werr;
+	sbcErr err;
 	bool found;
 	uint32_t share_index, param_index;
 
-	werr = smbconf_txt_load_file(ctx);
-	if (!W_ERROR_IS_OK(werr)) {
-		return werr;
+	err = smbconf_txt_load_file(ctx);
+	if (!SBC_ERROR_IS_OK(err)) {
+		return err;
 	}
 
 	found = smbconf_find_in_array(service,
@@ -503,7 +502,7 @@ static WERROR smbconf_txt_get_parameter(struct smbconf_ctx *ctx,
 				      pd(ctx)->cache->num_shares,
 				      &share_index);
 	if (!found) {
-		return WERR_NO_SUCH_SERVICE;
+		return SBC_ERR_NO_SUCH_SERVICE;
 	}
 
 	found = smbconf_reverse_find_in_array(param,
@@ -511,45 +510,45 @@ static WERROR smbconf_txt_get_parameter(struct smbconf_ctx *ctx,
 				pd(ctx)->cache->num_params[share_index],
 				&param_index);
 	if (!found) {
-		return WERR_INVALID_PARAM;
+		return SBC_ERR_INVALID_PARAM;
 	}
 
 	*valstr = talloc_strdup(mem_ctx,
 			pd(ctx)->cache->param_values[share_index][param_index]);
 
 	if (*valstr == NULL) {
-		return WERR_NOMEM;
+		return SBC_ERR_NOMEM;
 	}
 
-	return WERR_OK;
+	return SBC_ERR_OK;
 }
 
 /**
  * delete a parameter from configuration
  */
-static WERROR smbconf_txt_delete_parameter(struct smbconf_ctx *ctx,
+static sbcErr smbconf_txt_delete_parameter(struct smbconf_ctx *ctx,
 					   const char *service,
 					   const char *param)
 {
-	return WERR_NOT_SUPPORTED;
+	return SBC_ERR_NOT_SUPPORTED;
 }
 
-static WERROR smbconf_txt_get_includes(struct smbconf_ctx *ctx,
+static sbcErr smbconf_txt_get_includes(struct smbconf_ctx *ctx,
 				       TALLOC_CTX *mem_ctx,
 				       const char *service,
 				       uint32_t *num_includes,
 				       char ***includes)
 {
-	WERROR werr;
+	sbcErr err;
 	bool found;
 	uint32_t sidx, count;
 	TALLOC_CTX *tmp_ctx = NULL;
 	uint32_t tmp_num_includes = 0;
 	char **tmp_includes = NULL;
 
-	werr = smbconf_txt_load_file(ctx);
-	if (!W_ERROR_IS_OK(werr)) {
-		return werr;
+	err = smbconf_txt_load_file(ctx);
+	if (!SBC_ERROR_IS_OK(err)) {
+		return err;
 	}
 
 	found = smbconf_find_in_array(service,
@@ -557,7 +556,7 @@ static WERROR smbconf_txt_get_includes(struct smbconf_ctx *ctx,
 				      pd(ctx)->cache->num_shares,
 				      &sidx);
 	if (!found) {
-		return WERR_NO_SUCH_SERVICE;
+		return SBC_ERR_NO_SUCH_SERVICE;
 	}
 
 	tmp_ctx = talloc_stackframe();
@@ -566,11 +565,11 @@ static WERROR smbconf_txt_get_includes(struct smbconf_ctx *ctx,
 		if (strequal(pd(ctx)->cache->param_names[sidx][count],
 			     "include"))
 		{
-			werr = smbconf_add_string_to_array(tmp_ctx,
+			err = smbconf_add_string_to_array(tmp_ctx,
 				&tmp_includes,
 				tmp_num_includes,
 				pd(ctx)->cache->param_values[sidx][count]);
-			if (!W_ERROR_IS_OK(werr)) {
+			if (!SBC_ERROR_IS_OK(err)) {
 				goto done;
 			}
 			tmp_num_includes++;
@@ -581,47 +580,47 @@ static WERROR smbconf_txt_get_includes(struct smbconf_ctx *ctx,
 	if (*num_includes > 0) {
 		*includes = talloc_move(mem_ctx, &tmp_includes);
 		if (*includes == NULL) {
-			werr = WERR_NOMEM;
+			err = SBC_ERR_NOMEM;
 			goto done;
 		}
 	} else {
 		*includes = NULL;
 	}
 
-	werr = WERR_OK;
+	err = SBC_ERR_OK;
 
 done:
 	talloc_free(tmp_ctx);
-	return werr;
+	return err;
 }
 
-static WERROR smbconf_txt_set_includes(struct smbconf_ctx *ctx,
+static sbcErr smbconf_txt_set_includes(struct smbconf_ctx *ctx,
 				       const char *service,
 				       uint32_t num_includes,
 				       const char **includes)
 {
-	return WERR_NOT_SUPPORTED;
+	return SBC_ERR_NOT_SUPPORTED;
 }
 
-static WERROR smbconf_txt_delete_includes(struct smbconf_ctx *ctx,
+static sbcErr smbconf_txt_delete_includes(struct smbconf_ctx *ctx,
 					  const char *service)
 {
-	return WERR_NOT_SUPPORTED;
+	return SBC_ERR_NOT_SUPPORTED;
 }
 
-static WERROR smbconf_txt_transaction_start(struct smbconf_ctx *ctx)
+static sbcErr smbconf_txt_transaction_start(struct smbconf_ctx *ctx)
 {
-	return WERR_OK;
+	return SBC_ERR_OK;
 }
 
-static WERROR smbconf_txt_transaction_commit(struct smbconf_ctx *ctx)
+static sbcErr smbconf_txt_transaction_commit(struct smbconf_ctx *ctx)
 {
-	return WERR_OK;
+	return SBC_ERR_OK;
 }
 
-static WERROR smbconf_txt_transaction_cancel(struct smbconf_ctx *ctx)
+static sbcErr smbconf_txt_transaction_cancel(struct smbconf_ctx *ctx)
 {
-	return WERR_OK;
+	return SBC_ERR_OK;
 }
 
 static struct smbconf_ops smbconf_ops_txt = {
@@ -654,15 +653,15 @@ static struct smbconf_ops smbconf_ops_txt = {
  * initialize the smbconf text backend
  * the only function that is exported from this module
  */
-WERROR smbconf_init_txt(TALLOC_CTX *mem_ctx,
+sbcErr smbconf_init_txt(TALLOC_CTX *mem_ctx,
 			struct smbconf_ctx **conf_ctx,
 			const char *path)
 {
-	WERROR werr;
+	sbcErr err;
 
-	werr = smbconf_init_internal(mem_ctx, conf_ctx, path, &smbconf_ops_txt);
-	if (!W_ERROR_IS_OK(werr)) {
-		return werr;
+	err = smbconf_init_internal(mem_ctx, conf_ctx, path, &smbconf_ops_txt);
+	if (!SBC_ERROR_IS_OK(err)) {
+		return err;
 	}
 
 	return smbconf_txt_load_file(*conf_ctx);
diff --git a/lib/smbconf/smbconf_txt.h b/lib/smbconf/smbconf_txt.h
index 688bbc9d48..72d6207521 100644
--- a/lib/smbconf/smbconf_txt.h
+++ b/lib/smbconf/smbconf_txt.h
@@ -26,7 +26,7 @@ struct smbconf_ctx;
  * initialization functions for the text/file backend modules
  */
 
-WERROR smbconf_init_txt(TALLOC_CTX *mem_ctx,
+sbcErr smbconf_init_txt(TALLOC_CTX *mem_ctx,
 			struct smbconf_ctx **conf_ctx,
 			const char *path);
 
diff --git a/lib/smbconf/smbconf_util.c b/lib/smbconf/smbconf_util.c
index b309a3454b..86a95988f1 100644
--- a/lib/smbconf/smbconf_util.c
+++ b/lib/smbconf/smbconf_util.c
@@ -39,43 +39,43 @@ static int smbconf_destroy_ctx(struct smbconf_ctx *ctx)
  * After the work with the configuration is completed, smbconf_shutdown()
  * should be called.
  */
-WERROR smbconf_init_internal(TALLOC_CTX *mem_ctx, struct smbconf_ctx **conf_ctx,
+sbcErr smbconf_init_internal(TALLOC_CTX *mem_ctx, struct smbconf_ctx **conf_ctx,
 			     const char *path, struct smbconf_ops *ops)
 {
-	WERROR werr = WERR_OK;
+	sbcErr err = SBC_ERR_OK;
 	struct smbconf_ctx *ctx;
 
 	if (conf_ctx == NULL) {
-		return WERR_INVALID_PARAM;
+		return SBC_ERR_INVALID_PARAM;
 	}
 
 	ctx = talloc_zero(mem_ctx, struct smbconf_ctx);
 	if (ctx == NULL) {
-		return WERR_NOMEM;
+		return SBC_ERR_NOMEM;
 	}
 
 	ctx->ops = ops;
 
-	werr = ctx->ops->init(ctx, path);
-	if (!W_ERROR_IS_OK(werr)) {
+	err = ctx->ops->init(ctx, path);
+	if (!SBC_ERROR_IS_OK(err)) {
 		goto fail;
 	}
 
 	talloc_set_destructor(ctx, smbconf_destroy_ctx);
 
 	*conf_ctx = ctx;
-	return werr;
+	return err;
 
 fail:
 	talloc_free(ctx);
-	return werr;
+	return err;
 }
 
 
 /**
  * add a string to a talloced array of strings.
  */
-WERROR smbconf_add_string_to_array(TALLOC_CTX *mem_ctx,
+sbcErr smbconf_add_string_to_array(TALLOC_CTX *mem_ctx,
 				   char ***array,
 				   uint32_t count,
 				   const char *string)
@@ -83,12 +83,12 @@ WERROR smbconf_add_string_to_array(TALLOC_CTX *mem_ctx,
 	char **new_array = NULL;
 
 	if (array == NULL) {
-		return WERR_INVALID_PARAM;
+		return SBC_ERR_INVALID_PARAM;
 	}
 
 	new_array = talloc_realloc(mem_ctx, *array, char *, count + 1);
 	if (new_array == NULL) {
-		return WERR_NOMEM;
+		return SBC_ERR_NOMEM;
 	}
 
 	if (string == NULL) {
@@ -97,13 +97,13 @@ WERROR smbconf_add_string_to_array(TALLOC_CTX *mem_ctx,
 		new_array[count] = talloc_strdup(new_array, string);
 		if (new_array[count] == NULL) {
 			talloc_free(new_array);
-			return WERR_NOMEM;
+			return SBC_ERR_NOMEM;
 		}
 	}
 
 	*array = new_array;
 
-	return WERR_OK;
+	return SBC_ERR_OK;
 }
 
 bool smbconf_find_in_array(const char *string, char **list,
diff --git a/lib/socket/interfaces.c b/lib/socket/interfaces.c
new file mode 100644
index 0000000000..618714d1a7
--- /dev/null
+++ b/lib/socket/interfaces.c
@@ -0,0 +1,303 @@
+/*
+   Unix SMB/CIFS implementation.
+   return a list of network interfaces
+   Copyright (C) Andrew Tridgell 1998
+   Copyright (C) Jeremy Allison 2007
+   Copyright (C) Jelmer Vernooij 2007
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+
+#include "includes.h"
+#include "system/network.h"
+#include "interfaces.h"
+#include "lib/util/tsort.h"
+
+/****************************************************************************
+ Create a struct sockaddr_storage with the netmask bits set to 1.
+****************************************************************************/
+
+bool make_netmask(struct sockaddr_storage *pss_out,
+			const struct sockaddr_storage *pss_in,
+			unsigned long masklen)
+{
+	*pss_out = *pss_in;
+	/* Now apply masklen bits of mask. */
+#if defined(HAVE_IPV6)
+	if (pss_in->ss_family == AF_INET6) {
+		char *p = (char *)&((struct sockaddr_in6 *)pss_out)->sin6_addr;
+		unsigned int i;
+
+		if (masklen > 128) {
+			return false;
+		}
+		for (i = 0; masklen >= 8; masklen -= 8, i++) {
+			*p++ = 0xff;
+		}
+		/* Deal with the partial byte. */
+		*p++ &= (0xff & ~(0xff>>masklen));
+		i++;
+		for (;i < sizeof(struct in6_addr); i++) {
+			*p++ = '\0';
+		}
+		return true;
+	}
+#endif
+	if (pss_in->ss_family == AF_INET) {
+		if (masklen > 32) {
+			return false;
+		}
+		((struct sockaddr_in *)pss_out)->sin_addr.s_addr =
+			htonl(((0xFFFFFFFFL >> masklen) ^ 0xFFFFFFFFL));
+		return true;
+	}
+	return false;
+}
+
+/****************************************************************************
+ Create a struct sockaddr_storage set to the broadcast or network adress from
+ an incoming sockaddr_storage.
+****************************************************************************/
+
+static void make_bcast_or_net(struct sockaddr_storage *pss_out,
+			const struct sockaddr_storage *pss_in,
+			const struct sockaddr_storage *nmask,
+			bool make_bcast_p)
+{
+	unsigned int i = 0, len = 0;
+	char *pmask = NULL;
+	char *p = NULL;
+	*pss_out = *pss_in;
+
+	/* Set all zero netmask bits to 1. */
+#if defined(HAVE_IPV6)
+	if (pss_in->ss_family == AF_INET6) {
+		p = (char *)&((struct sockaddr_in6 *)pss_out)->sin6_addr;
+		pmask = discard_const_p(char, &((struct sockaddr_in6 *)nmask)->sin6_addr);
+		len = 16;
+	}
+#endif
+	if (pss_in->ss_family == AF_INET) {
+		p = (char *)&((struct sockaddr_in *)pss_out)->sin_addr;
+		pmask = discard_const_p(char, &((struct sockaddr_in *)nmask)->sin_addr);
+		len = 4;
+	}
+
+	for (i = 0; i < len; i++, p++, pmask++) {
+		if (make_bcast_p) {
+			*p = (*p & *pmask) | (*pmask ^ 0xff);
+		} else {
+			/* make_net */
+			*p = (*p & *pmask);
+		}
+	}
+}
+
+void make_bcast(struct sockaddr_storage *pss_out,
+			const struct sockaddr_storage *pss_in,
+			const struct sockaddr_storage *nmask)
+{
+	make_bcast_or_net(pss_out, pss_in, nmask, true);
+}
+
+void make_net(struct sockaddr_storage *pss_out,
+			const struct sockaddr_storage *pss_in,
+			const struct sockaddr_storage *nmask)
+{
+	make_bcast_or_net(pss_out, pss_in, nmask, false);
+}
+
+
+/****************************************************************************
+ Try the "standard" getifaddrs/freeifaddrs interfaces.
+ Also gets IPv6 interfaces.
+****************************************************************************/
+
+/****************************************************************************
+ Get the netmask address for a local interface.
+****************************************************************************/
+
+static int _get_interfaces(TALLOC_CTX *mem_ctx, struct iface_struct **pifaces)
+{
+	struct iface_struct *ifaces;
+	struct ifaddrs *iflist = NULL;
+	struct ifaddrs *ifptr = NULL;
+	int count;
+	int total = 0;
+	size_t copy_size;
+
+	if (getifaddrs(&iflist) < 0) {
+		return -1;
+	}
+
+	count = 0;
+	for (ifptr = iflist; ifptr != NULL; ifptr = ifptr->ifa_next) {
+		if (!ifptr->ifa_addr || !ifptr->ifa_netmask) {
+			continue;
+		}
+		if (!(ifptr->ifa_flags & IFF_UP)) {
+			continue;
+		}
+		count += 1;
+	}
+
+	ifaces = talloc_array(mem_ctx, struct iface_struct, count);
+	if (ifaces == NULL) {
+		errno = ENOMEM;
+		return -1;
+	}
+
+	/* Loop through interfaces, looking for given IP address */
+	for (ifptr = iflist; ifptr != NULL; ifptr = ifptr->ifa_next) {
+
+		if (!ifptr->ifa_addr || !ifptr->ifa_netmask) {
+			continue;
+		}
+
+		/* Check the interface is up. */
+		if (!(ifptr->ifa_flags & IFF_UP)) {
+			continue;
+		}
+
+		memset(&ifaces[total], '\0', sizeof(ifaces[total]));
+
+		copy_size = sizeof(struct sockaddr_in);
+
+		ifaces[total].flags = ifptr->ifa_flags;
+
+#if defined(HAVE_IPV6)
+		if (ifptr->ifa_addr->sa_family == AF_INET6) {
+			copy_size = sizeof(struct sockaddr_in6);
+		}
+#endif
+
+		memcpy(&ifaces[total].ip, ifptr->ifa_addr, copy_size);
+		memcpy(&ifaces[total].netmask, ifptr->ifa_netmask, copy_size);
+
+		if (ifaces[total].flags & (IFF_BROADCAST|IFF_LOOPBACK)) {
+			make_bcast(&ifaces[total].bcast,
+				&ifaces[total].ip,
+				&ifaces[total].netmask);
+		} else if ((ifaces[total].flags & IFF_POINTOPOINT) &&
+			       ifptr->ifa_dstaddr ) {
+			memcpy(&ifaces[total].bcast,
+				ifptr->ifa_dstaddr,
+				copy_size);
+		} else {
+			continue;
+		}
+
+		strlcpy(ifaces[total].name, ifptr->ifa_name,
+			sizeof(ifaces[total].name));
+		total++;
+	}
+
+	freeifaddrs(iflist);
+
+	*pifaces = ifaces;
+	return total;
+}
+
+static int iface_comp(struct iface_struct *i1, struct iface_struct *i2)
+{
+	int r;
+
+#if defined(HAVE_IPV6)
+	/*
+	 * If we have IPv6 - sort these interfaces lower
+	 * than any IPv4 ones.
+	 */
+	if (i1->ip.ss_family == AF_INET6 &&
+			i2->ip.ss_family == AF_INET) {
+		return -1;
+	} else if (i1->ip.ss_family == AF_INET &&
+			i2->ip.ss_family == AF_INET6) {
+		return 1;
+	}
+
+	if (i1->ip.ss_family == AF_INET6) {
+		struct sockaddr_in6 *s1 = (struct sockaddr_in6 *)&i1->ip;
+		struct sockaddr_in6 *s2 = (struct sockaddr_in6 *)&i2->ip;
+
+		r = memcmp(&s1->sin6_addr,
+				&s2->sin6_addr,
+				sizeof(struct in6_addr));
+		if (r) {
+			return r;
+		}
+
+		s1 = (struct sockaddr_in6 *)&i1->netmask;
+		s2 = (struct sockaddr_in6 *)&i2->netmask;
+
+		r = memcmp(&s1->sin6_addr,
+				&s2->sin6_addr,
+				sizeof(struct in6_addr));
+		if (r) {
+			return r;
+		}
+	}
+#endif
+
+	/* AIX uses __ss_family instead of ss_family inside of
+	   sockaddr_storage. Instead of trying to figure out which field to
+	   use, we can just cast it to a sockaddr.
+	 */
+
+	if (((struct sockaddr *)&i1->ip)->sa_family == AF_INET) {
+		struct sockaddr_in *s1 = (struct sockaddr_in *)&i1->ip;
+		struct sockaddr_in *s2 = (struct sockaddr_in *)&i2->ip;
+
+		r = ntohl(s1->sin_addr.s_addr) -
+			ntohl(s2->sin_addr.s_addr);
+		if (r) {
+			return r;
+		}
+
+		s1 = (struct sockaddr_in *)&i1->netmask;
+		s2 = (struct sockaddr_in *)&i2->netmask;
+
+		return ntohl(s1->sin_addr.s_addr) -
+			ntohl(s2->sin_addr.s_addr);
+	}
+	return 0;
+}
+
+/* this wrapper is used to remove duplicates from the interface list generated
+   above */
+int get_interfaces(TALLOC_CTX *mem_ctx, struct iface_struct **pifaces)
+{
+	struct iface_struct *ifaces;
+	int total, i, j;
+
+	total = _get_interfaces(mem_ctx, &ifaces);
+	if (total <= 0) return total;
+
+	/* now we need to remove duplicates */
+	TYPESAFE_QSORT(ifaces, total, iface_comp);
+
+	for (i=1;i<total;) {
+		if (iface_comp(&ifaces[i-1], &ifaces[i]) == 0) {
+			for (j=i-1;j<total-1;j++) {
+				ifaces[j] = ifaces[j+1];
+			}
+			total--;
+		} else {
+			i++;
+		}
+	}
+
+	*pifaces = ifaces;
+	return total;
+}
diff --git a/lib/socket/interfaces.h b/lib/socket/interfaces.h
new file mode 100644
index 0000000000..b4e113dcc8
--- /dev/null
+++ b/lib/socket/interfaces.h
@@ -0,0 +1,44 @@
+/*
+   Unix SMB/CIFS implementation.
+
+   structures for lib/netif/
+
+   Copyright (C) Andrew Tridgell 2004
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "system/network.h"
+
+struct iface_struct {
+	char name[16];
+	int flags;
+	struct sockaddr_storage ip;
+	struct sockaddr_storage netmask;
+	struct sockaddr_storage bcast;
+};
+
+struct interface;
+
+bool make_netmask(struct sockaddr_storage *pss_out,
+		  const struct sockaddr_storage *pss_in,
+		  unsigned long masklen);
+void make_bcast(struct sockaddr_storage *pss_out,
+		const struct sockaddr_storage *pss_in,
+		const struct sockaddr_storage *nmask);
+void make_net(struct sockaddr_storage *pss_out,
+	      const struct sockaddr_storage *pss_in,
+	      const struct sockaddr_storage *nmask);
+
+int get_interfaces(TALLOC_CTX *mem_ctx, struct iface_struct **pifaces);
diff --git a/lib/socket/wscript_build b/lib/socket/wscript_build
new file mode 100644
index 0000000000..61bde129c5
--- /dev/null
+++ b/lib/socket/wscript_build
@@ -0,0 +1,7 @@
+#!/usr/bin/env python
+
+bld.SAMBA_LIBRARY('interfaces',
+    source='interfaces.c',
+    deps='samba-util',
+    private_library=True
+    )
diff --git a/lib/talloc/talloc.3.xml b/lib/talloc/talloc.3.xml
index a327922dbe..99e8bcdb2f 100644
--- a/lib/talloc/talloc.3.xml
+++ b/lib/talloc/talloc.3.xml
@@ -783,9 +783,9 @@ if (ptr) memcpy(ptr, p, strlen(p)+1);</programlisting>
     </para>
     <para>
       This program is free software; you can redistribute it and/or modify
-      it under the terms of the GNU General Public License as published by
-      the Free Software Foundation; either version 3 of the License, or (at
-      your option) any later version.
+      it under the terms of the GNU Lesser General Public License as 
+      published by the Free Software Foundation; either version 3 of the
+      License, or (at your option) any later version.
     </para>
     <para>
       This program is distributed in the hope that it will be useful, but
diff --git a/lib/talloc/talloc.c b/lib/talloc/talloc.c
index 91452bfada..4700aa99e8 100644
--- a/lib/talloc/talloc.c
+++ b/lib/talloc/talloc.c
@@ -178,6 +178,32 @@ static struct {
 	TC_INVALIDATE_SHRINK_VALGRIND_CHUNK(_tc, _new_size); \
 } while (0)
 
+#define TC_UNDEFINE_SHRINK_FILL_CHUNK(_tc, _new_size) do { \
+	if (unlikely(talloc_fill.enabled)) { \
+		size_t _flen = (_tc)->size - (_new_size); \
+		char *_fptr = (char *)TC_PTR_FROM_CHUNK(_tc); \
+		_fptr += (_new_size); \
+		memset(_fptr, talloc_fill.fill_value, _flen); \
+	} \
+} while (0)
+
+#if defined(DEVELOPER) && defined(VALGRIND_MAKE_MEM_UNDEFINED)
+/* Mark the unused bytes as undefined */
+#define TC_UNDEFINE_SHRINK_VALGRIND_CHUNK(_tc, _new_size) do { \
+	size_t _flen = (_tc)->size - (_new_size); \
+	char *_fptr = (char *)TC_PTR_FROM_CHUNK(_tc); \
+	_fptr += (_new_size); \
+	VALGRIND_MAKE_MEM_UNDEFINED(_fptr, _flen); \
+} while (0)
+#else
+#define TC_UNDEFINE_SHRINK_VALGRIND_CHUNK(_tc, _new_size) do { } while (0)
+#endif
+
+#define TC_UNDEFINE_SHRINK_CHUNK(_tc, _new_size) do { \
+	TC_UNDEFINE_SHRINK_FILL_CHUNK(_tc, _new_size); \
+	TC_UNDEFINE_SHRINK_VALGRIND_CHUNK(_tc, _new_size); \
+} while (0)
+
 #if defined(DEVELOPER) && defined(VALGRIND_MAKE_MEM_UNDEFINED)
 /* Mark the new bytes as undefined */
 #define TC_UNDEFINE_GROW_VALGRIND_CHUNK(_tc, _new_size) do { \
@@ -683,6 +709,69 @@ _PUBLIC_ void *_talloc_reference_loc(const void *context, const void *ptr, const
 
 static void *_talloc_steal_internal(const void *new_ctx, const void *ptr);
 
+static inline void _talloc_free_poolmem(struct talloc_chunk *tc,
+					const char *location)
+{
+	struct talloc_chunk *pool;
+	void *next_tc;
+	unsigned int *pool_object_count;
+
+	pool = (struct talloc_chunk *)tc->pool;
+	next_tc = TC_POOLMEM_NEXT_CHUNK(tc);
+
+	tc->flags |= TALLOC_FLAG_FREE;
+
+	/* we mark the freed memory with where we called the free
+	 * from. This means on a double free error we can report where
+	 * the first free came from
+	 */
+	tc->name = location;
+
+	TC_INVALIDATE_FULL_CHUNK(tc);
+
+	pool_object_count = talloc_pool_objectcount(pool);
+
+	if (unlikely(*pool_object_count == 0)) {
+		talloc_abort("Pool object count zero!");
+		return;
+	}
+
+	*pool_object_count -= 1;
+
+	if (unlikely(*pool_object_count == 1 && !(pool->flags & TALLOC_FLAG_FREE))) {
+		/*
+		 * if there is just one object left in the pool
+		 * and pool->flags does not have TALLOC_FLAG_FREE,
+		 * it means this is the pool itself and
+		 * the rest is available for new objects
+		 * again.
+		 */
+		pool->pool = TC_POOL_FIRST_CHUNK(pool);
+		TC_INVALIDATE_POOL(pool);
+	} else if (unlikely(*pool_object_count == 0)) {
+		/*
+		 * we mark the freed memory with where we called the free
+		 * from. This means on a double free error we can report where
+		 * the first free came from
+		 */
+		pool->name = location;
+
+		TC_INVALIDATE_FULL_CHUNK(pool);
+		free(pool);
+	} else if (pool->pool == next_tc) {
+		/*
+		 * if pool->pool still points to end of
+		 * 'tc' (which is stored in the 'next_tc' variable),
+		 * we can reclaim the memory of 'tc'.
+		 */
+		pool->pool = tc;
+	}
+}
+
+static inline void _talloc_free_children_internal(struct talloc_chunk *tc,
+						  void *ptr,
+						  const char *location);
+
 /* 
    internal talloc_free call
 */
@@ -753,41 +842,7 @@ static inline int _talloc_free_internal(void *ptr, const char *location)
 
 	tc->flags |= TALLOC_FLAG_LOOP;
 
-	while (tc->child) {
-		/* we need to work out who will own an abandoned child
-		   if it cannot be freed. In priority order, the first
-		   choice is owner of any remaining reference to this
-		   pointer, the second choice is our parent, and the
-		   final choice is the null context. */
-		void *child = TC_PTR_FROM_CHUNK(tc->child);
-		const void *new_parent = null_context;
-		struct talloc_chunk *old_parent = NULL;
-		if (unlikely(tc->child->refs)) {
-			struct talloc_chunk *p = talloc_parent_chunk(tc->child->refs);
-			if (p) new_parent = TC_PTR_FROM_CHUNK(p);
-		}
-		/* finding the parent here is potentially quite
-		   expensive, but the alternative, which is to change
-		   talloc to always have a valid tc->parent pointer,
-		   makes realloc more expensive where there are a
-		   large number of children.
-
-		   The reason we need the parent pointer here is that
-		   if _talloc_free_internal() fails due to references
-		   or a failing destructor we need to re-parent, but
-		   the free call can invalidate the prev pointer.
-		*/
-		if (new_parent == null_context && (tc->child->refs || tc->child->destructor)) {
-			old_parent = talloc_parent_chunk(ptr);
-		}
-		if (unlikely(_talloc_free_internal(child, location) == -1)) {
-			if (new_parent == null_context) {
-				struct talloc_chunk *p = old_parent;
-				if (p) new_parent = TC_PTR_FROM_CHUNK(p);
-			}
-			_talloc_steal_internal(new_parent, child);
-		}
-	}
+	_talloc_free_children_internal(tc, ptr, location);
 
 	tc->flags |= TALLOC_FLAG_FREE;
 
@@ -797,21 +852,10 @@ static inline int _talloc_free_internal(void *ptr, const char *location)
 	 */	 
 	tc->name = location;
 
-	if (tc->flags & (TALLOC_FLAG_POOL|TALLOC_FLAG_POOLMEM)) {
-		struct talloc_chunk *pool;
-		void *next_tc = NULL;
+	if (tc->flags & TALLOC_FLAG_POOL) {
 		unsigned int *pool_object_count;
 
-		if (unlikely(tc->flags & TALLOC_FLAG_POOL)) {
-			pool = tc;
-		} else {
-			pool = (struct talloc_chunk *)tc->pool;
-			next_tc = TC_POOLMEM_NEXT_CHUNK(tc);
-
-			TC_INVALIDATE_FULL_CHUNK(tc);
-		}
-
-		pool_object_count = talloc_pool_objectcount(pool);
+		pool_object_count = talloc_pool_objectcount(tc);
 
 		if (unlikely(*pool_object_count == 0)) {
 			talloc_abort("Pool object count zero!");
@@ -820,26 +864,12 @@ static inline int _talloc_free_internal(void *ptr, const char *location)
 
 		*pool_object_count -= 1;
 
-		if (unlikely(*pool_object_count == 1)) {
-			/*
-			 * if there is just object left in the pool
-			 * it means this is the pool itself and
-			 * the rest is available for new objects
-			 * again.
-			 */
-			pool->pool = TC_POOL_FIRST_CHUNK(pool);
-			TC_INVALIDATE_POOL(pool);
-		} else if (unlikely(*pool_object_count == 0)) {
-			TC_INVALIDATE_FULL_CHUNK(pool);
-			free(pool);
-		} else if (pool->pool == next_tc) {
-			/*
-			 * if pool->pool still points to end of
-			 * 'tc' (which is stored in the 'next_tc' variable),
-			 * we can reclaim the memory of 'tc'.
-			 */
-			pool->pool = tc;
+		if (unlikely(*pool_object_count == 0)) {
+			TC_INVALIDATE_FULL_CHUNK(tc);
+			free(tc);
 		}
+	} else if (tc->flags & TALLOC_FLAG_POOLMEM) {
+		_talloc_free_poolmem(tc, location);
 	} else {
 		TC_INVALIDATE_FULL_CHUNK(tc);
 		free(tc);
@@ -1204,21 +1234,10 @@ _PUBLIC_ void *talloc_init(const char *fmt, ...)
 	return ptr;
 }
 
-/*
-  this is a replacement for the Samba3 talloc_destroy_pool functionality. It
-  should probably not be used in new code. It's in here to keep the talloc
-  code consistent across Samba 3 and 4.
-*/
-_PUBLIC_ void talloc_free_children(void *ptr)
+static inline void _talloc_free_children_internal(struct talloc_chunk *tc,
+						  void *ptr,
+						  const char *location)
 {
-	struct talloc_chunk *tc;
-
-	if (unlikely(ptr == NULL)) {
-		return;
-	}
-
-	tc = talloc_chunk_from_ptr(ptr);
-
 	while (tc->child) {
 		/* we need to work out who will own an abandoned child
 		   if it cannot be freed. In priority order, the first
@@ -1227,13 +1246,28 @@ _PUBLIC_ void talloc_free_children(void *ptr)
 		   final choice is the null context. */
 		void *child = TC_PTR_FROM_CHUNK(tc->child);
 		const void *new_parent = null_context;
+		struct talloc_chunk *old_parent = NULL;
 		if (unlikely(tc->child->refs)) {
 			struct talloc_chunk *p = talloc_parent_chunk(tc->child->refs);
 			if (p) new_parent = TC_PTR_FROM_CHUNK(p);
 		}
-		if (unlikely(talloc_free(child) == -1)) {
+		/* finding the parent here is potentially quite
+		   expensive, but the alternative, which is to change
+		   talloc to always have a valid tc->parent pointer,
+		   makes realloc more expensive where there are a
+		   large number of children.
+
+		   The reason we need the parent pointer here is that
+		   if _talloc_free_internal() fails due to references
+		   or a failing destructor we need to re-parent, but
+		   the free call can invalidate the prev pointer.
+		*/
+		if (new_parent == null_context && (tc->child->refs || tc->child->destructor)) {
+			old_parent = talloc_parent_chunk(ptr);
+		}
+		if (unlikely(_talloc_free_internal(child, location) == -1)) {
 			if (new_parent == null_context) {
-				struct talloc_chunk *p = talloc_parent_chunk(ptr);
+				struct talloc_chunk *p = old_parent;
 				if (p) new_parent = TC_PTR_FROM_CHUNK(p);
 			}
 			_talloc_steal_internal(new_parent, child);
@@ -1241,6 +1275,24 @@ _PUBLIC_ void talloc_free_children(void *ptr)
 	}
 }
 
+/*
+  this is a replacement for the Samba3 talloc_destroy_pool functionality. It
+  should probably not be used in new code. It's in here to keep the talloc
+  code consistent across Samba 3 and 4.
+*/
+_PUBLIC_ void talloc_free_children(void *ptr)
+{
+	struct talloc_chunk *tc;
+
+	if (unlikely(ptr == NULL)) {
+		return;
+	}
+
+	tc = talloc_chunk_from_ptr(ptr);
+
+	_talloc_free_children_internal(tc, ptr, __location__);
+}
+
 /* 
    Allocate a bit of memory as a child of an existing pointer
 */
@@ -1365,7 +1417,16 @@ _PUBLIC_ void *_talloc_realloc(const void *context, void *ptr, size_t size, cons
 			}
 			return ptr;
 		} else if ((tc->size - size) < 1024) {
-			TC_INVALIDATE_SHRINK_CHUNK(tc, size);
+			/*
+			 * if we call TC_INVALIDATE_SHRINK_CHUNK() here
+			 * we would need to call TC_UNDEFINE_GROW_CHUNK()
+			 * after each realloc call, which slows down
+			 * testing a lot :-(.
+			 *
+			 * That is why we only mark memory as undefined here.
+			 */
+			TC_UNDEFINE_SHRINK_CHUNK(tc, size);
+
 			/* do not shrink if we have less than 1k to gain */
 			tc->size = size;
 			return ptr;
@@ -1410,8 +1471,13 @@ _PUBLIC_ void *_talloc_realloc(const void *context, void *ptr, size_t size, cons
 		size_t new_chunk_size = TC_ALIGN16(TC_HDR_SIZE + size);
 		size_t space_needed;
 		size_t space_left;
+		unsigned int chunk_count = *talloc_pool_objectcount(pool_tc);
+
+		if (!(pool_tc->flags & TALLOC_FLAG_FREE)) {
+			chunk_count -= 1;
+		}
 
-		if (*talloc_pool_objectcount(pool_tc) == 2) {
+		if (chunk_count == 1) {
 			/*
 			 * optimize for the case where 'tc' is the only
 			 * chunk in the pool.
@@ -1438,6 +1504,7 @@ _PUBLIC_ void *_talloc_realloc(const void *context, void *ptr, size_t size, cons
 				memmove(pool_tc->pool, tc, old_used);
 				new_ptr = pool_tc->pool;
 
+				tc = (struct talloc_chunk *)new_ptr;
 				TC_UNDEFINE_GROW_CHUNK(tc, size);
 
 				/*
@@ -1481,7 +1548,6 @@ _PUBLIC_ void *_talloc_realloc(const void *context, void *ptr, size_t size, cons
 		}
 
 		new_ptr = talloc_alloc_pool(tc, size + TC_HDR_SIZE);
-		*talloc_pool_objectcount(pool_tc) -= 1;
 
 		if (new_ptr == NULL) {
 			new_ptr = malloc(TC_HDR_SIZE+size);
@@ -1490,21 +1556,8 @@ _PUBLIC_ void *_talloc_realloc(const void *context, void *ptr, size_t size, cons
 
 		if (new_ptr) {
 			memcpy(new_ptr, tc, MIN(tc->size,size) + TC_HDR_SIZE);
-			TC_INVALIDATE_FULL_CHUNK(tc);
 
-			if (*talloc_pool_objectcount(pool_tc) == 1) {
-				/*
-				 * If the pool is empty now reclaim everything.
-				 */
-				pool_tc->pool = TC_POOL_FIRST_CHUNK(pool_tc);
-				TC_INVALIDATE_POOL(pool_tc);
-			} else if (next_tc == pool_tc->pool) {
-				/*
-				 * If it was reallocated and tc was the last
-				 * chunk, we can reclaim the memory of tc.
-				 */
-				pool_tc->pool = tc;
-			}
+			_talloc_free_poolmem(tc, __location__ "_talloc_realloc");
 		}
 	}
 	else {
diff --git a/lib/talloc/testsuite.c b/lib/talloc/testsuite.c
index ba583ab84e..90417c6ade 100644
--- a/lib/talloc/testsuite.c
+++ b/lib/talloc/testsuite.c
@@ -1128,23 +1128,31 @@ static bool test_pool(void)
 	pool = talloc_pool(NULL, 1024);
 
 	p1 = talloc_size(pool, 80);
+	memset(p1, 0x11, talloc_get_size(p1));
 	p2 = talloc_size(pool, 20);
+	memset(p2, 0x11, talloc_get_size(p2));
 	p3 = talloc_size(p1, 50);
+	memset(p3, 0x11, talloc_get_size(p3));
 	p4 = talloc_size(p3, 1000);
+	memset(p4, 0x11, talloc_get_size(p4));
 
 #if 1 /* this relies on ALWAYS_REALLOC == 0 in talloc.c */
 	p2_2 = talloc_realloc_size(pool, p2, 20+1);
 	torture_assert("pool realloc 20+1", p2_2 == p2, "failed: pointer changed");
+	memset(p2, 0x11, talloc_get_size(p2));
 	p2_2 = talloc_realloc_size(pool, p2, 20-1);
 	torture_assert("pool realloc 20-1", p2_2 == p2, "failed: pointer changed");
+	memset(p2, 0x11, talloc_get_size(p2));
 	p2_2 = talloc_realloc_size(pool, p2, 20-1);
 	torture_assert("pool realloc 20-1", p2_2 == p2, "failed: pointer changed");
+	memset(p2, 0x11, talloc_get_size(p2));
 
 	talloc_free(p3);
 
 	/* this should reclaim the memory of p4 and p3 */
 	p2_2 = talloc_realloc_size(pool, p2, 400);
 	torture_assert("pool realloc 400", p2_2 == p2, "failed: pointer changed");
+	memset(p2, 0x11, talloc_get_size(p2));
 
 	talloc_free(p1);
 
@@ -1152,37 +1160,46 @@ static bool test_pool(void)
 	p2_2 = talloc_realloc_size(pool, p2, 800);
 	torture_assert("pool realloc 800", p2_2 == p1, "failed: pointer not changed");
 	p2 = p2_2;
+	memset(p2, 0x11, talloc_get_size(p2));
 
 	/* this should do a malloc */
 	p2_2 = talloc_realloc_size(pool, p2, 1800);
 	torture_assert("pool realloc 1800", p2_2 != p2, "failed: pointer not changed");
 	p2 = p2_2;
+	memset(p2, 0x11, talloc_get_size(p2));
 
 	/* this should reclaim the memory from the pool */
 	p3 = talloc_size(pool, 80);
 	torture_assert("pool alloc 80", p3 == p1, "failed: pointer changed");
+	memset(p3, 0x11, talloc_get_size(p3));
 
 	talloc_free(p2);
 	talloc_free(p3);
 
 	p1 = talloc_size(pool, 80);
+	memset(p1, 0x11, talloc_get_size(p1));
 	p2 = talloc_size(pool, 20);
+	memset(p2, 0x11, talloc_get_size(p2));
 
 	talloc_free(p1);
 
 	p2_2 = talloc_realloc_size(pool, p2, 20-1);
 	torture_assert("pool realloc 20-1", p2_2 == p2, "failed: pointer changed");
+	memset(p2, 0x11, talloc_get_size(p2));
 	p2_2 = talloc_realloc_size(pool, p2, 20-1);
 	torture_assert("pool realloc 20-1", p2_2 == p2, "failed: pointer changed");
+	memset(p2, 0x11, talloc_get_size(p2));
 
 	/* this should do a malloc */
 	p2_2 = talloc_realloc_size(pool, p2, 1800);
 	torture_assert("pool realloc 1800", p2_2 != p2, "failed: pointer not changed");
 	p2 = p2_2;
+	memset(p2, 0x11, talloc_get_size(p2));
 
 	/* this should reclaim the memory from the pool */
 	p3 = talloc_size(pool, 800);
 	torture_assert("pool alloc 800", p3 == p1, "failed: pointer changed");
+	memset(p3, 0x11, talloc_get_size(p3));
 
 #endif /* this relies on ALWAYS_REALLOC == 0 in talloc.c */
 
@@ -1191,6 +1208,73 @@ static bool test_pool(void)
 	return true;
 }
 
+static bool test_pool_steal(void)
+{
+	void *root;
+	void *pool;
+	void *p1, *p2;
+	void *p1_2, *p2_2;
+	size_t hdr;
+	size_t ofs1, ofs2;
+
+	root = talloc_new(NULL);
+	pool = talloc_pool(root, 1024);
+
+	p1 = talloc_size(pool, 4 * 16);
+	torture_assert("pool allocate 4 * 16", p1 != NULL, "failed ");
+	memset(p1, 0x11, talloc_get_size(p1));
+	p2 = talloc_size(pool, 4 * 16);
+	torture_assert("pool allocate 4 * 16", p2 > p1, "failed: !(p2 > p1) ");
+	memset(p2, 0x11, talloc_get_size(p2));
+
+	ofs1 = PTR_DIFF(p2, p1);
+	hdr = ofs1 - talloc_get_size(p1);
+
+	talloc_steal(root, p1);
+	talloc_steal(root, p2);
+
+	talloc_free(pool);
+
+	p1_2 = p1;
+
+#if 1 /* this relies on ALWAYS_REALLOC == 0 in talloc.c */
+	p1_2 = talloc_realloc_size(root, p1, 5 * 16);
+	torture_assert("pool realloc 5 * 16", p1_2 > p2, "failed: pointer not changed");
+	memset(p1_2, 0x11, talloc_get_size(p1_2));
+	ofs1 = PTR_DIFF(p1_2, p2);
+	ofs2 = talloc_get_size(p2) + hdr;
+
+	torture_assert("pool realloc ", ofs1 == ofs2, "failed: pointer offset unexpected");
+
+	p2_2 = talloc_realloc_size(root, p2, 3 * 16);
+	torture_assert("pool realloc 5 * 16", p2_2 == p2, "failed: pointer changed");
+	memset(p2_2, 0x11, talloc_get_size(p2_2));
+#endif /* this relies on ALWAYS_REALLOC == 0 in talloc.c */
+
+	talloc_free(p1_2);
+
+	p2_2 = p2;
+
+#if 1 /* this relies on ALWAYS_REALLOC == 0 in talloc.c */
+	/* now we should reclaim the full pool */
+	p2_2 = talloc_realloc_size(root, p2, 8 * 16);
+	torture_assert("pool realloc 8 * 16", p2_2 == p1, "failed: pointer not expected");
+	p2 = p2_2;
+	memset(p2_2, 0x11, talloc_get_size(p2_2));
+
+	/* now we malloc and free the full pool space */
+	p2_2 = talloc_realloc_size(root, p2, 2 * 1024);
+	torture_assert("pool realloc 2 * 1024", p2_2 != p1, "failed: pointer not expected");
+	memset(p2_2, 0x11, talloc_get_size(p2_2));
+
+#endif /* this relies on ALWAYS_REALLOC == 0 in talloc.c */
+
+	talloc_free(p2_2);
+
+	talloc_free(root);
+
+	return true;
+}
 
 static bool test_free_ref_null_context(void)
 {
@@ -1290,6 +1374,8 @@ bool torture_local_talloc(struct torture_context *tctx)
 	test_reset();
 	ret &= test_pool();
 	test_reset();
+	ret &= test_pool_steal();
+	test_reset();
 	ret &= test_free_ref_null_context();
 	test_reset();
 	ret &= test_rusty();
diff --git a/lib/tdb/common/hash.c b/lib/tdb/common/hash.c
index 2472ed1ace..1eed7221d2 100644
--- a/lib/tdb/common/hash.c
+++ b/lib/tdb/common/hash.c
@@ -214,9 +214,7 @@ static uint32_t hashlittle( const void *key, size_t length )
   u.ptr = key;
   if (HASH_LITTLE_ENDIAN && ((u.i & 0x3) == 0)) {
     const uint32_t *k = (const uint32_t *)key;         /* read 32-bit chunks */
-#ifdef VALGRIND
     const uint8_t  *k8;
-#endif
 
     /*------ all but last block: aligned reads and affect 32 bits of (a,b,c) */
     while (length > 12)
@@ -230,36 +228,6 @@ static uint32_t hashlittle( const void *key, size_t length )
     }
 
     /*----------------------------- handle the last (probably partial) block */
-    /*
-     * "k[2]&0xffffff" actually reads beyond the end of the string, but
-     * then masks off the part it's not allowed to read.  Because the
-     * string is aligned, the masked-off tail is in the same word as the
-     * rest of the string.  Every machine with memory protection I've seen
-     * does it on word boundaries, so is OK with this.  But VALGRIND will
-     * still catch it and complain.  The masking trick does make the hash
-     * noticably faster for short strings (like English words).
-     */
-#ifndef VALGRIND
-
-    switch(length)
-    {
-    case 12: c+=k[2]; b+=k[1]; a+=k[0]; break;
-    case 11: c+=k[2]&0xffffff; b+=k[1]; a+=k[0]; break;
-    case 10: c+=k[2]&0xffff; b+=k[1]; a+=k[0]; break;
-    case 9 : c+=k[2]&0xff; b+=k[1]; a+=k[0]; break;
-    case 8 : b+=k[1]; a+=k[0]; break;
-    case 7 : b+=k[1]&0xffffff; a+=k[0]; break;
-    case 6 : b+=k[1]&0xffff; a+=k[0]; break;
-    case 5 : b+=k[1]&0xff; a+=k[0]; break;
-    case 4 : a+=k[0]; break;
-    case 3 : a+=k[0]&0xffffff; break;
-    case 2 : a+=k[0]&0xffff; break;
-    case 1 : a+=k[0]&0xff; break;
-    case 0 : return c;              /* zero length strings require no mixing */
-    }
-
-#else /* make valgrind happy */
-
     k8 = (const uint8_t *)k;
     switch(length)
     {
@@ -277,9 +245,6 @@ static uint32_t hashlittle( const void *key, size_t length )
     case 1 : a+=k8[0]; break;
     case 0 : return c;
     }
-
-#endif /* !valgrind */
-
   } else if (HASH_LITTLE_ENDIAN && ((u.i & 0x1) == 0)) {
     const uint16_t *k = (const uint16_t *)key;         /* read 16-bit chunks */
     const uint8_t  *k8;
diff --git a/lib/tdb/pytdb.c b/lib/tdb/pytdb.c
index 0faba562de..3dd785e7be 100644
--- a/lib/tdb/pytdb.c
+++ b/lib/tdb/pytdb.c
@@ -558,6 +558,7 @@ static PyMethodDef tdb_methods[] = {
 	{ NULL }
 };
 
+void inittdb(void);
 void inittdb(void)
 {
 	PyObject *m;
diff --git a/lib/tdb/python/tests/simple.py b/lib/tdb/python/tests/simple.py
index f5484a0523..2877092fe3 100644
--- a/lib/tdb/python/tests/simple.py
+++ b/lib/tdb/python/tests/simple.py
@@ -20,8 +20,13 @@ class OpenTdbTests(TestCase):
 class CloseTdbTests(TestCase):
 
     def test_double_close(self):
-        self.tdb = tdb.Tdb(tempfile.mkstemp()[1], 0, tdb.DEFAULT,
-                os.O_CREAT|os.O_RDWR)
+        # No hash size in tdb2.
+        if tdb.__version__.startswith("2"):
+            self.tdb = tdb.Tdb(tempfile.mkstemp()[1], tdb.DEFAULT,
+                               os.O_CREAT|os.O_RDWR)
+        else:
+            self.tdb = tdb.Tdb(tempfile.mkstemp()[1], 0, tdb.DEFAULT,
+                               os.O_CREAT|os.O_RDWR)
         self.assertNotEqual(None, self.tdb)
 
         # ensure that double close does not crash python
@@ -42,8 +47,12 @@ class SimpleTdbTests(TestCase):
 
     def setUp(self):
         super(SimpleTdbTests, self).setUp()
-        self.tdb = tdb.Tdb(tempfile.mkstemp()[1], 0, tdb.DEFAULT,
-                os.O_CREAT|os.O_RDWR)
+        if tdb.__version__.startswith("2"):
+            self.tdb = tdb.Tdb(tempfile.mkstemp()[1], tdb.DEFAULT,
+                               os.O_CREAT|os.O_RDWR)
+        else:
+            self.tdb = tdb.Tdb(tempfile.mkstemp()[1], 0, tdb.DEFAULT,
+                               os.O_CREAT|os.O_RDWR)
         self.assertNotEqual(None, self.tdb)
 
     def tearDown(self):
@@ -56,7 +65,8 @@ class SimpleTdbTests(TestCase):
         self.tdb.lock_all()
 
     def test_max_dead(self):
-        self.tdb.max_dead = 20
+        if not tdb.__version__.startswith("2"):
+            self.tdb.max_dead = 20
 
     def test_unlockall(self):
         self.tdb.lock_all()
@@ -67,7 +77,8 @@ class SimpleTdbTests(TestCase):
         self.tdb.read_unlock_all()
 
     def test_reopen(self):
-        self.tdb.reopen()
+        if not tdb.__version__.startswith("2"):
+            self.tdb.reopen()
 
     def test_store(self):
         self.tdb.store("bar", "bla")
@@ -75,7 +86,8 @@ class SimpleTdbTests(TestCase):
 
     def test_getitem(self):
         self.tdb["bar"] = "foo"
-        self.tdb.reopen()
+        if not tdb.__version__.startswith("2"):
+            self.tdb.reopen()
         self.assertEquals("foo", self.tdb["bar"])
 
     def test_delete(self):
@@ -91,13 +103,16 @@ class SimpleTdbTests(TestCase):
         self.assertRaises(KeyError, lambda: self.tdb["bla"])
 
     def test_hash_size(self):
-        self.tdb.hash_size
+        if not tdb.__version__.startswith("2"):
+            self.tdb.hash_size
 
     def test_map_size(self):
-        self.tdb.map_size
+        if not tdb.__version__.startswith("2"):
+            self.tdb.map_size
 
     def test_freelist_size(self):
-        self.tdb.freelist_size
+        if not tdb.__version__.startswith("2"):
+            self.tdb.freelist_size
 
     def test_name(self):
         self.tdb.filename
@@ -105,7 +120,9 @@ class SimpleTdbTests(TestCase):
     def test_iterator(self):
         self.tdb["bla"] = "1"
         self.tdb["brainslug"] = "2"
-        self.assertEquals(["bla", "brainslug"], list(self.tdb))
+        l = list(self.tdb)
+        l.sort()
+        self.assertEquals(["bla", "brainslug"], l)
 
     def test_transaction_cancel(self):
         self.tdb["bloe"] = "2"
@@ -143,17 +160,19 @@ class SimpleTdbTests(TestCase):
         self.assertEquals(0, len(list(self.tdb)))
 
     def test_repack(self):
-        self.tdb["foo"] = "abc"
-        self.tdb["bar"] = "def"
-        del self.tdb["foo"]
-        self.tdb.repack()
+        if not tdb.__version__.startswith("2"):
+            self.tdb["foo"] = "abc"
+            self.tdb["bar"] = "def"
+            del self.tdb["foo"]
+            self.tdb.repack()
 
     def test_seqnum(self):
-        self.tdb.enable_seqnum()
-        seq1 = self.tdb.seqnum
-        self.tdb.increment_seqnum_nonblock()
-        seq2 = self.tdb.seqnum
-        self.assertEquals(seq2-seq1, 1)
+        if not tdb.__version__.startswith("2"):
+            self.tdb.enable_seqnum()
+            seq1 = self.tdb.seqnum
+            self.tdb.increment_seqnum_nonblock()
+            seq2 = self.tdb.seqnum
+            self.assertEquals(seq2-seq1, 1)
 
     def test_len(self):
         self.assertEquals(0, len(list(self.tdb)))
@@ -161,8 +180,12 @@ class SimpleTdbTests(TestCase):
         self.assertEquals(1, len(list(self.tdb)))
 
     def test_add_flags(self):
-        self.tdb.add_flags(tdb.NOMMAP)
-        self.tdb.remove_flags(tdb.NOMMAP)
+        if tdb.__version__.startswith("2"):
+            self.tdb.add_flag(tdb.NOMMAP)
+            self.tdb.remove_flag(tdb.NOMMAP)
+        else:
+            self.tdb.add_flags(tdb.NOMMAP)
+            self.tdb.remove_flags(tdb.NOMMAP)
 
 
 class VersionTests(TestCase):
diff --git a/lib/tdb/tools/tdbrestore.c b/lib/tdb/tools/tdbrestore.c
index 95ee360647..1daac63db1 100644
--- a/lib/tdb/tools/tdbrestore.c
+++ b/lib/tdb/tools/tdbrestore.c
@@ -170,7 +170,7 @@ static int read_rec(FILE *f, TDB_CONTEXT *tdb, int *eof)
 	    || (swallow(f, "}\n", NULL) == -1)) {
 		goto fail;
 	}
-	if (tdb_store(tdb, key, data, TDB_INSERT) == -1) {
+	if (tdb_store(tdb, key, data, TDB_INSERT) != 0) {
 		fprintf(stderr, "TDB error: %s\n", tdb_errorstr(tdb));
 		goto fail;
 	}
diff --git a/lib/tdb/tools/tdbtool.c b/lib/tdb/tools/tdbtool.c
index cd17f79e32..99d4841cf3 100644
--- a/lib/tdb/tools/tdbtool.c
+++ b/lib/tdb/tools/tdbtool.c
@@ -257,7 +257,7 @@ static void insert_tdb(char *keyname, size_t keylen, char* data, size_t datalen)
 	dbuf.dptr = (unsigned char *)data;
 	dbuf.dsize = datalen;
 
-	if (tdb_store(tdb, key, dbuf, TDB_INSERT) == -1) {
+	if (tdb_store(tdb, key, dbuf, TDB_INSERT) != 0) {
 		terror("insert failed");
 	}
 }
@@ -284,7 +284,7 @@ static void store_tdb(char *keyname, size_t keylen, char* data, size_t datalen)
 	printf("Storing key:\n");
 	print_rec(tdb, key, dbuf, NULL);
 
-	if (tdb_store(tdb, key, dbuf, TDB_REPLACE) == -1) {
+	if (tdb_store(tdb, key, dbuf, TDB_REPLACE) != 0) {
 		terror("store failed");
 	}
 }
@@ -363,7 +363,7 @@ static void move_rec(char *keyname, size_t keylen, char* tdbname)
 		return;
 	}
 	
-	if ( tdb_store( dst_tdb, key, dbuf, TDB_REPLACE ) == -1 ) {
+	if (tdb_store( dst_tdb, key, dbuf, TDB_REPLACE ) != 0) {
 		terror("failed to move record");
 	}
 	else
diff --git a/lib/tdb2/LICENSE b/lib/tdb2/LICENSE
new file mode 100644
index 0000000000..cca7fc278f
--- /dev/null
+++ b/lib/tdb2/LICENSE
@@ -0,0 +1,165 @@
+		   GNU LESSER GENERAL PUBLIC LICENSE
+                       Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+
+  This version of the GNU Lesser General Public License incorporates
+the terms and conditions of version 3 of the GNU General Public
+License, supplemented by the additional permissions listed below.
+
+  0. Additional Definitions.
+
+  As used herein, "this License" refers to version 3 of the GNU Lesser
+General Public License, and the "GNU GPL" refers to version 3 of the GNU
+General Public License.
+
+  "The Library" refers to a covered work governed by this License,
+other than an Application or a Combined Work as defined below.
+
+  An "Application" is any work that makes use of an interface provided
+by the Library, but which is not otherwise based on the Library.
+Defining a subclass of a class defined by the Library is deemed a mode
+of using an interface provided by the Library.
+
+  A "Combined Work" is a work produced by combining or linking an
+Application with the Library.  The particular version of the Library
+with which the Combined Work was made is also called the "Linked
+Version".
+
+  The "Minimal Corresponding Source" for a Combined Work means the
+Corresponding Source for the Combined Work, excluding any source code
+for portions of the Combined Work that, considered in isolation, are
+based on the Application, and not on the Linked Version.
+
+  The "Corresponding Application Code" for a Combined Work means the
+object code and/or source code for the Application, including any data
+and utility programs needed for reproducing the Combined Work from the
+Application, but excluding the System Libraries of the Combined Work.
+
+  1. Exception to Section 3 of the GNU GPL.
+
+  You may convey a covered work under sections 3 and 4 of this License
+without being bound by section 3 of the GNU GPL.
+
+  2. Conveying Modified Versions.
+
+  If you modify a copy of the Library, and, in your modifications, a
+facility refers to a function or data to be supplied by an Application
+that uses the facility (other than as an argument passed when the
+facility is invoked), then you may convey a copy of the modified
+version:
+
+   a) under this License, provided that you make a good faith effort to
+   ensure that, in the event an Application does not supply the
+   function or data, the facility still operates, and performs
+   whatever part of its purpose remains meaningful, or
+
+   b) under the GNU GPL, with none of the additional permissions of
+   this License applicable to that copy.
+
+  3. Object Code Incorporating Material from Library Header Files.
+
+  The object code form of an Application may incorporate material from
+a header file that is part of the Library.  You may convey such object
+code under terms of your choice, provided that, if the incorporated
+material is not limited to numerical parameters, data structure
+layouts and accessors, or small macros, inline functions and templates
+(ten or fewer lines in length), you do both of the following:
+
+   a) Give prominent notice with each copy of the object code that the
+   Library is used in it and that the Library and its use are
+   covered by this License.
+
+   b) Accompany the object code with a copy of the GNU GPL and this license
+   document.
+
+  4. Combined Works.
+
+  You may convey a Combined Work under terms of your choice that,
+taken together, effectively do not restrict modification of the
+portions of the Library contained in the Combined Work and reverse
+engineering for debugging such modifications, if you also do each of
+the following:
+
+   a) Give prominent notice with each copy of the Combined Work that
+   the Library is used in it and that the Library and its use are
+   covered by this License.
+
+   b) Accompany the Combined Work with a copy of the GNU GPL and this license
+   document.
+
+   c) For a Combined Work that displays copyright notices during
+   execution, include the copyright notice for the Library among
+   these notices, as well as a reference directing the user to the
+   copies of the GNU GPL and this license document.
+
+   d) Do one of the following:
+
+       0) Convey the Minimal Corresponding Source under the terms of this
+       License, and the Corresponding Application Code in a form
+       suitable for, and under terms that permit, the user to
+       recombine or relink the Application with a modified version of
+       the Linked Version to produce a modified Combined Work, in the
+       manner specified by section 6 of the GNU GPL for conveying
+       Corresponding Source.
+
+       1) Use a suitable shared library mechanism for linking with the
+       Library.  A suitable mechanism is one that (a) uses at run time
+       a copy of the Library already present on the user's computer
+       system, and (b) will operate properly with a modified version
+       of the Library that is interface-compatible with the Linked
+       Version.
+
+   e) Provide Installation Information, but only if you would otherwise
+   be required to provide such information under section 6 of the
+   GNU GPL, and only to the extent that such information is
+   necessary to install and execute a modified version of the
+   Combined Work produced by recombining or relinking the
+   Application with a modified version of the Linked Version. (If
+   you use option 4d0, the Installation Information must accompany
+   the Minimal Corresponding Source and Corresponding Application
+   Code. If you use option 4d1, you must provide the Installation
+   Information in the manner specified by section 6 of the GNU GPL
+   for conveying Corresponding Source.)
+
+  5. Combined Libraries.
+
+  You may place library facilities that are a work based on the
+Library side by side in a single library together with other library
+facilities that are not Applications and are not covered by this
+License, and convey such a combined library under terms of your
+choice, if you do both of the following:
+
+   a) Accompany the combined library with a copy of the same work based
+   on the Library, uncombined with any other library facilities,
+   conveyed under the terms of this License.
+
+   b) Give prominent notice with the combined library that part of it
+   is a work based on the Library, and explaining where to find the
+   accompanying uncombined form of the same work.
+
+  6. Revised Versions of the GNU Lesser General Public License.
+
+  The Free Software Foundation may publish revised and/or new versions
+of the GNU Lesser General Public License from time to time. Such new
+versions will be similar in spirit to the present version, but may
+differ in detail to address new problems or concerns.
+
+  Each version is given a distinguishing version number. If the
+Library as you received it specifies that a certain numbered version
+of the GNU Lesser General Public License "or any later version"
+applies to it, you have the option of following the terms and
+conditions either of that published version or of any later version
+published by the Free Software Foundation. If the Library as you
+received it does not specify a version number of the GNU Lesser
+General Public License, you may choose any version of the GNU Lesser
+General Public License ever published by the Free Software Foundation.
+
+  If the Library as you received it specifies that a proxy can decide
+whether future versions of the GNU Lesser General Public License shall
+apply, that proxy's public statement of acceptance of any version is
+permanent authorization for you to choose that version for the
+Library.
diff --git a/lib/tdb2/TODO b/lib/tdb2/TODO
new file mode 100644
index 0000000000..0a9374f016
--- /dev/null
+++ b/lib/tdb2/TODO
@@ -0,0 +1,4 @@
+- tdb2restore, tdb2dump, tdb2backup
+- tdb2tool man page
+- Integrate ccan testsuite
+- Integrate tdb2 testsuite
diff --git a/lib/tdb2/_info b/lib/tdb2/_info
new file mode 100644
index 0000000000..7213d67a22
--- /dev/null
+++ b/lib/tdb2/_info
@@ -0,0 +1,91 @@
+#include <string.h>
+#include <stdio.h>
+
+/**
+ * tdb2 - [[WORK IN PROGRESS!]] The trivial (64bit transactional) database
+ *
+ * The tdb2 module provides an efficient keyword data mapping (usually
+ * within a file).  It supports transactions, so the contents of the
+ * database is reliable even across crashes.
+ *
+ * Example:
+ *	#include <ccan/tdb2/tdb2.h>
+ *	#include <ccan/str/str.h>
+ *	#include <err.h>
+ *	#include <stdio.h>
+ *
+ *	static void usage(const char *argv0)
+ *	{
+ *		errx(1, "Usage: %s fetch <dbfile> <key>\n"
+ *		     "OR %s store <dbfile> <key> <data>", argv0, argv0);
+ *	}
+ *
+ *	int main(int argc, char *argv[])
+ *	{
+ *		struct tdb_context *tdb;
+ *		TDB_DATA key, value;
+ *		enum TDB_ERROR error;
+ *
+ *		if (argc < 4)
+ *			usage(argv[0]);
+ *
+ *		tdb = tdb_open(argv[2], TDB_DEFAULT, O_CREAT|O_RDWR,0600, NULL);
+ *		if (!tdb)
+ *			err(1, "Opening %s", argv[2]);
+ *
+ *		key.dptr = (void *)argv[3];
+ *		key.dsize = strlen(argv[3]);
+ *
+ *		if (streq(argv[1], "fetch")) {
+ *			if (argc != 4)
+ *				usage(argv[0]);
+ *			error = tdb_fetch(tdb, key, &value);
+ *			if (error)
+ *				errx(1, "fetch %s: %s",
+ *				     argv[3], tdb_errorstr(error));
+ *			printf("%.*s\n", value.dsize, (char *)value.dptr);
+ *			free(value.dptr);
+ *		} else if (streq(argv[1], "store")) {
+ *			if (argc != 5)
+ *				usage(argv[0]);
+ *			value.dptr = (void *)argv[4];
+ *			value.dsize = strlen(argv[4]);
+ *			error = tdb_store(tdb, key, value, 0);
+ *			if (error)
+ *				errx(1, "store %s: %s",
+ *				     argv[3], tdb_errorstr(error));
+ *		} else
+ *			usage(argv[0]);
+ *
+ *		return 0;
+ *	}
+ *
+ * Maintainer: Rusty Russell <rusty@rustcorp.com.au>
+ *
+ * Author: Rusty Russell
+ *
+ * License: LGPLv3 (or later)
+ */
+int main(int argc, char *argv[])
+{
+	if (argc != 2)
+		return 1;
+
+	if (strcmp(argv[1], "depends") == 0) {
+		printf("ccan/asprintf\n");
+		printf("ccan/hash\n");
+		printf("ccan/likely\n");
+		printf("ccan/asearch\n");
+		printf("ccan/compiler\n");
+		printf("ccan/build_assert\n");
+		printf("ccan/ilog\n");
+		printf("ccan/failtest\n");
+		printf("ccan/tally\n");
+		printf("ccan/typesafe_cb\n");
+		printf("ccan/cast\n");
+		printf("ccan/endian\n");
+		return 0;
+	}
+
+	return 1;
+}
diff --git a/lib/tdb2/check.c b/lib/tdb2/check.c
new file mode 100644
index 0000000000..52fb188764
--- /dev/null
+++ b/lib/tdb2/check.c
@@ -0,0 +1,835 @@
+ /*
+   Trivial Database 2: free list/block handling
+   Copyright (C) Rusty Russell 2010
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+#include "private.h"
+#include <ccan/likely/likely.h>
+#include <ccan/asearch/asearch.h>
+
+/* We keep an ordered array of offsets. */
+static bool append(tdb_off_t **arr, size_t *num, tdb_off_t off)
+{
+	tdb_off_t *new = realloc(*arr, (*num + 1) * sizeof(tdb_off_t));
+	if (!new)
+		return false;
+	new[(*num)++] = off;
+	*arr = new;
+	return true;
+}
+
+static enum TDB_ERROR check_header(struct tdb_context *tdb, tdb_off_t *recovery,
+				   uint64_t *features)
+{
+	uint64_t hash_test;
+	struct tdb_header hdr;
+	enum TDB_ERROR ecode;
+
+	ecode = tdb_read_convert(tdb, 0, &hdr, sizeof(hdr));
+	if (ecode != TDB_SUCCESS) {
+		return ecode;
+	}
+	/* magic food should not be converted, so convert back. */
+	tdb_convert(tdb, hdr.magic_food, sizeof(hdr.magic_food));
+
+	hash_test = TDB_HASH_MAGIC;
+	hash_test = tdb_hash(tdb, &hash_test, sizeof(hash_test));
+	if (hdr.hash_test != hash_test) {
+		return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
+				  "check: hash test %llu should be %llu",
+				  (long long)hdr.hash_test,
+				  (long long)hash_test);
+	}
+
+	if (strcmp(hdr.magic_food, TDB_MAGIC_FOOD) != 0) {
+		return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
+				  "check: bad magic '%.*s'",
+				  (unsigned)sizeof(hdr.magic_food),
+				  hdr.magic_food);
+	}
+
+	/* Features which are used must be a subset of features offered. */
+	if (hdr.features_used & ~hdr.features_offered) {
+		return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
+				  "check: features used (0x%llx) which"
+				  " are not offered (0x%llx)",
+				  (long long)hdr.features_used,
+				  (long long)hdr.features_offered);
+	}
+
+	*features = hdr.features_offered;
+	*recovery = hdr.recovery;
+	if (*recovery) {
+		if (*recovery < sizeof(hdr)
+		    || *recovery > tdb->file->map_size) {
+			return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
+					  "tdb_check:"
+					  " invalid recovery offset %zu",
+					  (size_t)*recovery);
+		}
+	}
+
+	/* Don't check reserved: they *can* be used later. */
+	return TDB_SUCCESS;
+}
+
+static enum TDB_ERROR check_hash_tree(struct tdb_context *tdb,
+				      tdb_off_t off, unsigned int group_bits,
+				      uint64_t hprefix,
+				      unsigned hprefix_bits,
+				      tdb_off_t used[],
+				      size_t num_used,
+				      size_t *num_found,
+				      enum TDB_ERROR (*check)(TDB_DATA,
+							      TDB_DATA, void *),
+				      void *data);
+
+static enum TDB_ERROR check_hash_chain(struct tdb_context *tdb,
+				       tdb_off_t off,
+				       uint64_t hash,
+				       tdb_off_t used[],
+				       size_t num_used,
+				       size_t *num_found,
+				       enum TDB_ERROR (*check)(TDB_DATA,
+							       TDB_DATA,
+							       void *),
+				       void *data)
+{
+	struct tdb_used_record rec;
+	enum TDB_ERROR ecode;
+
+	ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec));
+	if (ecode != TDB_SUCCESS) {
+		return ecode;
+	}
+
+	if (rec_magic(&rec) != TDB_CHAIN_MAGIC) {
+		return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
+				  "tdb_check: Bad hash chain magic %llu",
+				  (long long)rec_magic(&rec));
+	}
+
+	if (rec_data_length(&rec) != sizeof(struct tdb_chain)) {
+		return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
+				  "tdb_check:"
+				  " Bad hash chain length %llu vs %zu",
+				  (long long)rec_data_length(&rec),
+				  sizeof(struct tdb_chain));
+	}
+	if (rec_key_length(&rec) != 0) {
+		return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
+				  "tdb_check: Bad hash chain key length %llu",
+				  (long long)rec_key_length(&rec));
+	}
+	if (rec_hash(&rec) != 0) {
+		return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
+				  "tdb_check: Bad hash chain hash value %llu",
+				  (long long)rec_hash(&rec));
+	}
+
+	off += sizeof(rec);
+	ecode = check_hash_tree(tdb, off, 0, hash, 64,
+				used, num_used, num_found, check, data);
+	if (ecode != TDB_SUCCESS) {
+		return ecode;
+	}
+
+	off = tdb_read_off(tdb, off + offsetof(struct tdb_chain, next));
+	if (TDB_OFF_IS_ERR(off)) {
+		return off;
+	}
+	if (off == 0)
+		return TDB_SUCCESS;
+	(*num_found)++;
+	return check_hash_chain(tdb, off, hash, used, num_used, num_found,
+				check, data);
+}
+
+static enum TDB_ERROR check_hash_record(struct tdb_context *tdb,
+					tdb_off_t off,
+					uint64_t hprefix,
+					unsigned hprefix_bits,
+					tdb_off_t used[],
+					size_t num_used,
+					size_t *num_found,
+					enum TDB_ERROR (*check)(TDB_DATA,
+								TDB_DATA,
+								void *),
+					void *data)
+{
+	struct tdb_used_record rec;
+	enum TDB_ERROR ecode;
+
+	if (hprefix_bits >= 64)
+		return check_hash_chain(tdb, off, hprefix, used, num_used,
+					num_found, check, data);
+
+	ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec));
+	if (ecode != TDB_SUCCESS) {
+		return ecode;
+	}
+
+	if (rec_magic(&rec) != TDB_HTABLE_MAGIC) {
+		return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
+				  "tdb_check: Bad hash table magic %llu",
+				  (long long)rec_magic(&rec));
+	}
+	if (rec_data_length(&rec)
+	    != sizeof(tdb_off_t) << TDB_SUBLEVEL_HASH_BITS) {
+		return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
+				  "tdb_check:"
+				  " Bad hash table length %llu vs %llu",
+				  (long long)rec_data_length(&rec),
+				  (long long)sizeof(tdb_off_t)
+				  << TDB_SUBLEVEL_HASH_BITS);
+	}
+	if (rec_key_length(&rec) != 0) {
+		return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
+				  "tdb_check: Bad hash table key length %llu",
+				  (long long)rec_key_length(&rec));
+	}
+	if (rec_hash(&rec) != 0) {
+		return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
+				  "tdb_check: Bad hash table hash value %llu",
+				  (long long)rec_hash(&rec));
+	}
+
+	off += sizeof(rec);
+	return check_hash_tree(tdb, off,
+			       TDB_SUBLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS,
+			       hprefix, hprefix_bits,
+			       used, num_used, num_found, check, data);
+}
+
+static int off_cmp(const tdb_off_t *a, const tdb_off_t *b)
+{
+	/* Can overflow an int. */
+	return *a > *b ? 1
+		: *a < *b ? -1
+		: 0;
+}
+
+static uint64_t get_bits(uint64_t h, unsigned num, unsigned *used)
+{
+	*used += num;
+
+	return (h >> (64 - *used)) & ((1U << num) - 1);
+}
+
+static enum TDB_ERROR check_hash_tree(struct tdb_context *tdb,
+				      tdb_off_t off, unsigned int group_bits,
+				      uint64_t hprefix,
+				      unsigned hprefix_bits,
+				      tdb_off_t used[],
+				      size_t num_used,
+				      size_t *num_found,
+				      enum TDB_ERROR (*check)(TDB_DATA,
+							      TDB_DATA, void *),
+				      void *data)
+{
+	unsigned int g, b;
+	const tdb_off_t *hash;
+	struct tdb_used_record rec;
+	enum TDB_ERROR ecode;
+
+	hash = tdb_access_read(tdb, off,
+			       sizeof(tdb_off_t)
+			       << (group_bits + TDB_HASH_GROUP_BITS),
+			       true);
+	if (TDB_PTR_IS_ERR(hash)) {
+		return TDB_PTR_ERR(hash);
+	}
+
+	for (g = 0; g < (1 << group_bits); g++) {
+		const tdb_off_t *group = hash + (g << TDB_HASH_GROUP_BITS);
+		for (b = 0; b < (1 << TDB_HASH_GROUP_BITS); b++) {
+			unsigned int bucket, i, used_bits;
+			uint64_t h;
+			tdb_off_t *p;
+			if (group[b] == 0)
+				continue;
+
+			off = group[b] & TDB_OFF_MASK;
+			p = asearch(&off, used, num_used, off_cmp);
+			if (!p) {
+				ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT,
+						   TDB_LOG_ERROR,
+						   "tdb_check: Invalid offset"
+						   " %llu in hash",
+						   (long long)off);
+				goto fail;
+			}
+			/* Mark it invalid. */
+			*p ^= 1;
+			(*num_found)++;
+
+			if (hprefix_bits == 64) {
+				/* Chained entries are unordered. */
+				if (is_subhash(group[b])) {
+					ecode = TDB_ERR_CORRUPT;
+					tdb_logerr(tdb, ecode,
+						   TDB_LOG_ERROR,
+						   "tdb_check: Invalid chain"
+						   " entry subhash");
+					goto fail;
+				}
+				h = hash_record(tdb, off);
+				if (h != hprefix) {
+					ecode = TDB_ERR_CORRUPT;
+					tdb_logerr(tdb, ecode,
+						   TDB_LOG_ERROR,
+						   "check: bad hash chain"
+						   " placement"
+						   " 0x%llx vs 0x%llx",
+						   (long long)h,
+						   (long long)hprefix);
+					goto fail;
+				}
+				ecode = tdb_read_convert(tdb, off, &rec,
+							 sizeof(rec));
+				if (ecode != TDB_SUCCESS) {
+					goto fail;
+				}
+				goto check;
+			}
+
+			if (is_subhash(group[b])) {
+				uint64_t subprefix;
+				subprefix = (hprefix
+				     << (group_bits + TDB_HASH_GROUP_BITS))
+					+ g * (1 << TDB_HASH_GROUP_BITS) + b;
+
+				ecode = check_hash_record(tdb,
+					       group[b] & TDB_OFF_MASK,
+					       subprefix,
+					       hprefix_bits
+						       + group_bits
+						       + TDB_HASH_GROUP_BITS,
+					       used, num_used, num_found,
+					       check, data);
+				if (ecode != TDB_SUCCESS) {
+					goto fail;
+				}
+				continue;
+			}
+			/* A normal entry */
+
+			/* Does it belong here at all? */
+			h = hash_record(tdb, off);
+			used_bits = 0;
+			if (get_bits(h, hprefix_bits, &used_bits) != hprefix
+			    && hprefix_bits) {
+				ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT,
+						   TDB_LOG_ERROR,
+						   "check: bad hash placement"
+						   " 0x%llx vs 0x%llx",
+						   (long long)h,
+						   (long long)hprefix);
+				goto fail;
+			}
+
+			/* Does it belong in this group? */
+			if (get_bits(h, group_bits, &used_bits) != g) {
+				ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT,
+						   TDB_LOG_ERROR,
+						   "check: bad group %llu"
+						   " vs %u",
+						   (long long)h, g);
+				goto fail;
+			}
+
+			/* Are bucket bits correct? */
+			bucket = group[b] & TDB_OFF_HASH_GROUP_MASK;
+			if (get_bits(h, TDB_HASH_GROUP_BITS, &used_bits)
+			    != bucket) {
+				used_bits -= TDB_HASH_GROUP_BITS;
+				ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT,
+						   TDB_LOG_ERROR,
+						   "check: bad bucket %u vs %u",
+						   (unsigned)get_bits(h,
+							TDB_HASH_GROUP_BITS,
+							&used_bits),
+						   bucket);
+				goto fail;
+			}
+
+			/* There must not be any zero entries between
+			 * the bucket it belongs in and this one! */
+			for (i = bucket;
+			     i != b;
+			     i = (i + 1) % (1 << TDB_HASH_GROUP_BITS)) {
+				if (group[i] == 0) {
+					ecode = TDB_ERR_CORRUPT;
+					tdb_logerr(tdb, ecode,
+						   TDB_LOG_ERROR,
+						   "check: bad group placement"
+						   " %u vs %u",
+						   b, bucket);
+					goto fail;
+				}
+			}
+
+			ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec));
+			if (ecode != TDB_SUCCESS) {
+				goto fail;
+			}
+
+			/* Bottom bits must match header. */
+			if ((h & ((1 << 11)-1)) != rec_hash(&rec)) {
+				ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT,
+						   TDB_LOG_ERROR,
+						   "tdb_check: Bad hash magic"
+						   " at offset %llu"
+						   " (0x%llx vs 0x%llx)",
+						   (long long)off,
+						   (long long)h,
+						   (long long)rec_hash(&rec));
+				goto fail;
+			}
+
+		check:
+			if (check) {
+				TDB_DATA k, d;
+				const unsigned char *kptr;
+
+				kptr = tdb_access_read(tdb,
+						       off + sizeof(rec),
+						       rec_key_length(&rec)
+						       + rec_data_length(&rec),
+						       false);
+				if (TDB_PTR_IS_ERR(kptr)) {
+					ecode = TDB_PTR_ERR(kptr);
+					goto fail;
+				}
+
+				k = tdb_mkdata(kptr, rec_key_length(&rec));
+				d = tdb_mkdata(kptr + k.dsize,
+					       rec_data_length(&rec));
+				ecode = check(k, d, data);
+				tdb_access_release(tdb, kptr);
+				if (ecode != TDB_SUCCESS) {
+					goto fail;
+				}
+			}
+		}
+	}
+	tdb_access_release(tdb, hash);
+	return TDB_SUCCESS;
+
+fail:
+	tdb_access_release(tdb, hash);
+	return ecode;
+}
+
+static enum TDB_ERROR check_hash(struct tdb_context *tdb,
+				 tdb_off_t used[],
+				 size_t num_used, size_t num_ftables,
+				 int (*check)(TDB_DATA, TDB_DATA, void *),
+				 void *data)
+{
+	/* Free tables also show up as used. */
+	size_t num_found = num_ftables;
+	enum TDB_ERROR ecode;
+
+	ecode = check_hash_tree(tdb, offsetof(struct tdb_header, hashtable),
+				TDB_TOPLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS,
+				0, 0, used, num_used, &num_found,
+				check, data);
+	if (ecode == TDB_SUCCESS) {
+		if (num_found != num_used) {
+			ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
+					   "tdb_check: Not all entries"
+					   " are in hash");
+		}
+	}
+	return ecode;
+}
+
+static enum TDB_ERROR check_free(struct tdb_context *tdb,
+				 tdb_off_t off,
+				 const struct tdb_free_record *frec,
+				 tdb_off_t prev, unsigned int ftable,
+				 unsigned int bucket)
+{
+	enum TDB_ERROR ecode;
+
+	if (frec_magic(frec) != TDB_FREE_MAGIC) {
+		return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
+				  "tdb_check: offset %llu bad magic 0x%llx",
+				  (long long)off,
+				  (long long)frec->magic_and_prev);
+	}
+	if (frec_ftable(frec) != ftable) {
+		return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
+				  "tdb_check: offset %llu bad freetable %u",
+				  (long long)off, frec_ftable(frec));
+
+	}
+
+	ecode = tdb->methods->oob(tdb, off
+				  + frec_len(frec)
+				  + sizeof(struct tdb_used_record),
+				  false);
+	if (ecode != TDB_SUCCESS) {
+		return ecode;
+	}
+	if (size_to_bucket(frec_len(frec)) != bucket) {
+		return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
+				  "tdb_check: offset %llu in wrong bucket"
+				  " (%u vs %u)",
+				  (long long)off,
+				  bucket, size_to_bucket(frec_len(frec)));
+	}
+	if (prev && prev != frec_prev(frec)) {
+		return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
+				  "tdb_check: offset %llu bad prev"
+				  " (%llu vs %llu)",
+				  (long long)off,
+				  (long long)prev, (long long)frec_len(frec));
+	}
+	return TDB_SUCCESS;
+}
+
+static enum TDB_ERROR check_free_table(struct tdb_context *tdb,
+				       tdb_off_t ftable_off,
+				       unsigned ftable_num,
+				       tdb_off_t fr[],
+				       size_t num_free,
+				       size_t *num_found)
+{
+	struct tdb_freetable ft;
+	tdb_off_t h;
+	unsigned int i;
+	enum TDB_ERROR ecode;
+
+	ecode = tdb_read_convert(tdb, ftable_off, &ft, sizeof(ft));
+	if (ecode != TDB_SUCCESS) {
+		return ecode;
+	}
+
+	if (rec_magic(&ft.hdr) != TDB_FTABLE_MAGIC
+	    || rec_key_length(&ft.hdr) != 0
+	    || rec_data_length(&ft.hdr) != sizeof(ft) - sizeof(ft.hdr)
+	    || rec_hash(&ft.hdr) != 0) {
+		return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
+				  "tdb_check: Invalid header on free table");
+	}
+
+	for (i = 0; i < TDB_FREE_BUCKETS; i++) {
+		tdb_off_t off, prev = 0, *p, first = 0;
+		struct tdb_free_record f;
+
+		h = bucket_off(ftable_off, i);
+		for (off = tdb_read_off(tdb, h); off; off = f.next) {
+			if (TDB_OFF_IS_ERR(off)) {
+				return off;
+			}
+			if (!first) {
+				off &= TDB_OFF_MASK;
+				first = off;
+			}
+			ecode = tdb_read_convert(tdb, off, &f, sizeof(f));
+			if (ecode != TDB_SUCCESS) {
+				return ecode;
+			}
+			ecode = check_free(tdb, off, &f, prev, ftable_num, i);
+			if (ecode != TDB_SUCCESS) {
+				return ecode;
+			}
+
+			/* FIXME: Check hash bits */
+			p = asearch(&off, fr, num_free, off_cmp);
+			if (!p) {
+				return tdb_logerr(tdb, TDB_ERR_CORRUPT,
+						  TDB_LOG_ERROR,
+						  "tdb_check: Invalid offset"
+						  " %llu in free table",
+						  (long long)off);
+			}
+			/* Mark it invalid. */
+			*p ^= 1;
+			(*num_found)++;
+			prev = off;
+		}
+
+		if (first) {
+			/* Now we can check first back pointer. */
+			ecode = tdb_read_convert(tdb, first, &f, sizeof(f));
+			if (ecode != TDB_SUCCESS) {
+				return ecode;
+			}
+			ecode = check_free(tdb, first, &f, prev, ftable_num, i);
+			if (ecode != TDB_SUCCESS) {
+				return ecode;
+			}
+		}
+	}
+	return TDB_SUCCESS;
+}
+
+/* Slow, but should be very rare. */
+tdb_off_t dead_space(struct tdb_context *tdb, tdb_off_t off)
+{
+	size_t len;
+	enum TDB_ERROR ecode;
+
+	for (len = 0; off + len < tdb->file->map_size; len++) {
+		char c;
+		ecode = tdb->methods->tread(tdb, off, &c, 1);
+		if (ecode != TDB_SUCCESS) {
+			return ecode;
+		}
+		if (c != 0 && c != 0x43)
+			break;
+	}
+	return len;
+}
+
+static enum TDB_ERROR check_linear(struct tdb_context *tdb,
+				   tdb_off_t **used, size_t *num_used,
+				   tdb_off_t **fr, size_t *num_free,
+				   uint64_t features, tdb_off_t recovery)
+{
+	tdb_off_t off;
+	tdb_len_t len;
+	enum TDB_ERROR ecode;
+	bool found_recovery = false;
+
+	for (off = sizeof(struct tdb_header);
+	     off < tdb->file->map_size;
+	     off += len) {
+		union {
+			struct tdb_used_record u;
+			struct tdb_free_record f;
+			struct tdb_recovery_record r;
+		} rec;
+		/* r is larger: only get that if we need to. */
+		ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec.f));
+		if (ecode != TDB_SUCCESS) {
+			return ecode;
+		}
+
+		/* If we crash after ftruncate, we can get zeroes or fill. */
+		if (rec.r.magic == TDB_RECOVERY_INVALID_MAGIC
+		    || rec.r.magic ==  0x4343434343434343ULL) {
+			ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec.r));
+			if (ecode != TDB_SUCCESS) {
+				return ecode;
+			}
+			if (recovery == off) {
+				found_recovery = true;
+				len = sizeof(rec.r) + rec.r.max_len;
+			} else {
+				len = dead_space(tdb, off);
+				if (TDB_OFF_IS_ERR(len)) {
+					return len;
+				}
+				if (len < sizeof(rec.r)) {
+					return tdb_logerr(tdb, TDB_ERR_CORRUPT,
+							  TDB_LOG_ERROR,
+							  "tdb_check: invalid"
+							  " dead space at %zu",
+							  (size_t)off);
+				}
+
+				tdb_logerr(tdb, TDB_SUCCESS, TDB_LOG_WARNING,
+					   "Dead space at %zu-%zu (of %zu)",
+					   (size_t)off, (size_t)(off + len),
+					   (size_t)tdb->file->map_size);
+			}
+		} else if (rec.r.magic == TDB_RECOVERY_MAGIC) {
+			ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec.r));
+			if (ecode != TDB_SUCCESS) {
+				return ecode;
+			}
+			if (recovery != off) {
+				return tdb_logerr(tdb, TDB_ERR_CORRUPT,
+						  TDB_LOG_ERROR,
+						  "tdb_check: unexpected"
+						  " recovery record at offset"
+						  " %zu",
+						  (size_t)off);
+			}
+			if (rec.r.len > rec.r.max_len) {
+				return tdb_logerr(tdb, TDB_ERR_CORRUPT,
+						  TDB_LOG_ERROR,
+						  "tdb_check: invalid recovery"
+						  " length %zu",
+						  (size_t)rec.r.len);
+			}
+			if (rec.r.eof > tdb->file->map_size) {
+				return tdb_logerr(tdb, TDB_ERR_CORRUPT,
+						  TDB_LOG_ERROR,
+						  "tdb_check: invalid old EOF"
+						  " %zu", (size_t)rec.r.eof);
+			}
+			found_recovery = true;
+			len = sizeof(rec.r) + rec.r.max_len;
+		} else if (frec_magic(&rec.f) == TDB_FREE_MAGIC) {
+			len = sizeof(rec.u) + frec_len(&rec.f);
+			if (off + len > tdb->file->map_size) {
+				return tdb_logerr(tdb, TDB_ERR_CORRUPT,
+						  TDB_LOG_ERROR,
+						  "tdb_check: free overlength"
+						  " %llu at offset %llu",
+						  (long long)len,
+						  (long long)off);
+			}
+			/* This record should be in free lists. */
+			if (frec_ftable(&rec.f) != TDB_FTABLE_NONE
+			    && !append(fr, num_free, off)) {
+				return tdb_logerr(tdb, TDB_ERR_OOM,
+						  TDB_LOG_ERROR,
+						  "tdb_check: tracking %zu'th"
+						  " free record.", *num_free);
+			}
+		} else if (rec_magic(&rec.u) == TDB_USED_MAGIC
+			   || rec_magic(&rec.u) == TDB_CHAIN_MAGIC
+			   || rec_magic(&rec.u) == TDB_HTABLE_MAGIC
+			   || rec_magic(&rec.u) == TDB_FTABLE_MAGIC) {
+			uint64_t klen, dlen, extra;
+
+			/* This record is used! */
+			if (!append(used, num_used, off)) {
+				return tdb_logerr(tdb, TDB_ERR_OOM,
+						  TDB_LOG_ERROR,
+						  "tdb_check: tracking %zu'th"
+						  " used record.", *num_used);
+			}
+
+			klen = rec_key_length(&rec.u);
+			dlen = rec_data_length(&rec.u);
+			extra = rec_extra_padding(&rec.u);
+
+			len = sizeof(rec.u) + klen + dlen + extra;
+			if (off + len > tdb->file->map_size) {
+				return tdb_logerr(tdb, TDB_ERR_CORRUPT,
+						  TDB_LOG_ERROR,
+						  "tdb_check: used overlength"
+						  " %llu at offset %llu",
+						  (long long)len,
+						  (long long)off);
+			}
+
+			if (len < sizeof(rec.f)) {
+				return tdb_logerr(tdb, TDB_ERR_CORRUPT,
+						  TDB_LOG_ERROR,
+						  "tdb_check: too short record"
+						  " %llu at %llu",
+						  (long long)len,
+						  (long long)off);
+			}
+
+			/* Check that records have correct 0 at end (but may
+			 * not in future). */
+			if (extra && !features) {
+				const char *p;
+				char c;
+				p = tdb_access_read(tdb, off + sizeof(rec.u)
+						    + klen + dlen, 1, false);
+				if (TDB_PTR_IS_ERR(p))
+					return TDB_PTR_ERR(p);
+				c = *p;
+				tdb_access_release(tdb, p);
+
+				if (c != '\0') {
+					return tdb_logerr(tdb, TDB_ERR_CORRUPT,
+							  TDB_LOG_ERROR,
+							  "tdb_check:"
+							  " non-zero extra"
+							  " at %llu",
+							  (long long)off);
+				}
+			}
+		} else {
+			return tdb_logerr(tdb, TDB_ERR_CORRUPT,
+					  TDB_LOG_ERROR,
+					  "tdb_check: Bad magic 0x%llx"
+					  " at offset %zu",
+					  (long long)rec_magic(&rec.u),
+					  (size_t)off);
+		}
+	}
+
+	/* We must have found recovery area if there was one. */
+	if (recovery != 0 && !found_recovery) {
+		return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
+				  "tdb_check: expected a recovery area at %zu",
+				  (size_t)recovery);
+	}
+
+	return TDB_SUCCESS;
+}
+
+enum TDB_ERROR tdb_check_(struct tdb_context *tdb,
+			  enum TDB_ERROR (*check)(TDB_DATA, TDB_DATA, void *),
+			  void *data)
+{
+	tdb_off_t *fr = NULL, *used = NULL, ft, recovery;
+	size_t num_free = 0, num_used = 0, num_found = 0, num_ftables = 0;
+	uint64_t features;
+	enum TDB_ERROR ecode;
+
+	ecode = tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_WAIT, false);
+	if (ecode != TDB_SUCCESS) {
+		return tdb->last_error = ecode;
+	}
+
+	ecode = tdb_lock_expand(tdb, F_RDLCK);
+	if (ecode != TDB_SUCCESS) {
+		tdb_allrecord_unlock(tdb, F_RDLCK);
+		return tdb->last_error = ecode;
+	}
+
+	ecode = check_header(tdb, &recovery, &features);
+	if (ecode != TDB_SUCCESS)
+		goto out;
+
+	/* First we do a linear scan, checking all records. */
+	ecode = check_linear(tdb, &used, &num_used, &fr, &num_free, features,
+			     recovery);
+	if (ecode != TDB_SUCCESS)
+		goto out;
+
+	for (ft = first_ftable(tdb); ft; ft = next_ftable(tdb, ft)) {
+		if (TDB_OFF_IS_ERR(ft)) {
+			ecode = ft;
+			goto out;
+		}
+		ecode = check_free_table(tdb, ft, num_ftables, fr, num_free,
+					 &num_found);
+		if (ecode != TDB_SUCCESS)
+			goto out;
+		num_ftables++;
+	}
+
+	/* FIXME: Check key uniqueness? */
+	ecode = check_hash(tdb, used, num_used, num_ftables, check, data);
+	if (ecode != TDB_SUCCESS)
+		goto out;
+
+	if (num_found != num_free) {
+		ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
+				   "tdb_check: Not all entries are in"
+				   " free table");
+	}
+
+out:
+	tdb_allrecord_unlock(tdb, F_RDLCK);
+	tdb_unlock_expand(tdb, F_RDLCK);
+	free(fr);
+	free(used);
+	return tdb->last_error = ecode;
+}
diff --git a/lib/tdb2/doc/TDB1_porting.txt b/lib/tdb2/doc/TDB1_porting.txt
new file mode 100644
index 0000000000..90ba249738
--- /dev/null
+++ b/lib/tdb2/doc/TDB1_porting.txt
@@ -0,0 +1,44 @@
+Interface differences between TDB1 and TDB2.
+
+- tdb2 uses 'struct tdb_data', tdb1 uses 'struct TDB_DATA'.  Use the
+  TDB_DATA typedef if you want portability between the two.
+
+- tdb2 functions return 0 on success, and a negative error on failure,
+  whereas tdb1 functions returned 0 on success, and -1 on failure.
+  tdb1 then used tdb_error() to determine the error; this is also
+  supported in tdb2 to ease backwards compatibility, though the other
+  form is preferred.
+
+- tdb2's tdb_fetch() returns an error, tdb1's returned the data directly
+  (or tdb_null, and you were supposed to check tdb_error() to find out why).
+
+- tdb2's tdb_nextkey() frees the old key's dptr, in tdb2 you needed to do
+  this manually.
+
+- tdb1's tdb_open/tdb_open_ex took an explicit hash size.  tdb2's hash table
+  resizes as required.
+
+- tdb2 uses a linked list of attribute structures to implement logging and
+  alternate hashes.  tdb1 used tdb_open_ex, which was not extensible.
+
+- tdb2 does locking on read-only databases (ie. O_RDONLY passed to tdb_open).
+  tdb1 did not: use the TDB_NOLOCK flag if you want to suppress locking.
+
+- tdb2's log function is simpler than tdb1's log function.  The string is
+  already formatted, and it takes an enum tdb_log_level not a tdb_debug_level,
+  and which has only three values: TDB_LOG_ERROR, TDB_LOG_USE_ERROR and
+  TDB_LOG_WARNING.
+
+- tdb2 provides tdb_deq() for comparing two struct tdb_data.
+
+- tdb2's tdb_name() returns a copy of the name even for TDB_INTERNAL dbs.
+
+- tdb2 does not need tdb_reopen() or tdb_reopen_all().  If you call
+  fork() after during certain operations the child should close the
+  tdb, or complete the operations before continuing to use the tdb:
+
+	tdb_transaction_start(): child must tdb_transaction_cancel()
+	tdb_lockall(): child must call tdb_unlockall()
+	tdb_lockall_read(): child must call tdb_unlockall_read()
+	tdb_chainlock(): child must call tdb_chainunlock()
+	tdb_parse() callback: child must return from tdb_parse()
diff --git a/lib/tdb2/doc/design-1.3.txt b/lib/tdb2/doc/design-1.3.txt
new file mode 100644
index 0000000000..f81ecf7885
--- /dev/null
+++ b/lib/tdb2/doc/design-1.3.txt
@@ -0,0 +1,1049 @@
+TDB2: A Redesigning The Trivial DataBase
+
+Rusty Russell, IBM Corporation
+
+27-April-2010
+
+Abstract
+
+The Trivial DataBase on-disk format is 32 bits; with usage cases
+heading towards the 4G limit, that must change. This required
+breakage provides an opportunity to revisit TDB's other design
+decisions and reassess them.
+
+1 Introduction
+
+The Trivial DataBase was originally written by Andrew Tridgell as
+a simple key/data pair storage system with the same API as dbm,
+but allowing multiple readers and writers while being small
+enough (< 1000 lines of C) to include in SAMBA. The simple design
+created in 1999 has proven surprisingly robust and performant,
+used in Samba versions 3 and 4 as well as numerous other
+projects. Its useful life was greatly increased by the
+(backwards-compatible!) addition of transaction support in 2005.
+
+The wider variety and greater demands of TDB-using code has lead
+to some organic growth of the API, as well as some compromises on
+the implementation. None of these, by themselves, are seen as
+show-stoppers, but the cumulative effect is to a loss of elegance
+over the initial, simple TDB implementation. Here is a table of
+the approximate number of lines of implementation code and number
+of API functions at the end of each year:
+
+
++-----------+----------------+--------------------------------+
+| Year End  | API Functions  | Lines of C Code Implementation |
++-----------+----------------+--------------------------------+
++-----------+----------------+--------------------------------+
+|   1999    |      13        |              1195              |
++-----------+----------------+--------------------------------+
+|   2000    |      24        |              1725              |
++-----------+----------------+--------------------------------+
+|   2001    |      32        |              2228              |
++-----------+----------------+--------------------------------+
+|   2002    |      35        |              2481              |
++-----------+----------------+--------------------------------+
+|   2003    |      35        |              2552              |
++-----------+----------------+--------------------------------+
+|   2004    |      40        |              2584              |
++-----------+----------------+--------------------------------+
+|   2005    |      38        |              2647              |
++-----------+----------------+--------------------------------+
+|   2006    |      52        |              3754              |
++-----------+----------------+--------------------------------+
+|   2007    |      66        |              4398              |
++-----------+----------------+--------------------------------+
+|   2008    |      71        |              4768              |
++-----------+----------------+--------------------------------+
+|   2009    |      73        |              5715              |
++-----------+----------------+--------------------------------+
+
+
+This review is an attempt to catalog and address all the known
+issues with TDB and create solutions which address the problems
+without significantly increasing complexity; all involved are far
+too aware of the dangers of second system syndrome in rewriting a
+successful project like this.
+
+2 API Issues
+
+2.1 tdb_open_ex Is Not Expandable
+
+The tdb_open() call was expanded to tdb_open_ex(), which added an
+optional hashing function and an optional logging function
+argument. Additional arguments to open would require the
+introduction of a tdb_open_ex2 call etc.
+
+2.1.1 Proposed Solution
+
+tdb_open() will take a linked-list of attributes:
+
+enum tdb_attribute {
+
+    TDB_ATTRIBUTE_LOG = 0,
+
+    TDB_ATTRIBUTE_HASH = 1
+
+};
+
+struct tdb_attribute_base {
+
+    enum tdb_attribute attr;
+
+    union tdb_attribute *next;
+
+};
+
+struct tdb_attribute_log {
+
+    struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_LOG
+*/
+
+    tdb_log_func log_fn;
+
+    void *log_private;
+
+};
+
+struct tdb_attribute_hash {
+
+    struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_HASH
+*/
+
+    tdb_hash_func hash_fn;
+
+    void *hash_private;
+
+};
+
+union tdb_attribute {
+
+    struct tdb_attribute_base base;
+
+    struct tdb_attribute_log log;
+
+    struct tdb_attribute_hash hash;
+
+};
+
+This allows future attributes to be added, even if this expands
+the size of the union.
+
+2.2 tdb_traverse Makes Impossible Guarantees
+
+tdb_traverse (and tdb_firstkey/tdb_nextkey) predate transactions,
+and it was thought that it was important to guarantee that all
+records which exist at the start and end of the traversal would
+be included, and no record would be included twice.
+
+This adds complexity (see[Reliable-Traversal-Adds]) and does not
+work anyway for records which are altered (in particular, those
+which are expanded may be effectively deleted and re-added behind
+the traversal).
+
+2.2.1 <traverse-Proposed-Solution>Proposed Solution
+
+Abandon the guarantee. You will see every record if no changes
+occur during your traversal, otherwise you will see some subset.
+You can prevent changes by using a transaction or the locking
+API.
+
+2.3 Nesting of Transactions Is Fraught
+
+TDB has alternated between allowing nested transactions and not
+allowing them. Various paths in the Samba codebase assume that
+transactions will nest, and in a sense they can: the operation is
+only committed to disk when the outer transaction is committed.
+There are two problems, however:
+
+1. Canceling the inner transaction will cause the outer
+  transaction commit to fail, and will not undo any operations
+  since the inner transaction began. This problem is soluble with
+  some additional internal code.
+
+2. An inner transaction commit can be cancelled by the outer
+  transaction. This is desirable in the way which Samba's
+  database initialization code uses transactions, but could be a
+  surprise to any users expecting a successful transaction commit
+  to expose changes to others.
+
+The current solution is to specify the behavior at tdb_open(),
+with the default currently that nested transactions are allowed.
+This flag can also be changed at runtime.
+
+2.3.1 Proposed Solution
+
+Given the usage patterns, it seems that the “least-surprise”
+behavior of disallowing nested transactions should become the
+default. Additionally, it seems the outer transaction is the only
+code which knows whether inner transactions should be allowed, so
+a flag to indicate this could be added to tdb_transaction_start.
+However, this behavior can be simulated with a wrapper which uses
+tdb_add_flags() and tdb_remove_flags(), so the API should not be
+expanded for this relatively-obscure case.
+
+2.4 Incorrect Hash Function is Not Detected
+
+tdb_open_ex() allows the calling code to specify a different hash
+function to use, but does not check that all other processes
+accessing this tdb are using the same hash function. The result
+is that records are missing from tdb_fetch().
+
+2.4.1 Proposed Solution
+
+The header should contain an example hash result (eg. the hash of
+0xdeadbeef), and tdb_open_ex() should check that the given hash
+function produces the same answer, or fail the tdb_open call.
+
+2.5 tdb_set_max_dead/TDB_VOLATILE Expose Implementation
+
+In response to scalability issues with the free list ([TDB-Freelist-Is]
+) two API workarounds have been incorporated in TDB:
+tdb_set_max_dead() and the TDB_VOLATILE flag to tdb_open. The
+latter actually calls the former with an argument of “5”.
+
+This code allows deleted records to accumulate without putting
+them in the free list. On delete we iterate through each chain
+and free them in a batch if there are more than max_dead entries.
+These are never otherwise recycled except as a side-effect of a
+tdb_repack.
+
+2.5.1 Proposed Solution
+
+With the scalability problems of the freelist solved, this API
+can be removed. The TDB_VOLATILE flag may still be useful as a
+hint that store and delete of records will be at least as common
+as fetch in order to allow some internal tuning, but initially
+will become a no-op.
+
+2.6 <TDB-Files-Cannot>TDB Files Cannot Be Opened Multiple Times
+  In The Same Process
+
+No process can open the same TDB twice; we check and disallow it.
+This is an unfortunate side-effect of fcntl locks, which operate
+on a per-file rather than per-file-descriptor basis, and do not
+nest. Thus, closing any file descriptor on a file clears all the
+locks obtained by this process, even if they were placed using a
+different file descriptor!
+
+Note that even if this were solved, deadlock could occur if
+operations were nested: this is a more manageable programming
+error in most cases.
+
+2.6.1 Proposed Solution
+
+We could lobby POSIX to fix the perverse rules, or at least lobby
+Linux to violate them so that the most common implementation does
+not have this restriction. This would be a generally good idea
+for other fcntl lock users.
+
+Samba uses a wrapper which hands out the same tdb_context to
+multiple callers if this happens, and does simple reference
+counting. We should do this inside the tdb library, which already
+emulates lock nesting internally; it would need to recognize when
+deadlock occurs within a single process. This would create a new
+failure mode for tdb operations (while we currently handle
+locking failures, they are impossible in normal use and a process
+encountering them can do little but give up).
+
+I do not see benefit in an additional tdb_open flag to indicate
+whether re-opening is allowed, as though there may be some
+benefit to adding a call to detect when a tdb_context is shared,
+to allow other to create such an API.
+
+2.7 TDB API Is Not POSIX Thread-safe
+
+The TDB API uses an error code which can be queried after an
+operation to determine what went wrong. This programming model
+does not work with threads, unless specific additional guarantees
+are given by the implementation. In addition, even
+otherwise-independent threads cannot open the same TDB (as in [TDB-Files-Cannot]
+).
+
+2.7.1 Proposed Solution
+
+Reachitecting the API to include a tdb_errcode pointer would be a
+great deal of churn; we are better to guarantee that the
+tdb_errcode is per-thread so the current programming model can be
+maintained.
+
+This requires dynamic per-thread allocations, which is awkward
+with POSIX threads (pthread_key_create space is limited and we
+cannot simply allocate a key for every TDB).
+
+Internal locking is required to make sure that fcntl locks do not
+overlap between threads, and also that the global list of tdbs is
+maintained.
+
+The aim is that building tdb with -DTDB_PTHREAD will result in a
+pthread-safe version of the library, and otherwise no overhead
+will exist.
+
+2.8 *_nonblock Functions And *_mark Functions Expose
+  Implementation
+
+CTDB[footnote:
+Clustered TDB, see http://ctdb.samba.org
+] wishes to operate on TDB in a non-blocking manner. This is
+currently done as follows:
+
+1. Call the _nonblock variant of an API function (eg.
+  tdb_lockall_nonblock). If this fails:
+
+2. Fork a child process, and wait for it to call the normal
+  variant (eg. tdb_lockall).
+
+3. If the child succeeds, call the _mark variant to indicate we
+  already have the locks (eg. tdb_lockall_mark).
+
+4. Upon completion, tell the child to release the locks (eg.
+  tdb_unlockall).
+
+5. Indicate to tdb that it should consider the locks removed (eg.
+  tdb_unlockall_mark).
+
+There are several issues with this approach. Firstly, adding two
+new variants of each function clutters the API for an obscure
+use, and so not all functions have three variants. Secondly, it
+assumes that all paths of the functions ask for the same locks,
+otherwise the parent process will have to get a lock which the
+child doesn't have under some circumstances. I don't believe this
+is currently the case, but it constrains the implementation.
+
+2.8.1 <Proposed-Solution-locking-hook>Proposed Solution
+
+Implement a hook for locking methods, so that the caller can
+control the calls to create and remove fcntl locks. In this
+scenario, ctdbd would operate as follows:
+
+1. Call the normal API function, eg tdb_lockall().
+
+2. When the lock callback comes in, check if the child has the
+  lock. Initially, this is always false. If so, return 0.
+  Otherwise, try to obtain it in non-blocking mode. If that
+  fails, return EWOULDBLOCK.
+
+3. Release locks in the unlock callback as normal.
+
+4. If tdb_lockall() fails, see if we recorded a lock failure; if
+  so, call the child to repeat the operation.
+
+5. The child records what locks it obtains, and returns that
+  information to the parent.
+
+6. When the child has succeeded, goto 1.
+
+This is flexible enough to handle any potential locking scenario,
+even when lock requirements change. It can be optimized so that
+the parent does not release locks, just tells the child which
+locks it doesn't need to obtain.
+
+It also keeps the complexity out of the API, and in ctdbd where
+it is needed.
+
+2.9 tdb_chainlock Functions Expose Implementation
+
+tdb_chainlock locks some number of records, including the record
+indicated by the given key. This gave atomicity guarantees;
+no-one can start a transaction, alter, read or delete that key
+while the lock is held.
+
+It also makes the same guarantee for any other key in the chain,
+which is an internal implementation detail and potentially a
+cause for deadlock.
+
+2.9.1 Proposed Solution
+
+None. It would be nice to have an explicit single entry lock
+which effected no other keys. Unfortunately, this won't work for
+an entry which doesn't exist. Thus while chainlock may be
+implemented more efficiently for the existing case, it will still
+have overlap issues with the non-existing case. So it is best to
+keep the current (lack of) guarantee about which records will be
+effected to avoid constraining our implementation.
+
+2.10 Signal Handling is Not Race-Free
+
+The tdb_setalarm_sigptr() call allows the caller's signal handler
+to indicate that the tdb locking code should return with a
+failure, rather than trying again when a signal is received (and
+errno == EAGAIN). This is usually used to implement timeouts.
+
+Unfortunately, this does not work in the case where the signal is
+received before the tdb code enters the fcntl() call to place the
+lock: the code will sleep within the fcntl() code, unaware that
+the signal wants it to exit. In the case of long timeouts, this
+does not happen in practice.
+
+2.10.1 Proposed Solution
+
+The locking hooks proposed in[Proposed-Solution-locking-hook]
+would allow the user to decide on whether to fail the lock
+acquisition on a signal. This allows the caller to choose their
+own compromise: they could narrow the race by checking
+immediately before the fcntl call.[footnote:
+It may be possible to make this race-free in some implementations
+by having the signal handler alter the struct flock to make it
+invalid. This will cause the fcntl() lock call to fail with
+EINVAL if the signal occurs before the kernel is entered,
+otherwise EAGAIN.
+]
+
+2.11 The API Uses Gratuitous Typedefs, Capitals
+
+typedefs are useful for providing source compatibility when types
+can differ across implementations, or arguably in the case of
+function pointer definitions which are hard for humans to parse.
+Otherwise it is simply obfuscation and pollutes the namespace.
+
+Capitalization is usually reserved for compile-time constants and
+macros.
+
+  TDB_CONTEXT There is no reason to use this over 'struct
+  tdb_context'; the definition isn't visible to the API user
+  anyway.
+
+  TDB_DATA There is no reason to use this over struct TDB_DATA;
+  the struct needs to be understood by the API user.
+
+  struct TDB_DATA This would normally be called 'struct
+  tdb_data'.
+
+  enum TDB_ERROR Similarly, this would normally be enum
+  tdb_error.
+
+2.11.1 Proposed Solution
+
+None. Introducing lower case variants would please pedants like
+myself, but if it were done the existing ones should be kept.
+There is little point forcing a purely cosmetic change upon tdb
+users.
+
+2.12 <tdb_log_func-Doesnt-Take>tdb_log_func Doesn't Take The
+  Private Pointer
+
+For API compatibility reasons, the logging function needs to call
+tdb_get_logging_private() to retrieve the pointer registered by
+the tdb_open_ex for logging.
+
+2.12.1 Proposed Solution
+
+It should simply take an extra argument, since we are prepared to
+break the API/ABI.
+
+2.13 Various Callback Functions Are Not Typesafe
+
+The callback functions in tdb_set_logging_function (after [tdb_log_func-Doesnt-Take]
+ is resolved), tdb_parse_record, tdb_traverse, tdb_traverse_read
+and tdb_check all take void * and must internally convert it to
+the argument type they were expecting.
+
+If this type changes, the compiler will not produce warnings on
+the callers, since it only sees void *.
+
+2.13.1 Proposed Solution
+
+With careful use of macros, we can create callback functions
+which give a warning when used on gcc and the types of the
+callback and its private argument differ. Unsupported compilers
+will not give a warning, which is no worse than now. In addition,
+the callbacks become clearer, as they need not use void * for
+their parameter.
+
+See CCAN's typesafe_cb module at
+http://ccan.ozlabs.org/info/typesafe_cb.html
+
+2.14 TDB_CLEAR_IF_FIRST Must Be Specified On All Opens,
+  tdb_reopen_all Problematic
+
+The TDB_CLEAR_IF_FIRST flag to tdb_open indicates that the TDB
+file should be cleared if the caller discovers it is the only
+process with the TDB open. However, if any caller does not
+specify TDB_CLEAR_IF_FIRST it will not be detected, so will have
+the TDB erased underneath them (usually resulting in a crash).
+
+There is a similar issue on fork(); if the parent exits (or
+otherwise closes the tdb) before the child calls tdb_reopen_all()
+to establish the lock used to indicate the TDB is opened by
+someone, a TDB_CLEAR_IF_FIRST opener at that moment will believe
+it alone has opened the TDB and will erase it.
+
+2.14.1 Proposed Solution
+
+Remove TDB_CLEAR_IF_FIRST. Other workarounds are possible, but
+see [TDB_CLEAR_IF_FIRST-Imposes-Performance].
+
+3 Performance And Scalability Issues
+
+3.1 <TDB_CLEAR_IF_FIRST-Imposes-Performance>TDB_CLEAR_IF_FIRST
+  Imposes Performance Penalty
+
+When TDB_CLEAR_IF_FIRST is specified, a 1-byte read lock is
+placed at offset 4 (aka. the ACTIVE_LOCK). While these locks
+never conflict in normal tdb usage, they do add substantial
+overhead for most fcntl lock implementations when the kernel
+scans to detect if a lock conflict exists. This is often a single
+linked list, making the time to acquire and release a fcntl lock
+O(N) where N is the number of processes with the TDB open, not
+the number actually doing work.
+
+In a Samba server it is common to have huge numbers of clients
+sitting idle, and thus they have weaned themselves off the
+TDB_CLEAR_IF_FIRST flag.[footnote:
+There is a flag to tdb_reopen_all() which is used for this
+optimization: if the parent process will outlive the child, the
+child does not need the ACTIVE_LOCK. This is a workaround for
+this very performance issue.
+]
+
+3.1.1 Proposed Solution
+
+Remove the flag. It was a neat idea, but even trivial servers
+tend to know when they are initializing for the first time and
+can simply unlink the old tdb at that point.
+
+3.2 TDB Files Have a 4G Limit
+
+This seems to be becoming an issue (so much for “trivial”!),
+particularly for ldb.
+
+3.2.1 Proposed Solution
+
+A new, incompatible TDB format which uses 64 bit offsets
+internally rather than 32 bit as now. For simplicity of endian
+conversion (which TDB does on the fly if required), all values
+will be 64 bit on disk. In practice, some upper bits may be used
+for other purposes, but at least 56 bits will be available for
+file offsets.
+
+tdb_open() will automatically detect the old version, and even
+create them if TDB_VERSION6 is specified to tdb_open.
+
+32 bit processes will still be able to access TDBs larger than 4G
+(assuming that their off_t allows them to seek to 64 bits), they
+will gracefully fall back as they fail to mmap. This can happen
+already with large TDBs.
+
+Old versions of tdb will fail to open the new TDB files (since 28
+August 2009, commit 398d0c29290: prior to that any unrecognized
+file format would be erased and initialized as a fresh tdb!)
+
+3.3 TDB Records Have a 4G Limit
+
+This has not been a reported problem, and the API uses size_t
+which can be 64 bit on 64 bit platforms. However, other limits
+may have made such an issue moot.
+
+3.3.1 Proposed Solution
+
+Record sizes will be 64 bit, with an error returned on 32 bit
+platforms which try to access such records (the current
+implementation would return TDB_ERR_OOM in a similar case). It
+seems unlikely that 32 bit keys will be a limitation, so the
+implementation may not support this (see [sub:Records-Incur-A]).
+
+3.4 Hash Size Is Determined At TDB Creation Time
+
+TDB contains a number of hash chains in the header; the number is
+specified at creation time, and defaults to 131. This is such a
+bottleneck on large databases (as each hash chain gets quite
+long), that LDB uses 10,000 for this hash. In general it is
+impossible to know what the 'right' answer is at database
+creation time.
+
+3.4.1 Proposed Solution
+
+After comprehensive performance testing on various scalable hash
+variants[footnote:
+http://rusty.ozlabs.org/?p=89 and http://rusty.ozlabs.org/?p=94
+This was annoying because I was previously convinced that an
+expanding tree of hashes would be very close to optimal.
+], it became clear that it is hard to beat a straight linear hash
+table which doubles in size when it reaches saturation. There are
+three details which become important:
+
+1. On encountering a full bucket, we use the next bucket.
+
+2. Extra hash bits are stored with the offset, to reduce
+  comparisons.
+
+3. A marker entry is used on deleting an entry.
+
+The doubling of the table must be done under a transaction; we
+will not reduce it on deletion, so it will be an unusual case. It
+will either be placed at the head (other entries will be moved
+out the way so we can expand). We could have a pointer in the
+header to the current hashtable location, but that pointer would
+have to be read frequently to check for hashtable moves.
+
+The locking for this is slightly more complex than the chained
+case; we currently have one lock per bucket, and that means we
+would need to expand the lock if we overflow to the next bucket.
+The frequency of such collisions will effect our locking
+heuristics: we can always lock more buckets than we need.
+
+One possible optimization is to only re-check the hash size on an
+insert or a lookup miss.
+
+3.5 <TDB-Freelist-Is>TDB Freelist Is Highly Contended
+
+TDB uses a single linked list for the free list. Allocation
+occurs as follows, using heuristics which have evolved over time:
+
+1. Get the free list lock for this whole operation.
+
+2. Multiply length by 1.25, so we always over-allocate by 25%.
+
+3. Set the slack multiplier to 1.
+
+4. Examine the current freelist entry: if it is > length but <
+  the current best case, remember it as the best case.
+
+5. Multiply the slack multiplier by 1.05.
+
+6. If our best fit so far is less than length * slack multiplier,
+  return it. The slack will be turned into a new free record if
+  it's large enough.
+
+7. Otherwise, go onto the next freelist entry.
+
+Deleting a record occurs as follows:
+
+1. Lock the hash chain for this whole operation.
+
+2. Walk the chain to find the record, keeping the prev pointer
+  offset.
+
+3. If max_dead is non-zero:
+
+  (a) Walk the hash chain again and count the dead records.
+
+  (b) If it's more than max_dead, bulk free all the dead ones
+    (similar to steps 4 and below, but the lock is only obtained
+    once).
+
+  (c) Simply mark this record as dead and return.
+
+4. Get the free list lock for the remainder of this operation.
+
+5. <right-merging>Examine the following block to see if it is
+  free; if so, enlarge the current block and remove that block
+  from the free list. This was disabled, as removal from the free
+  list was O(entries-in-free-list).
+
+6. Examine the preceeding block to see if it is free: for this
+  reason, each block has a 32-bit tailer which indicates its
+  length. If it is free, expand it to cover our new block and
+  return.
+
+7. Otherwise, prepend ourselves to the free list.
+
+Disabling right-merging (step [right-merging]) causes
+fragmentation; the other heuristics proved insufficient to
+address this, so the final answer to this was that when we expand
+the TDB file inside a transaction commit, we repack the entire
+tdb.
+
+The single list lock limits our allocation rate; due to the other
+issues this is not currently seen as a bottleneck.
+
+3.5.1 Proposed Solution
+
+The first step is to remove all the current heuristics, as they
+obviously interact, then examine them once the lock contention is
+addressed.
+
+The free list must be split to reduce contention. Assuming
+perfect free merging, we can at most have 1 free list entry for
+each entry. This implies that the number of free lists is related
+to the size of the hash table, but as it is rare to walk a large
+number of free list entries we can use far fewer, say 1/32 of the
+number of hash buckets.
+
+There are various benefits in using per-size free lists (see [sub:TDB-Becomes-Fragmented]
+) but it's not clear this would reduce contention in the common
+case where all processes are allocating/freeing the same size.
+Thus we almost certainly need to divide in other ways: the most
+obvious is to divide the file into zones, and using a free list
+(or set of free lists) for each. This approximates address
+ordering.
+
+Note that this means we need to split the free lists when we
+expand the file; this is probably acceptable when we double the
+hash table size, since that is such an expensive operation
+already. In the case of increasing the file size, there is an
+optimization we can use: if we use M in the formula above as the
+file size rounded up to the next power of 2, we only need
+reshuffle free lists when the file size crosses a power of 2
+boundary, and reshuffling the free lists is trivial: we simply
+merge every consecutive pair of free lists.
+
+The basic algorithm is as follows. Freeing is simple:
+
+1. Identify the correct zone.
+
+2. Lock the corresponding list.
+
+3. Re-check the zone (we didn't have a lock, sizes could have
+  changed): relock if necessary.
+
+4. Place the freed entry in the list for that zone.
+
+Allocation is a little more complicated, as we perform delayed
+coalescing at this point:
+
+1. Pick a zone either the zone we last freed into, or based on a “
+  random” number.
+
+2. Lock the corresponding list.
+
+3. Re-check the zone: relock if necessary.
+
+4. If the top entry is -large enough, remove it from the list and
+  return it.
+
+5. Otherwise, coalesce entries in the list.
+
+  (a)
+
+  (b)
+
+  (c)
+
+  (d)
+
+6. If there was no entry large enough, unlock the list and try
+  the next zone.
+
+7.
+
+8.
+
+9. If no zone satisfies, expand the file.
+
+This optimizes rapid insert/delete of free list entries by not
+coalescing them all the time.. First-fit address ordering
+ordering seems to be fairly good for keeping fragmentation low
+(see [sub:TDB-Becomes-Fragmented]). Note that address ordering
+does not need a tailer to coalesce, though if we needed one we
+could have one cheaply: see [sub:Records-Incur-A].
+
+
+
+I anticipate that the number of entries in each free zone would
+be small, but it might be worth using one free entry to hold
+pointers to the others for cache efficiency.
+
+3.6 <sub:TDB-Becomes-Fragmented>TDB Becomes Fragmented
+
+Much of this is a result of allocation strategy[footnote:
+The Memory Fragmentation Problem: Solved? Johnstone & Wilson 1995
+ftp://ftp.cs.utexas.edu/pub/garbage/malloc/ismm98.ps
+] and deliberate hobbling of coalescing; internal fragmentation
+(aka overallocation) is deliberately set at 25%, and external
+fragmentation is only cured by the decision to repack the entire
+db when a transaction commit needs to enlarge the file.
+
+3.6.1 Proposed Solution
+
+The 25% overhead on allocation works in practice for ldb because
+indexes tend to expand by one record at a time. This internal
+fragmentation can be resolved by having an “expanded” bit in the
+header to note entries that have previously expanded, and
+allocating more space for them.
+
+There are is a spectrum of possible solutions for external
+fragmentation: one is to use a fragmentation-avoiding allocation
+strategy such as best-fit address-order allocator. The other end
+of the spectrum would be to use a bump allocator (very fast and
+simple) and simply repack the file when we reach the end.
+
+There are three problems with efficient fragmentation-avoiding
+allocators: they are non-trivial, they tend to use a single free
+list for each size, and there's no evidence that tdb allocation
+patterns will match those recorded for general allocators (though
+it seems likely).
+
+Thus we don't spend too much effort on external fragmentation; we
+will be no worse than the current code if we need to repack on
+occasion. More effort is spent on reducing freelist contention,
+and reducing overhead.
+
+3.7 <sub:Records-Incur-A>Records Incur A 28-Byte Overhead
+
+Each TDB record has a header as follows:
+
+struct tdb_record {
+
+        tdb_off_t next; /* offset of the next record in the list
+*/
+
+        tdb_len_t rec_len; /* total byte length of record */
+
+        tdb_len_t key_len; /* byte length of key */
+
+        tdb_len_t data_len; /* byte length of data */
+
+        uint32_t full_hash; /* the full 32 bit hash of the key */
+
+        uint32_t magic;   /* try to catch errors */
+
+        /* the following union is implied:
+
+                union {
+
+                        char record[rec_len];
+
+                        struct {
+
+                                char key[key_len];
+
+                                char data[data_len];
+
+                        }
+
+                        uint32_t totalsize; (tailer)
+
+                }
+
+        */
+
+};
+
+Naively, this would double to a 56-byte overhead on a 64 bit
+implementation.
+
+3.7.1 Proposed Solution
+
+We can use various techniques to reduce this for an allocated
+block:
+
+1. The 'next' pointer is not required, as we are using a flat
+  hash table.
+
+2. 'rec_len' can instead be expressed as an addition to key_len
+  and data_len (it accounts for wasted or overallocated length in
+  the record). Since the record length is always a multiple of 8,
+  we can conveniently fit it in 32 bits (representing up to 35
+  bits).
+
+3. 'key_len' and 'data_len' can be reduced. I'm unwilling to
+  restrict 'data_len' to 32 bits, but instead we can combine the
+  two into one 64-bit field and using a 5 bit value which
+  indicates at what bit to divide the two. Keys are unlikely to
+  scale as fast as data, so I'm assuming a maximum key size of 32
+  bits.
+
+4. 'full_hash' is used to avoid a memcmp on the “miss” case, but
+  this is diminishing returns after a handful of bits (at 10
+  bits, it reduces 99.9% of false memcmp). As an aside, as the
+  lower bits are already incorporated in the hash table
+  resolution, the upper bits should be used here.
+
+5. 'magic' does not need to be enlarged: it currently reflects
+  one of 5 values (used, free, dead, recovery, and
+  unused_recovery). It is useful for quick sanity checking
+  however, and should not be eliminated.
+
+6. 'tailer' is only used to coalesce free blocks (so a block to
+  the right can find the header to check if this block is free).
+  This can be replaced by a single 'free' bit in the header of
+  the following block (and the tailer only exists in free
+  blocks).[footnote:
+This technique from Thomas Standish. Data Structure Techniques.
+Addison-Wesley, Reading, Massachusetts, 1980.
+] The current proposed coalescing algorithm doesn't need this,
+  however.
+
+This produces a 16 byte used header like this:
+
+struct tdb_used_record {
+
+        uint32_t magic : 16,
+
+                 prev_is_free: 1,
+
+                 key_data_divide: 5,
+
+                 top_hash: 10;
+
+        uint32_t extra_octets;
+
+        uint64_t key_and_data_len;
+
+};
+
+And a free record like this:
+
+struct tdb_free_record {
+
+        uint32_t free_magic;
+
+        uint64_t total_length;
+
+        ...
+
+        uint64_t tailer;
+
+};
+
+
+
+3.8 Transaction Commit Requires 4 fdatasync
+
+The current transaction algorithm is:
+
+1. write_recovery_data();
+
+2. sync();
+
+3. write_recovery_header();
+
+4. sync();
+
+5. overwrite_with_new_data();
+
+6. sync();
+
+7. remove_recovery_header();
+
+8. sync();
+
+On current ext3, each sync flushes all data to disk, so the next
+3 syncs are relatively expensive. But this could become a
+performance bottleneck on other filesystems such as ext4.
+
+3.8.1 Proposed Solution
+
+
+
+
+
+
+
+
+
+Neil Brown points out that this is overzealous, and only one sync
+is needed:
+
+1. Bundle the recovery data, a transaction counter and a strong
+  checksum of the new data.
+
+2. Strong checksum that whole bundle.
+
+3. Store the bundle in the database.
+
+4. Overwrite the oldest of the two recovery pointers in the
+  header (identified using the transaction counter) with the
+  offset of this bundle.
+
+5. sync.
+
+6. Write the new data to the file.
+
+Checking for recovery means identifying the latest bundle with a
+valid checksum and using the new data checksum to ensure that it
+has been applied. This is more expensive than the current check,
+but need only be done at open. For running databases, a separate
+header field can be used to indicate a transaction in progress;
+we need only check for recovery if this is set.
+
+3.9 TDB Does Not Have Snapshot Support
+
+3.9.1 Proposed Solution
+
+None. At some point you say “use a real database”.
+
+But as a thought experiment, if we implemented transactions to
+only overwrite free entries (this is tricky: there must not be a
+header in each entry which indicates whether it is free, but use
+of presence in metadata elsewhere), and a pointer to the hash
+table, we could create an entirely new commit without destroying
+existing data. Then it would be easy to implement snapshots in a
+similar way.
+
+This would not allow arbitrary changes to the database, such as
+tdb_repack does, and would require more space (since we have to
+preserve the current and future entries at once). If we used hash
+trees rather than one big hash table, we might only have to
+rewrite some sections of the hash, too.
+
+We could then implement snapshots using a similar method, using
+multiple different hash tables/free tables.
+
+3.10 Transactions Cannot Operate in Parallel
+
+This would be useless for ldb, as it hits the index records with
+just about every update. It would add significant complexity in
+resolving clashes, and cause the all transaction callers to write
+their code to loop in the case where the transactions spuriously
+failed.
+
+3.10.1 Proposed Solution
+
+We could solve a small part of the problem by providing read-only
+transactions. These would allow one write transaction to begin,
+but it could not commit until all r/o transactions are done. This
+would require a new RO_TRANSACTION_LOCK, which would be upgraded
+on commit.
+
+3.11 Default Hash Function Is Suboptimal
+
+The Knuth-inspired multiplicative hash used by tdb is fairly slow
+(especially if we expand it to 64 bits), and works best when the
+hash bucket size is a prime number (which also means a slow
+modulus). In addition, it is highly predictable which could
+potentially lead to a Denial of Service attack in some TDB uses.
+
+3.11.1 Proposed Solution
+
+The Jenkins lookup3 hash[footnote:
+http://burtleburtle.net/bob/c/lookup3.c
+] is a fast and superbly-mixing hash. It's used by the Linux
+kernel and almost everything else. This has the particular
+properties that it takes an initial seed, and produces two 32 bit
+hash numbers, which we can combine into a 64-bit hash.
+
+The seed should be created at tdb-creation time from some random
+source, and placed in the header. This is far from foolproof, but
+adds a little bit of protection against hash bombing.
+
+3.12 <Reliable-Traversal-Adds>Reliable Traversal Adds Complexity
+
+We lock a record during traversal iteration, and try to grab that
+lock in the delete code. If that grab on delete fails, we simply
+mark it deleted and continue onwards; traversal checks for this
+condition and does the delete when it moves off the record.
+
+If traversal terminates, the dead record may be left
+indefinitely.
+
+3.12.1 Proposed Solution
+
+Remove reliability guarantees; see [traverse-Proposed-Solution].
+
+3.13 Fcntl Locking Adds Overhead
+
+Placing a fcntl lock means a system call, as does removing one.
+This is actually one reason why transactions can be faster
+(everything is locked once at transaction start). In the
+uncontended case, this overhead can theoretically be eliminated.
+
+3.13.1 Proposed Solution
+
+None.
+
+We tried this before with spinlock support, in the early days of
+TDB, and it didn't make much difference except in manufactured
+benchmarks.
+
+We could use spinlocks (with futex kernel support under Linux),
+but it means that we lose automatic cleanup when a process dies
+with a lock. There is a method of auto-cleanup under Linux, but
+it's not supported by other operating systems. We could
+reintroduce a clear-if-first-style lock and sweep for dead
+futexes on open, but that wouldn't help the normal case of one
+concurrent opener dying. Increasingly elaborate repair schemes
+could be considered, but they require an ABI change (everyone
+must use them) anyway, so there's no need to do this at the same
+time as everything else.
diff --git a/lib/tdb2/doc/design.lyx b/lib/tdb2/doc/design.lyx
new file mode 100644
index 0000000000..0a1d6a14bc
--- /dev/null
+++ b/lib/tdb2/doc/design.lyx
@@ -0,0 +1,2689 @@
+#LyX 1.6.7 created this file. For more info see http://www.lyx.org/
+\lyxformat 345
+\begin_document
+\begin_header
+\textclass article
+\use_default_options true
+\language english
+\inputencoding auto
+\font_roman default
+\font_sans default
+\font_typewriter default
+\font_default_family default
+\font_sc false
+\font_osf false
+\font_sf_scale 100
+\font_tt_scale 100
+
+\graphics default
+\paperfontsize default
+\use_hyperref false
+\papersize default
+\use_geometry false
+\use_amsmath 1
+\use_esint 1
+\cite_engine basic
+\use_bibtopic false
+\paperorientation portrait
+\secnumdepth 3
+\tocdepth 3
+\paragraph_separation indent
+\defskip medskip
+\quotes_language english
+\papercolumns 1
+\papersides 1
+\paperpagestyle default
+\tracking_changes true
+\output_changes true
+\author ""
+\author ""
+\end_header
+
+\begin_body
+
+\begin_layout Title
+TDB2: A Redesigning The Trivial DataBase
+\end_layout
+
+\begin_layout Author
+Rusty Russell, IBM Corporation
+\end_layout
+
+\begin_layout Date
+17-March-2011
+\end_layout
+
+\begin_layout Abstract
+The Trivial DataBase on-disk format is 32 bits; with usage cases heading
+ towards the 4G limit, that must change.
+ This required breakage provides an opportunity to revisit TDB's other design
+ decisions and reassess them.
+\end_layout
+
+\begin_layout Section
+Introduction
+\end_layout
+
+\begin_layout Standard
+The Trivial DataBase was originally written by Andrew Tridgell as a simple
+ key/data pair storage system with the same API as dbm, but allowing multiple
+ readers and writers while being small enough (< 1000 lines of C) to include
+ in SAMBA.
+ The simple design created in 1999 has proven surprisingly robust and performant
+, used in Samba versions 3 and 4 as well as numerous other projects.
+ Its useful life was greatly increased by the (backwards-compatible!) addition
+ of transaction support in 2005.
+\end_layout
+
+\begin_layout Standard
+The wider variety and greater demands of TDB-using code has lead to some
+ organic growth of the API, as well as some compromises on the implementation.
+ None of these, by themselves, are seen as show-stoppers, but the cumulative
+ effect is to a loss of elegance over the initial, simple TDB implementation.
+ Here is a table of the approximate number of lines of implementation code
+ and number of API functions at the end of each year:
+\end_layout
+
+\begin_layout Standard
+\begin_inset Tabular
+<lyxtabular version="3" rows="12" columns="3">
+<features>
+<column alignment="center" valignment="top" width="0">
+<column alignment="center" valignment="top" width="0">
+<column alignment="center" valignment="top" width="0">
+<row>
+<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+Year End
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+API Functions
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+Lines of C Code Implementation
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+1999
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+13
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+1195
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2000
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+24
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+1725
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2001
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+32
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2228
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2002
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+35
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2481
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2003
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+35
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2552
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2004
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+40
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2584
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2005
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+38
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2647
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2006
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+52
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+3754
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2007
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+66
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+4398
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2008
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+71
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+4768
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2009
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+73
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+5715
+\end_layout
+
+\end_inset
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+This review is an attempt to catalog and address all the known issues with
+ TDB and create solutions which address the problems without significantly
+ increasing complexity; all involved are far too aware of the dangers of
+ second system syndrome in rewriting a successful project like this.
+\end_layout
+
+\begin_layout Section
+API Issues
+\end_layout
+
+\begin_layout Subsection
+tdb_open_ex Is Not Expandable
+\end_layout
+
+\begin_layout Standard
+The tdb_open() call was expanded to tdb_open_ex(), which added an optional
+ hashing function and an optional logging function argument.
+ Additional arguments to open would require the introduction of a tdb_open_ex2
+ call etc.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\begin_inset CommandInset label
+LatexCommand label
+name "attributes"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+tdb_open() will take a linked-list of attributes:
+\end_layout
+
+\begin_layout LyX-Code
+enum tdb_attribute {
+\end_layout
+
+\begin_layout LyX-Code
+    TDB_ATTRIBUTE_LOG = 0,
+\end_layout
+
+\begin_layout LyX-Code
+    TDB_ATTRIBUTE_HASH = 1
+\end_layout
+
+\begin_layout LyX-Code
+};
+\end_layout
+
+\begin_layout LyX-Code
+struct tdb_attribute_base {
+\end_layout
+
+\begin_layout LyX-Code
+    enum tdb_attribute attr;
+\end_layout
+
+\begin_layout LyX-Code
+    union tdb_attribute *next;
+\end_layout
+
+\begin_layout LyX-Code
+};
+\end_layout
+
+\begin_layout LyX-Code
+struct tdb_attribute_log {
+\end_layout
+
+\begin_layout LyX-Code
+    struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_LOG */
+\end_layout
+
+\begin_layout LyX-Code
+    tdb_log_func log_fn;
+\end_layout
+
+\begin_layout LyX-Code
+    void *log_private;
+\end_layout
+
+\begin_layout LyX-Code
+};
+\end_layout
+
+\begin_layout LyX-Code
+struct tdb_attribute_hash {
+\end_layout
+
+\begin_layout LyX-Code
+    struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_HASH */
+\end_layout
+
+\begin_layout LyX-Code
+    tdb_hash_func hash_fn;
+\end_layout
+
+\begin_layout LyX-Code
+    void *hash_private;
+\end_layout
+
+\begin_layout LyX-Code
+};
+\end_layout
+
+\begin_layout LyX-Code
+union tdb_attribute {
+\end_layout
+
+\begin_layout LyX-Code
+    struct tdb_attribute_base base;
+\end_layout
+
+\begin_layout LyX-Code
+    struct tdb_attribute_log log;
+\end_layout
+
+\begin_layout LyX-Code
+    struct tdb_attribute_hash hash;
+\end_layout
+
+\begin_layout LyX-Code
+};
+\end_layout
+
+\begin_layout Standard
+This allows future attributes to be added, even if this expands the size
+ of the union.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+tdb_traverse Makes Impossible Guarantees
+\end_layout
+
+\begin_layout Standard
+tdb_traverse (and tdb_firstkey/tdb_nextkey) predate transactions, and it
+ was thought that it was important to guarantee that all records which exist
+ at the start and end of the traversal would be included, and no record
+ would be included twice.
+\end_layout
+
+\begin_layout Standard
+This adds complexity (see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "Reliable-Traversal-Adds"
+
+\end_inset
+
+) and does not work anyway for records which are altered (in particular,
+ those which are expanded may be effectively deleted and re-added behind
+ the traversal).
+\end_layout
+
+\begin_layout Subsubsection
+\begin_inset CommandInset label
+LatexCommand label
+name "traverse-Proposed-Solution"
+
+\end_inset
+
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+Abandon the guarantee.
+ You will see every record if no changes occur during your traversal, otherwise
+ you will see some subset.
+ You can prevent changes by using a transaction or the locking API.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+ Delete-during-traverse will still delete every record, too (assuming no
+ other changes).
+\end_layout
+
+\begin_layout Subsection
+Nesting of Transactions Is Fraught
+\end_layout
+
+\begin_layout Standard
+TDB has alternated between allowing nested transactions and not allowing
+ them.
+ Various paths in the Samba codebase assume that transactions will nest,
+ and in a sense they can: the operation is only committed to disk when the
+ outer transaction is committed.
+ There are two problems, however:
+\end_layout
+
+\begin_layout Enumerate
+Canceling the inner transaction will cause the outer transaction commit
+ to fail, and will not undo any operations since the inner transaction began.
+ This problem is soluble with some additional internal code.
+\end_layout
+
+\begin_layout Enumerate
+An inner transaction commit can be cancelled by the outer transaction.
+ This is desirable in the way which Samba's database initialization code
+ uses transactions, but could be a surprise to any users expecting a successful
+ transaction commit to expose changes to others.
+\end_layout
+
+\begin_layout Standard
+The current solution is to specify the behavior at tdb_open(), with the
+ default currently that nested transactions are allowed.
+ This flag can also be changed at runtime.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+Given the usage patterns, it seems that the
+\begin_inset Quotes eld
+\end_inset
+
+least-surprise
+\begin_inset Quotes erd
+\end_inset
+
+ behavior of disallowing nested transactions should become the default.
+ Additionally, it seems the outer transaction is the only code which knows
+ whether inner transactions should be allowed, so a flag to indicate this
+ could be added to tdb_transaction_start.
+ However, this behavior can be simulated with a wrapper which uses tdb_add_flags
+() and tdb_remove_flags(), so the API should not be expanded for this relatively
+-obscure case.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete; the nesting flag has been removed.
+\end_layout
+
+\begin_layout Subsection
+Incorrect Hash Function is Not Detected
+\end_layout
+
+\begin_layout Standard
+tdb_open_ex() allows the calling code to specify a different hash function
+ to use, but does not check that all other processes accessing this tdb
+ are using the same hash function.
+ The result is that records are missing from tdb_fetch().
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+The header should contain an example hash result (eg.
+ the hash of 0xdeadbeef), and tdb_open_ex() should check that the given
+ hash function produces the same answer, or fail the tdb_open call.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+tdb_set_max_dead/TDB_VOLATILE Expose Implementation
+\end_layout
+
+\begin_layout Standard
+In response to scalability issues with the free list (
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "TDB-Freelist-Is"
+
+\end_inset
+
+) two API workarounds have been incorporated in TDB: tdb_set_max_dead()
+ and the TDB_VOLATILE flag to tdb_open.
+ The latter actually calls the former with an argument of
+\begin_inset Quotes eld
+\end_inset
+
+5
+\begin_inset Quotes erd
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Standard
+This code allows deleted records to accumulate without putting them in the
+ free list.
+ On delete we iterate through each chain and free them in a batch if there
+ are more than max_dead entries.
+ These are never otherwise recycled except as a side-effect of a tdb_repack.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+With the scalability problems of the freelist solved, this API can be removed.
+ The TDB_VOLATILE flag may still be useful as a hint that store and delete
+ of records will be at least as common as fetch in order to allow some internal
+ tuning, but initially will become a no-op.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+ Unknown flags cause tdb_open() to fail as well, so they can be detected
+ at runtime.
+\end_layout
+
+\begin_layout Subsection
+\begin_inset CommandInset label
+LatexCommand label
+name "TDB-Files-Cannot"
+
+\end_inset
+
+TDB Files Cannot Be Opened Multiple Times In The Same Process
+\end_layout
+
+\begin_layout Standard
+No process can open the same TDB twice; we check and disallow it.
+ This is an unfortunate side-effect of fcntl locks, which operate on a per-file
+ rather than per-file-descriptor basis, and do not nest.
+ Thus, closing any file descriptor on a file clears all the locks obtained
+ by this process, even if they were placed using a different file descriptor!
+\end_layout
+
+\begin_layout Standard
+Note that even if this were solved, deadlock could occur if operations were
+ nested: this is a more manageable programming error in most cases.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+We could lobby POSIX to fix the perverse rules, or at least lobby Linux
+ to violate them so that the most common implementation does not have this
+ restriction.
+ This would be a generally good idea for other fcntl lock users.
+\end_layout
+
+\begin_layout Standard
+Samba uses a wrapper which hands out the same tdb_context to multiple callers
+ if this happens, and does simple reference counting.
+ We should do this inside the tdb library, which already emulates lock nesting
+ internally; it would need to recognize when deadlock occurs within a single
+ process.
+ This would create a new failure mode for tdb operations (while we currently
+ handle locking failures, they are impossible in normal use and a process
+ encountering them can do little but give up).
+\end_layout
+
+\begin_layout Standard
+I do not see benefit in an additional tdb_open flag to indicate whether
+ re-opening is allowed, as though there may be some benefit to adding a
+ call to detect when a tdb_context is shared, to allow other to create such
+ an API.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+TDB API Is Not POSIX Thread-safe
+\end_layout
+
+\begin_layout Standard
+The TDB API uses an error code which can be queried after an operation to
+ determine what went wrong.
+ This programming model does not work with threads, unless specific additional
+ guarantees are given by the implementation.
+ In addition, even otherwise-independent threads cannot open the same TDB
+ (as in
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "TDB-Files-Cannot"
+
+\end_inset
+
+).
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+Reachitecting the API to include a tdb_errcode pointer would be a great
+ deal of churn, but fortunately most functions return 0 on success and -1
+ on error: we can change these to return 0 on success and a negative error
+ code on error, and the API remains similar to previous.
+ The tdb_fetch, tdb_firstkey and tdb_nextkey functions need to take a TDB_DATA
+ pointer and return an error code.
+ It is also simpler to have tdb_nextkey replace its key argument in place,
+ freeing up any old .dptr.
+\end_layout
+
+\begin_layout Standard
+Internal locking is required to make sure that fcntl locks do not overlap
+ between threads, and also that the global list of tdbs is maintained.
+\end_layout
+
+\begin_layout Standard
+The aim is that building tdb with -DTDB_PTHREAD will result in a pthread-safe
+ version of the library, and otherwise no overhead will exist.
+ Alternatively, a hooking mechanism similar to that proposed for
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "Proposed-Solution-locking-hook"
+
+\end_inset
+
+ could be used to enable pthread locking at runtime.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Incomplete; API has been changed but thread safety has not been implemented.
+\end_layout
+
+\begin_layout Subsection
+*_nonblock Functions And *_mark Functions Expose Implementation
+\end_layout
+
+\begin_layout Standard
+CTDB
+\begin_inset Foot
+status collapsed
+
+\begin_layout Plain Layout
+Clustered TDB, see http://ctdb.samba.org
+\end_layout
+
+\end_inset
+
+ wishes to operate on TDB in a non-blocking manner.
+ This is currently done as follows:
+\end_layout
+
+\begin_layout Enumerate
+Call the _nonblock variant of an API function (eg.
+ tdb_lockall_nonblock).
+ If this fails:
+\end_layout
+
+\begin_layout Enumerate
+Fork a child process, and wait for it to call the normal variant (eg.
+ tdb_lockall).
+\end_layout
+
+\begin_layout Enumerate
+If the child succeeds, call the _mark variant to indicate we already have
+ the locks (eg.
+ tdb_lockall_mark).
+\end_layout
+
+\begin_layout Enumerate
+Upon completion, tell the child to release the locks (eg.
+ tdb_unlockall).
+\end_layout
+
+\begin_layout Enumerate
+Indicate to tdb that it should consider the locks removed (eg.
+ tdb_unlockall_mark).
+\end_layout
+
+\begin_layout Standard
+There are several issues with this approach.
+ Firstly, adding two new variants of each function clutters the API for
+ an obscure use, and so not all functions have three variants.
+ Secondly, it assumes that all paths of the functions ask for the same locks,
+ otherwise the parent process will have to get a lock which the child doesn't
+ have under some circumstances.
+ I don't believe this is currently the case, but it constrains the implementatio
+n.
+
+\end_layout
+
+\begin_layout Subsubsection
+\begin_inset CommandInset label
+LatexCommand label
+name "Proposed-Solution-locking-hook"
+
+\end_inset
+
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+Implement a hook for locking methods, so that the caller can control the
+ calls to create and remove fcntl locks.
+ In this scenario, ctdbd would operate as follows:
+\end_layout
+
+\begin_layout Enumerate
+Call the normal API function, eg tdb_lockall().
+\end_layout
+
+\begin_layout Enumerate
+When the lock callback comes in, check if the child has the lock.
+ Initially, this is always false.
+ If so, return 0.
+ Otherwise, try to obtain it in non-blocking mode.
+ If that fails, return EWOULDBLOCK.
+\end_layout
+
+\begin_layout Enumerate
+Release locks in the unlock callback as normal.
+\end_layout
+
+\begin_layout Enumerate
+If tdb_lockall() fails, see if we recorded a lock failure; if so, call the
+ child to repeat the operation.
+\end_layout
+
+\begin_layout Enumerate
+The child records what locks it obtains, and returns that information to
+ the parent.
+\end_layout
+
+\begin_layout Enumerate
+When the child has succeeded, goto 1.
+\end_layout
+
+\begin_layout Standard
+This is flexible enough to handle any potential locking scenario, even when
+ lock requirements change.
+ It can be optimized so that the parent does not release locks, just tells
+ the child which locks it doesn't need to obtain.
+\end_layout
+
+\begin_layout Standard
+It also keeps the complexity out of the API, and in ctdbd where it is needed.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Incomplete.
+\end_layout
+
+\begin_layout Subsection
+tdb_chainlock Functions Expose Implementation
+\end_layout
+
+\begin_layout Standard
+tdb_chainlock locks some number of records, including the record indicated
+ by the given key.
+ This gave atomicity guarantees; no-one can start a transaction, alter,
+ read or delete that key while the lock is held.
+\end_layout
+
+\begin_layout Standard
+It also makes the same guarantee for any other key in the chain, which is
+ an internal implementation detail and potentially a cause for deadlock.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+None.
+ It would be nice to have an explicit single entry lock which effected no
+ other keys.
+ Unfortunately, this won't work for an entry which doesn't exist.
+ Thus while chainlock may be implemented more efficiently for the existing
+ case, it will still have overlap issues with the non-existing case.
+ So it is best to keep the current (lack of) guarantee about which records
+ will be effected to avoid constraining our implementation.
+\end_layout
+
+\begin_layout Subsection
+Signal Handling is Not Race-Free
+\end_layout
+
+\begin_layout Standard
+The tdb_setalarm_sigptr() call allows the caller's signal handler to indicate
+ that the tdb locking code should return with a failure, rather than trying
+ again when a signal is received (and errno == EAGAIN).
+ This is usually used to implement timeouts.
+\end_layout
+
+\begin_layout Standard
+Unfortunately, this does not work in the case where the signal is received
+ before the tdb code enters the fcntl() call to place the lock: the code
+ will sleep within the fcntl() code, unaware that the signal wants it to
+ exit.
+ In the case of long timeouts, this does not happen in practice.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+The locking hooks proposed in
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "Proposed-Solution-locking-hook"
+
+\end_inset
+
+ would allow the user to decide on whether to fail the lock acquisition
+ on a signal.
+ This allows the caller to choose their own compromise: they could narrow
+ the race by checking immediately before the fcntl call.
+\begin_inset Foot
+status collapsed
+
+\begin_layout Plain Layout
+It may be possible to make this race-free in some implementations by having
+ the signal handler alter the struct flock to make it invalid.
+ This will cause the fcntl() lock call to fail with EINVAL if the signal
+ occurs before the kernel is entered, otherwise EAGAIN.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Incomplete.
+\end_layout
+
+\begin_layout Subsection
+The API Uses Gratuitous Typedefs, Capitals
+\end_layout
+
+\begin_layout Standard
+typedefs are useful for providing source compatibility when types can differ
+ across implementations, or arguably in the case of function pointer definitions
+ which are hard for humans to parse.
+ Otherwise it is simply obfuscation and pollutes the namespace.
+\end_layout
+
+\begin_layout Standard
+Capitalization is usually reserved for compile-time constants and macros.
+\end_layout
+
+\begin_layout Description
+TDB_CONTEXT There is no reason to use this over 'struct tdb_context'; the
+ definition isn't visible to the API user anyway.
+\end_layout
+
+\begin_layout Description
+TDB_DATA There is no reason to use this over struct TDB_DATA; the struct
+ needs to be understood by the API user.
+\end_layout
+
+\begin_layout Description
+struct
+\begin_inset space ~
+\end_inset
+
+TDB_DATA This would normally be called 'struct tdb_data'.
+\end_layout
+
+\begin_layout Description
+enum
+\begin_inset space ~
+\end_inset
+
+TDB_ERROR Similarly, this would normally be enum tdb_error.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+None.
+ Introducing lower case variants would please pedants like myself, but if
+ it were done the existing ones should be kept.
+ There is little point forcing a purely cosmetic change upon tdb users.
+\end_layout
+
+\begin_layout Subsection
+\begin_inset CommandInset label
+LatexCommand label
+name "tdb_log_func-Doesnt-Take"
+
+\end_inset
+
+tdb_log_func Doesn't Take The Private Pointer
+\end_layout
+
+\begin_layout Standard
+For API compatibility reasons, the logging function needs to call tdb_get_loggin
+g_private() to retrieve the pointer registered by the tdb_open_ex for logging.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+It should simply take an extra argument, since we are prepared to break
+ the API/ABI.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+Various Callback Functions Are Not Typesafe
+\end_layout
+
+\begin_layout Standard
+The callback functions in tdb_set_logging_function (after
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "tdb_log_func-Doesnt-Take"
+
+\end_inset
+
+ is resolved), tdb_parse_record, tdb_traverse, tdb_traverse_read and tdb_check
+ all take void * and must internally convert it to the argument type they
+ were expecting.
+\end_layout
+
+\begin_layout Standard
+If this type changes, the compiler will not produce warnings on the callers,
+ since it only sees void *.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+With careful use of macros, we can create callback functions which give
+ a warning when used on gcc and the types of the callback and its private
+ argument differ.
+ Unsupported compilers will not give a warning, which is no worse than now.
+ In addition, the callbacks become clearer, as they need not use void *
+ for their parameter.
+\end_layout
+
+\begin_layout Standard
+See CCAN's typesafe_cb module at http://ccan.ozlabs.org/info/typesafe_cb.html
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+TDB_CLEAR_IF_FIRST Must Be Specified On All Opens, tdb_reopen_all Problematic
+\end_layout
+
+\begin_layout Standard
+The TDB_CLEAR_IF_FIRST flag to tdb_open indicates that the TDB file should
+ be cleared if the caller discovers it is the only process with the TDB
+ open.
+ However, if any caller does not specify TDB_CLEAR_IF_FIRST it will not
+ be detected, so will have the TDB erased underneath them (usually resulting
+ in a crash).
+\end_layout
+
+\begin_layout Standard
+There is a similar issue on fork(); if the parent exits (or otherwise closes
+ the tdb) before the child calls tdb_reopen_all() to establish the lock
+ used to indicate the TDB is opened by someone, a TDB_CLEAR_IF_FIRST opener
+ at that moment will believe it alone has opened the TDB and will erase
+ it.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+Remove TDB_CLEAR_IF_FIRST.
+ Other workarounds are possible, but see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "TDB_CLEAR_IF_FIRST-Imposes-Performance"
+
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+Extending The Header Is Difficult
+\end_layout
+
+\begin_layout Standard
+We have reserved (zeroed) words in the TDB header, which can be used for
+ future features.
+ If the future features are compulsory, the version number must be updated
+ to prevent old code from accessing the database.
+ But if the future feature is optional, we have no way of telling if older
+ code is accessing the database or not.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+The header should contain a
+\begin_inset Quotes eld
+\end_inset
+
+format variant
+\begin_inset Quotes erd
+\end_inset
+
+ value (64-bit).
+ This is divided into two 32-bit parts:
+\end_layout
+
+\begin_layout Enumerate
+The lower part reflects the format variant understood by code accessing
+ the database.
+\end_layout
+
+\begin_layout Enumerate
+The upper part reflects the format variant you must understand to write
+ to the database (otherwise you can only open for reading).
+\end_layout
+
+\begin_layout Standard
+The latter field can only be written at creation time, the former should
+ be written under the OPEN_LOCK when opening the database for writing, if
+ the variant of the code is lower than the current lowest variant.
+\end_layout
+
+\begin_layout Standard
+This should allow backwards-compatible features to be added, and detection
+ if older code (which doesn't understand the feature) writes to the database.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+Record Headers Are Not Expandible
+\end_layout
+
+\begin_layout Standard
+If we later want to add (say) checksums on keys and data, it would require
+ another format change, which we'd like to avoid.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+We often have extra padding at the tail of a record.
+ If we ensure that the first byte (if any) of this padding is zero, we will
+ have a way for future changes to detect code which doesn't understand a
+ new format: the new code would write (say) a 1 at the tail, and thus if
+ there is no tail or the first byte is 0, we would know the extension is
+ not present on that record.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+TDB Does Not Use Talloc
+\end_layout
+
+\begin_layout Standard
+Many users of TDB (particularly Samba) use the talloc allocator, and thus
+ have to wrap TDB in a talloc context to use it conveniently.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+The allocation within TDB is not complicated enough to justify the use of
+ talloc, and I am reluctant to force another (excellent) library on TDB
+ users.
+ Nonetheless a compromise is possible.
+ An attribute (see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "attributes"
+
+\end_inset
+
+) can be added later to tdb_open() to provide an alternate allocation mechanism,
+ specifically for talloc but usable by any other allocator (which would
+ ignore the
+\begin_inset Quotes eld
+\end_inset
+
+context
+\begin_inset Quotes erd
+\end_inset
+
+ argument).
+\end_layout
+
+\begin_layout Standard
+This would form a talloc heirarchy as expected, but the caller would still
+ have to attach a destructor to the tdb context returned from tdb_open to
+ close it.
+ All TDB_DATA fields would be children of the tdb_context, and the caller
+ would still have to manage them (using talloc_free() or talloc_steal()).
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Deferred.
+\end_layout
+
+\begin_layout Section
+Performance And Scalability Issues
+\end_layout
+
+\begin_layout Subsection
+\begin_inset CommandInset label
+LatexCommand label
+name "TDB_CLEAR_IF_FIRST-Imposes-Performance"
+
+\end_inset
+
+TDB_CLEAR_IF_FIRST Imposes Performance Penalty
+\end_layout
+
+\begin_layout Standard
+When TDB_CLEAR_IF_FIRST is specified, a 1-byte read lock is placed at offset
+ 4 (aka.
+ the ACTIVE_LOCK).
+ While these locks never conflict in normal tdb usage, they do add substantial
+ overhead for most fcntl lock implementations when the kernel scans to detect
+ if a lock conflict exists.
+ This is often a single linked list, making the time to acquire and release
+ a fcntl lock O(N) where N is the number of processes with the TDB open,
+ not the number actually doing work.
+\end_layout
+
+\begin_layout Standard
+In a Samba server it is common to have huge numbers of clients sitting idle,
+ and thus they have weaned themselves off the TDB_CLEAR_IF_FIRST flag.
+\begin_inset Foot
+status collapsed
+
+\begin_layout Plain Layout
+There is a flag to tdb_reopen_all() which is used for this optimization:
+ if the parent process will outlive the child, the child does not need the
+ ACTIVE_LOCK.
+ This is a workaround for this very performance issue.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+Remove the flag.
+ It was a neat idea, but even trivial servers tend to know when they are
+ initializing for the first time and can simply unlink the old tdb at that
+ point.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+TDB Files Have a 4G Limit
+\end_layout
+
+\begin_layout Standard
+This seems to be becoming an issue (so much for
+\begin_inset Quotes eld
+\end_inset
+
+trivial
+\begin_inset Quotes erd
+\end_inset
+
+!), particularly for ldb.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+A new, incompatible TDB format which uses 64 bit offsets internally rather
+ than 32 bit as now.
+ For simplicity of endian conversion (which TDB does on the fly if required),
+ all values will be 64 bit on disk.
+ In practice, some upper bits may be used for other purposes, but at least
+ 56 bits will be available for file offsets.
+\end_layout
+
+\begin_layout Standard
+tdb_open() will automatically detect the old version, and even create them
+ if TDB_VERSION6 is specified to tdb_open.
+\end_layout
+
+\begin_layout Standard
+32 bit processes will still be able to access TDBs larger than 4G (assuming
+ that their off_t allows them to seek to 64 bits), they will gracefully
+ fall back as they fail to mmap.
+ This can happen already with large TDBs.
+\end_layout
+
+\begin_layout Standard
+Old versions of tdb will fail to open the new TDB files (since 28 August
+ 2009, commit 398d0c29290: prior to that any unrecognized file format would
+ be erased and initialized as a fresh tdb!)
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+TDB Records Have a 4G Limit
+\end_layout
+
+\begin_layout Standard
+This has not been a reported problem, and the API uses size_t which can
+ be 64 bit on 64 bit platforms.
+ However, other limits may have made such an issue moot.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+Record sizes will be 64 bit, with an error returned on 32 bit platforms
+ which try to access such records (the current implementation would return
+ TDB_ERR_OOM in a similar case).
+ It seems unlikely that 32 bit keys will be a limitation, so the implementation
+ may not support this (see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sub:Records-Incur-A"
+
+\end_inset
+
+).
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+Hash Size Is Determined At TDB Creation Time
+\end_layout
+
+\begin_layout Standard
+TDB contains a number of hash chains in the header; the number is specified
+ at creation time, and defaults to 131.
+ This is such a bottleneck on large databases (as each hash chain gets quite
+ long), that LDB uses 10,000 for this hash.
+ In general it is impossible to know what the 'right' answer is at database
+ creation time.
+\end_layout
+
+\begin_layout Subsubsection
+\begin_inset CommandInset label
+LatexCommand label
+name "sub:Hash-Size-Solution"
+
+\end_inset
+
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+After comprehensive performance testing on various scalable hash variants
+\begin_inset Foot
+status collapsed
+
+\begin_layout Plain Layout
+http://rusty.ozlabs.org/?p=89 and http://rusty.ozlabs.org/?p=94 This was annoying
+ because I was previously convinced that an expanding tree of hashes would
+ be very close to optimal.
+\end_layout
+
+\end_inset
+
+, it became clear that it is hard to beat a straight linear hash table which
+ doubles in size when it reaches saturation.
+ Unfortunately, altering the hash table introduces serious locking complications
+: the entire hash table needs to be locked to enlarge the hash table, and
+ others might be holding locks.
+ Particularly insidious are insertions done under tdb_chainlock.
+\end_layout
+
+\begin_layout Standard
+Thus an expanding layered hash will be used: an array of hash groups, with
+ each hash group exploding into pointers to lower hash groups once it fills,
+ turning into a hash tree.
+ This has implications for locking: we must lock the entire group in case
+ we need to expand it, yet we don't know how deep the tree is at that point.
+\end_layout
+
+\begin_layout Standard
+Note that bits from the hash table entries should be stolen to hold more
+ hash bits to reduce the penalty of collisions.
+ We can use the otherwise-unused lower 3 bits.
+ If we limit the size of the database to 64 exabytes, we can use the top
+ 8 bits of the hash entry as well.
+ These 11 bits would reduce false positives down to 1 in 2000 which is more
+ than we need: we can use one of the bits to indicate that the extra hash
+ bits are valid.
+ This means we can choose not to re-hash all entries when we expand a hash
+ group; simply use the next bits we need and mark them invalid.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+\begin_inset CommandInset label
+LatexCommand label
+name "TDB-Freelist-Is"
+
+\end_inset
+
+TDB Freelist Is Highly Contended
+\end_layout
+
+\begin_layout Standard
+TDB uses a single linked list for the free list.
+ Allocation occurs as follows, using heuristics which have evolved over
+ time:
+\end_layout
+
+\begin_layout Enumerate
+Get the free list lock for this whole operation.
+\end_layout
+
+\begin_layout Enumerate
+Multiply length by 1.25, so we always over-allocate by 25%.
+\end_layout
+
+\begin_layout Enumerate
+Set the slack multiplier to 1.
+\end_layout
+
+\begin_layout Enumerate
+Examine the current freelist entry: if it is > length but < the current
+ best case, remember it as the best case.
+\end_layout
+
+\begin_layout Enumerate
+Multiply the slack multiplier by 1.05.
+\end_layout
+
+\begin_layout Enumerate
+If our best fit so far is less than length * slack multiplier, return it.
+ The slack will be turned into a new free record if it's large enough.
+\end_layout
+
+\begin_layout Enumerate
+Otherwise, go onto the next freelist entry.
+\end_layout
+
+\begin_layout Standard
+Deleting a record occurs as follows:
+\end_layout
+
+\begin_layout Enumerate
+Lock the hash chain for this whole operation.
+\end_layout
+
+\begin_layout Enumerate
+Walk the chain to find the record, keeping the prev pointer offset.
+\end_layout
+
+\begin_layout Enumerate
+If max_dead is non-zero:
+\end_layout
+
+\begin_deeper
+\begin_layout Enumerate
+Walk the hash chain again and count the dead records.
+\end_layout
+
+\begin_layout Enumerate
+If it's more than max_dead, bulk free all the dead ones (similar to steps
+ 4 and below, but the lock is only obtained once).
+\end_layout
+
+\begin_layout Enumerate
+Simply mark this record as dead and return.
+
+\end_layout
+
+\end_deeper
+\begin_layout Enumerate
+Get the free list lock for the remainder of this operation.
+\end_layout
+
+\begin_layout Enumerate
+\begin_inset CommandInset label
+LatexCommand label
+name "right-merging"
+
+\end_inset
+
+Examine the following block to see if it is free; if so, enlarge the current
+ block and remove that block from the free list.
+ This was disabled, as removal from the free list was O(entries-in-free-list).
+\end_layout
+
+\begin_layout Enumerate
+Examine the preceeding block to see if it is free: for this reason, each
+ block has a 32-bit tailer which indicates its length.
+ If it is free, expand it to cover our new block and return.
+\end_layout
+
+\begin_layout Enumerate
+Otherwise, prepend ourselves to the free list.
+\end_layout
+
+\begin_layout Standard
+Disabling right-merging (step
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "right-merging"
+
+\end_inset
+
+) causes fragmentation; the other heuristics proved insufficient to address
+ this, so the final answer to this was that when we expand the TDB file
+ inside a transaction commit, we repack the entire tdb.
+\end_layout
+
+\begin_layout Standard
+The single list lock limits our allocation rate; due to the other issues
+ this is not currently seen as a bottleneck.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+The first step is to remove all the current heuristics, as they obviously
+ interact, then examine them once the lock contention is addressed.
+\end_layout
+
+\begin_layout Standard
+The free list must be split to reduce contention.
+ Assuming perfect free merging, we can at most have 1 free list entry for
+ each entry.
+ This implies that the number of free lists is related to the size of the
+ hash table, but as it is rare to walk a large number of free list entries
+ we can use far fewer, say 1/32 of the number of hash buckets.
+\end_layout
+
+\begin_layout Standard
+It seems tempting to try to reuse the hash implementation which we use for
+ records here, but we have two ways of searching for free entries: for allocatio
+n we search by size (and possibly zone) which produces too many clashes
+ for our hash table to handle well, and for coalescing we search by address.
+ Thus an array of doubly-linked free lists seems preferable.
+\end_layout
+
+\begin_layout Standard
+There are various benefits in using per-size free lists (see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sub:TDB-Becomes-Fragmented"
+
+\end_inset
+
+) but it's not clear this would reduce contention in the common case where
+ all processes are allocating/freeing the same size.
+ Thus we almost certainly need to divide in other ways: the most obvious
+ is to divide the file into zones, and using a free list (or table of free
+ lists) for each.
+ This approximates address ordering.
+\end_layout
+
+\begin_layout Standard
+Unfortunately it is difficult to know what heuristics should be used to
+ determine zone sizes, and our transaction code relies on being able to
+ create a
+\begin_inset Quotes eld
+\end_inset
+
+recovery area
+\begin_inset Quotes erd
+\end_inset
+
+ by simply appending to the file (difficult if it would need to create a
+ new zone header).
+ Thus we use a linked-list of free tables; currently we only ever create
+ one, but if there is more than one we choose one at random to use.
+ In future we may use heuristics to add new free tables on contention.
+ We only expand the file when all free tables are exhausted.
+\end_layout
+
+\begin_layout Standard
+The basic algorithm is as follows.
+ Freeing is simple:
+\end_layout
+
+\begin_layout Enumerate
+Identify the correct free list.
+\end_layout
+
+\begin_layout Enumerate
+Lock the corresponding list.
+\end_layout
+
+\begin_layout Enumerate
+Re-check the list (we didn't have a lock, sizes could have changed): relock
+ if necessary.
+\end_layout
+
+\begin_layout Enumerate
+Place the freed entry in the list.
+\end_layout
+
+\begin_layout Standard
+Allocation is a little more complicated, as we perform delayed coalescing
+ at this point:
+\end_layout
+
+\begin_layout Enumerate
+Pick a free table; usually the previous one.
+\end_layout
+
+\begin_layout Enumerate
+Lock the corresponding list.
+\end_layout
+
+\begin_layout Enumerate
+If the top entry is -large enough, remove it from the list and return it.
+\end_layout
+
+\begin_layout Enumerate
+Otherwise, coalesce entries in the list.If there was no entry large enough,
+ unlock the list and try the next largest list
+\end_layout
+
+\begin_layout Enumerate
+If no list has an entry which meets our needs, try the next free table.
+\end_layout
+
+\begin_layout Enumerate
+If no zone satisfies, expand the file.
+\end_layout
+
+\begin_layout Standard
+This optimizes rapid insert/delete of free list entries by not coalescing
+ them all the time..
+ First-fit address ordering ordering seems to be fairly good for keeping
+ fragmentation low (see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sub:TDB-Becomes-Fragmented"
+
+\end_inset
+
+).
+ Note that address ordering does not need a tailer to coalesce, though if
+ we needed one we could have one cheaply: see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sub:Records-Incur-A"
+
+\end_inset
+
+.
+
+\end_layout
+
+\begin_layout Standard
+Each free entry has the free table number in the header: less than 255.
+ It also contains a doubly-linked list for easy deletion.
+\end_layout
+
+\begin_layout Subsection
+\begin_inset CommandInset label
+LatexCommand label
+name "sub:TDB-Becomes-Fragmented"
+
+\end_inset
+
+TDB Becomes Fragmented
+\end_layout
+
+\begin_layout Standard
+Much of this is a result of allocation strategy
+\begin_inset Foot
+status collapsed
+
+\begin_layout Plain Layout
+The Memory Fragmentation Problem: Solved? Johnstone & Wilson 1995 ftp://ftp.cs.ute
+xas.edu/pub/garbage/malloc/ismm98.ps
+\end_layout
+
+\end_inset
+
+ and deliberate hobbling of coalescing; internal fragmentation (aka overallocati
+on) is deliberately set at 25%, and external fragmentation is only cured
+ by the decision to repack the entire db when a transaction commit needs
+ to enlarge the file.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+The 25% overhead on allocation works in practice for ldb because indexes
+ tend to expand by one record at a time.
+ This internal fragmentation can be resolved by having an
+\begin_inset Quotes eld
+\end_inset
+
+expanded
+\begin_inset Quotes erd
+\end_inset
+
+ bit in the header to note entries that have previously expanded, and allocating
+ more space for them.
+\end_layout
+
+\begin_layout Standard
+There are is a spectrum of possible solutions for external fragmentation:
+ one is to use a fragmentation-avoiding allocation strategy such as best-fit
+ address-order allocator.
+ The other end of the spectrum would be to use a bump allocator (very fast
+ and simple) and simply repack the file when we reach the end.
+\end_layout
+
+\begin_layout Standard
+There are three problems with efficient fragmentation-avoiding allocators:
+ they are non-trivial, they tend to use a single free list for each size,
+ and there's no evidence that tdb allocation patterns will match those recorded
+ for general allocators (though it seems likely).
+\end_layout
+
+\begin_layout Standard
+Thus we don't spend too much effort on external fragmentation; we will be
+ no worse than the current code if we need to repack on occasion.
+ More effort is spent on reducing freelist contention, and reducing overhead.
+\end_layout
+
+\begin_layout Subsection
+\begin_inset CommandInset label
+LatexCommand label
+name "sub:Records-Incur-A"
+
+\end_inset
+
+Records Incur A 28-Byte Overhead
+\end_layout
+
+\begin_layout Standard
+Each TDB record has a header as follows:
+\end_layout
+
+\begin_layout LyX-Code
+struct tdb_record {
+\end_layout
+
+\begin_layout LyX-Code
+        tdb_off_t next; /* offset of the next record in the list */
+\end_layout
+
+\begin_layout LyX-Code
+        tdb_len_t rec_len; /* total byte length of record */
+\end_layout
+
+\begin_layout LyX-Code
+        tdb_len_t key_len; /* byte length of key */
+\end_layout
+
+\begin_layout LyX-Code
+        tdb_len_t data_len; /* byte length of data */
+\end_layout
+
+\begin_layout LyX-Code
+        uint32_t full_hash; /* the full 32 bit hash of the key */
+\end_layout
+
+\begin_layout LyX-Code
+        uint32_t magic;   /* try to catch errors */
+\end_layout
+
+\begin_layout LyX-Code
+        /* the following union is implied:
+\end_layout
+
+\begin_layout LyX-Code
+                union {
+\end_layout
+
+\begin_layout LyX-Code
+                        char record[rec_len];
+\end_layout
+
+\begin_layout LyX-Code
+                        struct {
+\end_layout
+
+\begin_layout LyX-Code
+                                char key[key_len];
+\end_layout
+
+\begin_layout LyX-Code
+                                char data[data_len];
+\end_layout
+
+\begin_layout LyX-Code
+                        }
+\end_layout
+
+\begin_layout LyX-Code
+                        uint32_t totalsize; (tailer)
+\end_layout
+
+\begin_layout LyX-Code
+                }
+\end_layout
+
+\begin_layout LyX-Code
+        */
+\end_layout
+
+\begin_layout LyX-Code
+};
+\end_layout
+
+\begin_layout Standard
+Naively, this would double to a 56-byte overhead on a 64 bit implementation.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+We can use various techniques to reduce this for an allocated block:
+\end_layout
+
+\begin_layout Enumerate
+The 'next' pointer is not required, as we are using a flat hash table.
+\end_layout
+
+\begin_layout Enumerate
+'rec_len' can instead be expressed as an addition to key_len and data_len
+ (it accounts for wasted or overallocated length in the record).
+ Since the record length is always a multiple of 8, we can conveniently
+ fit it in 32 bits (representing up to 35 bits).
+\end_layout
+
+\begin_layout Enumerate
+'key_len' and 'data_len' can be reduced.
+ I'm unwilling to restrict 'data_len' to 32 bits, but instead we can combine
+ the two into one 64-bit field and using a 5 bit value which indicates at
+ what bit to divide the two.
+ Keys are unlikely to scale as fast as data, so I'm assuming a maximum key
+ size of 32 bits.
+\end_layout
+
+\begin_layout Enumerate
+'full_hash' is used to avoid a memcmp on the
+\begin_inset Quotes eld
+\end_inset
+
+miss
+\begin_inset Quotes erd
+\end_inset
+
+ case, but this is diminishing returns after a handful of bits (at 10 bits,
+ it reduces 99.9% of false memcmp).
+ As an aside, as the lower bits are already incorporated in the hash table
+ resolution, the upper bits should be used here.
+ Note that it's not clear that these bits will be a win, given the extra
+ bits in the hash table itself (see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sub:Hash-Size-Solution"
+
+\end_inset
+
+).
+\end_layout
+
+\begin_layout Enumerate
+'magic' does not need to be enlarged: it currently reflects one of 5 values
+ (used, free, dead, recovery, and unused_recovery).
+ It is useful for quick sanity checking however, and should not be eliminated.
+\end_layout
+
+\begin_layout Enumerate
+'tailer' is only used to coalesce free blocks (so a block to the right can
+ find the header to check if this block is free).
+ This can be replaced by a single 'free' bit in the header of the following
+ block (and the tailer only exists in free blocks).
+\begin_inset Foot
+status collapsed
+
+\begin_layout Plain Layout
+This technique from Thomas Standish.
+ Data Structure Techniques.
+ Addison-Wesley, Reading, Massachusetts, 1980.
+\end_layout
+
+\end_inset
+
+ The current proposed coalescing algorithm doesn't need this, however.
+\end_layout
+
+\begin_layout Standard
+This produces a 16 byte used header like this:
+\end_layout
+
+\begin_layout LyX-Code
+struct tdb_used_record {
+\end_layout
+
+\begin_layout LyX-Code
+        uint32_t used_magic : 16,
+\end_layout
+
+\begin_layout LyX-Code
+
+\end_layout
+
+\begin_layout LyX-Code
+                 key_data_divide: 5,
+\end_layout
+
+\begin_layout LyX-Code
+                 top_hash: 11;
+\end_layout
+
+\begin_layout LyX-Code
+        uint32_t extra_octets;
+\end_layout
+
+\begin_layout LyX-Code
+        uint64_t key_and_data_len;
+\end_layout
+
+\begin_layout LyX-Code
+};
+\end_layout
+
+\begin_layout Standard
+And a free record like this:
+\end_layout
+
+\begin_layout LyX-Code
+struct tdb_free_record {
+\end_layout
+
+\begin_layout LyX-Code
+        uint64_t free_magic: 8,
+\end_layout
+
+\begin_layout LyX-Code
+                   prev : 56;
+\end_layout
+
+\begin_layout LyX-Code
+
+\end_layout
+
+\begin_layout LyX-Code
+        uint64_t free_table: 8,
+\end_layout
+
+\begin_layout LyX-Code
+                 total_length : 56
+\end_layout
+
+\begin_layout LyX-Code
+        uint64_t next;;
+\end_layout
+
+\begin_layout LyX-Code
+};
+\end_layout
+
+\begin_layout Standard
+Note that by limiting valid offsets to 56 bits, we can pack everything we
+ need into 3 64-byte words, meaning our minimum record size is 8 bytes.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+Transaction Commit Requires 4 fdatasync
+\end_layout
+
+\begin_layout Standard
+The current transaction algorithm is:
+\end_layout
+
+\begin_layout Enumerate
+write_recovery_data();
+\end_layout
+
+\begin_layout Enumerate
+sync();
+\end_layout
+
+\begin_layout Enumerate
+write_recovery_header();
+\end_layout
+
+\begin_layout Enumerate
+sync();
+\end_layout
+
+\begin_layout Enumerate
+overwrite_with_new_data();
+\end_layout
+
+\begin_layout Enumerate
+sync();
+\end_layout
+
+\begin_layout Enumerate
+remove_recovery_header();
+\end_layout
+
+\begin_layout Enumerate
+sync();
+\end_layout
+
+\begin_layout Standard
+On current ext3, each sync flushes all data to disk, so the next 3 syncs
+ are relatively expensive.
+ But this could become a performance bottleneck on other filesystems such
+ as ext4.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+Neil Brown points out that this is overzealous, and only one sync is needed:
+\end_layout
+
+\begin_layout Enumerate
+Bundle the recovery data, a transaction counter and a strong checksum of
+ the new data.
+\end_layout
+
+\begin_layout Enumerate
+Strong checksum that whole bundle.
+\end_layout
+
+\begin_layout Enumerate
+Store the bundle in the database.
+\end_layout
+
+\begin_layout Enumerate
+Overwrite the oldest of the two recovery pointers in the header (identified
+ using the transaction counter) with the offset of this bundle.
+\end_layout
+
+\begin_layout Enumerate
+sync.
+\end_layout
+
+\begin_layout Enumerate
+Write the new data to the file.
+\end_layout
+
+\begin_layout Standard
+Checking for recovery means identifying the latest bundle with a valid checksum
+ and using the new data checksum to ensure that it has been applied.
+ This is more expensive than the current check, but need only be done at
+ open.
+ For running databases, a separate header field can be used to indicate
+ a transaction in progress; we need only check for recovery if this is set.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Deferred.
+\end_layout
+
+\begin_layout Subsection
+\begin_inset CommandInset label
+LatexCommand label
+name "sub:TDB-Does-Not"
+
+\end_inset
+
+TDB Does Not Have Snapshot Support
+\end_layout
+
+\begin_layout Subsubsection
+Proposed SolutionNone.
+ At some point you say
+\begin_inset Quotes eld
+\end_inset
+
+use a real database
+\begin_inset Quotes erd
+\end_inset
+
+ (but see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "replay-attribute"
+
+\end_inset
+
+).
+\end_layout
+
+\begin_layout Standard
+But as a thought experiment, if we implemented transactions to only overwrite
+ free entries (this is tricky: there must not be a header in each entry
+ which indicates whether it is free, but use of presence in metadata elsewhere),
+ and a pointer to the hash table, we could create an entirely new commit
+ without destroying existing data.
+ Then it would be easy to implement snapshots in a similar way.
+\end_layout
+
+\begin_layout Standard
+This would not allow arbitrary changes to the database, such as tdb_repack
+ does, and would require more space (since we have to preserve the current
+ and future entries at once).
+ If we used hash trees rather than one big hash table, we might only have
+ to rewrite some sections of the hash, too.
+\end_layout
+
+\begin_layout Standard
+We could then implement snapshots using a similar method, using multiple
+ different hash tables/free tables.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Deferred.
+\end_layout
+
+\begin_layout Subsection
+Transactions Cannot Operate in Parallel
+\end_layout
+
+\begin_layout Standard
+This would be useless for ldb, as it hits the index records with just about
+ every update.
+ It would add significant complexity in resolving clashes, and cause the
+ all transaction callers to write their code to loop in the case where the
+ transactions spuriously failed.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+None (but see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "replay-attribute"
+
+\end_inset
+
+).
+ We could solve a small part of the problem by providing read-only transactions.
+ These would allow one write transaction to begin, but it could not commit
+ until all r/o transactions are done.
+ This would require a new RO_TRANSACTION_LOCK, which would be upgraded on
+ commit.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Deferred.
+\end_layout
+
+\begin_layout Subsection
+Default Hash Function Is Suboptimal
+\end_layout
+
+\begin_layout Standard
+The Knuth-inspired multiplicative hash used by tdb is fairly slow (especially
+ if we expand it to 64 bits), and works best when the hash bucket size is
+ a prime number (which also means a slow modulus).
+ In addition, it is highly predictable which could potentially lead to a
+ Denial of Service attack in some TDB uses.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+The Jenkins lookup3 hash
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+http://burtleburtle.net/bob/c/lookup3.c
+\end_layout
+
+\end_inset
+
+ is a fast and superbly-mixing hash.
+ It's used by the Linux kernel and almost everything else.
+ This has the particular properties that it takes an initial seed, and produces
+ two 32 bit hash numbers, which we can combine into a 64-bit hash.
+\end_layout
+
+\begin_layout Standard
+The seed should be created at tdb-creation time from some random source,
+ and placed in the header.
+ This is far from foolproof, but adds a little bit of protection against
+ hash bombing.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+\begin_inset CommandInset label
+LatexCommand label
+name "Reliable-Traversal-Adds"
+
+\end_inset
+
+Reliable Traversal Adds Complexity
+\end_layout
+
+\begin_layout Standard
+We lock a record during traversal iteration, and try to grab that lock in
+ the delete code.
+ If that grab on delete fails, we simply mark it deleted and continue onwards;
+ traversal checks for this condition and does the delete when it moves off
+ the record.
+\end_layout
+
+\begin_layout Standard
+If traversal terminates, the dead record may be left indefinitely.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+Remove reliability guarantees; see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "traverse-Proposed-Solution"
+
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+Fcntl Locking Adds Overhead
+\end_layout
+
+\begin_layout Standard
+Placing a fcntl lock means a system call, as does removing one.
+ This is actually one reason why transactions can be faster (everything
+ is locked once at transaction start).
+ In the uncontended case, this overhead can theoretically be eliminated.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+None.
+\end_layout
+
+\begin_layout Standard
+We tried this before with spinlock support, in the early days of TDB, and
+ it didn't make much difference except in manufactured benchmarks.
+\end_layout
+
+\begin_layout Standard
+We could use spinlocks (with futex kernel support under Linux), but it means
+ that we lose automatic cleanup when a process dies with a lock.
+ There is a method of auto-cleanup under Linux, but it's not supported by
+ other operating systems.
+ We could reintroduce a clear-if-first-style lock and sweep for dead futexes
+ on open, but that wouldn't help the normal case of one concurrent opener
+ dying.
+ Increasingly elaborate repair schemes could be considered, but they require
+ an ABI change (everyone must use them) anyway, so there's no need to do
+ this at the same time as everything else.
+\end_layout
+
+\begin_layout Subsection
+Some Transactions Don't Require Durability
+\end_layout
+
+\begin_layout Standard
+Volker points out that gencache uses a CLEAR_IF_FIRST tdb for normal (fast)
+ usage, and occasionally empties the results into a transactional TDB.
+ This kind of usage prioritizes performance over durability: as long as
+ we are consistent, data can be lost.
+\end_layout
+
+\begin_layout Standard
+This would be more neatly implemented inside tdb: a
+\begin_inset Quotes eld
+\end_inset
+
+soft
+\begin_inset Quotes erd
+\end_inset
+
+ transaction commit (ie.
+ syncless) which meant that data may be reverted on a crash.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+None.
+\end_layout
+
+\begin_layout Standard
+Unfortunately any transaction scheme which overwrites old data requires
+ a sync before that overwrite to avoid the possibility of corruption.
+\end_layout
+
+\begin_layout Standard
+It seems possible to use a scheme similar to that described in
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sub:TDB-Does-Not"
+
+\end_inset
+
+,where transactions are committed without overwriting existing data, and
+ an array of top-level pointers were available in the header.
+ If the transaction is
+\begin_inset Quotes eld
+\end_inset
+
+soft
+\begin_inset Quotes erd
+\end_inset
+
+ then we would not need a sync at all: existing processes would pick up
+ the new hash table and free list and work with that.
+\end_layout
+
+\begin_layout Standard
+At some later point, a sync would allow recovery of the old data into the
+ free lists (perhaps when the array of top-level pointers filled).
+ On crash, tdb_open() would examine the array of top levels, and apply the
+ transactions until it encountered an invalid checksum.
+\end_layout
+
+\begin_layout Subsection
+Tracing Is Fragile, Replay Is External
+\end_layout
+
+\begin_layout Standard
+The current TDB has compile-time-enabled tracing code, but it often breaks
+ as it is not enabled by default.
+ In a similar way, the ctdb code has an external wrapper which does replay
+ tracing so it can coordinate cluster-wide transactions.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\begin_inset CommandInset label
+LatexCommand label
+name "replay-attribute"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+Tridge points out that an attribute can be later added to tdb_open (see
+
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "attributes"
+
+\end_inset
+
+) to provide replay/trace hooks, which could become the basis for this and
+ future parallel transactions and snapshot support.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Deferred.
+\end_layout
+
+\end_body
+\end_document
diff --git a/lib/tdb2/doc/design.lyx,v b/lib/tdb2/doc/design.lyx,v
new file mode 100644
index 0000000000..13e6387f7f
--- /dev/null
+++ b/lib/tdb2/doc/design.lyx,v
@@ -0,0 +1,4679 @@
+head	1.13;
+access;
+symbols;
+locks; strict;
+comment	@# @;
+
+
+1.13
+date	2011.03.01.11.46.54;	author rusty;	state Exp;
+branches;
+next	1.12;
+
+1.12
+date	2010.12.01.12.20.49;	author rusty;	state Exp;
+branches;
+next	1.11;
+
+1.11
+date	2010.12.01.11.55.20;	author rusty;	state Exp;
+branches;
+next	1.10;
+
+1.10
+date	2010.09.14.00.33.57;	author rusty;	state Exp;
+branches;
+next	1.9;
+
+1.9
+date	2010.09.09.07.25.12;	author rusty;	state Exp;
+branches;
+next	1.8;
+
+1.8
+date	2010.09.02.02.29.05;	author rusty;	state Exp;
+branches;
+next	1.7;
+
+1.7
+date	2010.09.01.10.58.12;	author rusty;	state Exp;
+branches;
+next	1.6;
+
+1.6
+date	2010.08.02.00.21.43;	author rusty;	state Exp;
+branches;
+next	1.5;
+
+1.5
+date	2010.08.02.00.21.16;	author rusty;	state Exp;
+branches;
+next	1.4;
+
+1.4
+date	2010.05.10.13.09.11;	author rusty;	state Exp;
+branches;
+next	1.3;
+
+1.3
+date	2010.05.10.11.58.37;	author rusty;	state Exp;
+branches;
+next	1.2;
+
+1.2
+date	2010.05.10.05.35.13;	author rusty;	state Exp;
+branches;
+next	1.1;
+
+1.1
+date	2010.05.04.02.29.16;	author rusty;	state Exp;
+branches;
+next	;
+
+
+desc
+@First draft
+@
+
+
+1.13
+log
+@Thread-safe API
+@
+text
+@#LyX 1.6.7 created this file. For more info see http://www.lyx.org/
+\lyxformat 345
+\begin_document
+\begin_header
+\textclass article
+\use_default_options true
+\language english
+\inputencoding auto
+\font_roman default
+\font_sans default
+\font_typewriter default
+\font_default_family default
+\font_sc false
+\font_osf false
+\font_sf_scale 100
+\font_tt_scale 100
+
+\graphics default
+\paperfontsize default
+\use_hyperref false
+\papersize default
+\use_geometry false
+\use_amsmath 1
+\use_esint 1
+\cite_engine basic
+\use_bibtopic false
+\paperorientation portrait
+\secnumdepth 3
+\tocdepth 3
+\paragraph_separation indent
+\defskip medskip
+\quotes_language english
+\papercolumns 1
+\papersides 1
+\paperpagestyle default
+\tracking_changes true
+\output_changes true
+\author "Rusty Russell,,,"
+\author ""
+\end_header
+
+\begin_body
+
+\begin_layout Title
+TDB2: A Redesigning The Trivial DataBase
+\end_layout
+
+\begin_layout Author
+Rusty Russell, IBM Corporation
+\end_layout
+
+\begin_layout Date
+1-December-2010
+\end_layout
+
+\begin_layout Abstract
+The Trivial DataBase on-disk format is 32 bits; with usage cases heading
+ towards the 4G limit, that must change.
+ This required breakage provides an opportunity to revisit TDB's other design
+ decisions and reassess them.
+\end_layout
+
+\begin_layout Section
+Introduction
+\end_layout
+
+\begin_layout Standard
+The Trivial DataBase was originally written by Andrew Tridgell as a simple
+ key/data pair storage system with the same API as dbm, but allowing multiple
+ readers and writers while being small enough (< 1000 lines of C) to include
+ in SAMBA.
+ The simple design created in 1999 has proven surprisingly robust and performant
+, used in Samba versions 3 and 4 as well as numerous other projects.
+ Its useful life was greatly increased by the (backwards-compatible!) addition
+ of transaction support in 2005.
+\end_layout
+
+\begin_layout Standard
+The wider variety and greater demands of TDB-using code has lead to some
+ organic growth of the API, as well as some compromises on the implementation.
+ None of these, by themselves, are seen as show-stoppers, but the cumulative
+ effect is to a loss of elegance over the initial, simple TDB implementation.
+ Here is a table of the approximate number of lines of implementation code
+ and number of API functions at the end of each year:
+\end_layout
+
+\begin_layout Standard
+\begin_inset Tabular
+<lyxtabular version="3" rows="12" columns="3">
+<features>
+<column alignment="center" valignment="top" width="0">
+<column alignment="center" valignment="top" width="0">
+<column alignment="center" valignment="top" width="0">
+<row>
+<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+Year End
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+API Functions
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+Lines of C Code Implementation
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+1999
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+13
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+1195
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2000
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+24
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+1725
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2001
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+32
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2228
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2002
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+35
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2481
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2003
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+35
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2552
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2004
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+40
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2584
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2005
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+38
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2647
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2006
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+52
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+3754
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2007
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+66
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+4398
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2008
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+71
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+4768
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2009
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+73
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+5715
+\end_layout
+
+\end_inset
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+This review is an attempt to catalog and address all the known issues with
+ TDB and create solutions which address the problems without significantly
+ increasing complexity; all involved are far too aware of the dangers of
+ second system syndrome in rewriting a successful project like this.
+\end_layout
+
+\begin_layout Section
+API Issues
+\end_layout
+
+\begin_layout Subsection
+tdb_open_ex Is Not Expandable
+\end_layout
+
+\begin_layout Standard
+The tdb_open() call was expanded to tdb_open_ex(), which added an optional
+ hashing function and an optional logging function argument.
+ Additional arguments to open would require the introduction of a tdb_open_ex2
+ call etc.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\begin_inset CommandInset label
+LatexCommand label
+name "attributes"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+tdb_open() will take a linked-list of attributes:
+\end_layout
+
+\begin_layout LyX-Code
+enum tdb_attribute {
+\end_layout
+
+\begin_layout LyX-Code
+    TDB_ATTRIBUTE_LOG = 0,
+\end_layout
+
+\begin_layout LyX-Code
+    TDB_ATTRIBUTE_HASH = 1
+\end_layout
+
+\begin_layout LyX-Code
+};
+\end_layout
+
+\begin_layout LyX-Code
+struct tdb_attribute_base {
+\end_layout
+
+\begin_layout LyX-Code
+    enum tdb_attribute attr;
+\end_layout
+
+\begin_layout LyX-Code
+    union tdb_attribute *next;
+\end_layout
+
+\begin_layout LyX-Code
+};
+\end_layout
+
+\begin_layout LyX-Code
+struct tdb_attribute_log {
+\end_layout
+
+\begin_layout LyX-Code
+    struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_LOG */
+\end_layout
+
+\begin_layout LyX-Code
+    tdb_log_func log_fn;
+\end_layout
+
+\begin_layout LyX-Code
+    void *log_private;
+\end_layout
+
+\begin_layout LyX-Code
+};
+\end_layout
+
+\begin_layout LyX-Code
+struct tdb_attribute_hash {
+\end_layout
+
+\begin_layout LyX-Code
+    struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_HASH */
+\end_layout
+
+\begin_layout LyX-Code
+    tdb_hash_func hash_fn;
+\end_layout
+
+\begin_layout LyX-Code
+    void *hash_private;
+\end_layout
+
+\begin_layout LyX-Code
+};
+\end_layout
+
+\begin_layout LyX-Code
+union tdb_attribute {
+\end_layout
+
+\begin_layout LyX-Code
+    struct tdb_attribute_base base;
+\end_layout
+
+\begin_layout LyX-Code
+    struct tdb_attribute_log log;
+\end_layout
+
+\begin_layout LyX-Code
+    struct tdb_attribute_hash hash;
+\end_layout
+
+\begin_layout LyX-Code
+};
+\end_layout
+
+\begin_layout Standard
+This allows future attributes to be added, even if this expands the size
+ of the union.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+tdb_traverse Makes Impossible Guarantees
+\end_layout
+
+\begin_layout Standard
+tdb_traverse (and tdb_firstkey/tdb_nextkey) predate transactions, and it
+ was thought that it was important to guarantee that all records which exist
+ at the start and end of the traversal would be included, and no record
+ would be included twice.
+\end_layout
+
+\begin_layout Standard
+This adds complexity (see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "Reliable-Traversal-Adds"
+
+\end_inset
+
+) and does not work anyway for records which are altered (in particular,
+ those which are expanded may be effectively deleted and re-added behind
+ the traversal).
+\end_layout
+
+\begin_layout Subsubsection
+\begin_inset CommandInset label
+LatexCommand label
+name "traverse-Proposed-Solution"
+
+\end_inset
+
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+Abandon the guarantee.
+ You will see every record if no changes occur during your traversal, otherwise
+ you will see some subset.
+ You can prevent changes by using a transaction or the locking API.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+ Delete-during-traverse will still delete every record, too (assuming no
+ other changes).
+\end_layout
+
+\begin_layout Subsection
+Nesting of Transactions Is Fraught
+\end_layout
+
+\begin_layout Standard
+TDB has alternated between allowing nested transactions and not allowing
+ them.
+ Various paths in the Samba codebase assume that transactions will nest,
+ and in a sense they can: the operation is only committed to disk when the
+ outer transaction is committed.
+ There are two problems, however:
+\end_layout
+
+\begin_layout Enumerate
+Canceling the inner transaction will cause the outer transaction commit
+ to fail, and will not undo any operations since the inner transaction began.
+ This problem is soluble with some additional internal code.
+\end_layout
+
+\begin_layout Enumerate
+An inner transaction commit can be cancelled by the outer transaction.
+ This is desirable in the way which Samba's database initialization code
+ uses transactions, but could be a surprise to any users expecting a successful
+ transaction commit to expose changes to others.
+\end_layout
+
+\begin_layout Standard
+The current solution is to specify the behavior at tdb_open(), with the
+ default currently that nested transactions are allowed.
+ This flag can also be changed at runtime.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+Given the usage patterns, it seems that the
+\begin_inset Quotes eld
+\end_inset
+
+least-surprise
+\begin_inset Quotes erd
+\end_inset
+
+ behavior of disallowing nested transactions should become the default.
+ Additionally, it seems the outer transaction is the only code which knows
+ whether inner transactions should be allowed, so a flag to indicate this
+ could be added to tdb_transaction_start.
+ However, this behavior can be simulated with a wrapper which uses tdb_add_flags
+() and tdb_remove_flags(), so the API should not be expanded for this relatively
+-obscure case.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+
+\change_deleted 0 1298979572
+Incomplete; nesting flag is still defined as per tdb1.
+\change_inserted 0 1298979584
+Complete; the nesting flag has been removed.
+\change_unchanged
+
+\end_layout
+
+\begin_layout Subsection
+Incorrect Hash Function is Not Detected
+\end_layout
+
+\begin_layout Standard
+tdb_open_ex() allows the calling code to specify a different hash function
+ to use, but does not check that all other processes accessing this tdb
+ are using the same hash function.
+ The result is that records are missing from tdb_fetch().
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+The header should contain an example hash result (eg.
+ the hash of 0xdeadbeef), and tdb_open_ex() should check that the given
+ hash function produces the same answer, or fail the tdb_open call.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+tdb_set_max_dead/TDB_VOLATILE Expose Implementation
+\end_layout
+
+\begin_layout Standard
+In response to scalability issues with the free list (
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "TDB-Freelist-Is"
+
+\end_inset
+
+) two API workarounds have been incorporated in TDB: tdb_set_max_dead()
+ and the TDB_VOLATILE flag to tdb_open.
+ The latter actually calls the former with an argument of
+\begin_inset Quotes eld
+\end_inset
+
+5
+\begin_inset Quotes erd
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Standard
+This code allows deleted records to accumulate without putting them in the
+ free list.
+ On delete we iterate through each chain and free them in a batch if there
+ are more than max_dead entries.
+ These are never otherwise recycled except as a side-effect of a tdb_repack.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+With the scalability problems of the freelist solved, this API can be removed.
+ The TDB_VOLATILE flag may still be useful as a hint that store and delete
+ of records will be at least as common as fetch in order to allow some internal
+ tuning, but initially will become a no-op.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Incomplete.
+ TDB_VOLATILE still defined, but implementation should fail on unknown flags
+ to be future-proof.
+\end_layout
+
+\begin_layout Subsection
+\begin_inset CommandInset label
+LatexCommand label
+name "TDB-Files-Cannot"
+
+\end_inset
+
+TDB Files Cannot Be Opened Multiple Times In The Same Process
+\end_layout
+
+\begin_layout Standard
+No process can open the same TDB twice; we check and disallow it.
+ This is an unfortunate side-effect of fcntl locks, which operate on a per-file
+ rather than per-file-descriptor basis, and do not nest.
+ Thus, closing any file descriptor on a file clears all the locks obtained
+ by this process, even if they were placed using a different file descriptor!
+\end_layout
+
+\begin_layout Standard
+Note that even if this were solved, deadlock could occur if operations were
+ nested: this is a more manageable programming error in most cases.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+We could lobby POSIX to fix the perverse rules, or at least lobby Linux
+ to violate them so that the most common implementation does not have this
+ restriction.
+ This would be a generally good idea for other fcntl lock users.
+\end_layout
+
+\begin_layout Standard
+Samba uses a wrapper which hands out the same tdb_context to multiple callers
+ if this happens, and does simple reference counting.
+ We should do this inside the tdb library, which already emulates lock nesting
+ internally; it would need to recognize when deadlock occurs within a single
+ process.
+ This would create a new failure mode for tdb operations (while we currently
+ handle locking failures, they are impossible in normal use and a process
+ encountering them can do little but give up).
+\end_layout
+
+\begin_layout Standard
+I do not see benefit in an additional tdb_open flag to indicate whether
+ re-opening is allowed, as though there may be some benefit to adding a
+ call to detect when a tdb_context is shared, to allow other to create such
+ an API.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Incomplete.
+\end_layout
+
+\begin_layout Subsection
+TDB API Is Not POSIX Thread-safe
+\end_layout
+
+\begin_layout Standard
+The TDB API uses an error code which can be queried after an operation to
+ determine what went wrong.
+ This programming model does not work with threads, unless specific additional
+ guarantees are given by the implementation.
+ In addition, even otherwise-independent threads cannot open the same TDB
+ (as in
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "TDB-Files-Cannot"
+
+\end_inset
+
+).
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+Reachitecting the API to include a tdb_errcode pointer would be a great
+ deal of churn
+\change_inserted 0 1298979557
+, but fortunately most functions return 0 on success and -1 on error: we
+ can change these to return 0 on success and a negative error code on error,
+ and the API remains similar to previous.
+ The tdb_fetch, tdb_firstkey and tdb_nextkey functions need to take a TDB_DATA
+ pointer and return an error code.
+ It is also simpler to have tdb_nextkey replace its key argument in place,
+ freeing up any old .dptr.
+\end_layout
+
+\begin_layout Standard
+
+\change_deleted 0 1298979438
+; we are better to guarantee that the tdb_errcode is per-thread so the current
+ programming model can be maintained.
+\end_layout
+
+\begin_layout Standard
+
+\change_deleted 0 1298979438
+This requires dynamic per-thread allocations, which is awkward with POSIX
+ threads (pthread_key_create space is limited and we cannot simply allocate
+ a key for every TDB).
+\change_unchanged
+
+\end_layout
+
+\begin_layout Standard
+Internal locking is required to make sure that fcntl locks do not overlap
+ between threads, and also that the global list of tdbs is maintained.
+\end_layout
+
+\begin_layout Standard
+The aim is that building tdb with -DTDB_PTHREAD will result in a pthread-safe
+ version of the library, and otherwise no overhead will exist.
+ Alternatively, a hooking mechanism similar to that proposed for
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "Proposed-Solution-locking-hook"
+
+\end_inset
+
+ could be used to enable pthread locking at runtime.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Incomplete
+\change_inserted 0 1298979681
+; API has been changed but thread safety has not been implemented.
+\change_deleted 0 1298979669
+.
+\change_unchanged
+
+\end_layout
+
+\begin_layout Subsection
+*_nonblock Functions And *_mark Functions Expose Implementation
+\end_layout
+
+\begin_layout Standard
+CTDB
+\begin_inset Foot
+status collapsed
+
+\begin_layout Plain Layout
+Clustered TDB, see http://ctdb.samba.org
+\end_layout
+
+\end_inset
+
+ wishes to operate on TDB in a non-blocking manner.
+ This is currently done as follows:
+\end_layout
+
+\begin_layout Enumerate
+Call the _nonblock variant of an API function (eg.
+ tdb_lockall_nonblock).
+ If this fails:
+\end_layout
+
+\begin_layout Enumerate
+Fork a child process, and wait for it to call the normal variant (eg.
+ tdb_lockall).
+\end_layout
+
+\begin_layout Enumerate
+If the child succeeds, call the _mark variant to indicate we already have
+ the locks (eg.
+ tdb_lockall_mark).
+\end_layout
+
+\begin_layout Enumerate
+Upon completion, tell the child to release the locks (eg.
+ tdb_unlockall).
+\end_layout
+
+\begin_layout Enumerate
+Indicate to tdb that it should consider the locks removed (eg.
+ tdb_unlockall_mark).
+\end_layout
+
+\begin_layout Standard
+There are several issues with this approach.
+ Firstly, adding two new variants of each function clutters the API for
+ an obscure use, and so not all functions have three variants.
+ Secondly, it assumes that all paths of the functions ask for the same locks,
+ otherwise the parent process will have to get a lock which the child doesn't
+ have under some circumstances.
+ I don't believe this is currently the case, but it constrains the implementatio
+n.
+
+\end_layout
+
+\begin_layout Subsubsection
+\begin_inset CommandInset label
+LatexCommand label
+name "Proposed-Solution-locking-hook"
+
+\end_inset
+
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+Implement a hook for locking methods, so that the caller can control the
+ calls to create and remove fcntl locks.
+ In this scenario, ctdbd would operate as follows:
+\end_layout
+
+\begin_layout Enumerate
+Call the normal API function, eg tdb_lockall().
+\end_layout
+
+\begin_layout Enumerate
+When the lock callback comes in, check if the child has the lock.
+ Initially, this is always false.
+ If so, return 0.
+ Otherwise, try to obtain it in non-blocking mode.
+ If that fails, return EWOULDBLOCK.
+\end_layout
+
+\begin_layout Enumerate
+Release locks in the unlock callback as normal.
+\end_layout
+
+\begin_layout Enumerate
+If tdb_lockall() fails, see if we recorded a lock failure; if so, call the
+ child to repeat the operation.
+\end_layout
+
+\begin_layout Enumerate
+The child records what locks it obtains, and returns that information to
+ the parent.
+\end_layout
+
+\begin_layout Enumerate
+When the child has succeeded, goto 1.
+\end_layout
+
+\begin_layout Standard
+This is flexible enough to handle any potential locking scenario, even when
+ lock requirements change.
+ It can be optimized so that the parent does not release locks, just tells
+ the child which locks it doesn't need to obtain.
+\end_layout
+
+\begin_layout Standard
+It also keeps the complexity out of the API, and in ctdbd where it is needed.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Incomplete.
+\end_layout
+
+\begin_layout Subsection
+tdb_chainlock Functions Expose Implementation
+\end_layout
+
+\begin_layout Standard
+tdb_chainlock locks some number of records, including the record indicated
+ by the given key.
+ This gave atomicity guarantees; no-one can start a transaction, alter,
+ read or delete that key while the lock is held.
+\end_layout
+
+\begin_layout Standard
+It also makes the same guarantee for any other key in the chain, which is
+ an internal implementation detail and potentially a cause for deadlock.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+None.
+ It would be nice to have an explicit single entry lock which effected no
+ other keys.
+ Unfortunately, this won't work for an entry which doesn't exist.
+ Thus while chainlock may be implemented more efficiently for the existing
+ case, it will still have overlap issues with the non-existing case.
+ So it is best to keep the current (lack of) guarantee about which records
+ will be effected to avoid constraining our implementation.
+\end_layout
+
+\begin_layout Subsection
+Signal Handling is Not Race-Free
+\end_layout
+
+\begin_layout Standard
+The tdb_setalarm_sigptr() call allows the caller's signal handler to indicate
+ that the tdb locking code should return with a failure, rather than trying
+ again when a signal is received (and errno == EAGAIN).
+ This is usually used to implement timeouts.
+\end_layout
+
+\begin_layout Standard
+Unfortunately, this does not work in the case where the signal is received
+ before the tdb code enters the fcntl() call to place the lock: the code
+ will sleep within the fcntl() code, unaware that the signal wants it to
+ exit.
+ In the case of long timeouts, this does not happen in practice.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+The locking hooks proposed in
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "Proposed-Solution-locking-hook"
+
+\end_inset
+
+ would allow the user to decide on whether to fail the lock acquisition
+ on a signal.
+ This allows the caller to choose their own compromise: they could narrow
+ the race by checking immediately before the fcntl call.
+\begin_inset Foot
+status collapsed
+
+\begin_layout Plain Layout
+It may be possible to make this race-free in some implementations by having
+ the signal handler alter the struct flock to make it invalid.
+ This will cause the fcntl() lock call to fail with EINVAL if the signal
+ occurs before the kernel is entered, otherwise EAGAIN.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Incomplete.
+\end_layout
+
+\begin_layout Subsection
+The API Uses Gratuitous Typedefs, Capitals
+\end_layout
+
+\begin_layout Standard
+typedefs are useful for providing source compatibility when types can differ
+ across implementations, or arguably in the case of function pointer definitions
+ which are hard for humans to parse.
+ Otherwise it is simply obfuscation and pollutes the namespace.
+\end_layout
+
+\begin_layout Standard
+Capitalization is usually reserved for compile-time constants and macros.
+\end_layout
+
+\begin_layout Description
+TDB_CONTEXT There is no reason to use this over 'struct tdb_context'; the
+ definition isn't visible to the API user anyway.
+\end_layout
+
+\begin_layout Description
+TDB_DATA There is no reason to use this over struct TDB_DATA; the struct
+ needs to be understood by the API user.
+\end_layout
+
+\begin_layout Description
+struct
+\begin_inset space ~
+\end_inset
+
+TDB_DATA This would normally be called 'struct tdb_data'.
+\end_layout
+
+\begin_layout Description
+enum
+\begin_inset space ~
+\end_inset
+
+TDB_ERROR Similarly, this would normally be enum tdb_error.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+None.
+ Introducing lower case variants would please pedants like myself, but if
+ it were done the existing ones should be kept.
+ There is little point forcing a purely cosmetic change upon tdb users.
+\end_layout
+
+\begin_layout Subsection
+\begin_inset CommandInset label
+LatexCommand label
+name "tdb_log_func-Doesnt-Take"
+
+\end_inset
+
+tdb_log_func Doesn't Take The Private Pointer
+\end_layout
+
+\begin_layout Standard
+For API compatibility reasons, the logging function needs to call tdb_get_loggin
+g_private() to retrieve the pointer registered by the tdb_open_ex for logging.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+It should simply take an extra argument, since we are prepared to break
+ the API/ABI.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+Various Callback Functions Are Not Typesafe
+\end_layout
+
+\begin_layout Standard
+The callback functions in tdb_set_logging_function (after
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "tdb_log_func-Doesnt-Take"
+
+\end_inset
+
+ is resolved), tdb_parse_record, tdb_traverse, tdb_traverse_read and tdb_check
+ all take void * and must internally convert it to the argument type they
+ were expecting.
+\end_layout
+
+\begin_layout Standard
+If this type changes, the compiler will not produce warnings on the callers,
+ since it only sees void *.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+With careful use of macros, we can create callback functions which give
+ a warning when used on gcc and the types of the callback and its private
+ argument differ.
+ Unsupported compilers will not give a warning, which is no worse than now.
+ In addition, the callbacks become clearer, as they need not use void *
+ for their parameter.
+\end_layout
+
+\begin_layout Standard
+See CCAN's typesafe_cb module at http://ccan.ozlabs.org/info/typesafe_cb.html
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Incomplete.
+\end_layout
+
+\begin_layout Subsection
+TDB_CLEAR_IF_FIRST Must Be Specified On All Opens, tdb_reopen_all Problematic
+\end_layout
+
+\begin_layout Standard
+The TDB_CLEAR_IF_FIRST flag to tdb_open indicates that the TDB file should
+ be cleared if the caller discovers it is the only process with the TDB
+ open.
+ However, if any caller does not specify TDB_CLEAR_IF_FIRST it will not
+ be detected, so will have the TDB erased underneath them (usually resulting
+ in a crash).
+\end_layout
+
+\begin_layout Standard
+There is a similar issue on fork(); if the parent exits (or otherwise closes
+ the tdb) before the child calls tdb_reopen_all() to establish the lock
+ used to indicate the TDB is opened by someone, a TDB_CLEAR_IF_FIRST opener
+ at that moment will believe it alone has opened the TDB and will erase
+ it.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+Remove TDB_CLEAR_IF_FIRST.
+ Other workarounds are possible, but see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "TDB_CLEAR_IF_FIRST-Imposes-Performance"
+
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+
+\change_deleted 0 1298979699
+Incomplete, TDB_CLEAR_IF_FIRST still defined, but not implemented.
+\change_inserted 0 1298979700
+Complete.
+\change_unchanged
+
+\end_layout
+
+\begin_layout Subsection
+Extending The Header Is Difficult
+\end_layout
+
+\begin_layout Standard
+We have reserved (zeroed) words in the TDB header, which can be used for
+ future features.
+ If the future features are compulsory, the version number must be updated
+ to prevent old code from accessing the database.
+ But if the future feature is optional, we have no way of telling if older
+ code is accessing the database or not.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+The header should contain a
+\begin_inset Quotes eld
+\end_inset
+
+format variant
+\begin_inset Quotes erd
+\end_inset
+
+ value (64-bit).
+ This is divided into two 32-bit parts:
+\end_layout
+
+\begin_layout Enumerate
+The lower part reflects the format variant understood by code accessing
+ the database.
+\end_layout
+
+\begin_layout Enumerate
+The upper part reflects the format variant you must understand to write
+ to the database (otherwise you can only open for reading).
+\end_layout
+
+\begin_layout Standard
+The latter field can only be written at creation time, the former should
+ be written under the OPEN_LOCK when opening the database for writing, if
+ the variant of the code is lower than the current lowest variant.
+\end_layout
+
+\begin_layout Standard
+This should allow backwards-compatible features to be added, and detection
+ if older code (which doesn't understand the feature) writes to the database.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Incomplete.
+\end_layout
+
+\begin_layout Subsection
+Record Headers Are Not Expandible
+\end_layout
+
+\begin_layout Standard
+If we later want to add (say) checksums on keys and data, it would require
+ another format change, which we'd like to avoid.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+We often have extra padding at the tail of a record.
+ If we ensure that the first byte (if any) of this padding is zero, we will
+ have a way for future changes to detect code which doesn't understand a
+ new format: the new code would write (say) a 1 at the tail, and thus if
+ there is no tail or the first byte is 0, we would know the extension is
+ not present on that record.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Incomplete.
+\end_layout
+
+\begin_layout Subsection
+TDB Does Not Use Talloc
+\end_layout
+
+\begin_layout Standard
+Many users of TDB (particularly Samba) use the talloc allocator, and thus
+ have to wrap TDB in a talloc context to use it conveniently.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+The allocation within TDB is not complicated enough to justify the use of
+ talloc, and I am reluctant to force another (excellent) library on TDB
+ users.
+ Nonetheless a compromise is possible.
+ An attribute (see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "attributes"
+
+\end_inset
+
+) can be added later to tdb_open() to provide an alternate allocation mechanism,
+ specifically for talloc but usable by any other allocator (which would
+ ignore the
+\begin_inset Quotes eld
+\end_inset
+
+context
+\begin_inset Quotes erd
+\end_inset
+
+ argument).
+\end_layout
+
+\begin_layout Standard
+This would form a talloc heirarchy as expected, but the caller would still
+ have to attach a destructor to the tdb context returned from tdb_open to
+ close it.
+ All TDB_DATA fields would be children of the tdb_context, and the caller
+ would still have to manage them (using talloc_free() or talloc_steal()).
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Deferred.
+\end_layout
+
+\begin_layout Section
+Performance And Scalability Issues
+\end_layout
+
+\begin_layout Subsection
+\begin_inset CommandInset label
+LatexCommand label
+name "TDB_CLEAR_IF_FIRST-Imposes-Performance"
+
+\end_inset
+
+TDB_CLEAR_IF_FIRST Imposes Performance Penalty
+\end_layout
+
+\begin_layout Standard
+When TDB_CLEAR_IF_FIRST is specified, a 1-byte read lock is placed at offset
+ 4 (aka.
+ the ACTIVE_LOCK).
+ While these locks never conflict in normal tdb usage, they do add substantial
+ overhead for most fcntl lock implementations when the kernel scans to detect
+ if a lock conflict exists.
+ This is often a single linked list, making the time to acquire and release
+ a fcntl lock O(N) where N is the number of processes with the TDB open,
+ not the number actually doing work.
+\end_layout
+
+\begin_layout Standard
+In a Samba server it is common to have huge numbers of clients sitting idle,
+ and thus they have weaned themselves off the TDB_CLEAR_IF_FIRST flag.
+\begin_inset Foot
+status collapsed
+
+\begin_layout Plain Layout
+There is a flag to tdb_reopen_all() which is used for this optimization:
+ if the parent process will outlive the child, the child does not need the
+ ACTIVE_LOCK.
+ This is a workaround for this very performance issue.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+Remove the flag.
+ It was a neat idea, but even trivial servers tend to know when they are
+ initializing for the first time and can simply unlink the old tdb at that
+ point.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+
+\change_deleted 0 1298979837
+Incomplete; TDB_CLEAR_IF_FIRST still defined, but does nothing.
+\change_inserted 0 1298979837
+Complete.
+\change_unchanged
+
+\end_layout
+
+\begin_layout Subsection
+TDB Files Have a 4G Limit
+\end_layout
+
+\begin_layout Standard
+This seems to be becoming an issue (so much for
+\begin_inset Quotes eld
+\end_inset
+
+trivial
+\begin_inset Quotes erd
+\end_inset
+
+!), particularly for ldb.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+A new, incompatible TDB format which uses 64 bit offsets internally rather
+ than 32 bit as now.
+ For simplicity of endian conversion (which TDB does on the fly if required),
+ all values will be 64 bit on disk.
+ In practice, some upper bits may be used for other purposes, but at least
+ 56 bits will be available for file offsets.
+\end_layout
+
+\begin_layout Standard
+tdb_open() will automatically detect the old version, and even create them
+ if TDB_VERSION6 is specified to tdb_open.
+\end_layout
+
+\begin_layout Standard
+32 bit processes will still be able to access TDBs larger than 4G (assuming
+ that their off_t allows them to seek to 64 bits), they will gracefully
+ fall back as they fail to mmap.
+ This can happen already with large TDBs.
+\end_layout
+
+\begin_layout Standard
+Old versions of tdb will fail to open the new TDB files (since 28 August
+ 2009, commit 398d0c29290: prior to that any unrecognized file format would
+ be erased and initialized as a fresh tdb!)
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+TDB Records Have a 4G Limit
+\end_layout
+
+\begin_layout Standard
+This has not been a reported problem, and the API uses size_t which can
+ be 64 bit on 64 bit platforms.
+ However, other limits may have made such an issue moot.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+Record sizes will be 64 bit, with an error returned on 32 bit platforms
+ which try to access such records (the current implementation would return
+ TDB_ERR_OOM in a similar case).
+ It seems unlikely that 32 bit keys will be a limitation, so the implementation
+ may not support this (see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sub:Records-Incur-A"
+
+\end_inset
+
+).
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+Hash Size Is Determined At TDB Creation Time
+\end_layout
+
+\begin_layout Standard
+TDB contains a number of hash chains in the header; the number is specified
+ at creation time, and defaults to 131.
+ This is such a bottleneck on large databases (as each hash chain gets quite
+ long), that LDB uses 10,000 for this hash.
+ In general it is impossible to know what the 'right' answer is at database
+ creation time.
+\end_layout
+
+\begin_layout Subsubsection
+\begin_inset CommandInset label
+LatexCommand label
+name "sub:Hash-Size-Solution"
+
+\end_inset
+
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+After comprehensive performance testing on various scalable hash variants
+\begin_inset Foot
+status collapsed
+
+\begin_layout Plain Layout
+http://rusty.ozlabs.org/?p=89 and http://rusty.ozlabs.org/?p=94 This was annoying
+ because I was previously convinced that an expanding tree of hashes would
+ be very close to optimal.
+\end_layout
+
+\end_inset
+
+, it became clear that it is hard to beat a straight linear hash table which
+ doubles in size when it reaches saturation.
+ Unfortunately, altering the hash table introduces serious locking complications
+: the entire hash table needs to be locked to enlarge the hash table, and
+ others might be holding locks.
+ Particularly insidious are insertions done under tdb_chainlock.
+\end_layout
+
+\begin_layout Standard
+Thus an expanding layered hash will be used: an array of hash groups, with
+ each hash group exploding into pointers to lower hash groups once it fills,
+ turning into a hash tree.
+ This has implications for locking: we must lock the entire group in case
+ we need to expand it, yet we don't know how deep the tree is at that point.
+\end_layout
+
+\begin_layout Standard
+Note that bits from the hash table entries should be stolen to hold more
+ hash bits to reduce the penalty of collisions.
+ We can use the otherwise-unused lower 3 bits.
+ If we limit the size of the database to 64 exabytes, we can use the top
+ 8 bits of the hash entry as well.
+ These 11 bits would reduce false positives down to 1 in 2000 which is more
+ than we need: we can use one of the bits to indicate that the extra hash
+ bits are valid.
+ This means we can choose not to re-hash all entries when we expand a hash
+ group; simply use the next bits we need and mark them invalid.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+\begin_inset CommandInset label
+LatexCommand label
+name "TDB-Freelist-Is"
+
+\end_inset
+
+TDB Freelist Is Highly Contended
+\end_layout
+
+\begin_layout Standard
+TDB uses a single linked list for the free list.
+ Allocation occurs as follows, using heuristics which have evolved over
+ time:
+\end_layout
+
+\begin_layout Enumerate
+Get the free list lock for this whole operation.
+\end_layout
+
+\begin_layout Enumerate
+Multiply length by 1.25, so we always over-allocate by 25%.
+\end_layout
+
+\begin_layout Enumerate
+Set the slack multiplier to 1.
+\end_layout
+
+\begin_layout Enumerate
+Examine the current freelist entry: if it is > length but < the current
+ best case, remember it as the best case.
+\end_layout
+
+\begin_layout Enumerate
+Multiply the slack multiplier by 1.05.
+\end_layout
+
+\begin_layout Enumerate
+If our best fit so far is less than length * slack multiplier, return it.
+ The slack will be turned into a new free record if it's large enough.
+\end_layout
+
+\begin_layout Enumerate
+Otherwise, go onto the next freelist entry.
+\end_layout
+
+\begin_layout Standard
+Deleting a record occurs as follows:
+\end_layout
+
+\begin_layout Enumerate
+Lock the hash chain for this whole operation.
+\end_layout
+
+\begin_layout Enumerate
+Walk the chain to find the record, keeping the prev pointer offset.
+\end_layout
+
+\begin_layout Enumerate
+If max_dead is non-zero:
+\end_layout
+
+\begin_deeper
+\begin_layout Enumerate
+Walk the hash chain again and count the dead records.
+\end_layout
+
+\begin_layout Enumerate
+If it's more than max_dead, bulk free all the dead ones (similar to steps
+ 4 and below, but the lock is only obtained once).
+\end_layout
+
+\begin_layout Enumerate
+Simply mark this record as dead and return.
+
+\end_layout
+
+\end_deeper
+\begin_layout Enumerate
+Get the free list lock for the remainder of this operation.
+\end_layout
+
+\begin_layout Enumerate
+\begin_inset CommandInset label
+LatexCommand label
+name "right-merging"
+
+\end_inset
+
+Examine the following block to see if it is free; if so, enlarge the current
+ block and remove that block from the free list.
+ This was disabled, as removal from the free list was O(entries-in-free-list).
+\end_layout
+
+\begin_layout Enumerate
+Examine the preceeding block to see if it is free: for this reason, each
+ block has a 32-bit tailer which indicates its length.
+ If it is free, expand it to cover our new block and return.
+\end_layout
+
+\begin_layout Enumerate
+Otherwise, prepend ourselves to the free list.
+\end_layout
+
+\begin_layout Standard
+Disabling right-merging (step
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "right-merging"
+
+\end_inset
+
+) causes fragmentation; the other heuristics proved insufficient to address
+ this, so the final answer to this was that when we expand the TDB file
+ inside a transaction commit, we repack the entire tdb.
+\end_layout
+
+\begin_layout Standard
+The single list lock limits our allocation rate; due to the other issues
+ this is not currently seen as a bottleneck.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+The first step is to remove all the current heuristics, as they obviously
+ interact, then examine them once the lock contention is addressed.
+\end_layout
+
+\begin_layout Standard
+The free list must be split to reduce contention.
+ Assuming perfect free merging, we can at most have 1 free list entry for
+ each entry.
+ This implies that the number of free lists is related to the size of the
+ hash table, but as it is rare to walk a large number of free list entries
+ we can use far fewer, say 1/32 of the number of hash buckets.
+\end_layout
+
+\begin_layout Standard
+It seems tempting to try to reuse the hash implementation which we use for
+ records here, but we have two ways of searching for free entries: for allocatio
+n we search by size (and possibly zone) which produces too many clashes
+ for our hash table to handle well, and for coalescing we search by address.
+ Thus an array of doubly-linked free lists seems preferable.
+\end_layout
+
+\begin_layout Standard
+There are various benefits in using per-size free lists (see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sub:TDB-Becomes-Fragmented"
+
+\end_inset
+
+) but it's not clear this would reduce contention in the common case where
+ all processes are allocating/freeing the same size.
+ Thus we almost certainly need to divide in other ways: the most obvious
+ is to divide the file into zones, and using a free list (or table of free
+ lists) for each.
+ This approximates address ordering.
+\end_layout
+
+\begin_layout Standard
+Unfortunately it is difficult to know what heuristics should be used to
+ determine zone sizes, and our transaction code relies on being able to
+ create a
+\begin_inset Quotes eld
+\end_inset
+
+recovery area
+\begin_inset Quotes erd
+\end_inset
+
+ by simply appending to the file (difficult if it would need to create a
+ new zone header).
+ Thus we use a linked-list of free tables; currently we only ever create
+ one, but if there is more than one we choose one at random to use.
+ In future we may use heuristics to add new free tables on contention.
+ We only expand the file when all free tables are exhausted.
+\end_layout
+
+\begin_layout Standard
+The basic algorithm is as follows.
+ Freeing is simple:
+\end_layout
+
+\begin_layout Enumerate
+Identify the correct free list.
+\end_layout
+
+\begin_layout Enumerate
+Lock the corresponding list.
+\end_layout
+
+\begin_layout Enumerate
+Re-check the list (we didn't have a lock, sizes could have changed): relock
+ if necessary.
+\end_layout
+
+\begin_layout Enumerate
+Place the freed entry in the list.
+\end_layout
+
+\begin_layout Standard
+Allocation is a little more complicated, as we perform delayed coalescing
+ at this point:
+\end_layout
+
+\begin_layout Enumerate
+Pick a free table; usually the previous one.
+\end_layout
+
+\begin_layout Enumerate
+Lock the corresponding list.
+\end_layout
+
+\begin_layout Enumerate
+If the top entry is -large enough, remove it from the list and return it.
+\end_layout
+
+\begin_layout Enumerate
+Otherwise, coalesce entries in the list.If there was no entry large enough,
+ unlock the list and try the next largest list
+\end_layout
+
+\begin_layout Enumerate
+If no list has an entry which meets our needs, try the next free table.
+\end_layout
+
+\begin_layout Enumerate
+If no zone satisfies, expand the file.
+\end_layout
+
+\begin_layout Standard
+This optimizes rapid insert/delete of free list entries by not coalescing
+ them all the time..
+ First-fit address ordering ordering seems to be fairly good for keeping
+ fragmentation low (see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sub:TDB-Becomes-Fragmented"
+
+\end_inset
+
+).
+ Note that address ordering does not need a tailer to coalesce, though if
+ we needed one we could have one cheaply: see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sub:Records-Incur-A"
+
+\end_inset
+
+.
+
+\end_layout
+
+\begin_layout Standard
+Each free entry has the free table number in the header: less than 255.
+ It also contains a doubly-linked list for easy deletion.
+\end_layout
+
+\begin_layout Subsection
+\begin_inset CommandInset label
+LatexCommand label
+name "sub:TDB-Becomes-Fragmented"
+
+\end_inset
+
+TDB Becomes Fragmented
+\end_layout
+
+\begin_layout Standard
+Much of this is a result of allocation strategy
+\begin_inset Foot
+status collapsed
+
+\begin_layout Plain Layout
+The Memory Fragmentation Problem: Solved? Johnstone & Wilson 1995 ftp://ftp.cs.ute
+xas.edu/pub/garbage/malloc/ismm98.ps
+\end_layout
+
+\end_inset
+
+ and deliberate hobbling of coalescing; internal fragmentation (aka overallocati
+on) is deliberately set at 25%, and external fragmentation is only cured
+ by the decision to repack the entire db when a transaction commit needs
+ to enlarge the file.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+The 25% overhead on allocation works in practice for ldb because indexes
+ tend to expand by one record at a time.
+ This internal fragmentation can be resolved by having an
+\begin_inset Quotes eld
+\end_inset
+
+expanded
+\begin_inset Quotes erd
+\end_inset
+
+ bit in the header to note entries that have previously expanded, and allocating
+ more space for them.
+\end_layout
+
+\begin_layout Standard
+There are is a spectrum of possible solutions for external fragmentation:
+ one is to use a fragmentation-avoiding allocation strategy such as best-fit
+ address-order allocator.
+ The other end of the spectrum would be to use a bump allocator (very fast
+ and simple) and simply repack the file when we reach the end.
+\end_layout
+
+\begin_layout Standard
+There are three problems with efficient fragmentation-avoiding allocators:
+ they are non-trivial, they tend to use a single free list for each size,
+ and there's no evidence that tdb allocation patterns will match those recorded
+ for general allocators (though it seems likely).
+\end_layout
+
+\begin_layout Standard
+Thus we don't spend too much effort on external fragmentation; we will be
+ no worse than the current code if we need to repack on occasion.
+ More effort is spent on reducing freelist contention, and reducing overhead.
+\end_layout
+
+\begin_layout Subsection
+\begin_inset CommandInset label
+LatexCommand label
+name "sub:Records-Incur-A"
+
+\end_inset
+
+Records Incur A 28-Byte Overhead
+\end_layout
+
+\begin_layout Standard
+Each TDB record has a header as follows:
+\end_layout
+
+\begin_layout LyX-Code
+struct tdb_record {
+\end_layout
+
+\begin_layout LyX-Code
+        tdb_off_t next; /* offset of the next record in the list */
+\end_layout
+
+\begin_layout LyX-Code
+        tdb_len_t rec_len; /* total byte length of record */
+\end_layout
+
+\begin_layout LyX-Code
+        tdb_len_t key_len; /* byte length of key */
+\end_layout
+
+\begin_layout LyX-Code
+        tdb_len_t data_len; /* byte length of data */
+\end_layout
+
+\begin_layout LyX-Code
+        uint32_t full_hash; /* the full 32 bit hash of the key */
+\end_layout
+
+\begin_layout LyX-Code
+        uint32_t magic;   /* try to catch errors */
+\end_layout
+
+\begin_layout LyX-Code
+        /* the following union is implied:
+\end_layout
+
+\begin_layout LyX-Code
+                union {
+\end_layout
+
+\begin_layout LyX-Code
+                        char record[rec_len];
+\end_layout
+
+\begin_layout LyX-Code
+                        struct {
+\end_layout
+
+\begin_layout LyX-Code
+                                char key[key_len];
+\end_layout
+
+\begin_layout LyX-Code
+                                char data[data_len];
+\end_layout
+
+\begin_layout LyX-Code
+                        }
+\end_layout
+
+\begin_layout LyX-Code
+                        uint32_t totalsize; (tailer)
+\end_layout
+
+\begin_layout LyX-Code
+                }
+\end_layout
+
+\begin_layout LyX-Code
+        */
+\end_layout
+
+\begin_layout LyX-Code
+};
+\end_layout
+
+\begin_layout Standard
+Naively, this would double to a 56-byte overhead on a 64 bit implementation.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+We can use various techniques to reduce this for an allocated block:
+\end_layout
+
+\begin_layout Enumerate
+The 'next' pointer is not required, as we are using a flat hash table.
+\end_layout
+
+\begin_layout Enumerate
+'rec_len' can instead be expressed as an addition to key_len and data_len
+ (it accounts for wasted or overallocated length in the record).
+ Since the record length is always a multiple of 8, we can conveniently
+ fit it in 32 bits (representing up to 35 bits).
+\end_layout
+
+\begin_layout Enumerate
+'key_len' and 'data_len' can be reduced.
+ I'm unwilling to restrict 'data_len' to 32 bits, but instead we can combine
+ the two into one 64-bit field and using a 5 bit value which indicates at
+ what bit to divide the two.
+ Keys are unlikely to scale as fast as data, so I'm assuming a maximum key
+ size of 32 bits.
+\end_layout
+
+\begin_layout Enumerate
+'full_hash' is used to avoid a memcmp on the
+\begin_inset Quotes eld
+\end_inset
+
+miss
+\begin_inset Quotes erd
+\end_inset
+
+ case, but this is diminishing returns after a handful of bits (at 10 bits,
+ it reduces 99.9% of false memcmp).
+ As an aside, as the lower bits are already incorporated in the hash table
+ resolution, the upper bits should be used here.
+ Note that it's not clear that these bits will be a win, given the extra
+ bits in the hash table itself (see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sub:Hash-Size-Solution"
+
+\end_inset
+
+).
+\end_layout
+
+\begin_layout Enumerate
+'magic' does not need to be enlarged: it currently reflects one of 5 values
+ (used, free, dead, recovery, and unused_recovery).
+ It is useful for quick sanity checking however, and should not be eliminated.
+\end_layout
+
+\begin_layout Enumerate
+'tailer' is only used to coalesce free blocks (so a block to the right can
+ find the header to check if this block is free).
+ This can be replaced by a single 'free' bit in the header of the following
+ block (and the tailer only exists in free blocks).
+\begin_inset Foot
+status collapsed
+
+\begin_layout Plain Layout
+This technique from Thomas Standish.
+ Data Structure Techniques.
+ Addison-Wesley, Reading, Massachusetts, 1980.
+\end_layout
+
+\end_inset
+
+ The current proposed coalescing algorithm doesn't need this, however.
+\end_layout
+
+\begin_layout Standard
+This produces a 16 byte used header like this:
+\end_layout
+
+\begin_layout LyX-Code
+struct tdb_used_record {
+\end_layout
+
+\begin_layout LyX-Code
+        uint32_t used_magic : 16,
+\end_layout
+
+\begin_layout LyX-Code
+
+\end_layout
+
+\begin_layout LyX-Code
+                 key_data_divide: 5,
+\end_layout
+
+\begin_layout LyX-Code
+                 top_hash: 11;
+\end_layout
+
+\begin_layout LyX-Code
+        uint32_t extra_octets;
+\end_layout
+
+\begin_layout LyX-Code
+        uint64_t key_and_data_len;
+\end_layout
+
+\begin_layout LyX-Code
+};
+\end_layout
+
+\begin_layout Standard
+And a free record like this:
+\end_layout
+
+\begin_layout LyX-Code
+struct tdb_free_record {
+\end_layout
+
+\begin_layout LyX-Code
+        uint64_t free_magic: 8,
+\end_layout
+
+\begin_layout LyX-Code
+                   prev : 56;
+\end_layout
+
+\begin_layout LyX-Code
+
+\end_layout
+
+\begin_layout LyX-Code
+        uint64_t free_table: 8,
+\end_layout
+
+\begin_layout LyX-Code
+                 total_length : 56
+\end_layout
+
+\begin_layout LyX-Code
+        uint64_t next;;
+\end_layout
+
+\begin_layout LyX-Code
+};
+\end_layout
+
+\begin_layout Standard
+
+\change_deleted 0 1291206079
+
+\change_unchanged
+Note that by limiting valid offsets to 56 bits, we can pack everything we
+ need into 3 64-byte words, meaning our minimum record size is 8 bytes.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+Transaction Commit Requires 4 fdatasync
+\end_layout
+
+\begin_layout Standard
+The current transaction algorithm is:
+\end_layout
+
+\begin_layout Enumerate
+write_recovery_data();
+\end_layout
+
+\begin_layout Enumerate
+sync();
+\end_layout
+
+\begin_layout Enumerate
+write_recovery_header();
+\end_layout
+
+\begin_layout Enumerate
+sync();
+\end_layout
+
+\begin_layout Enumerate
+overwrite_with_new_data();
+\end_layout
+
+\begin_layout Enumerate
+sync();
+\end_layout
+
+\begin_layout Enumerate
+remove_recovery_header();
+\end_layout
+
+\begin_layout Enumerate
+sync();
+\end_layout
+
+\begin_layout Standard
+On current ext3, each sync flushes all data to disk, so the next 3 syncs
+ are relatively expensive.
+ But this could become a performance bottleneck on other filesystems such
+ as ext4.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+Neil Brown points out that this is overzealous, and only one sync is needed:
+\end_layout
+
+\begin_layout Enumerate
+Bundle the recovery data, a transaction counter and a strong checksum of
+ the new data.
+\end_layout
+
+\begin_layout Enumerate
+Strong checksum that whole bundle.
+\end_layout
+
+\begin_layout Enumerate
+Store the bundle in the database.
+\end_layout
+
+\begin_layout Enumerate
+Overwrite the oldest of the two recovery pointers in the header (identified
+ using the transaction counter) with the offset of this bundle.
+\end_layout
+
+\begin_layout Enumerate
+sync.
+\end_layout
+
+\begin_layout Enumerate
+Write the new data to the file.
+\end_layout
+
+\begin_layout Standard
+Checking for recovery means identifying the latest bundle with a valid checksum
+ and using the new data checksum to ensure that it has been applied.
+ This is more expensive than the current check, but need only be done at
+ open.
+ For running databases, a separate header field can be used to indicate
+ a transaction in progress; we need only check for recovery if this is set.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Deferred.
+\end_layout
+
+\begin_layout Subsection
+\begin_inset CommandInset label
+LatexCommand label
+name "sub:TDB-Does-Not"
+
+\end_inset
+
+TDB Does Not Have Snapshot Support
+\end_layout
+
+\begin_layout Subsubsection
+Proposed SolutionNone.
+ At some point you say
+\begin_inset Quotes eld
+\end_inset
+
+use a real database
+\begin_inset Quotes erd
+\end_inset
+
+ (but see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "replay-attribute"
+
+\end_inset
+
+).
+\end_layout
+
+\begin_layout Standard
+But as a thought experiment, if we implemented transactions to only overwrite
+ free entries (this is tricky: there must not be a header in each entry
+ which indicates whether it is free, but use of presence in metadata elsewhere),
+ and a pointer to the hash table, we could create an entirely new commit
+ without destroying existing data.
+ Then it would be easy to implement snapshots in a similar way.
+\end_layout
+
+\begin_layout Standard
+This would not allow arbitrary changes to the database, such as tdb_repack
+ does, and would require more space (since we have to preserve the current
+ and future entries at once).
+ If we used hash trees rather than one big hash table, we might only have
+ to rewrite some sections of the hash, too.
+\end_layout
+
+\begin_layout Standard
+We could then implement snapshots using a similar method, using multiple
+ different hash tables/free tables.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Deferred.
+\end_layout
+
+\begin_layout Subsection
+Transactions Cannot Operate in Parallel
+\end_layout
+
+\begin_layout Standard
+This would be useless for ldb, as it hits the index records with just about
+ every update.
+ It would add significant complexity in resolving clashes, and cause the
+ all transaction callers to write their code to loop in the case where the
+ transactions spuriously failed.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+None (but see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "replay-attribute"
+
+\end_inset
+
+).
+ We could solve a small part of the problem by providing read-only transactions.
+ These would allow one write transaction to begin, but it could not commit
+ until all r/o transactions are done.
+ This would require a new RO_TRANSACTION_LOCK, which would be upgraded on
+ commit.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Deferred.
+\end_layout
+
+\begin_layout Subsection
+Default Hash Function Is Suboptimal
+\end_layout
+
+\begin_layout Standard
+The Knuth-inspired multiplicative hash used by tdb is fairly slow (especially
+ if we expand it to 64 bits), and works best when the hash bucket size is
+ a prime number (which also means a slow modulus).
+ In addition, it is highly predictable which could potentially lead to a
+ Denial of Service attack in some TDB uses.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+The Jenkins lookup3 hash
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+http://burtleburtle.net/bob/c/lookup3.c
+\end_layout
+
+\end_inset
+
+ is a fast and superbly-mixing hash.
+ It's used by the Linux kernel and almost everything else.
+ This has the particular properties that it takes an initial seed, and produces
+ two 32 bit hash numbers, which we can combine into a 64-bit hash.
+\end_layout
+
+\begin_layout Standard
+The seed should be created at tdb-creation time from some random source,
+ and placed in the header.
+ This is far from foolproof, but adds a little bit of protection against
+ hash bombing.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+\begin_inset CommandInset label
+LatexCommand label
+name "Reliable-Traversal-Adds"
+
+\end_inset
+
+Reliable Traversal Adds Complexity
+\end_layout
+
+\begin_layout Standard
+We lock a record during traversal iteration, and try to grab that lock in
+ the delete code.
+ If that grab on delete fails, we simply mark it deleted and continue onwards;
+ traversal checks for this condition and does the delete when it moves off
+ the record.
+\end_layout
+
+\begin_layout Standard
+If traversal terminates, the dead record may be left indefinitely.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+Remove reliability guarantees; see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "traverse-Proposed-Solution"
+
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+Fcntl Locking Adds Overhead
+\end_layout
+
+\begin_layout Standard
+Placing a fcntl lock means a system call, as does removing one.
+ This is actually one reason why transactions can be faster (everything
+ is locked once at transaction start).
+ In the uncontended case, this overhead can theoretically be eliminated.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+None.
+\end_layout
+
+\begin_layout Standard
+We tried this before with spinlock support, in the early days of TDB, and
+ it didn't make much difference except in manufactured benchmarks.
+\end_layout
+
+\begin_layout Standard
+We could use spinlocks (with futex kernel support under Linux), but it means
+ that we lose automatic cleanup when a process dies with a lock.
+ There is a method of auto-cleanup under Linux, but it's not supported by
+ other operating systems.
+ We could reintroduce a clear-if-first-style lock and sweep for dead futexes
+ on open, but that wouldn't help the normal case of one concurrent opener
+ dying.
+ Increasingly elaborate repair schemes could be considered, but they require
+ an ABI change (everyone must use them) anyway, so there's no need to do
+ this at the same time as everything else.
+\end_layout
+
+\begin_layout Subsection
+Some Transactions Don't Require Durability
+\end_layout
+
+\begin_layout Standard
+Volker points out that gencache uses a CLEAR_IF_FIRST tdb for normal (fast)
+ usage, and occasionally empties the results into a transactional TDB.
+ This kind of usage prioritizes performance over durability: as long as
+ we are consistent, data can be lost.
+\end_layout
+
+\begin_layout Standard
+This would be more neatly implemented inside tdb: a
+\begin_inset Quotes eld
+\end_inset
+
+soft
+\begin_inset Quotes erd
+\end_inset
+
+ transaction commit (ie.
+ syncless) which meant that data may be reverted on a crash.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+None.
+\end_layout
+
+\begin_layout Standard
+Unfortunately any transaction scheme which overwrites old data requires
+ a sync before that overwrite to avoid the possibility of corruption.
+\end_layout
+
+\begin_layout Standard
+It seems possible to use a scheme similar to that described in
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sub:TDB-Does-Not"
+
+\end_inset
+
+,where transactions are committed without overwriting existing data, and
+ an array of top-level pointers were available in the header.
+ If the transaction is
+\begin_inset Quotes eld
+\end_inset
+
+soft
+\begin_inset Quotes erd
+\end_inset
+
+ then we would not need a sync at all: existing processes would pick up
+ the new hash table and free list and work with that.
+\end_layout
+
+\begin_layout Standard
+At some later point, a sync would allow recovery of the old data into the
+ free lists (perhaps when the array of top-level pointers filled).
+ On crash, tdb_open() would examine the array of top levels, and apply the
+ transactions until it encountered an invalid checksum.
+\end_layout
+
+\begin_layout Subsection
+Tracing Is Fragile, Replay Is External
+\end_layout
+
+\begin_layout Standard
+The current TDB has compile-time-enabled tracing code, but it often breaks
+ as it is not enabled by default.
+ In a similar way, the ctdb code has an external wrapper which does replay
+ tracing so it can coordinate cluster-wide transactions.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\begin_inset CommandInset label
+LatexCommand label
+name "replay-attribute"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+Tridge points out that an attribute can be later added to tdb_open (see
+
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "attributes"
+
+\end_inset
+
+) to provide replay/trace hooks, which could become the basis for this and
+ future parallel transactions and snapshot support.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Deferred.
+\end_layout
+
+\end_body
+\end_document
+@
+
+
+1.12
+log
+@Add status, some fixes, linked freelists.
+@
+text
+@d53 1
+a53 7
+
+\change_deleted 0 1291204535
+14-September
+\change_inserted 0 1291204533
+1-December
+\change_unchanged
+-2010
+a580 2
+\change_inserted 0 1291204563
+
+a583 2
+
+\change_inserted 0 1291204572
+a587 2
+
+\change_inserted 0 1291204573
+a588 2
+\change_unchanged
+
+a629 2
+\change_inserted 0 1291204588
+
+a632 2
+
+\change_inserted 0 1291204588
+a636 2
+
+\change_inserted 0 1291204631
+a639 2
+\change_unchanged
+
+a693 2
+\change_inserted 0 1291204639
+
+a696 2
+
+\change_inserted 0 1291204640
+d702 1
+a702 1
+\change_inserted 0 1291204665
+d704 2
+a728 2
+\change_inserted 0 1291204671
+
+a731 2
+
+\change_inserted 0 1291204671
+a735 2
+
+\change_inserted 0 1291204673
+a736 2
+\change_unchanged
+
+a780 2
+\change_inserted 0 1291204731
+
+a783 2
+
+\change_inserted 0 1291204732
+a787 2
+
+\change_inserted 0 1291204779
+a790 2
+\change_unchanged
+
+a842 2
+\change_inserted 0 1291204830
+
+a845 2
+
+\change_inserted 0 1291204831
+a849 2
+
+\change_inserted 0 1291204834
+a850 2
+\change_unchanged
+
+d879 9
+a887 2
+ deal of churn; we are better to guarantee that the tdb_errcode is per-thread
+ so the current programming model can be maintained.
+d891 9
+d903 2
+a922 2
+\change_inserted 0 1291204847
+
+a925 2
+
+\change_inserted 0 1291204847
+d930 5
+a934 3
+
+\change_inserted 0 1291204852
+Incomplete.
+a1051 2
+\change_inserted 0 1291204881
+
+a1054 2
+
+\change_inserted 0 1291204881
+a1058 2
+
+\change_inserted 0 1291204885
+a1059 2
+\change_unchanged
+
+a1140 2
+\change_inserted 0 1291204898
+
+a1143 2
+
+\change_inserted 0 1291204898
+a1147 2
+
+\change_inserted 0 1291204901
+a1148 2
+\change_unchanged
+
+a1224 2
+\change_inserted 0 1291204908
+
+a1227 2
+
+\change_inserted 0 1291204908
+a1231 2
+
+\change_inserted 0 1291204908
+a1232 2
+\change_unchanged
+
+a1271 2
+\change_inserted 0 1291204917
+
+a1274 2
+
+\change_inserted 0 1291204917
+a1278 2
+
+\change_inserted 0 1291204920
+a1279 2
+\change_unchanged
+
+a1316 2
+\change_inserted 0 1291204927
+
+a1319 2
+
+\change_inserted 0 1291204928
+d1325 1
+a1325 1
+\change_inserted 0 1291204942
+d1327 2
+a1381 2
+\change_inserted 0 1291205003
+
+a1384 2
+
+\change_inserted 0 1291205004
+a1388 2
+
+\change_inserted 0 1291205007
+a1411 2
+\change_inserted 0 1291205019
+
+a1414 2
+
+\change_inserted 0 1291205019
+a1418 2
+
+\change_inserted 0 1291205023
+a1419 2
+\change_unchanged
+
+a1465 2
+\change_inserted 0 1291205029
+
+a1468 2
+
+\change_inserted 0 1291205029
+a1472 2
+
+\change_inserted 0 1291206020
+a1473 2
+\change_unchanged
+
+a1528 2
+\change_inserted 0 1291205043
+
+a1531 2
+
+\change_inserted 0 1291205043
+d1537 1
+a1537 1
+\change_inserted 0 1291205057
+d1539 2
+a1589 2
+\change_inserted 0 1291205062
+
+a1592 2
+
+\change_inserted 0 1291205062
+a1596 2
+
+\change_inserted 0 1291205062
+a1597 2
+\change_unchanged
+
+a1626 2
+\change_inserted 0 1291205072
+
+a1629 2
+
+\change_inserted 0 1291205073
+a1633 2
+
+\change_inserted 0 1291205073
+a1634 2
+\change_unchanged
+
+a1674 4
+
+\change_deleted 0 1291204504
+
+\change_unchanged
+a1699 2
+\change_inserted 0 1291205079
+
+a1702 2
+
+\change_inserted 0 1291205080
+a1706 2
+
+\change_inserted 0 1291205080
+a1707 2
+\change_unchanged
+
+a1833 2
+\change_inserted 0 1291205090
+
+d1869 2
+a1870 7
+ is to divide the file into zones, and using a free list (or
+\change_inserted 0 1291205498
+table
+\change_deleted 0 1291205497
+set
+\change_unchanged
+ of free lists) for each.
+a1871 2
+\change_inserted 0 1291205203
+
+a1874 2
+
+\change_inserted 0 1291205358
+a1890 21
+\change_unchanged
+
+\end_layout
+
+\begin_layout Standard
+
+\change_deleted 0 1291205198
+Note that this means we need to split the free lists when we expand the
+ file; this is probably acceptable when we double the hash table size, since
+ that is such an expensive operation already.
+ In the case of increasing the file size, there is an optimization we can
+ use: if we use M in the formula above as the file size rounded up to the
+ next power of 2, we only need reshuffle free lists when the file size crosses
+ a power of 2 boundary,
+\emph on
+and
+\emph default
+reshuffling the free lists is trivial: we simply merge every consecutive
+ pair of free lists.
+\change_unchanged
+
+d1899 1
+a1899 7
+Identify the correct
+\change_inserted 0 1291205366
+free list
+\change_deleted 0 1291205364
+zone
+\change_unchanged
+.
+d1907 2
+a1908 7
+Re-check the
+\change_inserted 0 1291205372
+list
+\change_deleted 0 1291205371
+zone
+\change_unchanged
+ (we didn't have a lock, sizes could have changed): relock if necessary.
+d1912 1
+a1912 5
+Place the freed entry in the list
+\change_deleted 0 1291205382
+ for that zone
+\change_unchanged
+.
+d1921 1
+a1921 15
+Pick a
+\change_deleted 0 1291205403
+zone either the zone we last freed into, or based on a
+\begin_inset Quotes eld
+\end_inset
+
+random
+\begin_inset Quotes erd
+\end_inset
+
+ number.
+\change_inserted 0 1291205411
+free table; usually the previous one.
+\change_unchanged
+
+a1925 10
+\change_deleted 0 1291205432
+
+\end_layout
+
+\begin_layout Enumerate
+
+\change_deleted 0 1291205428
+Re-check the zone: relock if necessary.
+\change_unchanged
+
+d1934 1
+a1934 7
+ unlock the list and try the next
+\change_inserted 0 1291205455
+largest list
+\change_deleted 0 1291205452
+zone.
+\change_inserted 0 1291205457
+
+a1937 2
+
+\change_inserted 0 1291205476
+a1938 2
+\change_unchanged
+
+a1966 2
+\change_inserted 0 1291205542
+
+a1969 2
+
+\change_inserted 0 1291205591
+a1971 70
+\change_unchanged
+
+\end_layout
+
+\begin_layout Standard
+
+\change_deleted 0 1291205539
+I anticipate that the number of entries in each free zone would be small,
+ but it might be worth using one free entry to hold pointers to the others
+ for cache efficiency.
+\change_unchanged
+
+\end_layout
+
+\begin_layout Standard
+
+\change_deleted 0 1291205534
+\begin_inset CommandInset label
+LatexCommand label
+name "freelist-in-zone"
+
+\end_inset
+
+If we want to avoid locking complexity (enlarging the free lists when we
+ enlarge the file) we could place the array of free lists at the beginning
+ of each zone.
+ This means existing array lists never move, but means that a record cannot
+ be larger than a zone.
+ That in turn implies that zones should be variable sized (say, power of
+ 2), which makes the question
+\begin_inset Quotes eld
+\end_inset
+
+what zone is this record in?
+\begin_inset Quotes erd
+\end_inset
+
+ much harder (and
+\begin_inset Quotes eld
+\end_inset
+
+pick a random zone
+\begin_inset Quotes erd
+\end_inset
+
+, but that's less common).
+ It could be done with as few as 4 bits from the record header.
+\begin_inset Foot
+status collapsed
+
+\begin_layout Plain Layout
+Using
+\begin_inset Formula $2^{16+N*3}$
+\end_inset
+
+means 0 gives a minimal 65536-byte zone, 15 gives the maximal
+\begin_inset Formula $2^{61}$
+\end_inset
+
+ byte zone.
+ Zones range in factor of 8 steps.
+ Given the zone size for the zone the current record is in, we can determine
+ the start of the zone.
+\end_layout
+
+\end_inset
+
+
+\change_inserted 0 1291205139
+
+d2218 1
+a2218 5
+        uint32_t
+\change_inserted 0 1291205758
+used_
+\change_unchanged
+magic : 16,
+a2222 4
+\change_deleted 0 1291205693
+                 prev_is_free: 1,
+\change_unchanged
+
+d2230 1
+a2230 7
+                 top_hash: 1
+\change_inserted 0 1291205704
+1
+\change_deleted 0 1291205704
+0
+\change_unchanged
+;
+d2254 1
+a2254 9
+        uint
+\change_inserted 0 1291205725
+64
+\change_deleted 0 1291205723
+32
+\change_unchanged
+_t
+\change_inserted 0 1291205753
+free_magic: 8,
+a2257 2
+
+\change_inserted 0 1291205746
+a2262 24
+\change_deleted 0 1291205749
+free_magic;
+\change_unchanged
+
+\end_layout
+
+\begin_layout LyX-Code
+        uint64_t
+\change_inserted 0 1291205786
+free_table: 8,
+\end_layout
+
+\begin_layout LyX-Code
+
+\change_inserted 0 1291205788
+
+\change_unchanged
+total_length
+\change_inserted 0 1291205792
+ : 56
+\change_deleted 0 1291205790
+;
+\change_unchanged
+
+d2266 1
+a2266 7
+        uint64_t
+\change_deleted 0 1291205801
+prev,
+\change_unchanged
+next;
+\change_deleted 0 1291205811
+
+d2270 1
+a2270 3
+
+\change_deleted 0 1291205811
+        ...
+d2274 1
+a2274 5
+
+\change_deleted 0 1291205808
+        uint64_t tailer
+\change_unchanged
+;
+d2283 5
+a2287 16
+\change_deleted 0 1291205827
+We might want to take some bits from the used record's top_hash (and the
+ free record which has 32 bits of padding to spare anyway) if we use variable
+ sized zones.
+ See
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "freelist-in-zone"
+
+\end_inset
+
+.
+
+\change_inserted 0 1291205885
+ Note that by limiting valid offsets to 56 bits, we can pack everything
+ we need into 3 64-byte words, meaning our minimum record size is 8 bytes.
+a2290 2
+
+\change_inserted 0 1291205886
+a2294 2
+
+\change_inserted 0 1291205886
+a2295 2
+\change_unchanged
+
+a2385 2
+\change_inserted 0 1291205894
+
+a2388 2
+
+\change_inserted 0 1291205894
+a2392 2
+
+\change_inserted 0 1291205902
+a2393 2
+\change_unchanged
+
+a2415 4
+
+\change_deleted 0 1291204504
+
+\change_unchanged
+a2445 2
+\change_inserted 0 1291205910
+
+a2448 2
+
+\change_inserted 0 1291205910
+a2452 2
+
+\change_inserted 0 1291205914
+a2453 2
+\change_unchanged
+
+a2485 2
+\change_inserted 0 1291205919
+
+a2488 2
+
+\change_inserted 0 1291205919
+a2492 2
+
+\change_inserted 0 1291205922
+a2493 2
+\change_unchanged
+
+a2533 2
+\change_inserted 0 1291205929
+
+a2536 2
+
+\change_inserted 0 1291205929
+a2540 2
+
+\change_inserted 0 1291205929
+a2541 2
+\change_unchanged
+
+a2578 2
+\change_inserted 0 1291205932
+
+a2581 2
+
+\change_inserted 0 1291205933
+a2585 2
+
+\change_inserted 0 1291205933
+a2586 2
+\change_unchanged
+
+a2724 2
+\change_inserted 0 1291205944
+
+a2727 2
+
+\change_inserted 0 1291205945
+a2731 2
+
+\change_inserted 0 1291205948
+a2732 2
+\change_unchanged
+
+@
+
+
+1.11
+log
+@Merge changes
+@
+text
+@d53 7
+a59 1
+14-September-2010
+d587 16
+d644 18
+d716 16
+d753 16
+d813 18
+d883 16
+d953 16
+d1084 16
+d1181 16
+d1273 16
+d1328 16
+d1381 16
+d1447 19
+a1465 2
+ if older code (which doesn't understand the feature) writes to the database.Reco
+rd Headers Are Not Expandible
+d1484 16
+d1546 16
+d1617 16
+d1680 16
+d1725 16
+d1810 16
+d1951 8
+a1958 3
+Proposed SolutionThe first step is to remove all the current heuristics,
+ as they obviously interact, then examine them once the lock contention
+ is addressed.
+d1989 7
+a1995 2
+ is to divide the file into zones, and using a free list (or set of free
+ lists) for each.
+d1997 2
+d2002 25
+d2039 2
+d2049 7
+a2055 1
+Identify the correct zone.
+d2063 7
+a2069 2
+Re-check the zone (we didn't have a lock, sizes could have changed): relock
+ if necessary.
+d2073 5
+a2077 1
+Place the freed entry in the list for that zone.
+d2086 3
+a2088 1
+Pick a zone either the zone we last freed into, or based on a
+d2097 4
+d2105 2
+d2110 2
+d2113 2
+d2123 15
+a2137 1
+ unlock the list and try the next zone.
+d2166 11
+d2180 2
+d2185 2
+d2190 2
+d2223 1
+a2223 1
+status open
+d2243 2
+d2491 5
+a2495 1
+        uint32_t magic : 16,
+d2499 2
+d2502 2
+d2511 7
+a2517 1
+                 top_hash: 10;
+d2541 29
+a2569 1
+        uint32_t free_magic;
+d2573 11
+a2583 1
+        uint64_t total_length;
+d2587 7
+a2593 1
+        uint64_t prev, next;
+d2597 2
+d2603 5
+a2607 1
+        uint64_t tailer;
+d2615 2
+d2628 18
+d2736 16
+d2808 16
+d2856 16
+d2912 16
+d2965 16
+d3119 16
+@
+
+
+1.10
+log
+@Tracing attribute, talloc support.
+@
+text
+@d1 1
+a1 1
+#LyX 1.6.5 created this file. For more info see http://www.lyx.org/
+d53 1
+a53 7
+
+\change_deleted 0 1283307542
+26-July
+\change_inserted 0 1284423485
+14-September
+\change_unchanged
+-2010
+a472 2
+\change_inserted 0 1284422789
+
+a479 2
+\change_unchanged
+
+a838 2
+
+\change_inserted 0 1284016998
+a846 2
+\change_unchanged
+
+a1194 2
+\change_inserted 0 1284015637
+
+a1197 2
+
+\change_inserted 0 1284015716
+a1201 2
+
+\change_inserted 0 1284015906
+a1210 2
+
+\change_inserted 0 1284015637
+a1214 2
+
+\change_inserted 0 1284016114
+a1227 2
+
+\change_inserted 0 1284016149
+a1232 2
+
+\change_inserted 0 1284016639
+a1237 2
+
+\change_inserted 0 1284016821
+a1243 2
+
+\change_inserted 0 1284016803
+d1245 2
+a1246 9
+ if older code (which doesn't understand the feature) writes to the database.
+\change_deleted 0 1284016101
+
+\end_layout
+
+\begin_layout Subsection
+
+\change_inserted 0 1284015634
+Record Headers Are Not Expandible
+a1249 2
+
+\change_inserted 0 1284015634
+a1254 2
+
+\change_inserted 0 1284015634
+a1258 2
+
+\change_inserted 0 1284422552
+a1267 2
+
+\change_inserted 0 1284422568
+a1271 2
+
+\change_inserted 0 1284422646
+a1276 2
+
+\change_inserted 0 1284422656
+a1280 2
+
+\change_inserted 0 1284423065
+a1305 2
+
+\change_inserted 0 1284423042
+a1310 2
+\change_unchanged
+
+a1457 2
+
+\change_inserted 0 1283336713
+a1463 2
+
+\change_unchanged
+d1482 2
+d1485 1
+a1485 51
+\change_deleted 0 1283307675
+There are three details which become important:
+\end_layout
+
+\begin_layout Enumerate
+
+\change_deleted 0 1283307675
+On encountering a full bucket, we use the next bucket.
+\end_layout
+
+\begin_layout Enumerate
+
+\change_deleted 0 1283307675
+Extra hash bits are stored with the offset, to reduce comparisons.
+\end_layout
+
+\begin_layout Enumerate
+
+\change_deleted 0 1283307675
+A marker entry is used on deleting an entry.
+\end_layout
+
+\begin_layout Standard
+
+\change_deleted 0 1283307675
+The doubling of the table must be done under a transaction; we will not
+ reduce it on deletion, so it will be an unusual case.
+ It will either be placed at the head (other entries will be moved out the
+ way so we can expand).
+ We could have a pointer in the header to the current hashtable location,
+ but that pointer would have to be read frequently to check for hashtable
+ moves.
+\end_layout
+
+\begin_layout Standard
+
+\change_deleted 0 1283307675
+The locking for this is slightly more complex than the chained case; we
+ currently have one lock per bucket, and that means we would need to expand
+ the lock if we overflow to the next bucket.
+ The frequency of such collisions will effect our locking heuristics: we
+ can always lock more buckets than we need.
+\end_layout
+
+\begin_layout Standard
+
+\change_deleted 0 1283307675
+One possible optimization is to only re-check the hash size on an insert
+ or a lookup miss.
+
+\change_inserted 0 1283307770
+a1492 2
+
+\change_inserted 0 1283336187
+a1500 2
+
+\change_inserted 0 1283336586
+a1510 2
+\change_unchanged
+
+d1636 3
+a1638 8
+Proposed Solution
+\change_deleted 0 1283336858
+
+\end_layout
+
+\begin_layout Standard
+The first step is to remove all the current heuristics, as they obviously
+ interact, then examine them once the lock contention is addressed.
+a1647 2
+\change_inserted 0 1283336910
+
+a1650 2
+
+\change_inserted 0 1283337052
+a1655 2
+\change_unchanged
+
+a1776 2
+\change_inserted 0 1283309850
+
+a1779 2
+
+\change_inserted 0 1283337216
+a1813 2
+
+\change_inserted 0 1284424151
+a1825 2
+\change_unchanged
+
+a1830 2
+\change_unchanged
+
+a2031 2
+
+\change_inserted 0 1283336739
+a2040 2
+\change_unchanged
+
+a2117 2
+\change_inserted 0 1283337133
+
+a2120 2
+
+\change_inserted 0 1283337139
+a2121 2
+\change_unchanged
+
+a2136 2
+
+\change_inserted 0 1283337235
+a2147 2
+\change_unchanged
+
+d2251 1
+a2251 7
+Proposed Solution
+\change_deleted 0 1284423472
+
+\end_layout
+
+\begin_layout Standard
+None.
+d2261 1
+a2261 1
+\change_inserted 0 1284423891
+d2263 1
+a2263 4
+\change_deleted 0 1284423891
+.
+
+\change_inserted 0 1284423901
+a2271 2
+\change_unchanged
+
+a2293 2
+\change_inserted 0 1284423495
+
+a2312 2
+
+\change_inserted 0 1284424201
+d2321 1
+a2321 3
+
+\change_unchanged
+We could solve a small part of the problem by providing read-only transactions.
+a2505 2
+\change_inserted 0 1284423555
+
+a2508 2
+
+\change_inserted 0 1284423617
+a2512 2
+
+\change_inserted 0 1284423719
+a2519 2
+
+\change_inserted 0 1284423864
+a2530 2
+
+\change_inserted 0 1284423850
+a2540 2
+\change_unchanged
+
+@
+
+
+1.9
+log
+@Extension mechanism.
+@
+text
+@d56 2
+a57 2
+\change_inserted 0 1284016854
+9-September
+d479 11
+d1303 1
+a1303 1
+\change_inserted 0 1284016847
+d1310 56
+d1945 1
+a1945 1
+\change_inserted 0 1283310945
+d1956 2
+d2402 2
+d2416 4
+d2421 12
+d2455 2
+d2476 12
+d2673 47
+@
+
+
+1.8
+log
+@Remove bogus footnote
+@
+text
+@d56 2
+a57 2
+\change_inserted 0 1283307544
+1-September
+d838 12
+d1198 103
+@
+
+
+1.7
+log
+@Moving hash table does not work.
+@
+text
+@a1436 12
+\begin_inset Foot
+status collapsed
+
+\begin_layout Plain Layout
+
+\change_inserted 0 1283336450
+If we make the hash offsets zone-relative, then this only restricts the
+ zone size, not the overall database size.
+\end_layout
+
+\end_inset
+
+@
+
+
+1.6
+log
+@Commit changes
+@
+text
+@d38 1
+a38 1
+\author ""
+d53 7
+a59 1
+26-July-2010
+d1333 10
+d1361 3
+a1363 1
+ There are three details which become important:
+d1367 2
+d1373 2
+d1379 2
+d1385 2
+d1397 2
+d1407 2
+d1411 45
+d1582 2
+d1598 14
+d1733 62
+d1996 13
+d2086 10
+d2110 15
+a2124 1
+\begin_layout LyX-Code
+@
+
+
+1.5
+log
+@Soft transaction commit
+@
+text
+@d38 1
+a38 1
+\author "Rusty Russell,,,"
+a52 4
+
+\change_deleted 0 1280141199
+10-May-2010
+\change_inserted 0 1280141202
+a53 2
+\change_unchanged
+
+a2028 2
+
+\change_inserted 0 1280140902
+a2034 2
+
+\change_unchanged
+a2212 2
+\change_inserted 0 1280140661
+
+a2215 2
+
+\change_inserted 0 1280140703
+a2219 2
+
+\change_inserted 0 1280708312
+a2226 2
+
+\change_inserted 0 1280708400
+a2239 2
+
+\change_inserted 0 1280140836
+a2243 2
+
+\change_inserted 0 1280708255
+a2247 2
+
+\change_inserted 0 1280708374
+a2252 2
+
+\change_inserted 0 1280141181
+a2274 2
+
+\change_inserted 0 1280141345
+@
+
+
+1.4
+log
+@Merge changes
+@
+text
+@d38 1
+a38 1
+\author ""
+d53 2
+d56 4
+d2035 10
+d2223 84
+@
+
+
+1.3
+log
+@Transaction and freelist rethink.
+@
+text
+@d38 1
+a38 1
+\author "Rusty Russell,,,"
+d53 1
+a53 1
+27-April-2010
+d662 1
+a662 5
+ behavior of disallowing
+\change_inserted 0 1272940179
+nested
+\change_unchanged
+transactions should become the default.
+a1210 2
+\change_inserted 0 1272944650
+
+a1214 2
+
+\change_inserted 0 1272944763
+a1218 2
+\change_unchanged
+
+a1223 2
+\change_unchanged
+
+a1301 2
+
+\change_inserted 0 1273478114
+a1310 2
+\change_unchanged
+
+d1515 1
+a1515 11
+The free list
+\change_deleted 0 1273469807
+should
+\change_inserted 0 1273469810
+must
+\change_unchanged
+ be split
+\change_deleted 0 1273469815
+into multiple lists
+\change_unchanged
+to reduce contention.
+a1520 2
+\change_inserted 0 1273470006
+
+a1523 2
+
+\change_inserted 0 1273492055
+a1539 2
+
+\change_inserted 0 1273483888
+a1551 2
+\change_unchanged
+
+a1554 8
+
+\change_deleted 0 1272942055
+There are various ways to organize these lisys, but because we want to be
+ able to quickly identify which free list an entry is in, and reduce the
+ number of locks required for merging, we will use zoning (eg.
+ each free list covers some fixed fraction of the file).
+
+\change_inserted 0 1273484187
+d1556 1
+a1556 7
+
+\change_deleted 0 1273484194
+The algorithm for f
+\change_inserted 0 1273484194
+F
+\change_unchanged
+reeing is simple:
+d1560 1
+a1560 7
+Identify the correct
+\change_deleted 0 1273482856
+free list
+\change_inserted 0 1273482857
+zone
+\change_unchanged
+.
+d1564 1
+a1564 7
+Lock the
+\change_inserted 0 1273482895
+corresponding
+\change_unchanged
+list
+\change_inserted 0 1273482863
+.
+a1567 2
+
+\change_inserted 0 1273482909
+d1573 1
+a1573 13
+
+\change_deleted 0 1273482885
+, and p
+\change_inserted 0 1273482888
+P
+\change_unchanged
+lace the freed entry
+\change_deleted 0 1273492415
+at the head
+\change_inserted 0 1273492415
+in the list for that zone
+\change_unchanged
+.
+d1577 2
+a1578 7
+Allocation is a little more complicated, as we
+\change_deleted 0 1273483240
+merge entries as we walk the list:
+\change_inserted 0 1273484250
+perform delayed coalescing at this point:
+\change_unchanged
+
+d1582 1
+a1582 19
+Pick a
+\change_deleted 0 1273482955
+free list;
+\change_inserted 0 1273482957
+zone
+\change_unchanged
+ either the
+\change_deleted 0 1273482962
+list
+\change_inserted 0 1273482962
+zone
+\change_unchanged
+ we last freed
+\change_deleted 0 1273482966
+o
+\change_inserted 0 1273482966
+i
+\change_unchanged
+nto, or based on a
+d1594 1
+a1594 9
+Lock th
+\change_inserted 0 1273482980
+e corresponding
+\change_deleted 0 1273482973
+at
+\change_unchanged
+ list.
+\change_inserted 0 1273482982
+
+a1597 2
+
+\change_inserted 0 1273483084
+a1598 53
+\change_unchanged
+
+\end_layout
+
+\begin_layout Enumerate
+If the top entry is
+\change_deleted 0 1273492155
+well-sized,
+\change_inserted 0 1273492159
+-large enough,
+\change_unchanged
+remove it from the list and return it.
+\end_layout
+
+\begin_layout Enumerate
+Otherwise,
+\change_inserted 0 1273492206
+coalesce entries in the list.
+\change_deleted 0 1273492200
+examine the entry to the right of it in the file.
+ If it is free:
+\end_layout
+
+\begin_deeper
+\begin_layout Enumerate
+
+\change_deleted 0 1273492200
+If that entry is in a different list, lock that list too.
+\end_layout
+
+\begin_layout Enumerate
+
+\change_deleted 0 1273492200
+If we had to place a new lock, re-check that the entry is free.
+\end_layout
+
+\begin_layout Enumerate
+
+\change_deleted 0 1273492200
+Remove that entry from its free list and expand this entry to cover it.
+\end_layout
+
+\begin_layout Enumerate
+
+\change_deleted 0 1273485554
+Goto step 3.
+\end_layout
+
+\end_deeper
+\begin_layout Enumerate
+
+\change_inserted 0 1273485311
+If there was no entry large enough, unlock the list and try the next zone.
+d1602 1
+a1602 5
+
+\change_deleted 0 1273483646
+Repeat step 3 with each entry in the list.
+\change_unchanged
+
+d1606 2
+a1607 5
+
+\change_deleted 0 1273483668
+Unlock the list and repeat step 2 with the next list.
+\change_unchanged
+
+d1611 1
+a1611 7
+If no
+\change_deleted 0 1273483671
+list
+\change_inserted 0 1273483671
+zone
+\change_unchanged
+ satisfies, expand the file.
+d1615 2
+a1616 9
+This optimizes rapid insert/delete of free list entries
+\change_inserted 0 1273485794
+ by not coalescing them all the time.
+\change_deleted 0 1273483685
+, and allows us to get rid of the tailer altogether
+\change_unchanged
+.
+
+\change_inserted 0 1273492299
+a1638 39
+
+\change_deleted 0 1273476840
+The question of
+\begin_inset Quotes eld
+\end_inset
+
+well-sized
+\begin_inset Quotes erd
+\end_inset
+
+ free entries is more difficult: the 25% overhead works in practice for
+ ldb because indexes tend to expand by one record at a time.
+ This can be resolved by having an
+\begin_inset Quotes eld
+\end_inset
+
+expanded
+\begin_inset Quotes erd
+\end_inset
+
+ bit in the header to note entries that have previously expanded, and allocating
+ more space for them.
+ Whether the
+\begin_inset Quotes eld
+\end_inset
+
+increasing slack
+\begin_inset Quotes erd
+\end_inset
+
+ algorithm should be implemented or first-fit used is still unknown: we
+ will determine this once these other ideas are implemented.
+\change_inserted 0 1273483750
+
+\end_layout
+
+\begin_layout Standard
+
+\change_inserted 0 1273492450
+a1644 2
+
+\change_inserted 0 1273470441
+a1654 2
+
+\change_inserted 0 1273476556
+a1659 2
+
+\change_inserted 0 1273470423
+a1661 2
+\change_unchanged
+
+a1672 2
+
+\change_inserted 0 1273476847
+a1676 2
+
+\change_inserted 0 1273476886
+a1691 2
+
+\change_inserted 0 1273477233
+a1699 2
+
+\change_inserted 0 1273477534
+a1706 2
+
+\change_inserted 0 1273482700
+a1712 2
+
+\change_inserted 0 1273478079
+a1722 2
+
+\change_inserted 0 1273477839
+a1726 2
+
+\change_inserted 0 1273477925
+a1730 2
+
+\change_inserted 0 1273477925
+a1734 2
+
+\change_inserted 0 1273477925
+a1738 2
+
+\change_inserted 0 1273477925
+a1742 2
+
+\change_inserted 0 1273477925
+a1746 2
+
+\change_inserted 0 1273477925
+a1750 2
+
+\change_inserted 0 1273477925
+a1754 2
+
+\change_inserted 0 1273477925
+a1758 2
+
+\change_inserted 0 1273477925
+a1762 2
+
+\change_inserted 0 1273477925
+a1766 2
+
+\change_inserted 0 1273477925
+a1770 2
+
+\change_inserted 0 1273477925
+a1774 2
+
+\change_inserted 0 1273477925
+a1778 2
+
+\change_inserted 0 1273477925
+a1782 2
+
+\change_inserted 0 1273477925
+a1786 2
+
+\change_inserted 0 1273477925
+a1790 2
+
+\change_inserted 0 1273477925
+a1794 2
+
+\change_inserted 0 1273477925
+a1798 2
+
+\change_inserted 0 1273492522
+a1802 2
+
+\change_inserted 0 1273492530
+a1806 2
+
+\change_inserted 0 1273492546
+a1810 2
+
+\change_inserted 0 1273478239
+a1814 2
+
+\change_inserted 0 1273479960
+a1821 2
+
+\change_inserted 0 1273480265
+a1830 2
+
+\change_inserted 0 1273480354
+a1845 2
+
+\change_inserted 0 1273478968
+a1851 2
+
+\change_inserted 0 1273492604
+a1859 2
+
+\change_inserted 0 1273479572
+a1862 2
+\change_unchanged
+
+a1870 2
+
+\change_inserted 0 1273480282
+a1874 2
+
+\change_inserted 0 1273478931
+a1878 2
+
+\change_inserted 0 1273481549
+a1882 2
+
+\change_inserted 0 1273481557
+a1886 2
+
+\change_inserted 0 1273480307
+a1890 2
+
+\change_inserted 0 1273480335
+a1894 2
+
+\change_inserted 0 1273479897
+a1898 2
+
+\change_inserted 0 1273479653
+a1902 2
+
+\change_inserted 0 1273480371
+a1906 2
+
+\change_inserted 0 1273480464
+a1910 2
+
+\change_inserted 0 1273480399
+a1914 2
+
+\change_inserted 0 1273480425
+a1918 2
+
+\change_inserted 0 1273480453
+a1922 2
+
+\change_inserted 0 1273480455
+a1926 2
+
+\change_inserted 0 1273480450
+a1930 2
+
+\change_inserted 0 1273480452
+a1935 2
+\change_inserted 0 1273478830
+
+a1942 5
+
+\change_deleted 0 1273481604
+In theory, we could get away with 2: one after we write the new data, and
+ one to somehow atomically change over to it.
+\change_inserted 0 1273481632
+a1946 2
+
+\change_inserted 0 1273481724
+a1950 2
+
+\change_inserted 0 1273481713
+a1954 2
+
+\change_inserted 0 1273481717
+a1958 2
+
+\change_inserted 0 1273481730
+a1962 2
+
+\change_inserted 0 1273481736
+a1966 2
+
+\change_inserted 0 1273481744
+a1970 2
+
+\change_inserted 0 1273481748
+a1974 2
+
+\change_inserted 0 1273482185
+a1978 2
+
+\change_inserted 0 1273482259
+a1989 50
+
+\change_deleted 0 1273481848
+None.
+ Trying to rewrite the transaction code is a separate experiment, which
+ I encourage someone else to do.
+ At some point you say
+\begin_inset Quotes eld
+\end_inset
+
+use a real database
+\begin_inset Quotes erd
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Standard
+
+\change_deleted 0 1273481848
+But as a thought experiment:
+\change_unchanged
+
+\end_layout
+
+\begin_layout Standard
+
+\change_deleted 0 1273481788
+Say there was a pointer in the header which said where the hash table and
+ free list tables were, and that no blocks were labeled with whether they
+ were free or not (it had to be derived from what list they were in).
+ We could create new hash table and free list in some free space, and populate
+ it as we want the post-committed state to look.
+ Then we sync, then we switch the offset in the header, then we sync again.
+\end_layout
+
+\begin_layout Standard
+
+\change_deleted 0 1273481788
+This would not allow arbitrary changes to the database, such as tdb_repack
+ does, and would require more space (since we have to preserve the current
+ and future entries at once).
+ If we used hash trees rather than one big hash table, we might only have
+ to rewrite some sections of the hash, too.
+\change_inserted 0 1273481854
+
+\end_layout
+
+\begin_layout Standard
+
+\change_inserted 0 1273482102
+a1993 2
+
+\change_inserted 0 1273482061
+a1998 2
+
+\change_inserted 0 1273482063
+a2002 2
+
+\change_inserted 0 1273482072
+a2006 2
+
+\change_inserted 0 1273482139
+a2011 2
+
+\change_inserted 0 1273482364
+a2015 2
+
+\change_inserted 0 1273482163
+a2019 2
+
+\change_inserted 0 1273482493
+a2037 2
+
+\change_inserted 0 1273482536
+a2046 2
+\change_unchanged
+
+a2049 2
+
+\change_inserted 0 1273482641
+a2058 2
+
+\change_inserted 0 1273481827
+d2067 2
+a2068 11
+We could
+\change_inserted 0 1273481829
+then
+\change_unchanged
+implement snapshots using a similar method
+\change_deleted 0 1273481838
+ to the above, only
+\change_inserted 0 1273481840
+,
+\change_unchanged
+ using multiple different hash tables/free tables.
+@
+
+
+1.2
+log
+@After first feedback (Ronnie & Volker)
+@
+text
+@d1314 13
+d1531 11
+a1541 1
+The free list should be split into multiple lists to reduce contention.
+d1547 39
+d1596 7
+d1604 1
+a1604 1
+The algorithm for freeing is simple:
+d1608 7
+a1614 1
+Identify the correct free list.
+d1618 30
+a1647 1
+Lock the list, and place the freed entry at the head.
+d1651 7
+a1657 2
+Allocation is a little more complicated, as we merge entries as we walk
+ the list:
+d1661 19
+a1679 1
+Pick a free list; either the list we last freed onto, or based on a
+d1691 17
+a1707 1
+Lock that list.
+d1711 7
+a1717 1
+If the top entry is well-sized, remove it from the list and return it.
+d1721 5
+a1725 1
+Otherwise, examine the entry to the right of it in the file.
+d1731 2
+d1737 2
+d1743 2
+d1749 2
+d1756 8
+d1765 2
+d1770 2
+d1773 2
+d1778 7
+a1784 1
+If no list satisfies, expand the file.
+d1788 28
+a1815 2
+This optimizes rapid insert/delete of free list entries, and allows us to
+ get rid of the tailer altogether.
+d1819 2
+d1851 1
+a1851 1
+\change_inserted 0 1272941474
+d1857 303
+a2159 18
+\change_inserted 0 1272942759
+There are various ways to organize these lists, but because we want to be
+ able to quickly identify which free list an entry is in, and reduce the
+ number of locks required for merging, we will use zoning (eg.
+ each of the N free lists in a tdb file of size M covers a fixed fraction
+ M/N).
+ Note that this means we need to reshuffle the free lists when we expand
+ the file; this is probably acceptable when we double the hash table size,
+ since that is such an expensive operation already.
+ In the case of increasing the file size, there is an optimization we can
+ use: if we use M in the formula above as the file size rounded up to the
+ next power of 2, we only need reshuffle free lists when the file size crosses
+ a power of 2 boundary,
+\emph on
+and
+\emph default
+reshuffling the free lists is trivial: we simply merge every consecutive
+ pair of free lists.
+d2164 107
+d2276 2
+d2280 59
+d2346 2
+d2363 2
+d2366 2
+d2371 2
+d2382 2
+d2389 57
+d2458 13
+d2474 32
+a2505 2
+We could implement snapshots using a similar method to the above, only using
+ multiple different hash tables/free tables.
+@
+
+
+1.1
+log
+@Initial revision
+@
+text
+@d1 1
+a1 1
+#LyX 1.6.4 created this file. For more info see http://www.lyx.org/
+d36 3
+a38 3
+\tracking_changes false
+\output_changes false
+\author ""
+d662 5
+a666 1
+ behavior of disallowing transactions should become the default.
+d1215 21
+d1527 2
+d1533 3
+a1535 1
+ The algorithm for freeing is simple:
+d1642 26
+@
diff --git a/lib/tdb2/doc/design.pdf b/lib/tdb2/doc/design.pdf
new file mode 100644
index 0000000000..558dc1f8c2
--- /dev/null
+++ b/lib/tdb2/doc/design.pdf
diff --git a/lib/tdb2/doc/design.txt b/lib/tdb2/doc/design.txt
new file mode 100644
index 0000000000..bd2ffde4db
--- /dev/null
+++ b/lib/tdb2/doc/design.txt
@@ -0,0 +1,1258 @@
+TDB2: A Redesigning The Trivial DataBase
+
+Rusty Russell, IBM Corporation
+
+1-December-2010
+
+Abstract
+
+The Trivial DataBase on-disk format is 32 bits; with usage cases
+heading towards the 4G limit, that must change. This required
+breakage provides an opportunity to revisit TDB's other design
+decisions and reassess them.
+
+1 Introduction
+
+The Trivial DataBase was originally written by Andrew Tridgell as
+a simple key/data pair storage system with the same API as dbm,
+but allowing multiple readers and writers while being small
+enough (< 1000 lines of C) to include in SAMBA. The simple design
+created in 1999 has proven surprisingly robust and performant,
+used in Samba versions 3 and 4 as well as numerous other
+projects. Its useful life was greatly increased by the
+(backwards-compatible!) addition of transaction support in 2005.
+
+The wider variety and greater demands of TDB-using code has lead
+to some organic growth of the API, as well as some compromises on
+the implementation. None of these, by themselves, are seen as
+show-stoppers, but the cumulative effect is to a loss of elegance
+over the initial, simple TDB implementation. Here is a table of
+the approximate number of lines of implementation code and number
+of API functions at the end of each year:
+
+
++-----------+----------------+--------------------------------+
+| Year End  | API Functions  | Lines of C Code Implementation |
++-----------+----------------+--------------------------------+
++-----------+----------------+--------------------------------+
+|   1999    |      13        |              1195              |
++-----------+----------------+--------------------------------+
+|   2000    |      24        |              1725              |
++-----------+----------------+--------------------------------+
+|   2001    |      32        |              2228              |
++-----------+----------------+--------------------------------+
+|   2002    |      35        |              2481              |
++-----------+----------------+--------------------------------+
+|   2003    |      35        |              2552              |
++-----------+----------------+--------------------------------+
+|   2004    |      40        |              2584              |
++-----------+----------------+--------------------------------+
+|   2005    |      38        |              2647              |
++-----------+----------------+--------------------------------+
+|   2006    |      52        |              3754              |
++-----------+----------------+--------------------------------+
+|   2007    |      66        |              4398              |
++-----------+----------------+--------------------------------+
+|   2008    |      71        |              4768              |
++-----------+----------------+--------------------------------+
+|   2009    |      73        |              5715              |
++-----------+----------------+--------------------------------+
+
+
+This review is an attempt to catalog and address all the known
+issues with TDB and create solutions which address the problems
+without significantly increasing complexity; all involved are far
+too aware of the dangers of second system syndrome in rewriting a
+successful project like this.
+
+2 API Issues
+
+2.1 tdb_open_ex Is Not Expandable
+
+The tdb_open() call was expanded to tdb_open_ex(), which added an
+optional hashing function and an optional logging function
+argument. Additional arguments to open would require the
+introduction of a tdb_open_ex2 call etc.
+
+2.1.1 Proposed Solution<attributes>
+
+tdb_open() will take a linked-list of attributes:
+
+enum tdb_attribute {
+
+    TDB_ATTRIBUTE_LOG = 0,
+
+    TDB_ATTRIBUTE_HASH = 1
+
+};
+
+struct tdb_attribute_base {
+
+    enum tdb_attribute attr;
+
+    union tdb_attribute *next;
+
+};
+
+struct tdb_attribute_log {
+
+    struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_LOG
+*/
+
+    tdb_log_func log_fn;
+
+    void *log_private;
+
+};
+
+struct tdb_attribute_hash {
+
+    struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_HASH
+*/
+
+    tdb_hash_func hash_fn;
+
+    void *hash_private;
+
+};
+
+union tdb_attribute {
+
+    struct tdb_attribute_base base;
+
+    struct tdb_attribute_log log;
+
+    struct tdb_attribute_hash hash;
+
+};
+
+This allows future attributes to be added, even if this expands
+the size of the union.
+
+2.1.2 Status
+
+Complete.
+
+2.2 tdb_traverse Makes Impossible Guarantees
+
+tdb_traverse (and tdb_firstkey/tdb_nextkey) predate transactions,
+and it was thought that it was important to guarantee that all
+records which exist at the start and end of the traversal would
+be included, and no record would be included twice.
+
+This adds complexity (see[Reliable-Traversal-Adds]) and does not
+work anyway for records which are altered (in particular, those
+which are expanded may be effectively deleted and re-added behind
+the traversal).
+
+2.2.1 <traverse-Proposed-Solution>Proposed Solution
+
+Abandon the guarantee. You will see every record if no changes
+occur during your traversal, otherwise you will see some subset.
+You can prevent changes by using a transaction or the locking
+API.
+
+2.2.2 Status
+
+Complete. Delete-during-traverse will still delete every record,
+too (assuming no other changes).
+
+2.3 Nesting of Transactions Is Fraught
+
+TDB has alternated between allowing nested transactions and not
+allowing them. Various paths in the Samba codebase assume that
+transactions will nest, and in a sense they can: the operation is
+only committed to disk when the outer transaction is committed.
+There are two problems, however:
+
+1. Canceling the inner transaction will cause the outer
+  transaction commit to fail, and will not undo any operations
+  since the inner transaction began. This problem is soluble with
+  some additional internal code.
+
+2. An inner transaction commit can be cancelled by the outer
+  transaction. This is desirable in the way which Samba's
+  database initialization code uses transactions, but could be a
+  surprise to any users expecting a successful transaction commit
+  to expose changes to others.
+
+The current solution is to specify the behavior at tdb_open(),
+with the default currently that nested transactions are allowed.
+This flag can also be changed at runtime.
+
+2.3.1 Proposed Solution
+
+Given the usage patterns, it seems that the “least-surprise”
+behavior of disallowing nested transactions should become the
+default. Additionally, it seems the outer transaction is the only
+code which knows whether inner transactions should be allowed, so
+a flag to indicate this could be added to tdb_transaction_start.
+However, this behavior can be simulated with a wrapper which uses
+tdb_add_flags() and tdb_remove_flags(), so the API should not be
+expanded for this relatively-obscure case.
+
+2.3.2 Status
+
+Incomplete; nesting flag is still defined as per tdb1.
+
+2.4 Incorrect Hash Function is Not Detected
+
+tdb_open_ex() allows the calling code to specify a different hash
+function to use, but does not check that all other processes
+accessing this tdb are using the same hash function. The result
+is that records are missing from tdb_fetch().
+
+2.4.1 Proposed Solution
+
+The header should contain an example hash result (eg. the hash of
+0xdeadbeef), and tdb_open_ex() should check that the given hash
+function produces the same answer, or fail the tdb_open call.
+
+2.4.2 Status
+
+Complete.
+
+2.5 tdb_set_max_dead/TDB_VOLATILE Expose Implementation
+
+In response to scalability issues with the free list ([TDB-Freelist-Is]
+) two API workarounds have been incorporated in TDB:
+tdb_set_max_dead() and the TDB_VOLATILE flag to tdb_open. The
+latter actually calls the former with an argument of “5”.
+
+This code allows deleted records to accumulate without putting
+them in the free list. On delete we iterate through each chain
+and free them in a batch if there are more than max_dead entries.
+These are never otherwise recycled except as a side-effect of a
+tdb_repack.
+
+2.5.1 Proposed Solution
+
+With the scalability problems of the freelist solved, this API
+can be removed. The TDB_VOLATILE flag may still be useful as a
+hint that store and delete of records will be at least as common
+as fetch in order to allow some internal tuning, but initially
+will become a no-op.
+
+2.5.2 Status
+
+Incomplete. TDB_VOLATILE still defined, but implementation should
+fail on unknown flags to be future-proof.
+
+2.6 <TDB-Files-Cannot>TDB Files Cannot Be Opened Multiple Times
+  In The Same Process
+
+No process can open the same TDB twice; we check and disallow it.
+This is an unfortunate side-effect of fcntl locks, which operate
+on a per-file rather than per-file-descriptor basis, and do not
+nest. Thus, closing any file descriptor on a file clears all the
+locks obtained by this process, even if they were placed using a
+different file descriptor!
+
+Note that even if this were solved, deadlock could occur if
+operations were nested: this is a more manageable programming
+error in most cases.
+
+2.6.1 Proposed Solution
+
+We could lobby POSIX to fix the perverse rules, or at least lobby
+Linux to violate them so that the most common implementation does
+not have this restriction. This would be a generally good idea
+for other fcntl lock users.
+
+Samba uses a wrapper which hands out the same tdb_context to
+multiple callers if this happens, and does simple reference
+counting. We should do this inside the tdb library, which already
+emulates lock nesting internally; it would need to recognize when
+deadlock occurs within a single process. This would create a new
+failure mode for tdb operations (while we currently handle
+locking failures, they are impossible in normal use and a process
+encountering them can do little but give up).
+
+I do not see benefit in an additional tdb_open flag to indicate
+whether re-opening is allowed, as though there may be some
+benefit to adding a call to detect when a tdb_context is shared,
+to allow other to create such an API.
+
+2.6.2 Status
+
+Incomplete.
+
+2.7 TDB API Is Not POSIX Thread-safe
+
+The TDB API uses an error code which can be queried after an
+operation to determine what went wrong. This programming model
+does not work with threads, unless specific additional guarantees
+are given by the implementation. In addition, even
+otherwise-independent threads cannot open the same TDB (as in [TDB-Files-Cannot]
+).
+
+2.7.1 Proposed Solution
+
+Reachitecting the API to include a tdb_errcode pointer would be a
+great deal of churn; we are better to guarantee that the
+tdb_errcode is per-thread so the current programming model can be
+maintained.
+
+This requires dynamic per-thread allocations, which is awkward
+with POSIX threads (pthread_key_create space is limited and we
+cannot simply allocate a key for every TDB).
+
+Internal locking is required to make sure that fcntl locks do not
+overlap between threads, and also that the global list of tdbs is
+maintained.
+
+The aim is that building tdb with -DTDB_PTHREAD will result in a
+pthread-safe version of the library, and otherwise no overhead
+will exist. Alternatively, a hooking mechanism similar to that
+proposed for [Proposed-Solution-locking-hook] could be used to
+enable pthread locking at runtime.
+
+2.7.2 Status
+
+Incomplete.
+
+2.8 *_nonblock Functions And *_mark Functions Expose
+  Implementation
+
+CTDB[footnote:
+Clustered TDB, see http://ctdb.samba.org
+] wishes to operate on TDB in a non-blocking manner. This is
+currently done as follows:
+
+1. Call the _nonblock variant of an API function (eg.
+  tdb_lockall_nonblock). If this fails:
+
+2. Fork a child process, and wait for it to call the normal
+  variant (eg. tdb_lockall).
+
+3. If the child succeeds, call the _mark variant to indicate we
+  already have the locks (eg. tdb_lockall_mark).
+
+4. Upon completion, tell the child to release the locks (eg.
+  tdb_unlockall).
+
+5. Indicate to tdb that it should consider the locks removed (eg.
+  tdb_unlockall_mark).
+
+There are several issues with this approach. Firstly, adding two
+new variants of each function clutters the API for an obscure
+use, and so not all functions have three variants. Secondly, it
+assumes that all paths of the functions ask for the same locks,
+otherwise the parent process will have to get a lock which the
+child doesn't have under some circumstances. I don't believe this
+is currently the case, but it constrains the implementation.
+
+2.8.1 <Proposed-Solution-locking-hook>Proposed Solution
+
+Implement a hook for locking methods, so that the caller can
+control the calls to create and remove fcntl locks. In this
+scenario, ctdbd would operate as follows:
+
+1. Call the normal API function, eg tdb_lockall().
+
+2. When the lock callback comes in, check if the child has the
+  lock. Initially, this is always false. If so, return 0.
+  Otherwise, try to obtain it in non-blocking mode. If that
+  fails, return EWOULDBLOCK.
+
+3. Release locks in the unlock callback as normal.
+
+4. If tdb_lockall() fails, see if we recorded a lock failure; if
+  so, call the child to repeat the operation.
+
+5. The child records what locks it obtains, and returns that
+  information to the parent.
+
+6. When the child has succeeded, goto 1.
+
+This is flexible enough to handle any potential locking scenario,
+even when lock requirements change. It can be optimized so that
+the parent does not release locks, just tells the child which
+locks it doesn't need to obtain.
+
+It also keeps the complexity out of the API, and in ctdbd where
+it is needed.
+
+2.8.2 Status
+
+Incomplete.
+
+2.9 tdb_chainlock Functions Expose Implementation
+
+tdb_chainlock locks some number of records, including the record
+indicated by the given key. This gave atomicity guarantees;
+no-one can start a transaction, alter, read or delete that key
+while the lock is held.
+
+It also makes the same guarantee for any other key in the chain,
+which is an internal implementation detail and potentially a
+cause for deadlock.
+
+2.9.1 Proposed Solution
+
+None. It would be nice to have an explicit single entry lock
+which effected no other keys. Unfortunately, this won't work for
+an entry which doesn't exist. Thus while chainlock may be
+implemented more efficiently for the existing case, it will still
+have overlap issues with the non-existing case. So it is best to
+keep the current (lack of) guarantee about which records will be
+effected to avoid constraining our implementation.
+
+2.10 Signal Handling is Not Race-Free
+
+The tdb_setalarm_sigptr() call allows the caller's signal handler
+to indicate that the tdb locking code should return with a
+failure, rather than trying again when a signal is received (and
+errno == EAGAIN). This is usually used to implement timeouts.
+
+Unfortunately, this does not work in the case where the signal is
+received before the tdb code enters the fcntl() call to place the
+lock: the code will sleep within the fcntl() code, unaware that
+the signal wants it to exit. In the case of long timeouts, this
+does not happen in practice.
+
+2.10.1 Proposed Solution
+
+The locking hooks proposed in[Proposed-Solution-locking-hook]
+would allow the user to decide on whether to fail the lock
+acquisition on a signal. This allows the caller to choose their
+own compromise: they could narrow the race by checking
+immediately before the fcntl call.[footnote:
+It may be possible to make this race-free in some implementations
+by having the signal handler alter the struct flock to make it
+invalid. This will cause the fcntl() lock call to fail with
+EINVAL if the signal occurs before the kernel is entered,
+otherwise EAGAIN.
+]
+
+2.10.2 Status
+
+Incomplete.
+
+2.11 The API Uses Gratuitous Typedefs, Capitals
+
+typedefs are useful for providing source compatibility when types
+can differ across implementations, or arguably in the case of
+function pointer definitions which are hard for humans to parse.
+Otherwise it is simply obfuscation and pollutes the namespace.
+
+Capitalization is usually reserved for compile-time constants and
+macros.
+
+  TDB_CONTEXT There is no reason to use this over 'struct
+  tdb_context'; the definition isn't visible to the API user
+  anyway.
+
+  TDB_DATA There is no reason to use this over struct TDB_DATA;
+  the struct needs to be understood by the API user.
+
+  struct TDB_DATA This would normally be called 'struct
+  tdb_data'.
+
+  enum TDB_ERROR Similarly, this would normally be enum
+  tdb_error.
+
+2.11.1 Proposed Solution
+
+None. Introducing lower case variants would please pedants like
+myself, but if it were done the existing ones should be kept.
+There is little point forcing a purely cosmetic change upon tdb
+users.
+
+2.12 <tdb_log_func-Doesnt-Take>tdb_log_func Doesn't Take The
+  Private Pointer
+
+For API compatibility reasons, the logging function needs to call
+tdb_get_logging_private() to retrieve the pointer registered by
+the tdb_open_ex for logging.
+
+2.12.1 Proposed Solution
+
+It should simply take an extra argument, since we are prepared to
+break the API/ABI.
+
+2.12.2 Status
+
+Complete.
+
+2.13 Various Callback Functions Are Not Typesafe
+
+The callback functions in tdb_set_logging_function (after [tdb_log_func-Doesnt-Take]
+ is resolved), tdb_parse_record, tdb_traverse, tdb_traverse_read
+and tdb_check all take void * and must internally convert it to
+the argument type they were expecting.
+
+If this type changes, the compiler will not produce warnings on
+the callers, since it only sees void *.
+
+2.13.1 Proposed Solution
+
+With careful use of macros, we can create callback functions
+which give a warning when used on gcc and the types of the
+callback and its private argument differ. Unsupported compilers
+will not give a warning, which is no worse than now. In addition,
+the callbacks become clearer, as they need not use void * for
+their parameter.
+
+See CCAN's typesafe_cb module at
+http://ccan.ozlabs.org/info/typesafe_cb.html
+
+2.13.2 Status
+
+Incomplete.
+
+2.14 TDB_CLEAR_IF_FIRST Must Be Specified On All Opens,
+  tdb_reopen_all Problematic
+
+The TDB_CLEAR_IF_FIRST flag to tdb_open indicates that the TDB
+file should be cleared if the caller discovers it is the only
+process with the TDB open. However, if any caller does not
+specify TDB_CLEAR_IF_FIRST it will not be detected, so will have
+the TDB erased underneath them (usually resulting in a crash).
+
+There is a similar issue on fork(); if the parent exits (or
+otherwise closes the tdb) before the child calls tdb_reopen_all()
+to establish the lock used to indicate the TDB is opened by
+someone, a TDB_CLEAR_IF_FIRST opener at that moment will believe
+it alone has opened the TDB and will erase it.
+
+2.14.1 Proposed Solution
+
+Remove TDB_CLEAR_IF_FIRST. Other workarounds are possible, but
+see [TDB_CLEAR_IF_FIRST-Imposes-Performance].
+
+2.14.2 Status
+
+Incomplete, TDB_CLEAR_IF_FIRST still defined, but not
+implemented.
+
+2.15 Extending The Header Is Difficult
+
+We have reserved (zeroed) words in the TDB header, which can be
+used for future features. If the future features are compulsory,
+the version number must be updated to prevent old code from
+accessing the database. But if the future feature is optional, we
+have no way of telling if older code is accessing the database or
+not.
+
+2.15.1 Proposed Solution
+
+The header should contain a “format variant” value (64-bit). This
+is divided into two 32-bit parts:
+
+1. The lower part reflects the format variant understood by code
+  accessing the database.
+
+2. The upper part reflects the format variant you must understand
+  to write to the database (otherwise you can only open for
+  reading).
+
+The latter field can only be written at creation time, the former
+should be written under the OPEN_LOCK when opening the database
+for writing, if the variant of the code is lower than the current
+lowest variant.
+
+This should allow backwards-compatible features to be added, and
+detection if older code (which doesn't understand the feature)
+writes to the database.
+
+2.15.2 Status
+
+Incomplete.
+
+2.16 Record Headers Are Not Expandible
+
+If we later want to add (say) checksums on keys and data, it
+would require another format change, which we'd like to avoid.
+
+2.16.1 Proposed Solution
+
+We often have extra padding at the tail of a record. If we ensure
+that the first byte (if any) of this padding is zero, we will
+have a way for future changes to detect code which doesn't
+understand a new format: the new code would write (say) a 1 at
+the tail, and thus if there is no tail or the first byte is 0, we
+would know the extension is not present on that record.
+
+2.16.2 Status
+
+Incomplete.
+
+2.17 TDB Does Not Use Talloc
+
+Many users of TDB (particularly Samba) use the talloc allocator,
+and thus have to wrap TDB in a talloc context to use it
+conveniently.
+
+2.17.1 Proposed Solution
+
+The allocation within TDB is not complicated enough to justify
+the use of talloc, and I am reluctant to force another
+(excellent) library on TDB users. Nonetheless a compromise is
+possible. An attribute (see [attributes]) can be added later to
+tdb_open() to provide an alternate allocation mechanism,
+specifically for talloc but usable by any other allocator (which
+would ignore the “context” argument).
+
+This would form a talloc heirarchy as expected, but the caller
+would still have to attach a destructor to the tdb context
+returned from tdb_open to close it. All TDB_DATA fields would be
+children of the tdb_context, and the caller would still have to
+manage them (using talloc_free() or talloc_steal()).
+
+2.17.2 Status
+
+Deferred.
+
+3 Performance And Scalability Issues
+
+3.1 <TDB_CLEAR_IF_FIRST-Imposes-Performance>TDB_CLEAR_IF_FIRST
+  Imposes Performance Penalty
+
+When TDB_CLEAR_IF_FIRST is specified, a 1-byte read lock is
+placed at offset 4 (aka. the ACTIVE_LOCK). While these locks
+never conflict in normal tdb usage, they do add substantial
+overhead for most fcntl lock implementations when the kernel
+scans to detect if a lock conflict exists. This is often a single
+linked list, making the time to acquire and release a fcntl lock
+O(N) where N is the number of processes with the TDB open, not
+the number actually doing work.
+
+In a Samba server it is common to have huge numbers of clients
+sitting idle, and thus they have weaned themselves off the
+TDB_CLEAR_IF_FIRST flag.[footnote:
+There is a flag to tdb_reopen_all() which is used for this
+optimization: if the parent process will outlive the child, the
+child does not need the ACTIVE_LOCK. This is a workaround for
+this very performance issue.
+]
+
+3.1.1 Proposed Solution
+
+Remove the flag. It was a neat idea, but even trivial servers
+tend to know when they are initializing for the first time and
+can simply unlink the old tdb at that point.
+
+3.1.2 Status
+
+Incomplete; TDB_CLEAR_IF_FIRST still defined, but does nothing.
+
+3.2 TDB Files Have a 4G Limit
+
+This seems to be becoming an issue (so much for “trivial”!),
+particularly for ldb.
+
+3.2.1 Proposed Solution
+
+A new, incompatible TDB format which uses 64 bit offsets
+internally rather than 32 bit as now. For simplicity of endian
+conversion (which TDB does on the fly if required), all values
+will be 64 bit on disk. In practice, some upper bits may be used
+for other purposes, but at least 56 bits will be available for
+file offsets.
+
+tdb_open() will automatically detect the old version, and even
+create them if TDB_VERSION6 is specified to tdb_open.
+
+32 bit processes will still be able to access TDBs larger than 4G
+(assuming that their off_t allows them to seek to 64 bits), they
+will gracefully fall back as they fail to mmap. This can happen
+already with large TDBs.
+
+Old versions of tdb will fail to open the new TDB files (since 28
+August 2009, commit 398d0c29290: prior to that any unrecognized
+file format would be erased and initialized as a fresh tdb!)
+
+3.2.2 Status
+
+Complete.
+
+3.3 TDB Records Have a 4G Limit
+
+This has not been a reported problem, and the API uses size_t
+which can be 64 bit on 64 bit platforms. However, other limits
+may have made such an issue moot.
+
+3.3.1 Proposed Solution
+
+Record sizes will be 64 bit, with an error returned on 32 bit
+platforms which try to access such records (the current
+implementation would return TDB_ERR_OOM in a similar case). It
+seems unlikely that 32 bit keys will be a limitation, so the
+implementation may not support this (see [sub:Records-Incur-A]).
+
+3.3.2 Status
+
+Complete.
+
+3.4 Hash Size Is Determined At TDB Creation Time
+
+TDB contains a number of hash chains in the header; the number is
+specified at creation time, and defaults to 131. This is such a
+bottleneck on large databases (as each hash chain gets quite
+long), that LDB uses 10,000 for this hash. In general it is
+impossible to know what the 'right' answer is at database
+creation time.
+
+3.4.1 <sub:Hash-Size-Solution>Proposed Solution
+
+After comprehensive performance testing on various scalable hash
+variants[footnote:
+http://rusty.ozlabs.org/?p=89 and http://rusty.ozlabs.org/?p=94
+This was annoying because I was previously convinced that an
+expanding tree of hashes would be very close to optimal.
+], it became clear that it is hard to beat a straight linear hash
+table which doubles in size when it reaches saturation.
+Unfortunately, altering the hash table introduces serious locking
+complications: the entire hash table needs to be locked to
+enlarge the hash table, and others might be holding locks.
+Particularly insidious are insertions done under tdb_chainlock.
+
+Thus an expanding layered hash will be used: an array of hash
+groups, with each hash group exploding into pointers to lower
+hash groups once it fills, turning into a hash tree. This has
+implications for locking: we must lock the entire group in case
+we need to expand it, yet we don't know how deep the tree is at
+that point.
+
+Note that bits from the hash table entries should be stolen to
+hold more hash bits to reduce the penalty of collisions. We can
+use the otherwise-unused lower 3 bits. If we limit the size of
+the database to 64 exabytes, we can use the top 8 bits of the
+hash entry as well. These 11 bits would reduce false positives
+down to 1 in 2000 which is more than we need: we can use one of
+the bits to indicate that the extra hash bits are valid. This
+means we can choose not to re-hash all entries when we expand a
+hash group; simply use the next bits we need and mark them
+invalid.
+
+3.4.2 Status
+
+Complete.
+
+3.5 <TDB-Freelist-Is>TDB Freelist Is Highly Contended
+
+TDB uses a single linked list for the free list. Allocation
+occurs as follows, using heuristics which have evolved over time:
+
+1. Get the free list lock for this whole operation.
+
+2. Multiply length by 1.25, so we always over-allocate by 25%.
+
+3. Set the slack multiplier to 1.
+
+4. Examine the current freelist entry: if it is > length but <
+  the current best case, remember it as the best case.
+
+5. Multiply the slack multiplier by 1.05.
+
+6. If our best fit so far is less than length * slack multiplier,
+  return it. The slack will be turned into a new free record if
+  it's large enough.
+
+7. Otherwise, go onto the next freelist entry.
+
+Deleting a record occurs as follows:
+
+1. Lock the hash chain for this whole operation.
+
+2. Walk the chain to find the record, keeping the prev pointer
+  offset.
+
+3. If max_dead is non-zero:
+
+  (a) Walk the hash chain again and count the dead records.
+
+  (b) If it's more than max_dead, bulk free all the dead ones
+    (similar to steps 4 and below, but the lock is only obtained
+    once).
+
+  (c) Simply mark this record as dead and return.
+
+4. Get the free list lock for the remainder of this operation.
+
+5. <right-merging>Examine the following block to see if it is
+  free; if so, enlarge the current block and remove that block
+  from the free list. This was disabled, as removal from the free
+  list was O(entries-in-free-list).
+
+6. Examine the preceeding block to see if it is free: for this
+  reason, each block has a 32-bit tailer which indicates its
+  length. If it is free, expand it to cover our new block and
+  return.
+
+7. Otherwise, prepend ourselves to the free list.
+
+Disabling right-merging (step [right-merging]) causes
+fragmentation; the other heuristics proved insufficient to
+address this, so the final answer to this was that when we expand
+the TDB file inside a transaction commit, we repack the entire
+tdb.
+
+The single list lock limits our allocation rate; due to the other
+issues this is not currently seen as a bottleneck.
+
+3.5.1 Proposed Solution
+
+The first step is to remove all the current heuristics, as they
+obviously interact, then examine them once the lock contention is
+addressed.
+
+The free list must be split to reduce contention. Assuming
+perfect free merging, we can at most have 1 free list entry for
+each entry. This implies that the number of free lists is related
+to the size of the hash table, but as it is rare to walk a large
+number of free list entries we can use far fewer, say 1/32 of the
+number of hash buckets.
+
+It seems tempting to try to reuse the hash implementation which
+we use for records here, but we have two ways of searching for
+free entries: for allocation we search by size (and possibly
+zone) which produces too many clashes for our hash table to
+handle well, and for coalescing we search by address. Thus an
+array of doubly-linked free lists seems preferable.
+
+There are various benefits in using per-size free lists (see [sub:TDB-Becomes-Fragmented]
+) but it's not clear this would reduce contention in the common
+case where all processes are allocating/freeing the same size.
+Thus we almost certainly need to divide in other ways: the most
+obvious is to divide the file into zones, and using a free list
+(or table of free lists) for each. This approximates address
+ordering.
+
+Unfortunately it is difficult to know what heuristics should be
+used to determine zone sizes, and our transaction code relies on
+being able to create a “recovery area” by simply appending to the
+file (difficult if it would need to create a new zone header).
+Thus we use a linked-list of free tables; currently we only ever
+create one, but if there is more than one we choose one at random
+to use. In future we may use heuristics to add new free tables on
+contention. We only expand the file when all free tables are
+exhausted.
+
+The basic algorithm is as follows. Freeing is simple:
+
+1. Identify the correct free list.
+
+2. Lock the corresponding list.
+
+3. Re-check the list (we didn't have a lock, sizes could have
+  changed): relock if necessary.
+
+4. Place the freed entry in the list.
+
+Allocation is a little more complicated, as we perform delayed
+coalescing at this point:
+
+1. Pick a free table; usually the previous one.
+
+2. Lock the corresponding list.
+
+3. If the top entry is -large enough, remove it from the list and
+  return it.
+
+4. Otherwise, coalesce entries in the list.If there was no entry
+  large enough, unlock the list and try the next largest list
+
+5. If no list has an entry which meets our needs, try the next
+  free table.
+
+6. If no zone satisfies, expand the file.
+
+This optimizes rapid insert/delete of free list entries by not
+coalescing them all the time.. First-fit address ordering
+ordering seems to be fairly good for keeping fragmentation low
+(see [sub:TDB-Becomes-Fragmented]). Note that address ordering
+does not need a tailer to coalesce, though if we needed one we
+could have one cheaply: see [sub:Records-Incur-A].
+
+Each free entry has the free table number in the header: less
+than 255. It also contains a doubly-linked list for easy
+deletion.
+
+3.6 <sub:TDB-Becomes-Fragmented>TDB Becomes Fragmented
+
+Much of this is a result of allocation strategy[footnote:
+The Memory Fragmentation Problem: Solved? Johnstone & Wilson 1995
+ftp://ftp.cs.utexas.edu/pub/garbage/malloc/ismm98.ps
+] and deliberate hobbling of coalescing; internal fragmentation
+(aka overallocation) is deliberately set at 25%, and external
+fragmentation is only cured by the decision to repack the entire
+db when a transaction commit needs to enlarge the file.
+
+3.6.1 Proposed Solution
+
+The 25% overhead on allocation works in practice for ldb because
+indexes tend to expand by one record at a time. This internal
+fragmentation can be resolved by having an “expanded” bit in the
+header to note entries that have previously expanded, and
+allocating more space for them.
+
+There are is a spectrum of possible solutions for external
+fragmentation: one is to use a fragmentation-avoiding allocation
+strategy such as best-fit address-order allocator. The other end
+of the spectrum would be to use a bump allocator (very fast and
+simple) and simply repack the file when we reach the end.
+
+There are three problems with efficient fragmentation-avoiding
+allocators: they are non-trivial, they tend to use a single free
+list for each size, and there's no evidence that tdb allocation
+patterns will match those recorded for general allocators (though
+it seems likely).
+
+Thus we don't spend too much effort on external fragmentation; we
+will be no worse than the current code if we need to repack on
+occasion. More effort is spent on reducing freelist contention,
+and reducing overhead.
+
+3.7 <sub:Records-Incur-A>Records Incur A 28-Byte Overhead
+
+Each TDB record has a header as follows:
+
+struct tdb_record {
+
+        tdb_off_t next; /* offset of the next record in the list
+*/
+
+        tdb_len_t rec_len; /* total byte length of record */
+
+        tdb_len_t key_len; /* byte length of key */
+
+        tdb_len_t data_len; /* byte length of data */
+
+        uint32_t full_hash; /* the full 32 bit hash of the key */
+
+        uint32_t magic;   /* try to catch errors */
+
+        /* the following union is implied:
+
+                union {
+
+                        char record[rec_len];
+
+                        struct {
+
+                                char key[key_len];
+
+                                char data[data_len];
+
+                        }
+
+                        uint32_t totalsize; (tailer)
+
+                }
+
+        */
+
+};
+
+Naively, this would double to a 56-byte overhead on a 64 bit
+implementation.
+
+3.7.1 Proposed Solution
+
+We can use various techniques to reduce this for an allocated
+block:
+
+1. The 'next' pointer is not required, as we are using a flat
+  hash table.
+
+2. 'rec_len' can instead be expressed as an addition to key_len
+  and data_len (it accounts for wasted or overallocated length in
+  the record). Since the record length is always a multiple of 8,
+  we can conveniently fit it in 32 bits (representing up to 35
+  bits).
+
+3. 'key_len' and 'data_len' can be reduced. I'm unwilling to
+  restrict 'data_len' to 32 bits, but instead we can combine the
+  two into one 64-bit field and using a 5 bit value which
+  indicates at what bit to divide the two. Keys are unlikely to
+  scale as fast as data, so I'm assuming a maximum key size of 32
+  bits.
+
+4. 'full_hash' is used to avoid a memcmp on the “miss” case, but
+  this is diminishing returns after a handful of bits (at 10
+  bits, it reduces 99.9% of false memcmp). As an aside, as the
+  lower bits are already incorporated in the hash table
+  resolution, the upper bits should be used here. Note that it's
+  not clear that these bits will be a win, given the extra bits
+  in the hash table itself (see [sub:Hash-Size-Solution]).
+
+5. 'magic' does not need to be enlarged: it currently reflects
+  one of 5 values (used, free, dead, recovery, and
+  unused_recovery). It is useful for quick sanity checking
+  however, and should not be eliminated.
+
+6. 'tailer' is only used to coalesce free blocks (so a block to
+  the right can find the header to check if this block is free).
+  This can be replaced by a single 'free' bit in the header of
+  the following block (and the tailer only exists in free
+  blocks).[footnote:
+This technique from Thomas Standish. Data Structure Techniques.
+Addison-Wesley, Reading, Massachusetts, 1980.
+] The current proposed coalescing algorithm doesn't need this,
+  however.
+
+This produces a 16 byte used header like this:
+
+struct tdb_used_record {
+
+        uint32_t used_magic : 16,
+
+
+
+                 key_data_divide: 5,
+
+                 top_hash: 11;
+
+        uint32_t extra_octets;
+
+        uint64_t key_and_data_len;
+
+};
+
+And a free record like this:
+
+struct tdb_free_record {
+
+        uint64_t free_magic: 8,
+
+                   prev : 56;
+
+
+
+        uint64_t free_table: 8,
+
+                 total_length : 56
+
+        uint64_t next;;
+
+};
+
+Note that by limiting valid offsets to 56 bits, we can pack
+everything we need into 3 64-byte words, meaning our minimum
+record size is 8 bytes.
+
+3.7.2 Status
+
+Complete.
+
+3.8 Transaction Commit Requires 4 fdatasync
+
+The current transaction algorithm is:
+
+1. write_recovery_data();
+
+2. sync();
+
+3. write_recovery_header();
+
+4. sync();
+
+5. overwrite_with_new_data();
+
+6. sync();
+
+7. remove_recovery_header();
+
+8. sync();
+
+On current ext3, each sync flushes all data to disk, so the next
+3 syncs are relatively expensive. But this could become a
+performance bottleneck on other filesystems such as ext4.
+
+3.8.1 Proposed Solution
+
+Neil Brown points out that this is overzealous, and only one sync
+is needed:
+
+1. Bundle the recovery data, a transaction counter and a strong
+  checksum of the new data.
+
+2. Strong checksum that whole bundle.
+
+3. Store the bundle in the database.
+
+4. Overwrite the oldest of the two recovery pointers in the
+  header (identified using the transaction counter) with the
+  offset of this bundle.
+
+5. sync.
+
+6. Write the new data to the file.
+
+Checking for recovery means identifying the latest bundle with a
+valid checksum and using the new data checksum to ensure that it
+has been applied. This is more expensive than the current check,
+but need only be done at open. For running databases, a separate
+header field can be used to indicate a transaction in progress;
+we need only check for recovery if this is set.
+
+3.8.2 Status
+
+Deferred.
+
+3.9 <sub:TDB-Does-Not>TDB Does Not Have Snapshot Support
+
+3.9.1 Proposed SolutionNone. At some point you say “use a real
+  database” (but see [replay-attribute]).
+
+But as a thought experiment, if we implemented transactions to
+only overwrite free entries (this is tricky: there must not be a
+header in each entry which indicates whether it is free, but use
+of presence in metadata elsewhere), and a pointer to the hash
+table, we could create an entirely new commit without destroying
+existing data. Then it would be easy to implement snapshots in a
+similar way.
+
+This would not allow arbitrary changes to the database, such as
+tdb_repack does, and would require more space (since we have to
+preserve the current and future entries at once). If we used hash
+trees rather than one big hash table, we might only have to
+rewrite some sections of the hash, too.
+
+We could then implement snapshots using a similar method, using
+multiple different hash tables/free tables.
+
+3.9.2 Status
+
+Deferred.
+
+3.10 Transactions Cannot Operate in Parallel
+
+This would be useless for ldb, as it hits the index records with
+just about every update. It would add significant complexity in
+resolving clashes, and cause the all transaction callers to write
+their code to loop in the case where the transactions spuriously
+failed.
+
+3.10.1 Proposed Solution
+
+None (but see [replay-attribute]). We could solve a small part of
+the problem by providing read-only transactions. These would
+allow one write transaction to begin, but it could not commit
+until all r/o transactions are done. This would require a new
+RO_TRANSACTION_LOCK, which would be upgraded on commit.
+
+3.10.2 Status
+
+Deferred.
+
+3.11 Default Hash Function Is Suboptimal
+
+The Knuth-inspired multiplicative hash used by tdb is fairly slow
+(especially if we expand it to 64 bits), and works best when the
+hash bucket size is a prime number (which also means a slow
+modulus). In addition, it is highly predictable which could
+potentially lead to a Denial of Service attack in some TDB uses.
+
+3.11.1 Proposed Solution
+
+The Jenkins lookup3 hash[footnote:
+http://burtleburtle.net/bob/c/lookup3.c
+] is a fast and superbly-mixing hash. It's used by the Linux
+kernel and almost everything else. This has the particular
+properties that it takes an initial seed, and produces two 32 bit
+hash numbers, which we can combine into a 64-bit hash.
+
+The seed should be created at tdb-creation time from some random
+source, and placed in the header. This is far from foolproof, but
+adds a little bit of protection against hash bombing.
+
+3.11.2 Status
+
+Complete.
+
+3.12 <Reliable-Traversal-Adds>Reliable Traversal Adds Complexity
+
+We lock a record during traversal iteration, and try to grab that
+lock in the delete code. If that grab on delete fails, we simply
+mark it deleted and continue onwards; traversal checks for this
+condition and does the delete when it moves off the record.
+
+If traversal terminates, the dead record may be left
+indefinitely.
+
+3.12.1 Proposed Solution
+
+Remove reliability guarantees; see [traverse-Proposed-Solution].
+
+3.12.2 Status
+
+Complete.
+
+3.13 Fcntl Locking Adds Overhead
+
+Placing a fcntl lock means a system call, as does removing one.
+This is actually one reason why transactions can be faster
+(everything is locked once at transaction start). In the
+uncontended case, this overhead can theoretically be eliminated.
+
+3.13.1 Proposed Solution
+
+None.
+
+We tried this before with spinlock support, in the early days of
+TDB, and it didn't make much difference except in manufactured
+benchmarks.
+
+We could use spinlocks (with futex kernel support under Linux),
+but it means that we lose automatic cleanup when a process dies
+with a lock. There is a method of auto-cleanup under Linux, but
+it's not supported by other operating systems. We could
+reintroduce a clear-if-first-style lock and sweep for dead
+futexes on open, but that wouldn't help the normal case of one
+concurrent opener dying. Increasingly elaborate repair schemes
+could be considered, but they require an ABI change (everyone
+must use them) anyway, so there's no need to do this at the same
+time as everything else.
+
+3.14 Some Transactions Don't Require Durability
+
+Volker points out that gencache uses a CLEAR_IF_FIRST tdb for
+normal (fast) usage, and occasionally empties the results into a
+transactional TDB. This kind of usage prioritizes performance
+over durability: as long as we are consistent, data can be lost.
+
+This would be more neatly implemented inside tdb: a “soft”
+transaction commit (ie. syncless) which meant that data may be
+reverted on a crash.
+
+3.14.1 Proposed Solution
+
+None.
+
+Unfortunately any transaction scheme which overwrites old data
+requires a sync before that overwrite to avoid the possibility of
+corruption.
+
+It seems possible to use a scheme similar to that described in [sub:TDB-Does-Not]
+,where transactions are committed without overwriting existing
+data, and an array of top-level pointers were available in the
+header. If the transaction is “soft” then we would not need a
+sync at all: existing processes would pick up the new hash table
+and free list and work with that.
+
+At some later point, a sync would allow recovery of the old data
+into the free lists (perhaps when the array of top-level pointers
+filled). On crash, tdb_open() would examine the array of top
+levels, and apply the transactions until it encountered an
+invalid checksum.
+
+3.15 Tracing Is Fragile, Replay Is External
+
+The current TDB has compile-time-enabled tracing code, but it
+often breaks as it is not enabled by default. In a similar way,
+the ctdb code has an external wrapper which does replay tracing
+so it can coordinate cluster-wide transactions.
+
+3.15.1 Proposed Solution<replay-attribute>
+
+Tridge points out that an attribute can be later added to
+tdb_open (see [attributes]) to provide replay/trace hooks, which
+could become the basis for this and future parallel transactions
+and snapshot support.
+
+3.15.2 Status
+
+Deferred.
diff --git a/lib/tdb2/free.c b/lib/tdb2/free.c
new file mode 100644
index 0000000000..a770751dc0
--- /dev/null
+++ b/lib/tdb2/free.c
@@ -0,0 +1,968 @@
+ /*
+   Trivial Database 2: free list/block handling
+   Copyright (C) Rusty Russell 2010
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+#include "private.h"
+#include <ccan/likely/likely.h>
+#include <ccan/ilog/ilog.h>
+#include <time.h>
+#include <assert.h>
+#include <limits.h>
+
+static unsigned fls64(uint64_t val)
+{
+	return ilog64(val);
+}
+
+/* In which bucket would we find a particular record size? (ignoring header) */
+unsigned int size_to_bucket(tdb_len_t data_len)
+{
+	unsigned int bucket;
+
+	/* We can't have records smaller than this. */
+	assert(data_len >= TDB_MIN_DATA_LEN);
+
+	/* Ignoring the header... */
+	if (data_len - TDB_MIN_DATA_LEN <= 64) {
+		/* 0 in bucket 0, 8 in bucket 1... 64 in bucket 8. */
+		bucket = (data_len - TDB_MIN_DATA_LEN) / 8;
+	} else {
+		/* After that we go power of 2. */
+		bucket = fls64(data_len - TDB_MIN_DATA_LEN) + 2;
+	}
+
+	if (unlikely(bucket >= TDB_FREE_BUCKETS))
+		bucket = TDB_FREE_BUCKETS - 1;
+	return bucket;
+}
+
+tdb_off_t first_ftable(struct tdb_context *tdb)
+{
+	return tdb_read_off(tdb, offsetof(struct tdb_header, free_table));
+}
+
+tdb_off_t next_ftable(struct tdb_context *tdb, tdb_off_t ftable)
+{
+	return tdb_read_off(tdb, ftable + offsetof(struct tdb_freetable,next));
+}
+
+enum TDB_ERROR tdb_ftable_init(struct tdb_context *tdb)
+{
+	/* Use reservoir sampling algorithm to select a free list at random. */
+	unsigned int rnd, max = 0, count = 0;
+	tdb_off_t off;
+
+	tdb->ftable_off = off = first_ftable(tdb);
+	tdb->ftable = 0;
+
+	while (off) {
+		if (TDB_OFF_IS_ERR(off)) {
+			return off;
+		}
+
+		rnd = random();
+		if (rnd >= max) {
+			tdb->ftable_off = off;
+			tdb->ftable = count;
+			max = rnd;
+		}
+
+		off = next_ftable(tdb, off);
+		count++;
+	}
+	return TDB_SUCCESS;
+}
+
+/* Offset of a given bucket. */
+tdb_off_t bucket_off(tdb_off_t ftable_off, unsigned bucket)
+{
+	return ftable_off + offsetof(struct tdb_freetable, buckets)
+		+ bucket * sizeof(tdb_off_t);
+}
+
+/* Returns free_buckets + 1, or list number to search, or -ve error. */
+static tdb_off_t find_free_head(struct tdb_context *tdb,
+				tdb_off_t ftable_off,
+				tdb_off_t bucket)
+{
+	/* Speculatively search for a non-zero bucket. */
+	return tdb_find_nonzero_off(tdb, bucket_off(ftable_off, 0),
+				    bucket, TDB_FREE_BUCKETS);
+}
+
+static void check_list(struct tdb_context *tdb, tdb_off_t b_off)
+{
+#ifdef CCAN_TDB2_DEBUG
+	tdb_off_t off, prev = 0, first;
+	struct tdb_free_record r;
+
+	first = off = (tdb_read_off(tdb, b_off) & TDB_OFF_MASK);
+	while (off != 0) {
+		tdb_read_convert(tdb, off, &r, sizeof(r));
+		if (frec_magic(&r) != TDB_FREE_MAGIC)
+			abort();
+		if (prev && frec_prev(&r) != prev)
+			abort();
+		prev = off;
+		off = r.next;
+	}
+
+	if (first) {
+		tdb_read_convert(tdb, first, &r, sizeof(r));
+		if (frec_prev(&r) != prev)
+			abort();
+	}
+#endif
+}
+
+/* Remove from free bucket. */
+static enum TDB_ERROR remove_from_list(struct tdb_context *tdb,
+				       tdb_off_t b_off, tdb_off_t r_off,
+				       const struct tdb_free_record *r)
+{
+	tdb_off_t off, prev_next, head;
+	enum TDB_ERROR ecode;
+
+	/* Is this only element in list?  Zero out bucket, and we're done. */
+	if (frec_prev(r) == r_off)
+		return tdb_write_off(tdb, b_off, 0);
+
+	/* off = &r->prev->next */
+	off = frec_prev(r) + offsetof(struct tdb_free_record, next);
+
+	/* Get prev->next */
+	prev_next = tdb_read_off(tdb, off);
+	if (TDB_OFF_IS_ERR(prev_next))
+		return prev_next;
+
+	/* If prev->next == 0, we were head: update bucket to point to next. */
+	if (prev_next == 0) {
+		/* We must preserve upper bits. */
+		head = tdb_read_off(tdb, b_off);
+		if (TDB_OFF_IS_ERR(head))
+			return head;
+
+		if ((head & TDB_OFF_MASK) != r_off) {
+			return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
+					  "remove_from_list:"
+					  " %llu head %llu on list %llu",
+					  (long long)r_off,
+					  (long long)head,
+					  (long long)b_off);
+		}
+		head = ((head & ~TDB_OFF_MASK) | r->next);
+		ecode = tdb_write_off(tdb, b_off, head);
+		if (ecode != TDB_SUCCESS)
+			return ecode;
+	} else {
+		/* r->prev->next = r->next */
+		ecode = tdb_write_off(tdb, off, r->next);
+		if (ecode != TDB_SUCCESS)
+			return ecode;
+	}
+
+	/* If we were the tail, off = &head->prev. */
+	if (r->next == 0) {
+		head = tdb_read_off(tdb, b_off);
+		if (TDB_OFF_IS_ERR(head))
+			return head;
+		head &= TDB_OFF_MASK;
+		off = head + offsetof(struct tdb_free_record, magic_and_prev);
+	} else {
+		/* off = &r->next->prev */
+		off = r->next + offsetof(struct tdb_free_record,
+					 magic_and_prev);
+	}
+
+#ifdef CCAN_TDB2_DEBUG
+	/* *off == r */
+	if ((tdb_read_off(tdb, off) & TDB_OFF_MASK) != r_off) {
+		return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
+				  "remove_from_list:"
+				  " %llu bad prev in list %llu",
+				  (long long)r_off, (long long)b_off);
+	}
+#endif
+	/* r->next->prev = r->prev */
+	return tdb_write_off(tdb, off, r->magic_and_prev);
+}
+
+/* Enqueue in this free bucket: sets coalesce if we've added 128
+ * entries to it. */
+static enum TDB_ERROR enqueue_in_free(struct tdb_context *tdb,
+				      tdb_off_t b_off,
+				      tdb_off_t off,
+				      tdb_len_t len,
+				      bool *coalesce)
+{
+	struct tdb_free_record new;
+	enum TDB_ERROR ecode;
+	tdb_off_t prev, head;
+	uint64_t magic = (TDB_FREE_MAGIC << (64 - TDB_OFF_UPPER_STEAL));
+
+	head = tdb_read_off(tdb, b_off);
+	if (TDB_OFF_IS_ERR(head))
+		return head;
+
+	/* We only need to set ftable_and_len; rest is set in enqueue_in_free */
+	new.ftable_and_len = ((uint64_t)tdb->ftable << (64 - TDB_OFF_UPPER_STEAL))
+		| len;
+
+	/* new->next = head. */
+	new.next = (head & TDB_OFF_MASK);
+
+	/* First element?  Prev points to ourselves. */
+	if (!new.next) {
+		new.magic_and_prev = (magic | off);
+	} else {
+		/* new->prev = next->prev */
+		prev = tdb_read_off(tdb,
+				    new.next + offsetof(struct tdb_free_record,
+							magic_and_prev));
+		new.magic_and_prev = prev;
+		if (frec_magic(&new) != TDB_FREE_MAGIC) {
+			return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
+					  "enqueue_in_free: %llu bad head"
+					  " prev %llu",
+					  (long long)new.next,
+					  (long long)prev);
+		}
+		/* next->prev = new. */
+		ecode = tdb_write_off(tdb, new.next
+				      + offsetof(struct tdb_free_record,
+						 magic_and_prev),
+				      off | magic);
+		if (ecode != TDB_SUCCESS) {
+			return ecode;
+		}
+
+#ifdef CCAN_TDB2_DEBUG
+		prev = tdb_read_off(tdb, frec_prev(&new)
+				    + offsetof(struct tdb_free_record, next));
+		if (prev != 0) {
+			return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
+					  "enqueue_in_free:"
+					  " %llu bad tail next ptr %llu",
+					  (long long)frec_prev(&new)
+					  + offsetof(struct tdb_free_record,
+						     next),
+					  (long long)prev);
+		}
+#endif
+	}
+
+	/* Update enqueue count, but don't set high bit: see TDB_OFF_IS_ERR */
+	if (*coalesce)
+		head += (1ULL << (64 - TDB_OFF_UPPER_STEAL));
+	head &= ~(TDB_OFF_MASK | (1ULL << 63));
+	head |= off;
+
+	ecode = tdb_write_off(tdb, b_off, head);
+	if (ecode != TDB_SUCCESS) {
+		return ecode;
+	}
+
+	/* It's time to coalesce if counter wrapped. */
+	if (*coalesce)
+		*coalesce = ((head & ~TDB_OFF_MASK) == 0);
+
+	return tdb_write_convert(tdb, off, &new, sizeof(new));
+}
+
+static tdb_off_t ftable_offset(struct tdb_context *tdb, unsigned int ftable)
+{
+	tdb_off_t off;
+	unsigned int i;
+
+	if (likely(tdb->ftable == ftable))
+		return tdb->ftable_off;
+
+	off = first_ftable(tdb);
+	for (i = 0; i < ftable; i++) {
+		if (TDB_OFF_IS_ERR(off)) {
+			break;
+		}
+		off = next_ftable(tdb, off);
+	}
+	return off;
+}
+
+/* Note: we unlock the current bucket if fail (-ve), or coalesce (+ve) and
+ * need to blatt the *protect record (which is set to an error). */
+static tdb_len_t coalesce(struct tdb_context *tdb,
+			  tdb_off_t off, tdb_off_t b_off,
+			  tdb_len_t data_len,
+			  tdb_off_t *protect)
+{
+	tdb_off_t end;
+	struct tdb_free_record rec;
+	enum TDB_ERROR ecode;
+
+	tdb->stats.alloc_coalesce_tried++;
+	end = off + sizeof(struct tdb_used_record) + data_len;
+
+	while (end < tdb->file->map_size) {
+		const struct tdb_free_record *r;
+		tdb_off_t nb_off;
+		unsigned ftable, bucket;
+
+		r = tdb_access_read(tdb, end, sizeof(*r), true);
+		if (TDB_PTR_IS_ERR(r)) {
+			ecode = TDB_PTR_ERR(r);
+			goto err;
+		}
+
+		if (frec_magic(r) != TDB_FREE_MAGIC
+		    || frec_ftable(r) == TDB_FTABLE_NONE) {
+			tdb_access_release(tdb, r);
+			break;
+		}
+
+		ftable = frec_ftable(r);
+		bucket = size_to_bucket(frec_len(r));
+		nb_off = ftable_offset(tdb, ftable);
+		if (TDB_OFF_IS_ERR(nb_off)) {
+			tdb_access_release(tdb, r);
+			ecode = nb_off;
+			goto err;
+		}
+		nb_off = bucket_off(nb_off, bucket);
+		tdb_access_release(tdb, r);
+
+		/* We may be violating lock order here, so best effort. */
+		if (tdb_lock_free_bucket(tdb, nb_off, TDB_LOCK_NOWAIT)
+		    != TDB_SUCCESS) {
+			tdb->stats.alloc_coalesce_lockfail++;
+			break;
+		}
+
+		/* Now we have lock, re-check. */
+		ecode = tdb_read_convert(tdb, end, &rec, sizeof(rec));
+		if (ecode != TDB_SUCCESS) {
+			tdb_unlock_free_bucket(tdb, nb_off);
+			goto err;
+		}
+
+		if (unlikely(frec_magic(&rec) != TDB_FREE_MAGIC)) {
+			tdb->stats.alloc_coalesce_race++;
+			tdb_unlock_free_bucket(tdb, nb_off);
+			break;
+		}
+
+		if (unlikely(frec_ftable(&rec) != ftable)
+		    || unlikely(size_to_bucket(frec_len(&rec)) != bucket)) {
+			tdb->stats.alloc_coalesce_race++;
+			tdb_unlock_free_bucket(tdb, nb_off);
+			break;
+		}
+
+		/* Did we just mess up a record you were hoping to use? */
+		if (end == *protect) {
+			tdb->stats.alloc_coalesce_iterate_clash++;
+			*protect = TDB_ERR_NOEXIST;
+		}
+
+		ecode = remove_from_list(tdb, nb_off, end, &rec);
+		check_list(tdb, nb_off);
+		if (ecode != TDB_SUCCESS) {
+			tdb_unlock_free_bucket(tdb, nb_off);
+			goto err;
+		}
+
+		end += sizeof(struct tdb_used_record) + frec_len(&rec);
+		tdb_unlock_free_bucket(tdb, nb_off);
+		tdb->stats.alloc_coalesce_num_merged++;
+	}
+
+	/* Didn't find any adjacent free? */
+	if (end == off + sizeof(struct tdb_used_record) + data_len)
+		return 0;
+
+	/* Before we expand, check this isn't one you wanted protected? */
+	if (off == *protect) {
+		*protect = TDB_ERR_EXISTS;
+		tdb->stats.alloc_coalesce_iterate_clash++;
+	}
+
+	/* OK, expand initial record */
+	ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec));
+	if (ecode != TDB_SUCCESS) {
+		goto err;
+	}
+
+	if (frec_len(&rec) != data_len) {
+		ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
+				   "coalesce: expected data len %zu not %zu",
+				   (size_t)data_len, (size_t)frec_len(&rec));
+		goto err;
+	}
+
+	ecode = remove_from_list(tdb, b_off, off, &rec);
+	check_list(tdb, b_off);
+	if (ecode != TDB_SUCCESS) {
+		goto err;
+	}
+
+	/* Try locking violation first.  We don't allow coalesce recursion! */
+	ecode = add_free_record(tdb, off, end - off, TDB_LOCK_NOWAIT, false);
+	if (ecode != TDB_SUCCESS) {
+		/* Need to drop lock.  Can't rely on anything stable. */
+		tdb->stats.alloc_coalesce_lockfail++;
+		*protect = TDB_ERR_CORRUPT;
+
+		/* We have to drop this to avoid deadlocks, so make sure record
+		 * doesn't get coalesced by someone else! */
+		rec.ftable_and_len = (TDB_FTABLE_NONE
+				      << (64 - TDB_OFF_UPPER_STEAL))
+			| (end - off - sizeof(struct tdb_used_record));
+		ecode = tdb_write_off(tdb,
+				      off + offsetof(struct tdb_free_record,
+						     ftable_and_len),
+				      rec.ftable_and_len);
+		if (ecode != TDB_SUCCESS) {
+			goto err;
+		}
+
+		tdb_unlock_free_bucket(tdb, b_off);
+
+		ecode = add_free_record(tdb, off, end - off, TDB_LOCK_WAIT,
+					false);
+		if (ecode != TDB_SUCCESS) {
+			return ecode;
+		}
+	} else if (TDB_OFF_IS_ERR(*protect)) {
+		/* For simplicity, we always drop lock if they can't continue */
+		tdb_unlock_free_bucket(tdb, b_off);
+	}
+	tdb->stats.alloc_coalesce_succeeded++;
+
+	/* Return usable length. */
+	return end - off - sizeof(struct tdb_used_record);
+
+err:
+	/* To unify error paths, we *always* unlock bucket on error. */
+	tdb_unlock_free_bucket(tdb, b_off);
+	return ecode;
+}
+
+/* List is locked: we unlock it. */
+static enum TDB_ERROR coalesce_list(struct tdb_context *tdb,
+				    tdb_off_t ftable_off,
+				    tdb_off_t b_off,
+				    unsigned int limit)
+{
+	enum TDB_ERROR ecode;
+	tdb_off_t off;
+
+	off = tdb_read_off(tdb, b_off);
+	if (TDB_OFF_IS_ERR(off)) {
+		ecode = off;
+		goto unlock_err;
+	}
+	/* A little bit of paranoia: counter should be 0. */
+	off &= TDB_OFF_MASK;
+
+	while (off && limit--) {
+		struct tdb_free_record rec;
+		tdb_len_t coal;
+		tdb_off_t next;
+
+		ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec));
+		if (ecode != TDB_SUCCESS)
+			goto unlock_err;
+
+		next = rec.next;
+		coal = coalesce(tdb, off, b_off, frec_len(&rec), &next);
+		if (TDB_OFF_IS_ERR(coal)) {
+			/* This has already unlocked on error. */
+			return coal;
+		}
+		if (TDB_OFF_IS_ERR(next)) {
+			/* Coalescing had to unlock, so stop. */
+			return TDB_SUCCESS;
+		}
+		/* Keep going if we're doing well... */
+		limit += size_to_bucket(coal / 16 + TDB_MIN_DATA_LEN);
+		off = next;
+	}
+
+	/* Now, move those elements to the tail of the list so we get something
+	 * else next time. */
+	if (off) {
+		struct tdb_free_record oldhrec, newhrec, oldtrec, newtrec;
+		tdb_off_t oldhoff, oldtoff, newtoff;
+
+		/* The record we were up to is the new head. */
+		ecode = tdb_read_convert(tdb, off, &newhrec, sizeof(newhrec));
+		if (ecode != TDB_SUCCESS)
+			goto unlock_err;
+
+		/* Get the new tail. */
+		newtoff = frec_prev(&newhrec);
+		ecode = tdb_read_convert(tdb, newtoff, &newtrec,
+					 sizeof(newtrec));
+		if (ecode != TDB_SUCCESS)
+			goto unlock_err;
+
+		/* Get the old head. */
+		oldhoff = tdb_read_off(tdb, b_off);
+		if (TDB_OFF_IS_ERR(oldhoff)) {
+			ecode = oldhoff;
+			goto unlock_err;
+		}
+
+		/* This could happen if they all coalesced away. */
+		if (oldhoff == off)
+			goto out;
+
+		ecode = tdb_read_convert(tdb, oldhoff, &oldhrec,
+					 sizeof(oldhrec));
+		if (ecode != TDB_SUCCESS)
+			goto unlock_err;
+
+		/* Get the old tail. */
+		oldtoff = frec_prev(&oldhrec);
+		ecode = tdb_read_convert(tdb, oldtoff, &oldtrec,
+					 sizeof(oldtrec));
+		if (ecode != TDB_SUCCESS)
+			goto unlock_err;
+
+		/* Old tail's next points to old head. */
+		oldtrec.next = oldhoff;
+
+		/* Old head's prev points to old tail. */
+		oldhrec.magic_and_prev
+			= (TDB_FREE_MAGIC << (64 - TDB_OFF_UPPER_STEAL))
+			| oldtoff;
+
+		/* New tail's next is 0. */
+		newtrec.next = 0;
+
+		/* Write out the modified versions. */
+		ecode = tdb_write_convert(tdb, oldtoff, &oldtrec,
+					  sizeof(oldtrec));
+		if (ecode != TDB_SUCCESS)
+			goto unlock_err;
+
+		ecode = tdb_write_convert(tdb, oldhoff, &oldhrec,
+					  sizeof(oldhrec));
+		if (ecode != TDB_SUCCESS)
+			goto unlock_err;
+
+		ecode = tdb_write_convert(tdb, newtoff, &newtrec,
+					  sizeof(newtrec));
+		if (ecode != TDB_SUCCESS)
+			goto unlock_err;
+
+		/* And finally link in new head. */
+		ecode = tdb_write_off(tdb, b_off, off);
+		if (ecode != TDB_SUCCESS)
+			goto unlock_err;
+	}
+out:
+	tdb_unlock_free_bucket(tdb, b_off);
+	return TDB_SUCCESS;
+
+unlock_err:
+	tdb_unlock_free_bucket(tdb, b_off);
+	return ecode;
+}
+
+/* List must not be locked if coalesce_ok is set. */
+enum TDB_ERROR add_free_record(struct tdb_context *tdb,
+			       tdb_off_t off, tdb_len_t len_with_header,
+			       enum tdb_lock_flags waitflag,
+			       bool coalesce)
+{
+	tdb_off_t b_off;
+	tdb_len_t len;
+	enum TDB_ERROR ecode;
+
+	assert(len_with_header >= sizeof(struct tdb_free_record));
+
+	len = len_with_header - sizeof(struct tdb_used_record);
+
+	b_off = bucket_off(tdb->ftable_off, size_to_bucket(len));
+	ecode = tdb_lock_free_bucket(tdb, b_off, waitflag);
+	if (ecode != TDB_SUCCESS) {
+		return ecode;
+	}
+
+	ecode = enqueue_in_free(tdb, b_off, off, len, &coalesce);
+	check_list(tdb, b_off);
+
+	/* Coalescing unlocks free list. */
+	if (!ecode && coalesce)
+		ecode = coalesce_list(tdb, tdb->ftable_off, b_off, 2);
+	else
+		tdb_unlock_free_bucket(tdb, b_off);
+	return ecode;
+}
+
+static size_t adjust_size(size_t keylen, size_t datalen)
+{
+	size_t size = keylen + datalen;
+
+	if (size < TDB_MIN_DATA_LEN)
+		size = TDB_MIN_DATA_LEN;
+
+	/* Round to next uint64_t boundary. */
+	return (size + (sizeof(uint64_t) - 1ULL)) & ~(sizeof(uint64_t) - 1ULL);
+}
+
+/* If we have enough left over to be useful, split that off. */
+static size_t record_leftover(size_t keylen, size_t datalen,
+			      bool want_extra, size_t total_len)
+{
+	ssize_t leftover;
+
+	if (want_extra)
+		datalen += datalen / 2;
+	leftover = total_len - adjust_size(keylen, datalen);
+
+	if (leftover < (ssize_t)sizeof(struct tdb_free_record))
+		return 0;
+
+	return leftover;
+}
+
+/* We need size bytes to put our key and data in. */
+static tdb_off_t lock_and_alloc(struct tdb_context *tdb,
+				tdb_off_t ftable_off,
+				tdb_off_t bucket,
+				size_t keylen, size_t datalen,
+				bool want_extra,
+				unsigned magic,
+				unsigned hashlow)
+{
+	tdb_off_t off, b_off,best_off;
+	struct tdb_free_record best = { 0 };
+	double multiplier;
+	size_t size = adjust_size(keylen, datalen);
+	enum TDB_ERROR ecode;
+
+	tdb->stats.allocs++;
+	b_off = bucket_off(ftable_off, bucket);
+
+	/* FIXME: Try non-blocking wait first, to measure contention. */
+	/* Lock this bucket. */
+	ecode = tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT);
+	if (ecode != TDB_SUCCESS) {
+		return ecode;
+	}
+
+	best.ftable_and_len = -1ULL;
+	best_off = 0;
+
+	/* Get slack if we're after extra. */
+	if (want_extra)
+		multiplier = 1.5;
+	else
+		multiplier = 1.0;
+
+	/* Walk the list to see if any are large enough, getting less fussy
+	 * as we go. */
+	off = tdb_read_off(tdb, b_off);
+	if (TDB_OFF_IS_ERR(off)) {
+		ecode = off;
+		goto unlock_err;
+	}
+	off &= TDB_OFF_MASK;
+
+	while (off) {
+		const struct tdb_free_record *r;
+		tdb_len_t len;
+		tdb_off_t next;
+
+		r = tdb_access_read(tdb, off, sizeof(*r), true);
+		if (TDB_PTR_IS_ERR(r)) {
+			ecode = TDB_PTR_ERR(r);
+			goto unlock_err;
+		}
+
+		if (frec_magic(r) != TDB_FREE_MAGIC) {
+			ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
+					   "lock_and_alloc:"
+					   " %llu non-free 0x%llx",
+					   (long long)off,
+					   (long long)r->magic_and_prev);
+			tdb_access_release(tdb, r);
+			goto unlock_err;
+		}
+
+		if (frec_len(r) >= size && frec_len(r) < frec_len(&best)) {
+			best_off = off;
+			best = *r;
+		}
+
+		if (frec_len(&best) <= size * multiplier && best_off) {
+			tdb_access_release(tdb, r);
+			break;
+		}
+
+		multiplier *= 1.01;
+
+		next = r->next;
+		len = frec_len(r);
+		tdb_access_release(tdb, r);
+		off = next;
+	}
+
+	/* If we found anything at all, use it. */
+	if (best_off) {
+		struct tdb_used_record rec;
+		size_t leftover;
+
+		/* We're happy with this size: take it. */
+		ecode = remove_from_list(tdb, b_off, best_off, &best);
+		check_list(tdb, b_off);
+		if (ecode != TDB_SUCCESS) {
+			goto unlock_err;
+		}
+
+		leftover = record_leftover(keylen, datalen, want_extra,
+					   frec_len(&best));
+
+		assert(keylen + datalen + leftover <= frec_len(&best));
+		/* We need to mark non-free before we drop lock, otherwise
+		 * coalesce() could try to merge it! */
+		ecode = set_header(tdb, &rec, magic, keylen, datalen,
+				   frec_len(&best) - leftover, hashlow);
+		if (ecode != TDB_SUCCESS) {
+			goto unlock_err;
+		}
+
+		ecode = tdb_write_convert(tdb, best_off, &rec, sizeof(rec));
+		if (ecode != TDB_SUCCESS) {
+			goto unlock_err;
+		}
+
+		/* For futureproofing, we put a 0 in any unused space. */
+		if (rec_extra_padding(&rec)) {
+			ecode = tdb->methods->twrite(tdb, best_off + sizeof(rec)
+						     + keylen + datalen, "", 1);
+			if (ecode != TDB_SUCCESS) {
+				goto unlock_err;
+			}
+		}
+
+		/* Bucket of leftover will be <= current bucket, so nested
+		 * locking is allowed. */
+		if (leftover) {
+			tdb->stats.alloc_leftover++;
+			ecode = add_free_record(tdb,
+						best_off + sizeof(rec)
+						+ frec_len(&best) - leftover,
+						leftover, TDB_LOCK_WAIT, false);
+			if (ecode != TDB_SUCCESS) {
+				best_off = ecode;
+			}
+		}
+		tdb_unlock_free_bucket(tdb, b_off);
+
+		return best_off;
+	}
+
+	tdb_unlock_free_bucket(tdb, b_off);
+	return 0;
+
+unlock_err:
+	tdb_unlock_free_bucket(tdb, b_off);
+	return ecode;
+}
+
+/* Get a free block from current free list, or 0 if none, -ve on error. */
+static tdb_off_t get_free(struct tdb_context *tdb,
+			  size_t keylen, size_t datalen, bool want_extra,
+			  unsigned magic, unsigned hashlow)
+{
+	tdb_off_t off, ftable_off;
+	tdb_off_t start_b, b, ftable;
+	bool wrapped = false;
+
+	/* If they are growing, add 50% to get to higher bucket. */
+	if (want_extra)
+		start_b = size_to_bucket(adjust_size(keylen,
+						     datalen + datalen / 2));
+	else
+		start_b = size_to_bucket(adjust_size(keylen, datalen));
+
+	ftable_off = tdb->ftable_off;
+	ftable = tdb->ftable;
+	while (!wrapped || ftable_off != tdb->ftable_off) {
+		/* Start at exact size bucket, and search up... */
+		for (b = find_free_head(tdb, ftable_off, start_b);
+		     b < TDB_FREE_BUCKETS;
+		     b = find_free_head(tdb, ftable_off, b + 1)) {
+			/* Try getting one from list. */
+			off = lock_and_alloc(tdb, ftable_off,
+					     b, keylen, datalen, want_extra,
+					     magic, hashlow);
+			if (TDB_OFF_IS_ERR(off))
+				return off;
+			if (off != 0) {
+				if (b == start_b)
+					tdb->stats.alloc_bucket_exact++;
+				if (b == TDB_FREE_BUCKETS - 1)
+					tdb->stats.alloc_bucket_max++;
+				/* Worked?  Stay using this list. */
+				tdb->ftable_off = ftable_off;
+				tdb->ftable = ftable;
+				return off;
+			}
+			/* Didn't work.  Try next bucket. */
+		}
+
+		if (TDB_OFF_IS_ERR(b)) {
+			return b;
+		}
+
+		/* Hmm, try next table. */
+		ftable_off = next_ftable(tdb, ftable_off);
+		if (TDB_OFF_IS_ERR(ftable_off)) {
+			return ftable_off;
+		}
+		ftable++;
+
+		if (ftable_off == 0) {
+			wrapped = true;
+			ftable_off = first_ftable(tdb);
+			if (TDB_OFF_IS_ERR(ftable_off)) {
+				return ftable_off;
+			}
+			ftable = 0;
+		}
+	}
+
+	return 0;
+}
+
+enum TDB_ERROR set_header(struct tdb_context *tdb,
+			  struct tdb_used_record *rec,
+			  unsigned magic, uint64_t keylen, uint64_t datalen,
+			  uint64_t actuallen, unsigned hashlow)
+{
+	uint64_t keybits = (fls64(keylen) + 1) / 2;
+
+	/* Use bottom bits of hash, so it's independent of hash table size. */
+	rec->magic_and_meta = (hashlow & ((1 << 11)-1))
+		| ((actuallen - (keylen + datalen)) << 11)
+		| (keybits << 43)
+		| ((uint64_t)magic << 48);
+	rec->key_and_data_len = (keylen | (datalen << (keybits*2)));
+
+	/* Encoding can fail on big values. */
+	if (rec_key_length(rec) != keylen
+	    || rec_data_length(rec) != datalen
+	    || rec_extra_padding(rec) != actuallen - (keylen + datalen)) {
+		return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
+				  "Could not encode k=%llu,d=%llu,a=%llu",
+				  (long long)keylen, (long long)datalen,
+				  (long long)actuallen);
+	}
+	return TDB_SUCCESS;
+}
+
+/* Expand the database. */
+static enum TDB_ERROR tdb_expand(struct tdb_context *tdb, tdb_len_t size)
+{
+	uint64_t old_size, rec_size, map_size;
+	tdb_len_t wanted;
+	enum TDB_ERROR ecode;
+
+	/* Need to hold a hash lock to expand DB: transactions rely on it. */
+	if (!(tdb->flags & TDB_NOLOCK)
+	    && !tdb->file->allrecord_lock.count && !tdb_has_hash_locks(tdb)) {
+		return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
+				  "tdb_expand: must hold lock during expand");
+	}
+
+	/* Only one person can expand file at a time. */
+	ecode = tdb_lock_expand(tdb, F_WRLCK);
+	if (ecode != TDB_SUCCESS) {
+		return ecode;
+	}
+
+	/* Someone else may have expanded the file, so retry. */
+	old_size = tdb->file->map_size;
+	tdb->methods->oob(tdb, tdb->file->map_size + 1, true);
+	if (tdb->file->map_size != old_size) {
+		tdb_unlock_expand(tdb, F_WRLCK);
+		return TDB_SUCCESS;
+	}
+
+	/* limit size in order to avoid using up huge amounts of memory for
+	 * in memory tdbs if an oddball huge record creeps in */
+	if (size > 100 * 1024) {
+		rec_size = size * 2;
+	} else {
+		rec_size = size * 100;
+	}
+
+	/* always make room for at least rec_size more records, and at
+	   least 25% more space. if the DB is smaller than 100MiB,
+	   otherwise grow it by 10% only. */
+	if (old_size > 100 * 1024 * 1024) {
+		map_size = old_size / 10;
+	} else {
+		map_size = old_size / 4;
+	}
+
+	if (map_size > rec_size) {
+		wanted = map_size;
+	} else {
+		wanted = rec_size;
+	}
+
+	/* We need room for the record header too. */
+	wanted = adjust_size(0, sizeof(struct tdb_used_record) + wanted);
+
+	ecode = tdb->methods->expand_file(tdb, wanted);
+	if (ecode != TDB_SUCCESS) {
+		tdb_unlock_expand(tdb, F_WRLCK);
+		return ecode;
+	}
+
+	/* We need to drop this lock before adding free record. */
+	tdb_unlock_expand(tdb, F_WRLCK);
+
+	tdb->stats.expands++;
+	return add_free_record(tdb, old_size, wanted, TDB_LOCK_WAIT, true);
+}
+
+/* This won't fail: it will expand the database if it has to. */
+tdb_off_t alloc(struct tdb_context *tdb, size_t keylen, size_t datalen,
+		uint64_t hash, unsigned magic, bool growing)
+{
+	tdb_off_t off;
+
+	/* We can't hold pointers during this: we could unmap! */
+	assert(!tdb->direct_access);
+
+	for (;;) {
+		enum TDB_ERROR ecode;
+		off = get_free(tdb, keylen, datalen, growing, magic, hash);
+		if (likely(off != 0))
+			break;
+
+		ecode = tdb_expand(tdb, adjust_size(keylen, datalen));
+		if (ecode != TDB_SUCCESS) {
+			return ecode;
+		}
+	}
+
+	return off;
+}
diff --git a/lib/tdb2/hash.c b/lib/tdb2/hash.c
new file mode 100644
index 0000000000..1359cfecd6
--- /dev/null
+++ b/lib/tdb2/hash.c
@@ -0,0 +1,881 @@
+ /*
+   Trivial Database 2: hash handling
+   Copyright (C) Rusty Russell 2010
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+#include "private.h"
+#include <assert.h>
+
+uint64_t tdb_hash(struct tdb_context *tdb, const void *ptr, size_t len)
+{
+	return tdb->hash_fn(ptr, len, tdb->hash_seed, tdb->hash_data);
+}
+
+uint64_t hash_record(struct tdb_context *tdb, tdb_off_t off)
+{
+	const struct tdb_used_record *r;
+	const void *key;
+	uint64_t klen, hash;
+
+	r = tdb_access_read(tdb, off, sizeof(*r), true);
+	if (TDB_PTR_IS_ERR(r)) {
+		/* FIXME */
+		return 0;
+	}
+
+	klen = rec_key_length(r);
+	tdb_access_release(tdb, r);
+
+	key = tdb_access_read(tdb, off + sizeof(*r), klen, false);
+	if (TDB_PTR_IS_ERR(key)) {
+		return 0;
+	}
+
+	hash = tdb_hash(tdb, key, klen);
+	tdb_access_release(tdb, key);
+	return hash;
+}
+
+/* Get bits from a value. */
+static uint32_t bits_from(uint64_t val, unsigned start, unsigned num)
+{
+	assert(num <= 32);
+	return (val >> start) & ((1U << num) - 1);
+}
+
+/* We take bits from the top: that way we can lock whole sections of the hash
+ * by using lock ranges. */
+static uint32_t use_bits(struct hash_info *h, unsigned num)
+{
+	h->hash_used += num;
+	return bits_from(h->h, 64 - h->hash_used, num);
+}
+
+static tdb_bool_err key_matches(struct tdb_context *tdb,
+				const struct tdb_used_record *rec,
+				tdb_off_t off,
+				const struct tdb_data *key)
+{
+	tdb_bool_err ret = false;
+	const char *rkey;
+
+	if (rec_key_length(rec) != key->dsize) {
+		tdb->stats.compare_wrong_keylen++;
+		return ret;
+	}
+
+	rkey = tdb_access_read(tdb, off + sizeof(*rec), key->dsize, false);
+	if (TDB_PTR_IS_ERR(rkey)) {
+		return TDB_PTR_ERR(rkey);
+	}
+	if (memcmp(rkey, key->dptr, key->dsize) == 0)
+		ret = true;
+	else
+		tdb->stats.compare_wrong_keycmp++;
+	tdb_access_release(tdb, rkey);
+	return ret;
+}
+
+/* Does entry match? */
+static tdb_bool_err match(struct tdb_context *tdb,
+			  struct hash_info *h,
+			  const struct tdb_data *key,
+			  tdb_off_t val,
+			  struct tdb_used_record *rec)
+{
+	tdb_off_t off;
+	enum TDB_ERROR ecode;
+
+	tdb->stats.compares++;
+	/* Desired bucket must match. */
+	if (h->home_bucket != (val & TDB_OFF_HASH_GROUP_MASK)) {
+		tdb->stats.compare_wrong_bucket++;
+		return false;
+	}
+
+	/* Top bits of offset == next bits of hash. */
+	if (bits_from(val, TDB_OFF_HASH_EXTRA_BIT, TDB_OFF_UPPER_STEAL_EXTRA)
+	    != bits_from(h->h, 64 - h->hash_used - TDB_OFF_UPPER_STEAL_EXTRA,
+		    TDB_OFF_UPPER_STEAL_EXTRA)) {
+		tdb->stats.compare_wrong_offsetbits++;
+		return false;
+	}
+
+	off = val & TDB_OFF_MASK;
+	ecode = tdb_read_convert(tdb, off, rec, sizeof(*rec));
+	if (ecode != TDB_SUCCESS) {
+		return ecode;
+	}
+
+	if ((h->h & ((1 << 11)-1)) != rec_hash(rec)) {
+		tdb->stats.compare_wrong_rechash++;
+		return false;
+	}
+
+	return key_matches(tdb, rec, off, key);
+}
+
+static tdb_off_t hbucket_off(tdb_off_t group_start, unsigned bucket)
+{
+	return group_start
+		+ (bucket % (1 << TDB_HASH_GROUP_BITS)) * sizeof(tdb_off_t);
+}
+
+bool is_subhash(tdb_off_t val)
+{
+	return (val >> TDB_OFF_UPPER_STEAL_SUBHASH_BIT) & 1;
+}
+
+/* FIXME: Guess the depth, don't over-lock! */
+static tdb_off_t hlock_range(tdb_off_t group, tdb_off_t *size)
+{
+	*size = 1ULL << (64 - (TDB_TOPLEVEL_HASH_BITS - TDB_HASH_GROUP_BITS));
+	return group << (64 - (TDB_TOPLEVEL_HASH_BITS - TDB_HASH_GROUP_BITS));
+}
+
+static tdb_off_t COLD find_in_chain(struct tdb_context *tdb,
+				    struct tdb_data key,
+				    tdb_off_t chain,
+				    struct hash_info *h,
+				    struct tdb_used_record *rec,
+				    struct traverse_info *tinfo)
+{
+	tdb_off_t off, next;
+	enum TDB_ERROR ecode;
+
+	/* In case nothing is free, we set these to zero. */
+	h->home_bucket = h->found_bucket = 0;
+
+	for (off = chain; off; off = next) {
+		unsigned int i;
+
+		h->group_start = off;
+		ecode = tdb_read_convert(tdb, off, h->group, sizeof(h->group));
+		if (ecode != TDB_SUCCESS) {
+			return ecode;
+		}
+
+		for (i = 0; i < (1 << TDB_HASH_GROUP_BITS); i++) {
+			tdb_off_t recoff;
+			if (!h->group[i]) {
+				/* Remember this empty bucket. */
+				h->home_bucket = h->found_bucket = i;
+				continue;
+			}
+
+			/* We can insert extra bits via add_to_hash
+			 * empty bucket logic. */
+			recoff = h->group[i] & TDB_OFF_MASK;
+			ecode = tdb_read_convert(tdb, recoff, rec,
+						 sizeof(*rec));
+			if (ecode != TDB_SUCCESS) {
+				return ecode;
+			}
+
+			ecode = key_matches(tdb, rec, recoff, &key);
+			if (ecode < 0) {
+				return ecode;
+			}
+			if (ecode == 1) {
+				h->home_bucket = h->found_bucket = i;
+
+				if (tinfo) {
+					tinfo->levels[tinfo->num_levels]
+						.hashtable = off;
+					tinfo->levels[tinfo->num_levels]
+						.total_buckets
+						= 1 << TDB_HASH_GROUP_BITS;
+					tinfo->levels[tinfo->num_levels].entry
+						= i;
+					tinfo->num_levels++;
+				}
+				return recoff;
+			}
+		}
+		next = tdb_read_off(tdb, off
+				    + offsetof(struct tdb_chain, next));
+		if (TDB_OFF_IS_ERR(next)) {
+			return next;
+		}
+		if (next)
+			next += sizeof(struct tdb_used_record);
+	}
+	return 0;
+}
+
+/* This is the core routine which searches the hashtable for an entry.
+ * On error, no locks are held and -ve is returned.
+ * Otherwise, hinfo is filled in (and the optional tinfo).
+ * If not found, the return value is 0.
+ * If found, the return value is the offset, and *rec is the record. */
+tdb_off_t find_and_lock(struct tdb_context *tdb,
+			struct tdb_data key,
+			int ltype,
+			struct hash_info *h,
+			struct tdb_used_record *rec,
+			struct traverse_info *tinfo)
+{
+	uint32_t i, group;
+	tdb_off_t hashtable;
+	enum TDB_ERROR ecode;
+
+	h->h = tdb_hash(tdb, key.dptr, key.dsize);
+	h->hash_used = 0;
+	group = use_bits(h, TDB_TOPLEVEL_HASH_BITS - TDB_HASH_GROUP_BITS);
+	h->home_bucket = use_bits(h, TDB_HASH_GROUP_BITS);
+
+	h->hlock_start = hlock_range(group, &h->hlock_range);
+	ecode = tdb_lock_hashes(tdb, h->hlock_start, h->hlock_range, ltype,
+				TDB_LOCK_WAIT);
+	if (ecode != TDB_SUCCESS) {
+		return ecode;
+	}
+
+	hashtable = offsetof(struct tdb_header, hashtable);
+	if (tinfo) {
+		tinfo->toplevel_group = group;
+		tinfo->num_levels = 1;
+		tinfo->levels[0].entry = 0;
+		tinfo->levels[0].hashtable = hashtable
+			+ (group << TDB_HASH_GROUP_BITS) * sizeof(tdb_off_t);
+		tinfo->levels[0].total_buckets = 1 << TDB_HASH_GROUP_BITS;
+	}
+
+	while (h->hash_used <= 64) {
+		/* Read in the hash group. */
+		h->group_start = hashtable
+			+ group * (sizeof(tdb_off_t) << TDB_HASH_GROUP_BITS);
+
+		ecode = tdb_read_convert(tdb, h->group_start, &h->group,
+					 sizeof(h->group));
+		if (ecode != TDB_SUCCESS) {
+			goto fail;
+		}
+
+		/* Pointer to another hash table?  Go down... */
+		if (is_subhash(h->group[h->home_bucket])) {
+			hashtable = (h->group[h->home_bucket] & TDB_OFF_MASK)
+				+ sizeof(struct tdb_used_record);
+			if (tinfo) {
+				/* When we come back, use *next* bucket */
+				tinfo->levels[tinfo->num_levels-1].entry
+					+= h->home_bucket + 1;
+			}
+			group = use_bits(h, TDB_SUBLEVEL_HASH_BITS
+					 - TDB_HASH_GROUP_BITS);
+			h->home_bucket = use_bits(h, TDB_HASH_GROUP_BITS);
+			if (tinfo) {
+				tinfo->levels[tinfo->num_levels].hashtable
+					= hashtable;
+				tinfo->levels[tinfo->num_levels].total_buckets
+					= 1 << TDB_SUBLEVEL_HASH_BITS;
+				tinfo->levels[tinfo->num_levels].entry
+					= group << TDB_HASH_GROUP_BITS;
+				tinfo->num_levels++;
+			}
+			continue;
+		}
+
+		/* It's in this group: search (until 0 or all searched) */
+		for (i = 0, h->found_bucket = h->home_bucket;
+		     i < (1 << TDB_HASH_GROUP_BITS);
+		     i++, h->found_bucket = ((h->found_bucket+1)
+					     % (1 << TDB_HASH_GROUP_BITS))) {
+			tdb_bool_err berr;
+			if (is_subhash(h->group[h->found_bucket]))
+				continue;
+
+			if (!h->group[h->found_bucket])
+				break;
+
+			berr = match(tdb, h, &key, h->group[h->found_bucket],
+				     rec);
+			if (berr < 0) {
+				ecode = berr;
+				goto fail;
+			}
+			if (berr) {
+				if (tinfo) {
+					tinfo->levels[tinfo->num_levels-1].entry
+						+= h->found_bucket;
+				}
+				return h->group[h->found_bucket] & TDB_OFF_MASK;
+			}
+		}
+		/* Didn't find it: h indicates where it would go. */
+		return 0;
+	}
+
+	return find_in_chain(tdb, key, hashtable, h, rec, tinfo);
+
+fail:
+	tdb_unlock_hashes(tdb, h->hlock_start, h->hlock_range, ltype);
+	return ecode;
+}
+
+/* I wrote a simple test, expanding a hash to 2GB, for the following
+ * cases:
+ * 1) Expanding all the buckets at once,
+ * 2) Expanding the bucket we wanted to place the new entry into.
+ * 3) Expanding the most-populated bucket,
+ *
+ * I measured the worst/average/best density during this process.
+ * 1) 3%/16%/30%
+ * 2) 4%/20%/38%
+ * 3) 6%/22%/41%
+ *
+ * So we figure out the busiest bucket for the moment.
+ */
+static unsigned fullest_bucket(struct tdb_context *tdb,
+			       const tdb_off_t *group,
+			       unsigned new_bucket)
+{
+	unsigned counts[1 << TDB_HASH_GROUP_BITS] = { 0 };
+	unsigned int i, best_bucket;
+
+	/* Count the new entry. */
+	counts[new_bucket]++;
+	best_bucket = new_bucket;
+
+	for (i = 0; i < (1 << TDB_HASH_GROUP_BITS); i++) {
+		unsigned this_bucket;
+
+		if (is_subhash(group[i]))
+			continue;
+		this_bucket = group[i] & TDB_OFF_HASH_GROUP_MASK;
+		if (++counts[this_bucket] > counts[best_bucket])
+			best_bucket = this_bucket;
+	}
+
+	return best_bucket;
+}
+
+static bool put_into_group(tdb_off_t *group,
+			   unsigned bucket, tdb_off_t encoded)
+{
+	unsigned int i;
+
+	for (i = 0; i < (1 << TDB_HASH_GROUP_BITS); i++) {
+		unsigned b = (bucket + i) % (1 << TDB_HASH_GROUP_BITS);
+
+		if (group[b] == 0) {
+			group[b] = encoded;
+			return true;
+		}
+	}
+	return false;
+}
+
+static void force_into_group(tdb_off_t *group,
+			     unsigned bucket, tdb_off_t encoded)
+{
+	if (!put_into_group(group, bucket, encoded))
+		abort();
+}
+
+static tdb_off_t encode_offset(tdb_off_t new_off, struct hash_info *h)
+{
+	return h->home_bucket
+		| new_off
+		| ((uint64_t)bits_from(h->h,
+				  64 - h->hash_used - TDB_OFF_UPPER_STEAL_EXTRA,
+				  TDB_OFF_UPPER_STEAL_EXTRA)
+		   << TDB_OFF_HASH_EXTRA_BIT);
+}
+
+/* Simply overwrite the hash entry we found before. */
+enum TDB_ERROR replace_in_hash(struct tdb_context *tdb,
+			       struct hash_info *h,
+			       tdb_off_t new_off)
+{
+	return tdb_write_off(tdb, hbucket_off(h->group_start, h->found_bucket),
+			     encode_offset(new_off, h));
+}
+
+/* We slot in anywhere that's empty in the chain. */
+static enum TDB_ERROR COLD add_to_chain(struct tdb_context *tdb,
+					tdb_off_t subhash,
+					tdb_off_t new_off)
+{
+	tdb_off_t entry;
+	enum TDB_ERROR ecode;
+
+	entry = tdb_find_zero_off(tdb, subhash, 1<<TDB_HASH_GROUP_BITS);
+	if (TDB_OFF_IS_ERR(entry)) {
+		return entry;
+	}
+
+	if (entry == 1 << TDB_HASH_GROUP_BITS) {
+		tdb_off_t next;
+
+		next = tdb_read_off(tdb, subhash
+				    + offsetof(struct tdb_chain, next));
+		if (TDB_OFF_IS_ERR(next)) {
+			return next;
+		}
+
+		if (!next) {
+			next = alloc(tdb, 0, sizeof(struct tdb_chain), 0,
+				     TDB_CHAIN_MAGIC, false);
+			if (TDB_OFF_IS_ERR(next))
+				return next;
+			ecode = zero_out(tdb,
+					 next+sizeof(struct tdb_used_record),
+					 sizeof(struct tdb_chain));
+			if (ecode != TDB_SUCCESS) {
+				return ecode;
+			}
+			ecode = tdb_write_off(tdb, subhash
+					      + offsetof(struct tdb_chain,
+							 next),
+					      next);
+			if (ecode != TDB_SUCCESS) {
+				return ecode;
+			}
+		}
+		return add_to_chain(tdb, next, new_off);
+	}
+
+	return tdb_write_off(tdb, subhash + entry * sizeof(tdb_off_t),
+			     new_off);
+}
+
+/* Add into a newly created subhash. */
+static enum TDB_ERROR add_to_subhash(struct tdb_context *tdb, tdb_off_t subhash,
+				     unsigned hash_used, tdb_off_t val)
+{
+	tdb_off_t off = (val & TDB_OFF_MASK), *group;
+	struct hash_info h;
+	unsigned int gnum;
+
+	h.hash_used = hash_used;
+
+	if (hash_used + TDB_SUBLEVEL_HASH_BITS > 64)
+		return add_to_chain(tdb, subhash, off);
+
+	h.h = hash_record(tdb, off);
+	gnum = use_bits(&h, TDB_SUBLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS);
+	h.group_start = subhash
+		+ gnum * (sizeof(tdb_off_t) << TDB_HASH_GROUP_BITS);
+	h.home_bucket = use_bits(&h, TDB_HASH_GROUP_BITS);
+
+	group = tdb_access_write(tdb, h.group_start,
+				 sizeof(*group) << TDB_HASH_GROUP_BITS, true);
+	if (TDB_PTR_IS_ERR(group)) {
+		return TDB_PTR_ERR(group);
+	}
+	force_into_group(group, h.home_bucket, encode_offset(off, &h));
+	return tdb_access_commit(tdb, group);
+}
+
+static enum TDB_ERROR expand_group(struct tdb_context *tdb, struct hash_info *h)
+{
+	unsigned bucket, num_vals, i, magic;
+	size_t subsize;
+	tdb_off_t subhash;
+	tdb_off_t vals[1 << TDB_HASH_GROUP_BITS];
+	enum TDB_ERROR ecode;
+
+	/* Attach new empty subhash under fullest bucket. */
+	bucket = fullest_bucket(tdb, h->group, h->home_bucket);
+
+	if (h->hash_used == 64) {
+		tdb->stats.alloc_chain++;
+		subsize = sizeof(struct tdb_chain);
+		magic = TDB_CHAIN_MAGIC;
+	} else {
+		tdb->stats.alloc_subhash++;
+		subsize = (sizeof(tdb_off_t) << TDB_SUBLEVEL_HASH_BITS);
+		magic = TDB_HTABLE_MAGIC;
+	}
+
+	subhash = alloc(tdb, 0, subsize, 0, magic, false);
+	if (TDB_OFF_IS_ERR(subhash)) {
+		return subhash;
+	}
+
+	ecode = zero_out(tdb, subhash + sizeof(struct tdb_used_record),
+			 subsize);
+	if (ecode != TDB_SUCCESS) {
+		return ecode;
+	}
+
+	/* Remove any which are destined for bucket or are in wrong place. */
+	num_vals = 0;
+	for (i = 0; i < (1 << TDB_HASH_GROUP_BITS); i++) {
+		unsigned home_bucket = h->group[i] & TDB_OFF_HASH_GROUP_MASK;
+		if (!h->group[i] || is_subhash(h->group[i]))
+			continue;
+		if (home_bucket == bucket || home_bucket != i) {
+			vals[num_vals++] = h->group[i];
+			h->group[i] = 0;
+		}
+	}
+	/* FIXME: This assert is valid, but we do this during unit test :( */
+	/* assert(num_vals); */
+
+	/* Overwrite expanded bucket with subhash pointer. */
+	h->group[bucket] = subhash | (1ULL << TDB_OFF_UPPER_STEAL_SUBHASH_BIT);
+
+	/* Point to actual contents of record. */
+	subhash += sizeof(struct tdb_used_record);
+
+	/* Put values back. */
+	for (i = 0; i < num_vals; i++) {
+		unsigned this_bucket = vals[i] & TDB_OFF_HASH_GROUP_MASK;
+
+		if (this_bucket == bucket) {
+			ecode = add_to_subhash(tdb, subhash, h->hash_used,
+					       vals[i]);
+			if (ecode != TDB_SUCCESS)
+				return ecode;
+		} else {
+			/* There should be room to put this back. */
+			force_into_group(h->group, this_bucket, vals[i]);
+		}
+	}
+	return TDB_SUCCESS;
+}
+
+enum TDB_ERROR delete_from_hash(struct tdb_context *tdb, struct hash_info *h)
+{
+	unsigned int i, num_movers = 0;
+	tdb_off_t movers[1 << TDB_HASH_GROUP_BITS];
+
+	h->group[h->found_bucket] = 0;
+	for (i = 1; i < (1 << TDB_HASH_GROUP_BITS); i++) {
+		unsigned this_bucket;
+
+		this_bucket = (h->found_bucket+i) % (1 << TDB_HASH_GROUP_BITS);
+		/* Empty bucket?  We're done. */
+		if (!h->group[this_bucket])
+			break;
+
+		/* Ignore subhashes. */
+		if (is_subhash(h->group[this_bucket]))
+			continue;
+
+		/* If this one is not happy where it is, we'll move it. */
+		if ((h->group[this_bucket] & TDB_OFF_HASH_GROUP_MASK)
+		    != this_bucket) {
+			movers[num_movers++] = h->group[this_bucket];
+			h->group[this_bucket] = 0;
+		}
+	}
+
+	/* Put back the ones we erased. */
+	for (i = 0; i < num_movers; i++) {
+		force_into_group(h->group, movers[i] & TDB_OFF_HASH_GROUP_MASK,
+				 movers[i]);
+	}
+
+	/* Now we write back the hash group */
+	return tdb_write_convert(tdb, h->group_start,
+				 h->group, sizeof(h->group));
+}
+
+enum TDB_ERROR add_to_hash(struct tdb_context *tdb, struct hash_info *h,
+			   tdb_off_t new_off)
+{
+	enum TDB_ERROR ecode;
+
+	/* We hit an empty bucket during search?  That's where it goes. */
+	if (!h->group[h->found_bucket]) {
+		h->group[h->found_bucket] = encode_offset(new_off, h);
+		/* Write back the modified group. */
+		return tdb_write_convert(tdb, h->group_start,
+					 h->group, sizeof(h->group));
+	}
+
+	if (h->hash_used > 64)
+		return add_to_chain(tdb, h->group_start, new_off);
+
+	/* We're full.  Expand. */
+	ecode = expand_group(tdb, h);
+	if (ecode != TDB_SUCCESS) {
+		return ecode;
+	}
+
+	if (is_subhash(h->group[h->home_bucket])) {
+		/* We were expanded! */
+		tdb_off_t hashtable;
+		unsigned int gnum;
+
+		/* Write back the modified group. */
+		ecode = tdb_write_convert(tdb, h->group_start, h->group,
+					  sizeof(h->group));
+		if (ecode != TDB_SUCCESS) {
+			return ecode;
+		}
+
+		/* Move hashinfo down a level. */
+		hashtable = (h->group[h->home_bucket] & TDB_OFF_MASK)
+			+ sizeof(struct tdb_used_record);
+		gnum = use_bits(h,TDB_SUBLEVEL_HASH_BITS - TDB_HASH_GROUP_BITS);
+		h->home_bucket = use_bits(h, TDB_HASH_GROUP_BITS);
+		h->group_start = hashtable
+			+ gnum * (sizeof(tdb_off_t) << TDB_HASH_GROUP_BITS);
+		ecode = tdb_read_convert(tdb, h->group_start, &h->group,
+					 sizeof(h->group));
+		if (ecode != TDB_SUCCESS) {
+			return ecode;
+		}
+	}
+
+	/* Expanding the group must have made room if it didn't choose this
+	 * bucket. */
+	if (put_into_group(h->group, h->home_bucket, encode_offset(new_off,h))){
+		return tdb_write_convert(tdb, h->group_start,
+					 h->group, sizeof(h->group));
+	}
+
+	/* This can happen if all hashes in group (and us) dropped into same
+	 * group in subhash. */
+	return add_to_hash(tdb, h, new_off);
+}
+
+/* Traverse support: returns offset of record, or 0 or -ve error. */
+static tdb_off_t iterate_hash(struct tdb_context *tdb,
+			      struct traverse_info *tinfo)
+{
+	tdb_off_t off, val, i;
+	struct traverse_level *tlevel;
+
+	tlevel = &tinfo->levels[tinfo->num_levels-1];
+
+again:
+	for (i = tdb_find_nonzero_off(tdb, tlevel->hashtable,
+				      tlevel->entry, tlevel->total_buckets);
+	     i != tlevel->total_buckets;
+	     i = tdb_find_nonzero_off(tdb, tlevel->hashtable,
+				      i+1, tlevel->total_buckets)) {
+		if (TDB_OFF_IS_ERR(i)) {
+			return i;
+		}
+
+		val = tdb_read_off(tdb, tlevel->hashtable+sizeof(tdb_off_t)*i);
+		if (TDB_OFF_IS_ERR(val)) {
+			return val;
+		}
+
+		off = val & TDB_OFF_MASK;
+
+		/* This makes the delete-all-in-traverse case work
+		 * (and simplifies our logic a little). */
+		if (off == tinfo->prev)
+			continue;
+
+		tlevel->entry = i;
+
+		if (!is_subhash(val)) {
+			/* Found one. */
+			tinfo->prev = off;
+			return off;
+		}
+
+		/* When we come back, we want the next one */
+		tlevel->entry++;
+		tinfo->num_levels++;
+		tlevel++;
+		tlevel->hashtable = off + sizeof(struct tdb_used_record);
+		tlevel->entry = 0;
+		/* Next level is a chain? */
+		if (unlikely(tinfo->num_levels == TDB_MAX_LEVELS + 1))
+			tlevel->total_buckets = (1 << TDB_HASH_GROUP_BITS);
+		else
+			tlevel->total_buckets = (1 << TDB_SUBLEVEL_HASH_BITS);
+		goto again;
+	}
+
+	/* Nothing there? */
+	if (tinfo->num_levels == 1)
+		return 0;
+
+	/* Handle chained entries. */
+	if (unlikely(tinfo->num_levels == TDB_MAX_LEVELS + 1)) {
+		tlevel->hashtable = tdb_read_off(tdb, tlevel->hashtable
+						 + offsetof(struct tdb_chain,
+							    next));
+		if (TDB_OFF_IS_ERR(tlevel->hashtable)) {
+			return tlevel->hashtable;
+		}
+		if (tlevel->hashtable) {
+			tlevel->hashtable += sizeof(struct tdb_used_record);
+			tlevel->entry = 0;
+			goto again;
+		}
+	}
+
+	/* Go back up and keep searching. */
+	tinfo->num_levels--;
+	tlevel--;
+	goto again;
+}
+
+/* Return success if we find something, TDB_ERR_NOEXIST if none. */
+enum TDB_ERROR next_in_hash(struct tdb_context *tdb,
+			    struct traverse_info *tinfo,
+			    TDB_DATA *kbuf, size_t *dlen)
+{
+	const unsigned group_bits = TDB_TOPLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS;
+	tdb_off_t hl_start, hl_range, off;
+	enum TDB_ERROR ecode;
+
+	while (tinfo->toplevel_group < (1 << group_bits)) {
+		hl_start = (tdb_off_t)tinfo->toplevel_group
+			<< (64 - group_bits);
+		hl_range = 1ULL << group_bits;
+		ecode = tdb_lock_hashes(tdb, hl_start, hl_range, F_RDLCK,
+					TDB_LOCK_WAIT);
+		if (ecode != TDB_SUCCESS) {
+			return ecode;
+		}
+
+		off = iterate_hash(tdb, tinfo);
+		if (off) {
+			struct tdb_used_record rec;
+
+			if (TDB_OFF_IS_ERR(off)) {
+				ecode = off;
+				goto fail;
+			}
+
+			ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec));
+			if (ecode != TDB_SUCCESS) {
+				goto fail;
+			}
+			if (rec_magic(&rec) != TDB_USED_MAGIC) {
+				ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT,
+						   TDB_LOG_ERROR,
+						   "next_in_hash:"
+						   " corrupt record at %llu",
+						   (long long)off);
+				goto fail;
+			}
+
+			kbuf->dsize = rec_key_length(&rec);
+
+			/* They want data as well? */
+			if (dlen) {
+				*dlen = rec_data_length(&rec);
+				kbuf->dptr = tdb_alloc_read(tdb,
+							    off + sizeof(rec),
+							    kbuf->dsize
+							    + *dlen);
+			} else {
+				kbuf->dptr = tdb_alloc_read(tdb,
+							    off + sizeof(rec),
+							    kbuf->dsize);
+			}
+			tdb_unlock_hashes(tdb, hl_start, hl_range, F_RDLCK);
+			if (TDB_PTR_IS_ERR(kbuf->dptr)) {
+				return TDB_PTR_ERR(kbuf->dptr);
+			}
+			return TDB_SUCCESS;
+		}
+
+		tdb_unlock_hashes(tdb, hl_start, hl_range, F_RDLCK);
+
+		tinfo->toplevel_group++;
+		tinfo->levels[0].hashtable
+			+= (sizeof(tdb_off_t) << TDB_HASH_GROUP_BITS);
+		tinfo->levels[0].entry = 0;
+	}
+	return TDB_ERR_NOEXIST;
+
+fail:
+	tdb_unlock_hashes(tdb, hl_start, hl_range, F_RDLCK);
+	return ecode;
+
+}
+
+enum TDB_ERROR first_in_hash(struct tdb_context *tdb,
+			     struct traverse_info *tinfo,
+			     TDB_DATA *kbuf, size_t *dlen)
+{
+	tinfo->prev = 0;
+	tinfo->toplevel_group = 0;
+	tinfo->num_levels = 1;
+	tinfo->levels[0].hashtable = offsetof(struct tdb_header, hashtable);
+	tinfo->levels[0].entry = 0;
+	tinfo->levels[0].total_buckets = (1 << TDB_HASH_GROUP_BITS);
+
+	return next_in_hash(tdb, tinfo, kbuf, dlen);
+}
+
+/* Even if the entry isn't in this hash bucket, you'd have to lock this
+ * bucket to find it. */
+static enum TDB_ERROR chainlock(struct tdb_context *tdb, const TDB_DATA *key,
+				int ltype, enum tdb_lock_flags waitflag,
+				const char *func)
+{
+	enum TDB_ERROR ecode;
+	uint64_t h = tdb_hash(tdb, key->dptr, key->dsize);
+	tdb_off_t lockstart, locksize;
+	unsigned int group, gbits;
+
+	gbits = TDB_TOPLEVEL_HASH_BITS - TDB_HASH_GROUP_BITS;
+	group = bits_from(h, 64 - gbits, gbits);
+
+	lockstart = hlock_range(group, &locksize);
+
+	ecode = tdb_lock_hashes(tdb, lockstart, locksize, ltype, waitflag);
+	tdb_trace_1rec(tdb, func, *key);
+	return ecode;
+}
+
+/* lock/unlock one hash chain. This is meant to be used to reduce
+   contention - it cannot guarantee how many records will be locked */
+enum TDB_ERROR tdb_chainlock(struct tdb_context *tdb, TDB_DATA key)
+{
+	return tdb->last_error = chainlock(tdb, &key, F_WRLCK, TDB_LOCK_WAIT,
+					   "tdb_chainlock");
+}
+
+void tdb_chainunlock(struct tdb_context *tdb, TDB_DATA key)
+{
+	uint64_t h = tdb_hash(tdb, key.dptr, key.dsize);
+	tdb_off_t lockstart, locksize;
+	unsigned int group, gbits;
+
+	gbits = TDB_TOPLEVEL_HASH_BITS - TDB_HASH_GROUP_BITS;
+	group = bits_from(h, 64 - gbits, gbits);
+
+	lockstart = hlock_range(group, &locksize);
+
+	tdb_trace_1rec(tdb, "tdb_chainunlock", key);
+	tdb_unlock_hashes(tdb, lockstart, locksize, F_WRLCK);
+}
+
+enum TDB_ERROR tdb_chainlock_read(struct tdb_context *tdb, TDB_DATA key)
+{
+	return tdb->last_error = chainlock(tdb, &key, F_RDLCK, TDB_LOCK_WAIT,
+					   "tdb_chainlock_read");
+}
+
+void tdb_chainunlock_read(struct tdb_context *tdb, TDB_DATA key)
+{
+	uint64_t h = tdb_hash(tdb, key.dptr, key.dsize);
+	tdb_off_t lockstart, locksize;
+	unsigned int group, gbits;
+
+	gbits = TDB_TOPLEVEL_HASH_BITS - TDB_HASH_GROUP_BITS;
+	group = bits_from(h, 64 - gbits, gbits);
+
+	lockstart = hlock_range(group, &locksize);
+
+	tdb_trace_1rec(tdb, "tdb_chainunlock_read", key);
+	tdb_unlock_hashes(tdb, lockstart, locksize, F_RDLCK);
+}
diff --git a/lib/tdb2/io.c b/lib/tdb2/io.c
new file mode 100644
index 0000000000..8c5f45f308
--- /dev/null
+++ b/lib/tdb2/io.c
@@ -0,0 +1,615 @@
+ /*
+   Unix SMB/CIFS implementation.
+
+   trivial database library
+
+   Copyright (C) Andrew Tridgell              1999-2005
+   Copyright (C) Paul `Rusty' Russell		   2000
+   Copyright (C) Jeremy Allison			   2000-2003
+   Copyright (C) Rusty Russell			   2010
+
+     ** NOTE! The following LGPL license applies to the tdb
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+#include "private.h"
+#include <assert.h>
+#include <ccan/likely/likely.h>
+
+void tdb_munmap(struct tdb_file *file)
+{
+	if (file->fd == -1)
+		return;
+
+	if (file->map_ptr) {
+		munmap(file->map_ptr, file->map_size);
+		file->map_ptr = NULL;
+	}
+}
+
+void tdb_mmap(struct tdb_context *tdb)
+{
+	if (tdb->flags & TDB_INTERNAL)
+		return;
+
+	if (tdb->flags & TDB_NOMMAP)
+		return;
+
+	/* size_t can be smaller than off_t. */
+	if ((size_t)tdb->file->map_size == tdb->file->map_size) {
+		tdb->file->map_ptr = mmap(NULL, tdb->file->map_size,
+					  tdb->mmap_flags,
+					  MAP_SHARED, tdb->file->fd, 0);
+	} else
+		tdb->file->map_ptr = MAP_FAILED;
+
+	/*
+	 * NB. When mmap fails it returns MAP_FAILED *NOT* NULL !!!!
+	 */
+	if (tdb->file->map_ptr == MAP_FAILED) {
+		tdb->file->map_ptr = NULL;
+		tdb_logerr(tdb, TDB_SUCCESS, TDB_LOG_WARNING,
+			   "tdb_mmap failed for size %lld (%s)",
+			   (long long)tdb->file->map_size, strerror(errno));
+	}
+}
+
+/* check for an out of bounds access - if it is out of bounds then
+   see if the database has been expanded by someone else and expand
+   if necessary
+   note that "len" is the minimum length needed for the db
+*/
+static enum TDB_ERROR tdb_oob(struct tdb_context *tdb, tdb_off_t len,
+			      bool probe)
+{
+	struct stat st;
+	enum TDB_ERROR ecode;
+
+	/* We can't hold pointers during this: we could unmap! */
+	assert(!tdb->direct_access
+	       || (tdb->flags & TDB_NOLOCK)
+	       || tdb_has_expansion_lock(tdb));
+
+	if (len <= tdb->file->map_size)
+		return 0;
+	if (tdb->flags & TDB_INTERNAL) {
+		if (!probe) {
+			tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
+				 "tdb_oob len %lld beyond internal"
+				 " malloc size %lld",
+				 (long long)len,
+				 (long long)tdb->file->map_size);
+		}
+		return TDB_ERR_IO;
+	}
+
+	ecode = tdb_lock_expand(tdb, F_RDLCK);
+	if (ecode != TDB_SUCCESS) {
+		return ecode;
+	}
+
+	if (fstat(tdb->file->fd, &st) != 0) {
+		tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
+			   "Failed to fstat file: %s", strerror(errno));
+		tdb_unlock_expand(tdb, F_RDLCK);
+		return TDB_ERR_IO;
+	}
+
+	tdb_unlock_expand(tdb, F_RDLCK);
+
+	if (st.st_size < (size_t)len) {
+		if (!probe) {
+			tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
+				   "tdb_oob len %zu beyond eof at %zu",
+				   (size_t)len, st.st_size);
+		}
+		return TDB_ERR_IO;
+	}
+
+	/* Unmap, update size, remap */
+	tdb_munmap(tdb->file);
+
+	tdb->file->map_size = st.st_size;
+	tdb_mmap(tdb);
+	return TDB_SUCCESS;
+}
+
+/* Endian conversion: we only ever deal with 8 byte quantities */
+void *tdb_convert(const struct tdb_context *tdb, void *buf, tdb_len_t size)
+{
+	assert(size % 8 == 0);
+	if (unlikely((tdb->flags & TDB_CONVERT)) && buf) {
+		uint64_t i, *p = (uint64_t *)buf;
+		for (i = 0; i < size / 8; i++)
+			p[i] = bswap_64(p[i]);
+	}
+	return buf;
+}
+
+/* Return first non-zero offset in offset array, or end, or -ve error. */
+/* FIXME: Return the off? */
+uint64_t tdb_find_nonzero_off(struct tdb_context *tdb,
+			      tdb_off_t base, uint64_t start, uint64_t end)
+{
+	uint64_t i;
+	const uint64_t *val;
+
+	/* Zero vs non-zero is the same unconverted: minor optimization. */
+	val = tdb_access_read(tdb, base + start * sizeof(tdb_off_t),
+			      (end - start) * sizeof(tdb_off_t), false);
+	if (TDB_PTR_IS_ERR(val)) {
+		return TDB_PTR_ERR(val);
+	}
+
+	for (i = 0; i < (end - start); i++) {
+		if (val[i])
+			break;
+	}
+	tdb_access_release(tdb, val);
+	return start + i;
+}
+
+/* Return first zero offset in num offset array, or num, or -ve error. */
+uint64_t tdb_find_zero_off(struct tdb_context *tdb, tdb_off_t off,
+			   uint64_t num)
+{
+	uint64_t i;
+	const uint64_t *val;
+
+	/* Zero vs non-zero is the same unconverted: minor optimization. */
+	val = tdb_access_read(tdb, off, num * sizeof(tdb_off_t), false);
+	if (TDB_PTR_IS_ERR(val)) {
+		return TDB_PTR_ERR(val);
+	}
+
+	for (i = 0; i < num; i++) {
+		if (!val[i])
+			break;
+	}
+	tdb_access_release(tdb, val);
+	return i;
+}
+
+enum TDB_ERROR zero_out(struct tdb_context *tdb, tdb_off_t off, tdb_len_t len)
+{
+	char buf[8192] = { 0 };
+	void *p = tdb->methods->direct(tdb, off, len, true);
+	enum TDB_ERROR ecode = TDB_SUCCESS;
+
+	assert(!tdb->read_only);
+	if (TDB_PTR_IS_ERR(p)) {
+		return TDB_PTR_ERR(p);
+	}
+	if (p) {
+		memset(p, 0, len);
+		return ecode;
+	}
+	while (len) {
+		unsigned todo = len < sizeof(buf) ? len : sizeof(buf);
+		ecode = tdb->methods->twrite(tdb, off, buf, todo);
+		if (ecode != TDB_SUCCESS) {
+			break;
+		}
+		len -= todo;
+		off += todo;
+	}
+	return ecode;
+}
+
+tdb_off_t tdb_read_off(struct tdb_context *tdb, tdb_off_t off)
+{
+	tdb_off_t ret;
+	enum TDB_ERROR ecode;
+
+	if (likely(!(tdb->flags & TDB_CONVERT))) {
+		tdb_off_t *p = tdb->methods->direct(tdb, off, sizeof(*p),
+						    false);
+		if (TDB_PTR_IS_ERR(p)) {
+			return TDB_PTR_ERR(p);
+		}
+		if (p)
+			return *p;
+	}
+
+	ecode = tdb_read_convert(tdb, off, &ret, sizeof(ret));
+	if (ecode != TDB_SUCCESS) {
+		return ecode;
+	}
+	return ret;
+}
+
+/* write a lump of data at a specified offset */
+static enum TDB_ERROR tdb_write(struct tdb_context *tdb, tdb_off_t off,
+				const void *buf, tdb_len_t len)
+{
+	enum TDB_ERROR ecode;
+
+	if (tdb->read_only) {
+		return tdb_logerr(tdb, TDB_ERR_RDONLY, TDB_LOG_USE_ERROR,
+				  "Write to read-only database");
+	}
+
+	ecode = tdb->methods->oob(tdb, off + len, 0);
+	if (ecode != TDB_SUCCESS) {
+		return ecode;
+	}
+
+	if (tdb->file->map_ptr) {
+		memcpy(off + (char *)tdb->file->map_ptr, buf, len);
+	} else {
+		ssize_t ret;
+		ret = pwrite(tdb->file->fd, buf, len, off);
+		if (ret != len) {
+			/* This shouldn't happen: we avoid sparse files. */
+			if (ret >= 0)
+				errno = ENOSPC;
+
+			return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
+					  "tdb_write: %zi at %zu len=%zu (%s)",
+					  ret, (size_t)off, (size_t)len,
+					  strerror(errno));
+		}
+	}
+	return TDB_SUCCESS;
+}
+
+/* read a lump of data at a specified offset */
+static enum TDB_ERROR tdb_read(struct tdb_context *tdb, tdb_off_t off,
+			       void *buf, tdb_len_t len)
+{
+	enum TDB_ERROR ecode;
+
+	ecode = tdb->methods->oob(tdb, off + len, 0);
+	if (ecode != TDB_SUCCESS) {
+		return ecode;
+	}
+
+	if (tdb->file->map_ptr) {
+		memcpy(buf, off + (char *)tdb->file->map_ptr, len);
+	} else {
+		ssize_t r = pread(tdb->file->fd, buf, len, off);
+		if (r != len) {
+			return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
+					  "tdb_read failed with %zi at %zu "
+					  "len=%zu (%s) map_size=%zu",
+					  r, (size_t)off, (size_t)len,
+					  strerror(errno),
+					  (size_t)tdb->file->map_size);
+		}
+	}
+	return TDB_SUCCESS;
+}
+
+enum TDB_ERROR tdb_write_convert(struct tdb_context *tdb, tdb_off_t off,
+				 const void *rec, size_t len)
+{
+	enum TDB_ERROR ecode;
+
+	if (unlikely((tdb->flags & TDB_CONVERT))) {
+		void *conv = malloc(len);
+		if (!conv) {
+			return tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR,
+					  "tdb_write: no memory converting"
+					  " %zu bytes", len);
+		}
+		memcpy(conv, rec, len);
+		ecode = tdb->methods->twrite(tdb, off,
+					   tdb_convert(tdb, conv, len), len);
+		free(conv);
+	} else {
+		ecode = tdb->methods->twrite(tdb, off, rec, len);
+	}
+	return ecode;
+}
+
+enum TDB_ERROR tdb_read_convert(struct tdb_context *tdb, tdb_off_t off,
+				void *rec, size_t len)
+{
+	enum TDB_ERROR ecode = tdb->methods->tread(tdb, off, rec, len);
+	tdb_convert(tdb, rec, len);
+	return ecode;
+}
+
+enum TDB_ERROR tdb_write_off(struct tdb_context *tdb,
+			     tdb_off_t off, tdb_off_t val)
+{
+	if (tdb->read_only) {
+		return tdb_logerr(tdb, TDB_ERR_RDONLY, TDB_LOG_USE_ERROR,
+				  "Write to read-only database");
+	}
+
+	if (likely(!(tdb->flags & TDB_CONVERT))) {
+		tdb_off_t *p = tdb->methods->direct(tdb, off, sizeof(*p),
+						    true);
+		if (TDB_PTR_IS_ERR(p)) {
+			return TDB_PTR_ERR(p);
+		}
+		if (p) {
+			*p = val;
+			return TDB_SUCCESS;
+		}
+	}
+	return tdb_write_convert(tdb, off, &val, sizeof(val));
+}
+
+static void *_tdb_alloc_read(struct tdb_context *tdb, tdb_off_t offset,
+			     tdb_len_t len, unsigned int prefix)
+{
+	unsigned char *buf;
+	enum TDB_ERROR ecode;
+
+	/* some systems don't like zero length malloc */
+	buf = malloc(prefix + len ? prefix + len : 1);
+	if (!buf) {
+		tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_USE_ERROR,
+			   "tdb_alloc_read malloc failed len=%zu",
+			   (size_t)(prefix + len));
+		return TDB_ERR_PTR(TDB_ERR_OOM);
+	} else {
+		ecode = tdb->methods->tread(tdb, offset, buf+prefix, len);
+		if (unlikely(ecode != TDB_SUCCESS)) {
+			free(buf);
+			return TDB_ERR_PTR(ecode);
+		}
+	}
+	return buf;
+}
+
+/* read a lump of data, allocating the space for it */
+void *tdb_alloc_read(struct tdb_context *tdb, tdb_off_t offset, tdb_len_t len)
+{
+	return _tdb_alloc_read(tdb, offset, len, 0);
+}
+
+static enum TDB_ERROR fill(struct tdb_context *tdb,
+			   const void *buf, size_t size,
+			   tdb_off_t off, tdb_len_t len)
+{
+	while (len) {
+		size_t n = len > size ? size : len;
+		ssize_t ret = pwrite(tdb->file->fd, buf, n, off);
+		if (ret != n) {
+			if (ret >= 0)
+				errno = ENOSPC;
+
+			return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
+					  "fill failed:"
+					  " %zi at %zu len=%zu (%s)",
+					  ret, (size_t)off, (size_t)len,
+					  strerror(errno));
+		}
+		len -= n;
+		off += n;
+	}
+	return TDB_SUCCESS;
+}
+
+/* expand a file.  we prefer to use ftruncate, as that is what posix
+  says to use for mmap expansion */
+static enum TDB_ERROR tdb_expand_file(struct tdb_context *tdb,
+				      tdb_len_t addition)
+{
+	char buf[8192];
+	enum TDB_ERROR ecode;
+
+	if (tdb->read_only) {
+		return tdb_logerr(tdb, TDB_ERR_RDONLY, TDB_LOG_USE_ERROR,
+				  "Expand on read-only database");
+	}
+
+	if (tdb->flags & TDB_INTERNAL) {
+		char *new = realloc(tdb->file->map_ptr,
+				    tdb->file->map_size + addition);
+		if (!new) {
+			return tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR,
+					  "No memory to expand database");
+		}
+		tdb->file->map_ptr = new;
+		tdb->file->map_size += addition;
+	} else {
+		/* Unmap before trying to write; old TDB claimed OpenBSD had
+		 * problem with this otherwise. */
+		tdb_munmap(tdb->file);
+
+		/* If this fails, we try to fill anyway. */
+		if (ftruncate(tdb->file->fd, tdb->file->map_size + addition))
+			;
+
+		/* now fill the file with something. This ensures that the
+		   file isn't sparse, which would be very bad if we ran out of
+		   disk. This must be done with write, not via mmap */
+		memset(buf, 0x43, sizeof(buf));
+		ecode = fill(tdb, buf, sizeof(buf), tdb->file->map_size,
+			     addition);
+		if (ecode != TDB_SUCCESS)
+			return ecode;
+		tdb->file->map_size += addition;
+		tdb_mmap(tdb);
+	}
+	return TDB_SUCCESS;
+}
+
+const void *tdb_access_read(struct tdb_context *tdb,
+			    tdb_off_t off, tdb_len_t len, bool convert)
+{
+	void *ret = NULL;
+
+	if (likely(!(tdb->flags & TDB_CONVERT))) {
+		ret = tdb->methods->direct(tdb, off, len, false);
+
+		if (TDB_PTR_IS_ERR(ret)) {
+			return ret;
+		}
+	}
+	if (!ret) {
+		struct tdb_access_hdr *hdr;
+		hdr = _tdb_alloc_read(tdb, off, len, sizeof(*hdr));
+		if (TDB_PTR_IS_ERR(hdr)) {
+			return hdr;
+		}
+		hdr->next = tdb->access;
+		tdb->access = hdr;
+		ret = hdr + 1;
+		if (convert) {
+			tdb_convert(tdb, (void *)ret, len);
+		}
+	} else
+		tdb->direct_access++;
+
+	return ret;
+}
+
+void *tdb_access_write(struct tdb_context *tdb,
+		       tdb_off_t off, tdb_len_t len, bool convert)
+{
+	void *ret = NULL;
+
+	if (tdb->read_only) {
+		tdb_logerr(tdb, TDB_ERR_RDONLY, TDB_LOG_USE_ERROR,
+			   "Write to read-only database");
+		return TDB_ERR_PTR(TDB_ERR_RDONLY);
+	}
+
+	if (likely(!(tdb->flags & TDB_CONVERT))) {
+		ret = tdb->methods->direct(tdb, off, len, true);
+
+		if (TDB_PTR_IS_ERR(ret)) {
+			return ret;
+		}
+	}
+
+	if (!ret) {
+		struct tdb_access_hdr *hdr;
+		hdr = _tdb_alloc_read(tdb, off, len, sizeof(*hdr));
+		if (TDB_PTR_IS_ERR(hdr)) {
+			return hdr;
+		}
+		hdr->next = tdb->access;
+		tdb->access = hdr;
+		hdr->off = off;
+		hdr->len = len;
+		hdr->convert = convert;
+		ret = hdr + 1;
+		if (convert)
+			tdb_convert(tdb, (void *)ret, len);
+	} else
+		tdb->direct_access++;
+
+	return ret;
+}
+
+static struct tdb_access_hdr **find_hdr(struct tdb_context *tdb, const void *p)
+{
+	struct tdb_access_hdr **hp;
+
+	for (hp = &tdb->access; *hp; hp = &(*hp)->next) {
+		if (*hp + 1 == p)
+			return hp;
+	}
+	return NULL;
+}
+
+void tdb_access_release(struct tdb_context *tdb, const void *p)
+{
+	struct tdb_access_hdr *hdr, **hp = find_hdr(tdb, p);
+
+	if (hp) {
+		hdr = *hp;
+		*hp = hdr->next;
+		free(hdr);
+	} else
+		tdb->direct_access--;
+}
+
+enum TDB_ERROR tdb_access_commit(struct tdb_context *tdb, void *p)
+{
+	struct tdb_access_hdr *hdr, **hp = find_hdr(tdb, p);
+	enum TDB_ERROR ecode;
+
+	if (hp) {
+		hdr = *hp;
+		if (hdr->convert)
+			ecode = tdb_write_convert(tdb, hdr->off, p, hdr->len);
+		else
+			ecode = tdb_write(tdb, hdr->off, p, hdr->len);
+		*hp = hdr->next;
+		free(hdr);
+	} else {
+		tdb->direct_access--;
+		ecode = TDB_SUCCESS;
+	}
+
+	return ecode;
+}
+
+static void *tdb_direct(struct tdb_context *tdb, tdb_off_t off, size_t len,
+			bool write_mode)
+{
+	enum TDB_ERROR ecode;
+
+	if (unlikely(!tdb->file->map_ptr))
+		return NULL;
+
+	ecode = tdb_oob(tdb, off + len, true);
+	if (unlikely(ecode != TDB_SUCCESS))
+		return TDB_ERR_PTR(ecode);
+	return (char *)tdb->file->map_ptr + off;
+}
+
+void tdb_inc_seqnum(struct tdb_context *tdb)
+{
+	tdb_off_t seq;
+
+	if (likely(!(tdb->flags & TDB_CONVERT))) {
+		int64_t *direct;
+
+		direct = tdb->methods->direct(tdb,
+					      offsetof(struct tdb_header,
+						       seqnum),
+					      sizeof(*direct), true);
+		if (likely(direct)) {
+			/* Don't let it go negative, even briefly */
+			if (unlikely((*direct) + 1) < 0)
+				*direct = 0;
+			(*direct)++;
+			return;
+		}
+	}
+
+	seq = tdb_read_off(tdb, offsetof(struct tdb_header, seqnum));
+	if (!TDB_OFF_IS_ERR(seq)) {
+		seq++;
+		if (unlikely((int64_t)seq < 0))
+			seq = 0;
+		tdb_write_off(tdb, offsetof(struct tdb_header, seqnum), seq);
+	}
+}
+
+static const struct tdb_methods io_methods = {
+	tdb_read,
+	tdb_write,
+	tdb_oob,
+	tdb_expand_file,
+	tdb_direct,
+};
+
+/*
+  initialise the default methods table
+*/
+void tdb_io_init(struct tdb_context *tdb)
+{
+	tdb->methods = &io_methods;
+}
diff --git a/lib/tdb2/lock.c b/lib/tdb2/lock.c
new file mode 100644
index 0000000000..76b8bc3157
--- /dev/null
+++ b/lib/tdb2/lock.c
@@ -0,0 +1,875 @@
+ /*
+   Unix SMB/CIFS implementation.
+
+   trivial database library
+
+   Copyright (C) Andrew Tridgell              1999-2005
+   Copyright (C) Paul `Rusty' Russell		   2000
+   Copyright (C) Jeremy Allison			   2000-2003
+
+     ** NOTE! The following LGPL license applies to the tdb
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "private.h"
+#include <assert.h>
+#include <ccan/build_assert/build_assert.h>
+
+/* If we were threaded, we could wait for unlock, but we're not, so fail. */
+static enum TDB_ERROR owner_conflict(struct tdb_context *tdb, const char *call)
+{
+	return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_USE_ERROR,
+			  "%s: lock owned by another tdb in this process.",
+			  call);
+}
+
+/* If we fork, we no longer really own locks. */
+static bool check_lock_pid(struct tdb_context *tdb,
+			   const char *call, bool log)
+{
+	/* No locks?  No problem! */
+	if (tdb->file->allrecord_lock.count == 0
+	    && tdb->file->num_lockrecs == 0) {
+		return true;
+	}
+
+	/* No fork?  No problem! */
+	if (tdb->file->locker == getpid()) {
+		return true;
+	}
+
+	if (log) {
+		tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_USE_ERROR,
+			   "%s: fork() detected after lock acquisition!"
+			   " (%u vs %u)", call, tdb->file->locker, getpid());
+	}
+	return false;
+}
+
+int tdb_fcntl_lock(int fd, int rw, off_t off, off_t len, bool waitflag,
+		   void *unused)
+{
+	struct flock fl;
+	int ret;
+
+	do {
+		fl.l_type = rw;
+		fl.l_whence = SEEK_SET;
+		fl.l_start = off;
+		fl.l_len = len;
+
+		if (waitflag)
+			ret = fcntl(fd, F_SETLKW, &fl);
+		else
+			ret = fcntl(fd, F_SETLK, &fl);
+	} while (ret != 0 && errno == EINTR);
+	return ret;
+}
+
+int tdb_fcntl_unlock(int fd, int rw, off_t off, off_t len, void *unused)
+{
+	struct flock fl;
+	int ret;
+
+	do {
+		fl.l_type = F_UNLCK;
+		fl.l_whence = SEEK_SET;
+		fl.l_start = off;
+		fl.l_len = len;
+
+		ret = fcntl(fd, F_SETLKW, &fl);
+	} while (ret != 0 && errno == EINTR);
+	return ret;
+}
+
+static int lock(struct tdb_context *tdb,
+		      int rw, off_t off, off_t len, bool waitflag)
+{
+	int ret;
+	if (tdb->file->allrecord_lock.count == 0
+	    && tdb->file->num_lockrecs == 0) {
+		tdb->file->locker = getpid();
+	}
+
+	tdb->stats.lock_lowlevel++;
+	ret = tdb->lock_fn(tdb->file->fd, rw, off, len, waitflag,
+			   tdb->lock_data);
+	if (!waitflag) {
+		tdb->stats.lock_nonblock++;
+		if (ret != 0)
+			tdb->stats.lock_nonblock_fail++;
+	}
+	return ret;
+}
+
+static int unlock(struct tdb_context *tdb, int rw, off_t off, off_t len)
+{
+#if 0 /* Check they matched up locks and unlocks correctly. */
+	char line[80];
+	FILE *locks;
+	bool found = false;
+
+	locks = fopen("/proc/locks", "r");
+
+	while (fgets(line, 80, locks)) {
+		char *p;
+		int type, start, l;
+
+		/* eg. 1: FLOCK  ADVISORY  WRITE 2440 08:01:2180826 0 EOF */
+		p = strchr(line, ':') + 1;
+		if (strncmp(p, " POSIX  ADVISORY  ", strlen(" POSIX  ADVISORY  ")))
+			continue;
+		p += strlen(" FLOCK  ADVISORY  ");
+		if (strncmp(p, "READ  ", strlen("READ  ")) == 0)
+			type = F_RDLCK;
+		else if (strncmp(p, "WRITE ", strlen("WRITE ")) == 0)
+			type = F_WRLCK;
+		else
+			abort();
+		p += 6;
+		if (atoi(p) != getpid())
+			continue;
+		p = strchr(strchr(p, ' ') + 1, ' ') + 1;
+		start = atoi(p);
+		p = strchr(p, ' ') + 1;
+		if (strncmp(p, "EOF", 3) == 0)
+			l = 0;
+		else
+			l = atoi(p) - start + 1;
+
+		if (off == start) {
+			if (len != l) {
+				fprintf(stderr, "Len %u should be %u: %s",
+					(int)len, l, line);
+				abort();
+			}
+			if (type != rw) {
+				fprintf(stderr, "Type %s wrong: %s",
+					rw == F_RDLCK ? "READ" : "WRITE", line);
+				abort();
+			}
+			found = true;
+			break;
+		}
+	}
+
+	if (!found) {
+		fprintf(stderr, "Unlock on %u@%u not found!",
+			(int)off, (int)len);
+		abort();
+	}
+
+	fclose(locks);
+#endif
+
+	return tdb->unlock_fn(tdb->file->fd, rw, off, len, tdb->lock_data);
+}
+
+/* a byte range locking function - return 0 on success
+   this functions locks len bytes at the specified offset.
+
+   note that a len of zero means lock to end of file
+*/
+static enum TDB_ERROR tdb_brlock(struct tdb_context *tdb,
+				 int rw_type, tdb_off_t offset, tdb_off_t len,
+				 enum tdb_lock_flags flags)
+{
+	int ret;
+
+	if (tdb->flags & TDB_NOLOCK) {
+		return TDB_SUCCESS;
+	}
+
+	if (rw_type == F_WRLCK && tdb->read_only) {
+		return tdb_logerr(tdb, TDB_ERR_RDONLY, TDB_LOG_USE_ERROR,
+				  "Write lock attempted on read-only database");
+	}
+
+	/* A 32 bit system cannot open a 64-bit file, but it could have
+	 * expanded since then: check here. */
+	if ((size_t)(offset + len) != offset + len) {
+		return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
+				  "tdb_brlock: lock on giant offset %llu",
+				  (long long)(offset + len));
+	}
+
+	ret = lock(tdb, rw_type, offset, len, flags & TDB_LOCK_WAIT);
+	if (ret != 0) {
+		/* Generic lock error. errno set by fcntl.
+		 * EAGAIN is an expected return from non-blocking
+		 * locks. */
+		if (!(flags & TDB_LOCK_PROBE)
+		    && (errno != EAGAIN && errno != EINTR)) {
+			tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
+				   "tdb_brlock failed (fd=%d) at"
+				   " offset %zu rw_type=%d flags=%d len=%zu:"
+				   " %s",
+				   tdb->file->fd, (size_t)offset, rw_type,
+				   flags, (size_t)len, strerror(errno));
+		}
+		return TDB_ERR_LOCK;
+	}
+	return TDB_SUCCESS;
+}
+
+static enum TDB_ERROR tdb_brunlock(struct tdb_context *tdb,
+				   int rw_type, tdb_off_t offset, size_t len)
+{
+	if (tdb->flags & TDB_NOLOCK) {
+		return TDB_SUCCESS;
+	}
+
+	if (!check_lock_pid(tdb, "tdb_brunlock", true))
+		return TDB_ERR_LOCK;
+
+	if (unlock(tdb, rw_type, offset, len) == -1) {
+		return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
+				  "tdb_brunlock failed (fd=%d) at offset %zu"
+				  " rw_type=%d len=%zu: %s",
+				  tdb->file->fd, (size_t)offset, rw_type,
+				  (size_t)len, strerror(errno));
+	}
+	return TDB_SUCCESS;
+}
+
+/*
+  upgrade a read lock to a write lock. This needs to be handled in a
+  special way as some OSes (such as solaris) have too conservative
+  deadlock detection and claim a deadlock when progress can be
+  made. For those OSes we may loop for a while.
+*/
+enum TDB_ERROR tdb_allrecord_upgrade(struct tdb_context *tdb)
+{
+	int count = 1000;
+
+	if (!check_lock_pid(tdb, "tdb_transaction_prepare_commit", true))
+		return TDB_ERR_LOCK;
+
+	if (tdb->file->allrecord_lock.count != 1) {
+		return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
+				  "tdb_allrecord_upgrade failed:"
+				  " count %u too high",
+				  tdb->file->allrecord_lock.count);
+	}
+
+	if (tdb->file->allrecord_lock.off != 1) {
+		return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
+				  "tdb_allrecord_upgrade failed:"
+				  " already upgraded?");
+	}
+
+	if (tdb->file->allrecord_lock.owner != tdb) {
+		return owner_conflict(tdb, "tdb_allrecord_upgrade");
+	}
+
+	while (count--) {
+		struct timeval tv;
+		if (tdb_brlock(tdb, F_WRLCK,
+			       TDB_HASH_LOCK_START, 0,
+			       TDB_LOCK_WAIT|TDB_LOCK_PROBE) == TDB_SUCCESS) {
+			tdb->file->allrecord_lock.ltype = F_WRLCK;
+			tdb->file->allrecord_lock.off = 0;
+			return TDB_SUCCESS;
+		}
+		if (errno != EDEADLK) {
+			break;
+		}
+		/* sleep for as short a time as we can - more portable than usleep() */
+		tv.tv_sec = 0;
+		tv.tv_usec = 1;
+		select(0, NULL, NULL, NULL, &tv);
+	}
+
+	if (errno != EAGAIN && errno != EINTR)
+		tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
+			   "tdb_allrecord_upgrade failed");
+	return TDB_ERR_LOCK;
+}
+
+static struct tdb_lock *find_nestlock(struct tdb_context *tdb, tdb_off_t offset,
+				      const struct tdb_context *owner)
+{
+	unsigned int i;
+
+	for (i=0; i<tdb->file->num_lockrecs; i++) {
+		if (tdb->file->lockrecs[i].off == offset) {
+			if (owner && tdb->file->lockrecs[i].owner != owner)
+				return NULL;
+			return &tdb->file->lockrecs[i];
+		}
+	}
+	return NULL;
+}
+
+enum TDB_ERROR tdb_lock_and_recover(struct tdb_context *tdb)
+{
+	enum TDB_ERROR ecode;
+
+	if (!check_lock_pid(tdb, "tdb_transaction_prepare_commit", true))
+		return TDB_ERR_LOCK;
+
+	ecode = tdb_allrecord_lock(tdb, F_WRLCK, TDB_LOCK_WAIT|TDB_LOCK_NOCHECK,
+				   false);
+	if (ecode != TDB_SUCCESS) {
+		return ecode;
+	}
+
+	ecode = tdb_lock_open(tdb, F_WRLCK, TDB_LOCK_WAIT|TDB_LOCK_NOCHECK);
+	if (ecode != TDB_SUCCESS) {
+		tdb_allrecord_unlock(tdb, F_WRLCK);
+		return ecode;
+	}
+	ecode = tdb_transaction_recover(tdb);
+	tdb_unlock_open(tdb, F_WRLCK);
+	tdb_allrecord_unlock(tdb, F_WRLCK);
+
+	return ecode;
+}
+
+/* lock an offset in the database. */
+static enum TDB_ERROR tdb_nest_lock(struct tdb_context *tdb,
+				    tdb_off_t offset, int ltype,
+				    enum tdb_lock_flags flags)
+{
+	struct tdb_lock *new_lck;
+	enum TDB_ERROR ecode;
+
+	if (offset > (TDB_HASH_LOCK_START + TDB_HASH_LOCK_RANGE
+		      + tdb->file->map_size / 8)) {
+		return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
+				  "tdb_nest_lock: invalid offset %zu ltype=%d",
+				  (size_t)offset, ltype);
+	}
+
+	if (tdb->flags & TDB_NOLOCK)
+		return TDB_SUCCESS;
+
+	if (!check_lock_pid(tdb, "tdb_nest_lock", true)) {
+		return TDB_ERR_LOCK;
+	}
+
+	tdb->stats.locks++;
+
+	new_lck = find_nestlock(tdb, offset, NULL);
+	if (new_lck) {
+		if (new_lck->owner != tdb) {
+			return owner_conflict(tdb, "tdb_nest_lock");
+		}
+
+		if (new_lck->ltype == F_RDLCK && ltype == F_WRLCK) {
+			return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
+					  "tdb_nest_lock:"
+					  " offset %zu has read lock",
+					  (size_t)offset);
+		}
+		/* Just increment the struct, posix locks don't stack. */
+		new_lck->count++;
+		return TDB_SUCCESS;
+	}
+
+#if 0
+	if (tdb->file->num_lockrecs
+	    && offset >= TDB_HASH_LOCK_START
+	    && offset < TDB_HASH_LOCK_START + TDB_HASH_LOCK_RANGE) {
+		return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
+				  "tdb_nest_lock: already have a hash lock?");
+	}
+#endif
+
+	new_lck = (struct tdb_lock *)realloc(
+		tdb->file->lockrecs,
+		sizeof(*tdb->file->lockrecs) * (tdb->file->num_lockrecs+1));
+	if (new_lck == NULL) {
+		return tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR,
+				  "tdb_nest_lock:"
+				  " unable to allocate %zu lock struct",
+				  tdb->file->num_lockrecs + 1);
+	}
+	tdb->file->lockrecs = new_lck;
+
+	/* Since fcntl locks don't nest, we do a lock for the first one,
+	   and simply bump the count for future ones */
+	ecode = tdb_brlock(tdb, ltype, offset, 1, flags);
+	if (ecode != TDB_SUCCESS) {
+		return ecode;
+	}
+
+	/* First time we grab a lock, perhaps someone died in commit? */
+	if (!(flags & TDB_LOCK_NOCHECK)
+	    && tdb->file->num_lockrecs == 0) {
+		tdb_bool_err berr = tdb_needs_recovery(tdb);
+		if (berr != false) {
+			tdb_brunlock(tdb, ltype, offset, 1);
+
+			if (berr < 0)
+				return berr;
+			ecode = tdb_lock_and_recover(tdb);
+			if (ecode == TDB_SUCCESS) {
+				ecode = tdb_brlock(tdb, ltype, offset, 1,
+						   flags);
+			}
+			if (ecode != TDB_SUCCESS) {
+				return ecode;
+			}
+		}
+	}
+
+	tdb->file->lockrecs[tdb->file->num_lockrecs].owner = tdb;
+	tdb->file->lockrecs[tdb->file->num_lockrecs].off = offset;
+	tdb->file->lockrecs[tdb->file->num_lockrecs].count = 1;
+	tdb->file->lockrecs[tdb->file->num_lockrecs].ltype = ltype;
+	tdb->file->num_lockrecs++;
+
+	return TDB_SUCCESS;
+}
+
+static enum TDB_ERROR tdb_nest_unlock(struct tdb_context *tdb,
+				      tdb_off_t off, int ltype)
+{
+	struct tdb_lock *lck;
+	enum TDB_ERROR ecode;
+
+	if (tdb->flags & TDB_NOLOCK)
+		return TDB_SUCCESS;
+
+	lck = find_nestlock(tdb, off, tdb);
+	if ((lck == NULL) || (lck->count == 0)) {
+		return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
+				  "tdb_nest_unlock: no lock for %zu",
+				  (size_t)off);
+	}
+
+	if (lck->count > 1) {
+		lck->count--;
+		return TDB_SUCCESS;
+	}
+
+	/*
+	 * This lock has count==1 left, so we need to unlock it in the
+	 * kernel. We don't bother with decrementing the in-memory array
+	 * element, we're about to overwrite it with the last array element
+	 * anyway.
+	 */
+	ecode = tdb_brunlock(tdb, ltype, off, 1);
+
+	/*
+	 * Shrink the array by overwriting the element just unlocked with the
+	 * last array element.
+	 */
+	*lck = tdb->file->lockrecs[--tdb->file->num_lockrecs];
+
+	return ecode;
+}
+
+/*
+  get the transaction lock
+ */
+enum TDB_ERROR tdb_transaction_lock(struct tdb_context *tdb, int ltype)
+{
+	return tdb_nest_lock(tdb, TDB_TRANSACTION_LOCK, ltype, TDB_LOCK_WAIT);
+}
+
+/*
+  release the transaction lock
+ */
+void tdb_transaction_unlock(struct tdb_context *tdb, int ltype)
+{
+	tdb_nest_unlock(tdb, TDB_TRANSACTION_LOCK, ltype);
+}
+
+/* We only need to lock individual bytes, but Linux merges consecutive locks
+ * so we lock in contiguous ranges. */
+static enum TDB_ERROR tdb_lock_gradual(struct tdb_context *tdb,
+				       int ltype, enum tdb_lock_flags flags,
+				       tdb_off_t off, tdb_off_t len)
+{
+	enum TDB_ERROR ecode;
+	enum tdb_lock_flags nb_flags = (flags & ~TDB_LOCK_WAIT);
+
+	if (len <= 1) {
+		/* 0 would mean to end-of-file... */
+		assert(len != 0);
+		/* Single hash.  Just do blocking lock. */
+		return tdb_brlock(tdb, ltype, off, len, flags);
+	}
+
+	/* First we try non-blocking. */
+	if (tdb_brlock(tdb, ltype, off, len, nb_flags) == TDB_SUCCESS) {
+		return TDB_SUCCESS;
+	}
+
+	/* Try locking first half, then second. */
+	ecode = tdb_lock_gradual(tdb, ltype, flags, off, len / 2);
+	if (ecode != TDB_SUCCESS)
+		return ecode;
+
+	ecode = tdb_lock_gradual(tdb, ltype, flags,
+				 off + len / 2, len - len / 2);
+	if (ecode != TDB_SUCCESS) {
+		tdb_brunlock(tdb, ltype, off, len / 2);
+	}
+	return ecode;
+}
+
+/* lock/unlock entire database.  It can only be upgradable if you have some
+ * other way of guaranteeing exclusivity (ie. transaction write lock). */
+enum TDB_ERROR tdb_allrecord_lock(struct tdb_context *tdb, int ltype,
+				  enum tdb_lock_flags flags, bool upgradable)
+{
+	enum TDB_ERROR ecode;
+	tdb_bool_err berr;
+
+	if (tdb->flags & TDB_NOLOCK)
+		return TDB_SUCCESS;
+
+	if (!check_lock_pid(tdb, "tdb_allrecord_lock", true)) {
+		return TDB_ERR_LOCK;
+	}
+
+	if (tdb->file->allrecord_lock.count) {
+		if (tdb->file->allrecord_lock.owner != tdb) {
+			return owner_conflict(tdb, "tdb_allrecord_lock");
+		}
+
+		if (ltype == F_RDLCK
+		    || tdb->file->allrecord_lock.ltype == F_WRLCK) {
+			tdb->file->allrecord_lock.count++;
+			return TDB_SUCCESS;
+		}
+
+		/* a global lock of a different type exists */
+		return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_USE_ERROR,
+				  "tdb_allrecord_lock: already have %s lock",
+				  tdb->file->allrecord_lock.ltype == F_RDLCK
+				  ? "read" : "write");
+	}
+
+	if (tdb_has_hash_locks(tdb)) {
+		/* can't combine global and chain locks */
+		return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_USE_ERROR,
+				  "tdb_allrecord_lock:"
+				  " already have chain lock");
+	}
+
+	if (upgradable && ltype != F_RDLCK) {
+		/* tdb error: you can't upgrade a write lock! */
+		return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
+				  "tdb_allrecord_lock:"
+				  " can't upgrade a write lock");
+	}
+
+	tdb->stats.locks++;
+again:
+	/* Lock hashes, gradually. */
+	ecode = tdb_lock_gradual(tdb, ltype, flags, TDB_HASH_LOCK_START,
+				 TDB_HASH_LOCK_RANGE);
+	if (ecode != TDB_SUCCESS)
+		return ecode;
+
+	/* Lock free tables: there to end of file. */
+	ecode = tdb_brlock(tdb, ltype,
+			   TDB_HASH_LOCK_START + TDB_HASH_LOCK_RANGE,
+			   0, flags);
+	if (ecode != TDB_SUCCESS) {
+		tdb_brunlock(tdb, ltype, TDB_HASH_LOCK_START,
+			     TDB_HASH_LOCK_RANGE);
+		return ecode;
+	}
+
+	tdb->file->allrecord_lock.owner = tdb;
+	tdb->file->allrecord_lock.count = 1;
+	/* If it's upgradable, it's actually exclusive so we can treat
+	 * it as a write lock. */
+	tdb->file->allrecord_lock.ltype = upgradable ? F_WRLCK : ltype;
+	tdb->file->allrecord_lock.off = upgradable;
+
+	/* Now check for needing recovery. */
+	if (flags & TDB_LOCK_NOCHECK)
+		return TDB_SUCCESS;
+
+	berr = tdb_needs_recovery(tdb);
+	if (likely(berr == false))
+		return TDB_SUCCESS;
+
+	tdb_allrecord_unlock(tdb, ltype);
+	if (berr < 0)
+		return berr;
+	ecode = tdb_lock_and_recover(tdb);
+	if (ecode != TDB_SUCCESS) {
+		return ecode;
+	}
+	goto again;
+}
+
+enum TDB_ERROR tdb_lock_open(struct tdb_context *tdb,
+			     int ltype, enum tdb_lock_flags flags)
+{
+	return tdb_nest_lock(tdb, TDB_OPEN_LOCK, ltype, flags);
+}
+
+void tdb_unlock_open(struct tdb_context *tdb, int ltype)
+{
+	tdb_nest_unlock(tdb, TDB_OPEN_LOCK, ltype);
+}
+
+bool tdb_has_open_lock(struct tdb_context *tdb)
+{
+	return !(tdb->flags & TDB_NOLOCK)
+		&& find_nestlock(tdb, TDB_OPEN_LOCK, tdb) != NULL;
+}
+
+enum TDB_ERROR tdb_lock_expand(struct tdb_context *tdb, int ltype)
+{
+	/* Lock doesn't protect data, so don't check (we recurse if we do!) */
+	return tdb_nest_lock(tdb, TDB_EXPANSION_LOCK, ltype,
+			     TDB_LOCK_WAIT | TDB_LOCK_NOCHECK);
+}
+
+void tdb_unlock_expand(struct tdb_context *tdb, int ltype)
+{
+	tdb_nest_unlock(tdb, TDB_EXPANSION_LOCK, ltype);
+}
+
+/* unlock entire db */
+void tdb_allrecord_unlock(struct tdb_context *tdb, int ltype)
+{
+	if (tdb->flags & TDB_NOLOCK)
+		return;
+
+	if (tdb->file->allrecord_lock.count == 0) {
+		tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_USE_ERROR,
+			   "tdb_allrecord_unlock: not locked!");
+		return;
+	}
+
+	if (tdb->file->allrecord_lock.owner != tdb) {
+		tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_USE_ERROR,
+			   "tdb_allrecord_unlock: not locked by us!");
+		return;
+	}
+
+	/* Upgradable locks are marked as write locks. */
+	if (tdb->file->allrecord_lock.ltype != ltype
+	    && (!tdb->file->allrecord_lock.off || ltype != F_RDLCK)) {
+		tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
+			   "tdb_allrecord_unlock: have %s lock",
+			   tdb->file->allrecord_lock.ltype == F_RDLCK
+			   ? "read" : "write");
+		return;
+	}
+
+	if (tdb->file->allrecord_lock.count > 1) {
+		tdb->file->allrecord_lock.count--;
+		return;
+	}
+
+	tdb->file->allrecord_lock.count = 0;
+	tdb->file->allrecord_lock.ltype = 0;
+
+	tdb_brunlock(tdb, ltype, TDB_HASH_LOCK_START, 0);
+}
+
+bool tdb_has_expansion_lock(struct tdb_context *tdb)
+{
+	return find_nestlock(tdb, TDB_EXPANSION_LOCK, tdb) != NULL;
+}
+
+bool tdb_has_hash_locks(struct tdb_context *tdb)
+{
+	unsigned int i;
+
+	for (i=0; i<tdb->file->num_lockrecs; i++) {
+		if (tdb->file->lockrecs[i].off >= TDB_HASH_LOCK_START
+		    && tdb->file->lockrecs[i].off < (TDB_HASH_LOCK_START
+						     + TDB_HASH_LOCK_RANGE))
+			return true;
+	}
+	return false;
+}
+
+static bool tdb_has_free_lock(struct tdb_context *tdb)
+{
+	unsigned int i;
+
+	if (tdb->flags & TDB_NOLOCK)
+		return false;
+
+	for (i=0; i<tdb->file->num_lockrecs; i++) {
+		if (tdb->file->lockrecs[i].off
+		    > TDB_HASH_LOCK_START + TDB_HASH_LOCK_RANGE)
+			return true;
+	}
+	return false;
+}
+
+enum TDB_ERROR tdb_lock_hashes(struct tdb_context *tdb,
+			       tdb_off_t hash_lock,
+			       tdb_len_t hash_range,
+			       int ltype, enum tdb_lock_flags waitflag)
+{
+	/* FIXME: Do this properly, using hlock_range */
+	unsigned l = TDB_HASH_LOCK_START
+		+ (hash_lock >> (64 - TDB_HASH_LOCK_RANGE_BITS));
+
+	/* a allrecord lock allows us to avoid per chain locks */
+	if (tdb->file->allrecord_lock.count) {
+		if (!check_lock_pid(tdb, "tdb_lock_hashes", true))
+			return TDB_ERR_LOCK;
+
+		if (tdb->file->allrecord_lock.owner != tdb)
+			return owner_conflict(tdb, "tdb_lock_hashes");
+		if (ltype == tdb->file->allrecord_lock.ltype
+		    || ltype == F_RDLCK) {
+			return TDB_SUCCESS;
+		}
+
+		return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_USE_ERROR,
+				  "tdb_lock_hashes:"
+				  " already have %s allrecordlock",
+				  tdb->file->allrecord_lock.ltype == F_RDLCK
+				  ? "read" : "write");
+	}
+
+	if (tdb_has_free_lock(tdb)) {
+		return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
+				  "tdb_lock_hashes: already have free lock");
+	}
+
+	if (tdb_has_expansion_lock(tdb)) {
+		return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
+				  "tdb_lock_hashes:"
+				  " already have expansion lock");
+	}
+
+	return tdb_nest_lock(tdb, l, ltype, waitflag);
+}
+
+enum TDB_ERROR tdb_unlock_hashes(struct tdb_context *tdb,
+				 tdb_off_t hash_lock,
+				 tdb_len_t hash_range, int ltype)
+{
+	unsigned l = TDB_HASH_LOCK_START
+		+ (hash_lock >> (64 - TDB_HASH_LOCK_RANGE_BITS));
+
+	if (tdb->flags & TDB_NOLOCK)
+		return 0;
+
+	/* a allrecord lock allows us to avoid per chain locks */
+	if (tdb->file->allrecord_lock.count) {
+		if (tdb->file->allrecord_lock.ltype == F_RDLCK
+		    && ltype == F_WRLCK) {
+			return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
+					  "tdb_unlock_hashes RO allrecord!");
+		}
+		return TDB_SUCCESS;
+	}
+
+	return tdb_nest_unlock(tdb, l, ltype);
+}
+
+/* Hash locks use TDB_HASH_LOCK_START + the next 30 bits.
+ * Then we begin; bucket offsets are sizeof(tdb_len_t) apart, so we divide.
+ * The result is that on 32 bit systems we don't use lock values > 2^31 on
+ * files that are less than 4GB.
+ */
+static tdb_off_t free_lock_off(tdb_off_t b_off)
+{
+	return TDB_HASH_LOCK_START + TDB_HASH_LOCK_RANGE
+		+ b_off / sizeof(tdb_off_t);
+}
+
+enum TDB_ERROR tdb_lock_free_bucket(struct tdb_context *tdb, tdb_off_t b_off,
+				    enum tdb_lock_flags waitflag)
+{
+	assert(b_off >= sizeof(struct tdb_header));
+
+	if (tdb->flags & TDB_NOLOCK)
+		return 0;
+
+	/* a allrecord lock allows us to avoid per chain locks */
+	if (tdb->file->allrecord_lock.count) {
+		if (!check_lock_pid(tdb, "tdb_lock_free_bucket", true))
+			return TDB_ERR_LOCK;
+
+		if (tdb->file->allrecord_lock.ltype == F_WRLCK)
+			return 0;
+		return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
+				  "tdb_lock_free_bucket with"
+				  " read-only allrecordlock!");
+	}
+
+#if 0 /* FIXME */
+	if (tdb_has_expansion_lock(tdb)) {
+		return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
+				  "tdb_lock_free_bucket:"
+				  " already have expansion lock");
+	}
+#endif
+
+	return tdb_nest_lock(tdb, free_lock_off(b_off), F_WRLCK, waitflag);
+}
+
+void tdb_unlock_free_bucket(struct tdb_context *tdb, tdb_off_t b_off)
+{
+	if (tdb->file->allrecord_lock.count)
+		return;
+
+	tdb_nest_unlock(tdb, free_lock_off(b_off), F_WRLCK);
+}
+
+enum TDB_ERROR tdb_lockall(struct tdb_context *tdb)
+{
+	return tdb_allrecord_lock(tdb, F_WRLCK, TDB_LOCK_WAIT, false);
+}
+
+void tdb_unlockall(struct tdb_context *tdb)
+{
+	tdb_allrecord_unlock(tdb, F_WRLCK);
+}
+
+enum TDB_ERROR tdb_lockall_read(struct tdb_context *tdb)
+{
+	return tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_WAIT, false);
+}
+
+void tdb_unlockall_read(struct tdb_context *tdb)
+{
+	tdb_allrecord_unlock(tdb, F_RDLCK);
+}
+
+void tdb_lock_cleanup(struct tdb_context *tdb)
+{
+	unsigned int i;
+
+	/* We don't want to warn: they're allowed to close tdb after fork. */
+	if (!check_lock_pid(tdb, "tdb_close", false))
+		return;
+
+	while (tdb->file->allrecord_lock.count
+	       && tdb->file->allrecord_lock.owner == tdb) {
+		tdb_allrecord_unlock(tdb, tdb->file->allrecord_lock.ltype);
+	}
+
+	for (i=0; i<tdb->file->num_lockrecs; i++) {
+		if (tdb->file->lockrecs[i].owner == tdb) {
+			tdb_nest_unlock(tdb,
+					tdb->file->lockrecs[i].off,
+					tdb->file->lockrecs[i].ltype);
+			i--;
+		}
+	}
+}
diff --git a/lib/tdb2/open.c b/lib/tdb2/open.c
new file mode 100644
index 0000000000..c35598cdcc
--- /dev/null
+++ b/lib/tdb2/open.c
@@ -0,0 +1,661 @@
+#include "private.h"
+#include <ccan/hash/hash.h>
+#include <assert.h>
+
+/* all lock info, to detect double-opens (fcntl file don't nest!) */
+static struct tdb_file *files = NULL;
+
+static struct tdb_file *find_file(dev_t device, ino_t ino)
+{
+	struct tdb_file *i;
+
+	for (i = files; i; i = i->next) {
+		if (i->device == device && i->inode == ino) {
+			i->refcnt++;
+			break;
+		}
+	}
+	return i;
+}
+
+static bool read_all(int fd, void *buf, size_t len)
+{
+	while (len) {
+		ssize_t ret;
+		ret = read(fd, buf, len);
+		if (ret < 0)
+			return false;
+		if (ret == 0) {
+			/* ETOOSHORT? */
+			errno = EWOULDBLOCK;
+			return false;
+		}
+		buf = (char *)buf + ret;
+		len -= ret;
+	}
+	return true;
+}
+
+static uint64_t random_number(struct tdb_context *tdb)
+{
+	int fd;
+	uint64_t ret = 0;
+	struct timeval now;
+
+	fd = open("/dev/urandom", O_RDONLY);
+	if (fd >= 0) {
+		if (read_all(fd, &ret, sizeof(ret))) {
+			close(fd);
+			return ret;
+		}
+		close(fd);
+	}
+	/* FIXME: Untested!  Based on Wikipedia protocol description! */
+	fd = open("/dev/egd-pool", O_RDWR);
+	if (fd >= 0) {
+		/* Command is 1, next byte is size we want to read. */
+		char cmd[2] = { 1, sizeof(uint64_t) };
+		if (write(fd, cmd, sizeof(cmd)) == sizeof(cmd)) {
+			char reply[1 + sizeof(uint64_t)];
+			int r = read(fd, reply, sizeof(reply));
+			if (r > 1) {
+				/* Copy at least some bytes. */
+				memcpy(&ret, reply+1, r - 1);
+				if (reply[0] == sizeof(uint64_t)
+				    && r == sizeof(reply)) {
+					close(fd);
+					return ret;
+				}
+			}
+		}
+		close(fd);
+	}
+
+	/* Fallback: pid and time. */
+	gettimeofday(&now, NULL);
+	ret = getpid() * 100132289ULL + now.tv_sec * 1000000ULL + now.tv_usec;
+	tdb_logerr(tdb, TDB_SUCCESS, TDB_LOG_WARNING,
+		   "tdb_open: random from getpid and time");
+	return ret;
+}
+
+struct new_database {
+	struct tdb_header hdr;
+	struct tdb_freetable ftable;
+};
+
+/* initialise a new database */
+static enum TDB_ERROR tdb_new_database(struct tdb_context *tdb,
+				       struct tdb_attribute_seed *seed,
+				       struct tdb_header *hdr)
+{
+	/* We make it up in memory, then write it out if not internal */
+	struct new_database newdb;
+	unsigned int magic_len;
+	ssize_t rlen;
+	enum TDB_ERROR ecode;
+
+	/* Fill in the header */
+	newdb.hdr.version = TDB_VERSION;
+	if (seed)
+		newdb.hdr.hash_seed = seed->seed;
+	else
+		newdb.hdr.hash_seed = random_number(tdb);
+	newdb.hdr.hash_test = TDB_HASH_MAGIC;
+	newdb.hdr.hash_test = tdb->hash_fn(&newdb.hdr.hash_test,
+					   sizeof(newdb.hdr.hash_test),
+					   newdb.hdr.hash_seed,
+					   tdb->hash_data);
+	newdb.hdr.recovery = 0;
+	newdb.hdr.features_used = newdb.hdr.features_offered = TDB_FEATURE_MASK;
+	newdb.hdr.seqnum = 0;
+	memset(newdb.hdr.reserved, 0, sizeof(newdb.hdr.reserved));
+	/* Initial hashes are empty. */
+	memset(newdb.hdr.hashtable, 0, sizeof(newdb.hdr.hashtable));
+
+	/* Free is empty. */
+	newdb.hdr.free_table = offsetof(struct new_database, ftable);
+	memset(&newdb.ftable, 0, sizeof(newdb.ftable));
+	ecode = set_header(NULL, &newdb.ftable.hdr, TDB_FTABLE_MAGIC, 0,
+			   sizeof(newdb.ftable) - sizeof(newdb.ftable.hdr),
+			   sizeof(newdb.ftable) - sizeof(newdb.ftable.hdr),
+			   0);
+	if (ecode != TDB_SUCCESS) {
+		return ecode;
+	}
+
+	/* Magic food */
+	memset(newdb.hdr.magic_food, 0, sizeof(newdb.hdr.magic_food));
+	strcpy(newdb.hdr.magic_food, TDB_MAGIC_FOOD);
+
+	/* This creates an endian-converted database, as if read from disk */
+	magic_len = sizeof(newdb.hdr.magic_food);
+	tdb_convert(tdb,
+		    (char *)&newdb.hdr + magic_len, sizeof(newdb) - magic_len);
+
+	*hdr = newdb.hdr;
+
+	if (tdb->flags & TDB_INTERNAL) {
+		tdb->file->map_size = sizeof(newdb);
+		tdb->file->map_ptr = malloc(tdb->file->map_size);
+		if (!tdb->file->map_ptr) {
+			return tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR,
+					  "tdb_new_database:"
+					  " failed to allocate");
+		}
+		memcpy(tdb->file->map_ptr, &newdb, tdb->file->map_size);
+		return TDB_SUCCESS;
+	}
+	if (lseek(tdb->file->fd, 0, SEEK_SET) == -1) {
+		return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
+				  "tdb_new_database:"
+				  " failed to seek: %s", strerror(errno));
+	}
+
+	if (ftruncate(tdb->file->fd, 0) == -1) {
+		return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
+				  "tdb_new_database:"
+				  " failed to truncate: %s", strerror(errno));
+	}
+
+	rlen = write(tdb->file->fd, &newdb, sizeof(newdb));
+	if (rlen != sizeof(newdb)) {
+		if (rlen >= 0)
+			errno = ENOSPC;
+		return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
+				  "tdb_new_database: %zi writing header: %s",
+				  rlen, strerror(errno));
+	}
+	return TDB_SUCCESS;
+}
+
+static enum TDB_ERROR tdb_new_file(struct tdb_context *tdb)
+{
+	tdb->file = malloc(sizeof(*tdb->file));
+	if (!tdb->file)
+		return tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR,
+				  "tdb_open: cannot alloc tdb_file structure");
+	tdb->file->num_lockrecs = 0;
+	tdb->file->lockrecs = NULL;
+	tdb->file->allrecord_lock.count = 0;
+	tdb->file->refcnt = 1;
+	return TDB_SUCCESS;
+}
+
+enum TDB_ERROR tdb_set_attribute(struct tdb_context *tdb,
+				 const union tdb_attribute *attr)
+{
+	switch (attr->base.attr) {
+	case TDB_ATTRIBUTE_LOG:
+		tdb->log_fn = attr->log.fn;
+		tdb->log_data = attr->log.data;
+		break;
+	case TDB_ATTRIBUTE_HASH:
+	case TDB_ATTRIBUTE_SEED:
+	case TDB_ATTRIBUTE_OPENHOOK:
+		return tdb->last_error
+			= tdb_logerr(tdb, TDB_ERR_EINVAL,
+				     TDB_LOG_USE_ERROR,
+				     "tdb_set_attribute:"
+				     " cannot set %s after opening",
+				     attr->base.attr == TDB_ATTRIBUTE_HASH
+				     ? "TDB_ATTRIBUTE_HASH"
+				     : attr->base.attr == TDB_ATTRIBUTE_SEED
+				     ? "TDB_ATTRIBUTE_SEED"
+				     : "TDB_ATTRIBUTE_OPENHOOK");
+	case TDB_ATTRIBUTE_STATS:
+		return tdb->last_error
+			= tdb_logerr(tdb, TDB_ERR_EINVAL,
+				     TDB_LOG_USE_ERROR,
+				     "tdb_set_attribute:"
+				     " cannot set TDB_ATTRIBUTE_STATS");
+	case TDB_ATTRIBUTE_FLOCK:
+		tdb->lock_fn = attr->flock.lock;
+		tdb->unlock_fn = attr->flock.unlock;
+		tdb->lock_data = attr->flock.data;
+		break;
+	default:
+		return tdb->last_error
+			= tdb_logerr(tdb, TDB_ERR_EINVAL,
+				     TDB_LOG_USE_ERROR,
+				     "tdb_set_attribute:"
+				     " unknown attribute type %u",
+				     attr->base.attr);
+	}
+	return TDB_SUCCESS;
+}
+
+static uint64_t jenkins_hash(const void *key, size_t length, uint64_t seed,
+			     void *unused)
+{
+	uint64_t ret;
+	/* hash64_stable assumes lower bits are more important; they are a
+	 * slightly better hash.  We use the upper bits first, so swap them. */
+	ret = hash64_stable((const unsigned char *)key, length, seed);
+	return (ret >> 32) | (ret << 32);
+}
+
+enum TDB_ERROR tdb_get_attribute(struct tdb_context *tdb,
+				 union tdb_attribute *attr)
+{
+	switch (attr->base.attr) {
+	case TDB_ATTRIBUTE_LOG:
+		if (!tdb->log_fn)
+			return tdb->last_error = TDB_ERR_NOEXIST;
+		attr->log.fn = tdb->log_fn;
+		attr->log.data = tdb->log_data;
+		break;
+	case TDB_ATTRIBUTE_HASH:
+		attr->hash.fn = tdb->hash_fn;
+		attr->hash.data = tdb->hash_data;
+		break;
+	case TDB_ATTRIBUTE_SEED:
+		attr->seed.seed = tdb->hash_seed;
+		break;
+	case TDB_ATTRIBUTE_OPENHOOK:
+		return tdb->last_error
+			= tdb_logerr(tdb, TDB_ERR_EINVAL,
+				     TDB_LOG_USE_ERROR,
+				     "tdb_get_attribute:"
+				     " cannot get TDB_ATTRIBUTE_OPENHOOK");
+	case TDB_ATTRIBUTE_STATS: {
+		size_t size = attr->stats.size;
+		if (size > tdb->stats.size)
+			size = tdb->stats.size;
+		memcpy(&attr->stats, &tdb->stats, size);
+		break;
+	}
+	case TDB_ATTRIBUTE_FLOCK:
+		attr->flock.lock = tdb->lock_fn;
+		attr->flock.unlock = tdb->unlock_fn;
+		attr->flock.data = tdb->lock_data;
+		break;
+	default:
+		return tdb->last_error
+			= tdb_logerr(tdb, TDB_ERR_EINVAL,
+				     TDB_LOG_USE_ERROR,
+				     "tdb_get_attribute:"
+				     " unknown attribute type %u",
+				     attr->base.attr);
+	}
+	attr->base.next = NULL;
+	return TDB_SUCCESS;
+}
+
+void tdb_unset_attribute(struct tdb_context *tdb,
+			 enum tdb_attribute_type type)
+{
+	switch (type) {
+	case TDB_ATTRIBUTE_LOG:
+		tdb->log_fn = NULL;
+		break;
+	case TDB_ATTRIBUTE_HASH:
+	case TDB_ATTRIBUTE_SEED:
+	case TDB_ATTRIBUTE_OPENHOOK:
+		tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_LOG_USE_ERROR,
+			   "tdb_unset_attribute: cannot unset %s after opening",
+			   type == TDB_ATTRIBUTE_HASH
+			   ? "TDB_ATTRIBUTE_HASH"
+			   : type == TDB_ATTRIBUTE_SEED
+			   ? "TDB_ATTRIBUTE_SEED"
+			   : "TDB_ATTRIBUTE_OPENHOOK");
+		break;
+	case TDB_ATTRIBUTE_STATS:
+		tdb_logerr(tdb, TDB_ERR_EINVAL,
+			   TDB_LOG_USE_ERROR,
+			   "tdb_unset_attribute:"
+			   "cannot unset TDB_ATTRIBUTE_STATS");
+		break;
+	case TDB_ATTRIBUTE_FLOCK:
+		tdb->lock_fn = tdb_fcntl_lock;
+		tdb->unlock_fn = tdb_fcntl_unlock;
+		break;
+	default:
+		tdb_logerr(tdb, TDB_ERR_EINVAL,
+			   TDB_LOG_USE_ERROR,
+			   "tdb_unset_attribute: unknown attribute type %u",
+			   type);
+	}
+}
+
+struct tdb_context *tdb_open(const char *name, int tdb_flags,
+			     int open_flags, mode_t mode,
+			     union tdb_attribute *attr)
+{
+	struct tdb_context *tdb;
+	struct stat st;
+	int saved_errno = 0;
+	uint64_t hash_test;
+	unsigned v;
+	ssize_t rlen;
+	struct tdb_header hdr;
+	struct tdb_attribute_seed *seed = NULL;
+	struct tdb_attribute_openhook *openhook = NULL;
+	tdb_bool_err berr;
+	enum TDB_ERROR ecode;
+	int openlock;
+
+	tdb = malloc(sizeof(*tdb) + (name ? strlen(name) + 1 : 0));
+	if (!tdb) {
+		/* Can't log this */
+		errno = ENOMEM;
+		return NULL;
+	}
+	/* Set name immediately for logging functions. */
+	if (name) {
+		tdb->name = strcpy((char *)(tdb + 1), name);
+	} else {
+		tdb->name = NULL;
+	}
+	tdb->direct_access = 0;
+	tdb->flags = tdb_flags;
+	tdb->log_fn = NULL;
+	tdb->transaction = NULL;
+	tdb->access = NULL;
+	tdb->last_error = TDB_SUCCESS;
+	tdb->file = NULL;
+	tdb->lock_fn = tdb_fcntl_lock;
+	tdb->unlock_fn = tdb_fcntl_unlock;
+	tdb->hash_fn = jenkins_hash;
+	memset(&tdb->stats, 0, sizeof(tdb->stats));
+	tdb->stats.base.attr = TDB_ATTRIBUTE_STATS;
+	tdb->stats.size = sizeof(tdb->stats);
+	tdb_io_init(tdb);
+
+	while (attr) {
+		switch (attr->base.attr) {
+		case TDB_ATTRIBUTE_HASH:
+			tdb->hash_fn = attr->hash.fn;
+			tdb->hash_data = attr->hash.data;
+			break;
+		case TDB_ATTRIBUTE_SEED:
+			seed = &attr->seed;
+			break;
+		case TDB_ATTRIBUTE_OPENHOOK:
+			openhook = &attr->openhook;
+			break;
+		default:
+			/* These are set as normal. */
+			ecode = tdb_set_attribute(tdb, attr);
+			if (ecode != TDB_SUCCESS)
+				goto fail;
+		}
+		attr = attr->base.next;
+	}
+
+	if (tdb_flags & ~(TDB_INTERNAL | TDB_NOLOCK | TDB_NOMMAP | TDB_CONVERT
+			  | TDB_NOSYNC | TDB_SEQNUM | TDB_ALLOW_NESTING)) {
+		ecode = tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_LOG_USE_ERROR,
+				   "tdb_open: unknown flags %u", tdb_flags);
+		goto fail;
+	}
+
+	if ((open_flags & O_ACCMODE) == O_WRONLY) {
+		ecode = tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_LOG_USE_ERROR,
+				   "tdb_open: can't open tdb %s write-only",
+				   name);
+		goto fail;
+	}
+
+	if ((open_flags & O_ACCMODE) == O_RDONLY) {
+		tdb->read_only = true;
+		tdb->mmap_flags = PROT_READ;
+		openlock = F_RDLCK;
+	} else {
+		tdb->read_only = false;
+		tdb->mmap_flags = PROT_READ | PROT_WRITE;
+		openlock = F_WRLCK;
+	}
+
+	/* internal databases don't need any of the rest. */
+	if (tdb->flags & TDB_INTERNAL) {
+		tdb->flags |= (TDB_NOLOCK | TDB_NOMMAP);
+		ecode = tdb_new_file(tdb);
+		if (ecode != TDB_SUCCESS) {
+			goto fail;
+		}
+		tdb->file->fd = -1;
+		ecode = tdb_new_database(tdb, seed, &hdr);
+		if (ecode != TDB_SUCCESS) {
+			goto fail;
+		}
+		tdb_convert(tdb, &hdr.hash_seed, sizeof(hdr.hash_seed));
+		tdb->hash_seed = hdr.hash_seed;
+		tdb_ftable_init(tdb);
+		return tdb;
+	}
+
+	if (stat(name, &st) != -1)
+		tdb->file = find_file(st.st_dev, st.st_ino);
+
+	if (!tdb->file) {
+		int fd;
+
+		if ((fd = open(name, open_flags, mode)) == -1) {
+			/* errno set by open(2) */
+			saved_errno = errno;
+			tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
+				   "tdb_open: could not open file %s: %s",
+				   name, strerror(errno));
+			goto fail_errno;
+		}
+
+		/* on exec, don't inherit the fd */
+		v = fcntl(fd, F_GETFD, 0);
+		fcntl(fd, F_SETFD, v | FD_CLOEXEC);
+
+		if (fstat(fd, &st) == -1) {
+			saved_errno = errno;
+			tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
+				   "tdb_open: could not stat open %s: %s",
+				   name, strerror(errno));
+			close(fd);
+			goto fail_errno;
+		}
+
+		ecode = tdb_new_file(tdb);
+		if (ecode != TDB_SUCCESS) {
+			close(fd);
+			goto fail;
+		}
+
+		tdb->file->next = files;
+		tdb->file->fd = fd;
+		tdb->file->device = st.st_dev;
+		tdb->file->inode = st.st_ino;
+		tdb->file->map_ptr = NULL;
+		tdb->file->map_size = sizeof(struct tdb_header);
+	}
+
+	/* ensure there is only one process initialising at once */
+	ecode = tdb_lock_open(tdb, openlock, TDB_LOCK_WAIT|TDB_LOCK_NOCHECK);
+	if (ecode != TDB_SUCCESS) {
+		saved_errno = errno;
+		goto fail_errno;
+	}
+
+	/* call their open hook if they gave us one. */
+	if (openhook) {
+		ecode = openhook->fn(tdb->file->fd, openhook->data);
+		if (ecode != TDB_SUCCESS) {
+			tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
+				   "tdb_open: open hook failed");
+			goto fail;
+		}
+		open_flags |= O_CREAT;
+	}
+
+	/* If they used O_TRUNC, read will return 0. */
+	rlen = pread(tdb->file->fd, &hdr, sizeof(hdr), 0);
+	if (rlen == 0 && (open_flags & O_CREAT)) {
+		ecode = tdb_new_database(tdb, seed, &hdr);
+		if (ecode != TDB_SUCCESS) {
+			goto fail;
+		}
+	} else if (rlen < 0) {
+		ecode = tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
+				   "tdb_open: error %s reading %s",
+				   strerror(errno), name);
+		goto fail;
+	} else if (rlen < sizeof(hdr)
+		   || strcmp(hdr.magic_food, TDB_MAGIC_FOOD) != 0) {
+		ecode = tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
+				   "tdb_open: %s is not a tdb file", name);
+		goto fail;
+	}
+
+	if (hdr.version != TDB_VERSION) {
+		if (hdr.version == bswap_64(TDB_VERSION))
+			tdb->flags |= TDB_CONVERT;
+		else {
+			/* wrong version */
+			ecode = tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
+					   "tdb_open:"
+					   " %s is unknown version 0x%llx",
+					   name, (long long)hdr.version);
+			goto fail;
+		}
+	}
+
+	tdb_convert(tdb, &hdr, sizeof(hdr));
+	tdb->hash_seed = hdr.hash_seed;
+	hash_test = TDB_HASH_MAGIC;
+	hash_test = tdb_hash(tdb, &hash_test, sizeof(hash_test));
+	if (hdr.hash_test != hash_test) {
+		/* wrong hash variant */
+		ecode = tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
+				   "tdb_open:"
+				   " %s uses a different hash function",
+				   name);
+		goto fail;
+	}
+
+	/* Clear any features we don't understand. */
+	if ((open_flags & O_ACCMODE) != O_RDONLY) {
+		hdr.features_used &= TDB_FEATURE_MASK;
+		if (tdb_write_convert(tdb, offsetof(struct tdb_header,
+						    features_used),
+				      &hdr.features_used,
+				      sizeof(hdr.features_used)) == -1)
+			goto fail;
+	}
+
+	tdb_unlock_open(tdb, openlock);
+
+	/* This make sure we have current map_size and mmap. */
+	tdb->methods->oob(tdb, tdb->file->map_size + 1, true);
+
+	/* Now it's fully formed, recover if necessary. */
+	berr = tdb_needs_recovery(tdb);
+	if (unlikely(berr != false)) {
+		if (berr < 0) {
+			ecode = berr;
+			goto fail;
+		}
+		ecode = tdb_lock_and_recover(tdb);
+		if (ecode != TDB_SUCCESS) {
+			goto fail;
+		}
+	}
+
+	ecode = tdb_ftable_init(tdb);
+	if (ecode != TDB_SUCCESS) {
+		goto fail;
+	}
+
+	/* Add to linked list if we're new. */
+	if (tdb->file->refcnt == 1)
+		files = tdb->file;
+	return tdb;
+
+ fail:
+	/* Map ecode to some logical errno. */
+	switch (ecode) {
+	case TDB_ERR_CORRUPT:
+	case TDB_ERR_IO:
+		saved_errno = EIO;
+		break;
+	case TDB_ERR_LOCK:
+		saved_errno = EWOULDBLOCK;
+		break;
+	case TDB_ERR_OOM:
+		saved_errno = ENOMEM;
+		break;
+	case TDB_ERR_EINVAL:
+		saved_errno = EINVAL;
+		break;
+	default:
+		saved_errno = EINVAL;
+		break;
+	}
+
+fail_errno:
+#ifdef TDB_TRACE
+	close(tdb->tracefd);
+#endif
+	if (tdb->file) {
+		tdb_lock_cleanup(tdb);
+		if (--tdb->file->refcnt == 0) {
+			assert(tdb->file->num_lockrecs == 0);
+			if (tdb->file->map_ptr) {
+				if (tdb->flags & TDB_INTERNAL) {
+					free(tdb->file->map_ptr);
+				} else
+					tdb_munmap(tdb->file);
+			}
+			if (close(tdb->file->fd) != 0)
+				tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
+					   "tdb_open: failed to close tdb fd"
+					   " on error: %s", strerror(errno));
+			free(tdb->file->lockrecs);
+			free(tdb->file);
+		}
+	}
+
+	free(tdb);
+	errno = saved_errno;
+	return NULL;
+}
+
+int tdb_close(struct tdb_context *tdb)
+{
+	int ret = 0;
+
+	tdb_trace(tdb, "tdb_close");
+
+	if (tdb->transaction) {
+		tdb_transaction_cancel(tdb);
+	}
+
+	if (tdb->file->map_ptr) {
+		if (tdb->flags & TDB_INTERNAL)
+			free(tdb->file->map_ptr);
+		else
+			tdb_munmap(tdb->file);
+	}
+	if (tdb->file) {
+		struct tdb_file **i;
+
+		tdb_lock_cleanup(tdb);
+		if (--tdb->file->refcnt == 0) {
+			ret = close(tdb->file->fd);
+
+			/* Remove from files list */
+			for (i = &files; *i; i = &(*i)->next) {
+				if (*i == tdb->file) {
+					*i = tdb->file->next;
+					break;
+				}
+			}
+			free(tdb->file->lockrecs);
+			free(tdb->file);
+		}
+	}
+
+#ifdef TDB_TRACE
+	close(tdb->tracefd);
+#endif
+	free(tdb);
+
+	return ret;
+}
diff --git a/lib/tdb2/private.h b/lib/tdb2/private.h
new file mode 100644
index 0000000000..0c3e441657
--- /dev/null
+++ b/lib/tdb2/private.h
@@ -0,0 +1,624 @@
+#ifndef TDB_PRIVATE_H
+#define TDB_PRIVATE_H
+ /*
+   Trivial Database 2: private types and prototypes
+   Copyright (C) Rusty Russell 2010
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "config.h"
+#if HAVE_FILE_OFFSET_BITS
+#define _FILE_OFFSET_BITS 64
+#endif
+#include <ccan/likely/likely.h>
+#include <ccan/compiler/compiler.h>
+#include <ccan/endian/endian.h>
+#include "tdb2.h"
+
+#ifdef _SAMBA_BUILD_
+#include "replace.h"
+#include "system/filesys.h"
+#include "system/time.h"
+#include "system/shmem.h"
+#include "system/select.h"
+#include "system/wait.h"
+#else
+#include <stdint.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <sys/time.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <string.h>
+#include <errno.h>
+#include <stdio.h>
+#include <utime.h>
+#include <unistd.h>
+#endif
+
+#ifndef TEST_IT
+#define TEST_IT(cond)
+#endif
+
+/* #define TDB_TRACE 1 */
+
+#ifndef __STRING
+#define __STRING(x)    #x
+#endif
+
+#ifndef __STRINGSTRING
+#define __STRINGSTRING(x) __STRING(x)
+#endif
+
+#ifndef __location__
+#define __location__ __FILE__ ":" __STRINGSTRING(__LINE__)
+#endif
+
+typedef uint64_t tdb_len_t;
+typedef uint64_t tdb_off_t;
+
+#define TDB_MAGIC_FOOD "TDB file\n"
+#define TDB_VERSION ((uint64_t)(0x26011967 + 7))
+#define TDB_USED_MAGIC ((uint64_t)0x1999)
+#define TDB_HTABLE_MAGIC ((uint64_t)0x1888)
+#define TDB_CHAIN_MAGIC ((uint64_t)0x1777)
+#define TDB_FTABLE_MAGIC ((uint64_t)0x1666)
+#define TDB_FREE_MAGIC ((uint64_t)0xFE)
+#define TDB_HASH_MAGIC (0xA1ABE11A01092008ULL)
+#define TDB_RECOVERY_MAGIC (0xf53bc0e7ad124589ULL)
+#define TDB_RECOVERY_INVALID_MAGIC (0x0ULL)
+
+#define TDB_OFF_IS_ERR(off) unlikely(off >= (tdb_off_t)TDB_ERR_LAST)
+
+/* Packing errors into pointers and v.v. */
+#define TDB_PTR_IS_ERR(ptr) \
+	unlikely((unsigned long)(ptr) >= (unsigned long)TDB_ERR_LAST)
+#define TDB_PTR_ERR(p) ((enum TDB_ERROR)(long)(p))
+#define TDB_ERR_PTR(err) ((void *)(long)(err))
+
+/* Common case of returning true, false or -ve error. */
+typedef int tdb_bool_err;
+
+/* Prevent others from opening the file. */
+#define TDB_OPEN_LOCK 0
+/* Doing a transaction. */
+#define TDB_TRANSACTION_LOCK 1
+/* Expanding file. */
+#define TDB_EXPANSION_LOCK 2
+/* Hash chain locks. */
+#define TDB_HASH_LOCK_START 64
+
+/* Range for hash locks. */
+#define TDB_HASH_LOCK_RANGE_BITS 30
+#define TDB_HASH_LOCK_RANGE (1 << TDB_HASH_LOCK_RANGE_BITS)
+
+/* We have 1024 entries in the top level. */
+#define TDB_TOPLEVEL_HASH_BITS 10
+/* And 64 entries in each sub-level: thus 64 bits exactly after 9 levels. */
+#define TDB_SUBLEVEL_HASH_BITS 6
+/* And 8 entries in each group, ie 8 groups per sublevel. */
+#define TDB_HASH_GROUP_BITS 3
+/* This is currently 10: beyond this we chain. */
+#define TDB_MAX_LEVELS (1+(64-TDB_TOPLEVEL_HASH_BITS) / TDB_SUBLEVEL_HASH_BITS)
+
+/* Extend file by least 100 times larger than needed. */
+#define TDB_EXTENSION_FACTOR 100
+
+/* We steal bits from the offsets to store hash info. */
+#define TDB_OFF_HASH_GROUP_MASK ((1ULL << TDB_HASH_GROUP_BITS) - 1)
+/* We steal this many upper bits, giving a maximum offset of 64 exabytes. */
+#define TDB_OFF_UPPER_STEAL 8
+#define   TDB_OFF_UPPER_STEAL_EXTRA 7
+/* The bit number where we store extra hash bits. */
+#define TDB_OFF_HASH_EXTRA_BIT 57
+#define TDB_OFF_UPPER_STEAL_SUBHASH_BIT 56
+
+/* Additional features we understand.  Currently: none. */
+#define TDB_FEATURE_MASK ((uint64_t)0)
+
+/* The bit number where we store the extra hash bits. */
+/* Convenience mask to get actual offset. */
+#define TDB_OFF_MASK \
+	(((1ULL << (64 - TDB_OFF_UPPER_STEAL)) - 1) - TDB_OFF_HASH_GROUP_MASK)
+
+/* How many buckets in a free list: see size_to_bucket(). */
+#define TDB_FREE_BUCKETS (64 - TDB_OFF_UPPER_STEAL)
+
+/* We have to be able to fit a free record here. */
+#define TDB_MIN_DATA_LEN	\
+	(sizeof(struct tdb_free_record) - sizeof(struct tdb_used_record))
+
+/* Indicates this entry is not on an flist (can happen during coalescing) */
+#define TDB_FTABLE_NONE ((1ULL << TDB_OFF_UPPER_STEAL) - 1)
+
+struct tdb_used_record {
+	/* For on-disk compatibility, we avoid bitfields:
+	   magic: 16,        (highest)
+	   key_len_bits: 5,
+	   extra_padding: 32
+	   hash_bits: 11
+	*/
+        uint64_t magic_and_meta;
+	/* The bottom key_len_bits*2 are key length, rest is data length. */
+        uint64_t key_and_data_len;
+};
+
+static inline unsigned rec_key_bits(const struct tdb_used_record *r)
+{
+	return ((r->magic_and_meta >> 43) & ((1 << 5)-1)) * 2;
+}
+
+static inline uint64_t rec_key_length(const struct tdb_used_record *r)
+{
+	return r->key_and_data_len & ((1ULL << rec_key_bits(r)) - 1);
+}
+
+static inline uint64_t rec_data_length(const struct tdb_used_record *r)
+{
+	return r->key_and_data_len >> rec_key_bits(r);
+}
+
+static inline uint64_t rec_extra_padding(const struct tdb_used_record *r)
+{
+	return (r->magic_and_meta >> 11) & 0xFFFFFFFF;
+}
+
+static inline uint32_t rec_hash(const struct tdb_used_record *r)
+{
+	return r->magic_and_meta & ((1 << 11) - 1);
+}
+
+static inline uint16_t rec_magic(const struct tdb_used_record *r)
+{
+	return (r->magic_and_meta >> 48);
+}
+
+struct tdb_free_record {
+        uint64_t magic_and_prev; /* TDB_OFF_UPPER_STEAL bits magic, then prev */
+        uint64_t ftable_and_len; /* Len not counting these two fields. */
+	/* This is why the minimum record size is 8 bytes.  */
+	uint64_t next;
+};
+
+static inline uint64_t frec_prev(const struct tdb_free_record *f)
+{
+	return f->magic_and_prev & ((1ULL << (64 - TDB_OFF_UPPER_STEAL)) - 1);
+}
+
+static inline uint64_t frec_magic(const struct tdb_free_record *f)
+{
+	return f->magic_and_prev >> (64 - TDB_OFF_UPPER_STEAL);
+}
+
+static inline uint64_t frec_len(const struct tdb_free_record *f)
+{
+	return f->ftable_and_len & ((1ULL << (64 - TDB_OFF_UPPER_STEAL))-1);
+}
+
+static inline unsigned frec_ftable(const struct tdb_free_record *f)
+{
+	return f->ftable_and_len >> (64 - TDB_OFF_UPPER_STEAL);
+}
+
+struct tdb_recovery_record {
+	uint64_t magic;
+	/* Length of record (add this header to get total length). */
+	uint64_t max_len;
+	/* Length used. */
+	uint64_t len;
+	/* Old length of file before transaction. */
+	uint64_t eof;
+};
+
+/* If we bottom out of the subhashes, we chain. */
+struct tdb_chain {
+	tdb_off_t rec[1 << TDB_HASH_GROUP_BITS];
+	tdb_off_t next;
+};
+
+/* this is stored at the front of every database */
+struct tdb_header {
+	char magic_food[64]; /* for /etc/magic */
+	/* FIXME: Make me 32 bit? */
+	uint64_t version; /* version of the code */
+	uint64_t hash_test; /* result of hashing HASH_MAGIC. */
+	uint64_t hash_seed; /* "random" seed written at creation time. */
+	tdb_off_t free_table; /* (First) free table. */
+	tdb_off_t recovery; /* Transaction recovery area. */
+
+	uint64_t features_used; /* Features all writers understand */
+	uint64_t features_offered; /* Features offered */
+
+	uint64_t seqnum; /* Sequence number for TDB_SEQNUM */
+
+	tdb_off_t reserved[23];
+
+	/* Top level hash table. */
+	tdb_off_t hashtable[1ULL << TDB_TOPLEVEL_HASH_BITS];
+};
+
+struct tdb_freetable {
+	struct tdb_used_record hdr;
+	tdb_off_t next;
+	tdb_off_t buckets[TDB_FREE_BUCKETS];
+};
+
+/* Information about a particular (locked) hash entry. */
+struct hash_info {
+	/* Full hash value of entry. */
+	uint64_t h;
+	/* Start and length of lock acquired. */
+	tdb_off_t hlock_start;
+	tdb_len_t hlock_range;
+	/* Start of hash group. */
+	tdb_off_t group_start;
+	/* Bucket we belong in. */
+	unsigned int home_bucket;
+	/* Bucket we (or an empty space) were found in. */
+	unsigned int found_bucket;
+	/* How many bits of the hash are already used. */
+	unsigned int hash_used;
+	/* Current working group. */
+	tdb_off_t group[1 << TDB_HASH_GROUP_BITS];
+};
+
+struct traverse_info {
+	struct traverse_level {
+		tdb_off_t hashtable;
+		/* We ignore groups here, and treat it as a big array. */
+		unsigned entry;
+		unsigned int total_buckets;
+	} levels[TDB_MAX_LEVELS + 1];
+	unsigned int num_levels;
+	unsigned int toplevel_group;
+	/* This makes delete-everything-inside-traverse work as expected. */
+	tdb_off_t prev;
+};
+
+enum tdb_lock_flags {
+	/* WAIT == F_SETLKW, NOWAIT == F_SETLK */
+	TDB_LOCK_NOWAIT = 0,
+	TDB_LOCK_WAIT = 1,
+	/* If set, don't log an error on failure. */
+	TDB_LOCK_PROBE = 2,
+	/* If set, don't check for recovery (used by recovery code). */
+	TDB_LOCK_NOCHECK = 4,
+};
+
+struct tdb_lock {
+	struct tdb_context *owner;
+	uint32_t off;
+	uint32_t count;
+	uint32_t ltype;
+};
+
+/* This is only needed for tdb_access_commit, but used everywhere to
+ * simplify. */
+struct tdb_access_hdr {
+	struct tdb_access_hdr *next;
+	tdb_off_t off;
+	tdb_len_t len;
+	bool convert;
+};
+
+struct tdb_file {
+	/* Single list of all TDBs, to detect multiple opens. */
+	struct tdb_file *next;
+
+	/* How many are sharing us? */
+	unsigned int refcnt;
+
+	/* Mmap (if any), or malloc (for TDB_INTERNAL). */
+	void *map_ptr;
+
+	/* How much space has been mapped (<= current file size) */
+	tdb_len_t map_size;
+
+	/* The file descriptor (-1 for TDB_INTERNAL). */
+	int fd;
+
+	/* Lock information */
+	pid_t locker;
+	struct tdb_lock allrecord_lock;
+	size_t num_lockrecs;
+	struct tdb_lock *lockrecs;
+
+	/* Identity of this file. */
+	dev_t device;
+	ino_t inode;
+};
+
+struct tdb_context {
+	/* Filename of the database. */
+	const char *name;
+
+	/* Are we accessing directly? (debugging check). */
+	int direct_access;
+
+	/* Operating read-only? (Opened O_RDONLY, or in traverse_read) */
+	bool read_only;
+
+	/* mmap read only? */
+	int mmap_flags;
+
+	/* the flags passed to tdb_open, for tdb_reopen. */
+	uint32_t flags;
+
+	/* Logging function */
+	void (*log_fn)(struct tdb_context *tdb,
+		       enum tdb_log_level level,
+		       const char *message,
+		       void *data);
+	void *log_data;
+
+	/* Hash function. */
+	uint64_t (*hash_fn)(const void *key, size_t len, uint64_t seed, void *);
+	void *hash_data;
+	uint64_t hash_seed;
+
+	/* low level (fnctl) lock functions. */
+	int (*lock_fn)(int fd, int rw, off_t off, off_t len, bool w, void *);
+	int (*unlock_fn)(int fd, int rw, off_t off, off_t len, void *);
+	void *lock_data;
+
+	/* Set if we are in a transaction. */
+	struct tdb_transaction *transaction;
+
+	/* What free table are we using? */
+	tdb_off_t ftable_off;
+	unsigned int ftable;
+
+	/* IO methods: changes for transactions. */
+	const struct tdb_methods *methods;
+
+	/* Our statistics. */
+	struct tdb_attribute_stats stats;
+
+	/* Direct access information */
+	struct tdb_access_hdr *access;
+
+	/* Last error we returned. */
+	enum TDB_ERROR last_error;
+
+	/* The actual file information */
+	struct tdb_file *file;
+};
+
+struct tdb_methods {
+	enum TDB_ERROR (*tread)(struct tdb_context *, tdb_off_t, void *,
+				tdb_len_t);
+	enum TDB_ERROR (*twrite)(struct tdb_context *, tdb_off_t, const void *,
+				 tdb_len_t);
+	enum TDB_ERROR (*oob)(struct tdb_context *, tdb_off_t, bool);
+	enum TDB_ERROR (*expand_file)(struct tdb_context *, tdb_len_t);
+	void *(*direct)(struct tdb_context *, tdb_off_t, size_t, bool);
+};
+
+/*
+  internal prototypes
+*/
+/* hash.c: */
+tdb_bool_err first_in_hash(struct tdb_context *tdb,
+			   struct traverse_info *tinfo,
+			   TDB_DATA *kbuf, size_t *dlen);
+
+tdb_bool_err next_in_hash(struct tdb_context *tdb,
+			  struct traverse_info *tinfo,
+			  TDB_DATA *kbuf, size_t *dlen);
+
+/* Hash random memory. */
+uint64_t tdb_hash(struct tdb_context *tdb, const void *ptr, size_t len);
+
+/* Hash on disk. */
+uint64_t hash_record(struct tdb_context *tdb, tdb_off_t off);
+
+/* Find and lock a hash entry (or where it would be). */
+tdb_off_t find_and_lock(struct tdb_context *tdb,
+			struct tdb_data key,
+			int ltype,
+			struct hash_info *h,
+			struct tdb_used_record *rec,
+			struct traverse_info *tinfo);
+
+enum TDB_ERROR replace_in_hash(struct tdb_context *tdb,
+			       struct hash_info *h,
+			       tdb_off_t new_off);
+
+enum TDB_ERROR add_to_hash(struct tdb_context *tdb, struct hash_info *h,
+			   tdb_off_t new_off);
+
+enum TDB_ERROR delete_from_hash(struct tdb_context *tdb, struct hash_info *h);
+
+/* For tdb_check */
+bool is_subhash(tdb_off_t val);
+
+/* free.c: */
+enum TDB_ERROR tdb_ftable_init(struct tdb_context *tdb);
+
+/* check.c needs these to iterate through free lists. */
+tdb_off_t first_ftable(struct tdb_context *tdb);
+tdb_off_t next_ftable(struct tdb_context *tdb, tdb_off_t ftable);
+
+/* This returns space or -ve error number. */
+tdb_off_t alloc(struct tdb_context *tdb, size_t keylen, size_t datalen,
+		uint64_t hash, unsigned magic, bool growing);
+
+/* Put this record in a free list. */
+enum TDB_ERROR add_free_record(struct tdb_context *tdb,
+			       tdb_off_t off, tdb_len_t len_with_header,
+			       enum tdb_lock_flags waitflag,
+			       bool coalesce_ok);
+
+/* Set up header for a used/ftable/htable/chain record. */
+enum TDB_ERROR set_header(struct tdb_context *tdb,
+			  struct tdb_used_record *rec,
+			  unsigned magic, uint64_t keylen, uint64_t datalen,
+			  uint64_t actuallen, unsigned hashlow);
+
+/* Used by tdb_check to verify. */
+unsigned int size_to_bucket(tdb_len_t data_len);
+tdb_off_t bucket_off(tdb_off_t ftable_off, unsigned bucket);
+
+/* Used by tdb_summary */
+tdb_off_t dead_space(struct tdb_context *tdb, tdb_off_t off);
+
+/* io.c: */
+/* Initialize tdb->methods. */
+void tdb_io_init(struct tdb_context *tdb);
+
+/* Convert endian of the buffer if required. */
+void *tdb_convert(const struct tdb_context *tdb, void *buf, tdb_len_t size);
+
+/* Unmap and try to map the tdb. */
+void tdb_munmap(struct tdb_file *file);
+void tdb_mmap(struct tdb_context *tdb);
+
+/* Either alloc a copy, or give direct access.  Release frees or noop. */
+const void *tdb_access_read(struct tdb_context *tdb,
+			    tdb_off_t off, tdb_len_t len, bool convert);
+void *tdb_access_write(struct tdb_context *tdb,
+		       tdb_off_t off, tdb_len_t len, bool convert);
+
+/* Release result of tdb_access_read/write. */
+void tdb_access_release(struct tdb_context *tdb, const void *p);
+/* Commit result of tdb_acces_write. */
+enum TDB_ERROR tdb_access_commit(struct tdb_context *tdb, void *p);
+
+/* Convenience routine to get an offset. */
+tdb_off_t tdb_read_off(struct tdb_context *tdb, tdb_off_t off);
+
+/* Write an offset at an offset. */
+enum TDB_ERROR tdb_write_off(struct tdb_context *tdb, tdb_off_t off,
+			     tdb_off_t val);
+
+/* Clear an ondisk area. */
+enum TDB_ERROR zero_out(struct tdb_context *tdb, tdb_off_t off, tdb_len_t len);
+
+/* Return a non-zero offset between >= start < end in this array (or end). */
+tdb_off_t tdb_find_nonzero_off(struct tdb_context *tdb,
+			       tdb_off_t base,
+			       uint64_t start,
+			       uint64_t end);
+
+/* Return a zero offset in this array, or num. */
+tdb_off_t tdb_find_zero_off(struct tdb_context *tdb, tdb_off_t off,
+			    uint64_t num);
+
+/* Allocate and make a copy of some offset. */
+void *tdb_alloc_read(struct tdb_context *tdb, tdb_off_t offset, tdb_len_t len);
+
+/* Writes a converted copy of a record. */
+enum TDB_ERROR tdb_write_convert(struct tdb_context *tdb, tdb_off_t off,
+				 const void *rec, size_t len);
+
+/* Reads record and converts it */
+enum TDB_ERROR tdb_read_convert(struct tdb_context *tdb, tdb_off_t off,
+				void *rec, size_t len);
+
+/* Bump the seqnum (caller checks for tdb->flags & TDB_SEQNUM) */
+void tdb_inc_seqnum(struct tdb_context *tdb);
+
+/* lock.c: */
+/* Lock/unlock a range of hashes. */
+enum TDB_ERROR tdb_lock_hashes(struct tdb_context *tdb,
+			       tdb_off_t hash_lock, tdb_len_t hash_range,
+			       int ltype, enum tdb_lock_flags waitflag);
+enum TDB_ERROR tdb_unlock_hashes(struct tdb_context *tdb,
+				 tdb_off_t hash_lock,
+				 tdb_len_t hash_range, int ltype);
+
+/* For closing the file. */
+void tdb_lock_cleanup(struct tdb_context *tdb);
+
+/* Lock/unlock a particular free bucket. */
+enum TDB_ERROR tdb_lock_free_bucket(struct tdb_context *tdb, tdb_off_t b_off,
+				    enum tdb_lock_flags waitflag);
+void tdb_unlock_free_bucket(struct tdb_context *tdb, tdb_off_t b_off);
+
+/* Serialize transaction start. */
+enum TDB_ERROR tdb_transaction_lock(struct tdb_context *tdb, int ltype);
+void tdb_transaction_unlock(struct tdb_context *tdb, int ltype);
+
+/* Do we have any hash locks (ie. via tdb_chainlock) ? */
+bool tdb_has_hash_locks(struct tdb_context *tdb);
+
+/* Lock entire database. */
+enum TDB_ERROR tdb_allrecord_lock(struct tdb_context *tdb, int ltype,
+				  enum tdb_lock_flags flags, bool upgradable);
+void tdb_allrecord_unlock(struct tdb_context *tdb, int ltype);
+enum TDB_ERROR tdb_allrecord_upgrade(struct tdb_context *tdb);
+
+/* Serialize db open. */
+enum TDB_ERROR tdb_lock_open(struct tdb_context *tdb,
+			     int ltype, enum tdb_lock_flags flags);
+void tdb_unlock_open(struct tdb_context *tdb, int ltype);
+bool tdb_has_open_lock(struct tdb_context *tdb);
+
+/* Serialize db expand. */
+enum TDB_ERROR tdb_lock_expand(struct tdb_context *tdb, int ltype);
+void tdb_unlock_expand(struct tdb_context *tdb, int ltype);
+bool tdb_has_expansion_lock(struct tdb_context *tdb);
+
+/* If it needs recovery, grab all the locks and do it. */
+enum TDB_ERROR tdb_lock_and_recover(struct tdb_context *tdb);
+
+/* Default lock and unlock functions. */
+int tdb_fcntl_lock(int fd, int rw, off_t off, off_t len, bool waitflag, void *);
+int tdb_fcntl_unlock(int fd, int rw, off_t off, off_t len, void *);
+
+/* transaction.c: */
+enum TDB_ERROR tdb_transaction_recover(struct tdb_context *tdb);
+tdb_bool_err tdb_needs_recovery(struct tdb_context *tdb);
+
+/* tdb.c: */
+enum TDB_ERROR COLD tdb_logerr(struct tdb_context *tdb,
+			       enum TDB_ERROR ecode,
+			       enum tdb_log_level level,
+			       const char *fmt, ...);
+
+#ifdef TDB_TRACE
+void tdb_trace(struct tdb_context *tdb, const char *op);
+void tdb_trace_seqnum(struct tdb_context *tdb, uint32_t seqnum, const char *op);
+void tdb_trace_open(struct tdb_context *tdb, const char *op,
+		    unsigned hash_size, unsigned tdb_flags, unsigned open_flags);
+void tdb_trace_ret(struct tdb_context *tdb, const char *op, int ret);
+void tdb_trace_retrec(struct tdb_context *tdb, const char *op, TDB_DATA ret);
+void tdb_trace_1rec(struct tdb_context *tdb, const char *op,
+		    TDB_DATA rec);
+void tdb_trace_1rec_ret(struct tdb_context *tdb, const char *op,
+			TDB_DATA rec, int ret);
+void tdb_trace_1rec_retrec(struct tdb_context *tdb, const char *op,
+			   TDB_DATA rec, TDB_DATA ret);
+void tdb_trace_2rec_flag_ret(struct tdb_context *tdb, const char *op,
+			     TDB_DATA rec1, TDB_DATA rec2, unsigned flag,
+			     int ret);
+void tdb_trace_2rec_retrec(struct tdb_context *tdb, const char *op,
+			   TDB_DATA rec1, TDB_DATA rec2, TDB_DATA ret);
+#else
+#define tdb_trace(tdb, op)
+#define tdb_trace_seqnum(tdb, seqnum, op)
+#define tdb_trace_open(tdb, op, hash_size, tdb_flags, open_flags)
+#define tdb_trace_ret(tdb, op, ret)
+#define tdb_trace_retrec(tdb, op, ret)
+#define tdb_trace_1rec(tdb, op, rec)
+#define tdb_trace_1rec_ret(tdb, op, rec, ret)
+#define tdb_trace_1rec_retrec(tdb, op, rec, ret)
+#define tdb_trace_2rec_flag_ret(tdb, op, rec1, rec2, flag, ret)
+#define tdb_trace_2rec_retrec(tdb, op, rec1, rec2, ret)
+#endif /* !TDB_TRACE */
+
+#endif
diff --git a/lib/tdb2/pytdb.c b/lib/tdb2/pytdb.c
new file mode 100644
index 0000000000..c760045508
--- /dev/null
+++ b/lib/tdb2/pytdb.c
@@ -0,0 +1,586 @@
+/*
+   Unix SMB/CIFS implementation.
+
+   Python interface to tdb2.  Simply modified from tdb1 version.
+
+   Copyright (C) 2004-2006 Tim Potter <tpot@samba.org>
+   Copyright (C) 2007-2008 Jelmer Vernooij <jelmer@samba.org>
+   Copyright (C) 2011 Rusty Russell <rusty@rustcorp.com.au>
+
+     ** NOTE! The following LGPL license applies to the tdb
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <Python.h>
+#include "replace.h"
+#include "system/filesys.h"
+
+#ifndef Py_RETURN_NONE
+#define Py_RETURN_NONE return Py_INCREF(Py_None), Py_None
+#endif
+
+/* Include tdb headers */
+#include <tdb2.h>
+
+typedef struct {
+	PyObject_HEAD
+	struct tdb_context *ctx;
+	bool closed;
+} PyTdbObject;
+
+staticforward PyTypeObject PyTdb;
+
+static void PyErr_SetTDBError(enum TDB_ERROR e)
+{
+	PyErr_SetObject(PyExc_RuntimeError,
+		Py_BuildValue("(i,s)", e, tdb_errorstr(e)));
+}
+
+static TDB_DATA PyString_AsTDB_DATA(PyObject *data)
+{
+	TDB_DATA ret;
+	ret.dptr = (unsigned char *)PyString_AsString(data);
+	ret.dsize = PyString_Size(data);
+	return ret;
+}
+
+static PyObject *PyString_FromTDB_DATA(TDB_DATA data)
+{
+	PyObject *ret = PyString_FromStringAndSize((const char *)data.dptr,
+						   data.dsize);
+	free(data.dptr);
+	return ret;
+}
+
+#define PyErr_TDB_ERROR_IS_ERR_RAISE(ret) \
+	if (ret != TDB_SUCCESS) { \
+		PyErr_SetTDBError(ret); \
+		return NULL; \
+	}
+
+static void stderr_log(struct tdb_context *tdb,
+		       enum tdb_log_level level,
+		       const char *message,
+		       void *data)
+{
+	fprintf(stderr, "%s:%s\n", tdb_name(tdb), message);
+}
+
+static PyObject *py_tdb_open(PyTypeObject *type, PyObject *args, PyObject *kwargs)
+{
+	char *name = NULL;
+	int tdb_flags = TDB_DEFAULT, flags = O_RDWR, mode = 0600;
+	struct tdb_context *ctx;
+	PyTdbObject *ret;
+	union tdb_attribute logattr;
+	const char *kwnames[] = { "name", "tdb_flags", "flags", "mode", NULL };
+
+	if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|siii", (char **)kwnames, &name, &tdb_flags, &flags, &mode))
+		return NULL;
+
+	if (name == NULL) {
+		tdb_flags |= TDB_INTERNAL;
+	}
+
+	logattr.log.base.attr = TDB_ATTRIBUTE_LOG;
+	logattr.log.base.next = NULL;
+	logattr.log.fn = stderr_log;
+	ctx = tdb_open(name, tdb_flags, flags, mode, &logattr);
+	if (ctx == NULL) {
+		PyErr_SetFromErrno(PyExc_IOError);
+		return NULL;
+	}
+
+	ret = PyObject_New(PyTdbObject, &PyTdb);
+	if (!ret) {
+		tdb_close(ctx);
+		return NULL;
+	}
+
+	ret->ctx = ctx;
+	ret->closed = false;
+	return (PyObject *)ret;
+}
+
+static PyObject *obj_transaction_cancel(PyTdbObject *self)
+{
+	tdb_transaction_cancel(self->ctx);
+	Py_RETURN_NONE;
+}
+
+static PyObject *obj_transaction_commit(PyTdbObject *self)
+{
+	enum TDB_ERROR ret = tdb_transaction_commit(self->ctx);
+	PyErr_TDB_ERROR_IS_ERR_RAISE(ret);
+	Py_RETURN_NONE;
+}
+
+static PyObject *obj_transaction_prepare_commit(PyTdbObject *self)
+{
+	enum TDB_ERROR ret = tdb_transaction_prepare_commit(self->ctx);
+	PyErr_TDB_ERROR_IS_ERR_RAISE(ret);
+	Py_RETURN_NONE;
+}
+
+static PyObject *obj_transaction_start(PyTdbObject *self)
+{
+	enum TDB_ERROR ret = tdb_transaction_start(self->ctx);
+	PyErr_TDB_ERROR_IS_ERR_RAISE(ret);
+	Py_RETURN_NONE;
+}
+
+static PyObject *obj_lockall(PyTdbObject *self)
+{
+	enum TDB_ERROR ret = tdb_lockall(self->ctx);
+	PyErr_TDB_ERROR_IS_ERR_RAISE(ret);
+	Py_RETURN_NONE;
+}
+
+static PyObject *obj_unlockall(PyTdbObject *self)
+{
+	tdb_unlockall(self->ctx);
+	Py_RETURN_NONE;
+}
+
+static PyObject *obj_lockall_read(PyTdbObject *self)
+{
+	enum TDB_ERROR ret = tdb_lockall_read(self->ctx);
+	PyErr_TDB_ERROR_IS_ERR_RAISE(ret);
+	Py_RETURN_NONE;
+}
+
+static PyObject *obj_unlockall_read(PyTdbObject *self)
+{
+	tdb_unlockall_read(self->ctx);
+	Py_RETURN_NONE;
+}
+
+static PyObject *obj_close(PyTdbObject *self)
+{
+	enum TDB_ERROR ret;
+	if (self->closed)
+		Py_RETURN_NONE;
+	ret = tdb_close(self->ctx);
+	self->closed = true;
+	PyErr_TDB_ERROR_IS_ERR_RAISE(ret);
+	Py_RETURN_NONE;
+}
+
+static PyObject *obj_get(PyTdbObject *self, PyObject *args)
+{
+	TDB_DATA key, data;
+	PyObject *py_key;
+	enum TDB_ERROR ret;
+	if (!PyArg_ParseTuple(args, "O", &py_key))
+		return NULL;
+
+	key = PyString_AsTDB_DATA(py_key);
+	ret = tdb_fetch(self->ctx, key, &data);
+	if (ret == TDB_ERR_NOEXIST)
+		Py_RETURN_NONE;
+	PyErr_TDB_ERROR_IS_ERR_RAISE(ret);
+	return PyString_FromTDB_DATA(data);
+}
+
+static PyObject *obj_append(PyTdbObject *self, PyObject *args)
+{
+	TDB_DATA key, data;
+	PyObject *py_key, *py_data;
+	enum TDB_ERROR ret;
+	if (!PyArg_ParseTuple(args, "OO", &py_key, &py_data))
+		return NULL;
+
+	key = PyString_AsTDB_DATA(py_key);
+	data = PyString_AsTDB_DATA(py_data);
+
+	ret = tdb_append(self->ctx, key, data);
+	PyErr_TDB_ERROR_IS_ERR_RAISE(ret);
+	Py_RETURN_NONE;
+}
+
+static PyObject *obj_firstkey(PyTdbObject *self)
+{
+	enum TDB_ERROR ret;
+	TDB_DATA key;
+
+	ret = tdb_firstkey(self->ctx, &key);
+	if (ret == TDB_ERR_NOEXIST)
+		Py_RETURN_NONE;
+	PyErr_TDB_ERROR_IS_ERR_RAISE(ret);
+
+	return PyString_FromTDB_DATA(key);
+}
+
+static PyObject *obj_nextkey(PyTdbObject *self, PyObject *args)
+{
+	TDB_DATA key;
+	PyObject *py_key;
+	enum TDB_ERROR ret;
+	if (!PyArg_ParseTuple(args, "O", &py_key))
+		return NULL;
+
+	/* Malloc here, since tdb_nextkey frees. */
+	key.dsize = PyString_Size(py_key);
+	key.dptr = malloc(key.dsize);
+	memcpy(key.dptr, PyString_AsString(py_key), key.dsize);
+
+	ret = tdb_nextkey(self->ctx, &key);
+	if (ret == TDB_ERR_NOEXIST)
+		Py_RETURN_NONE;
+	PyErr_TDB_ERROR_IS_ERR_RAISE(ret);
+
+	return PyString_FromTDB_DATA(key);
+}
+
+static PyObject *obj_delete(PyTdbObject *self, PyObject *args)
+{
+	TDB_DATA key;
+	PyObject *py_key;
+	enum TDB_ERROR ret;
+	if (!PyArg_ParseTuple(args, "O", &py_key))
+		return NULL;
+
+	key = PyString_AsTDB_DATA(py_key);
+	ret = tdb_delete(self->ctx, key);
+	PyErr_TDB_ERROR_IS_ERR_RAISE(ret);
+	Py_RETURN_NONE;
+}
+
+static PyObject *obj_has_key(PyTdbObject *self, PyObject *args)
+{
+	TDB_DATA key;
+	enum TDB_ERROR ret;
+	PyObject *py_key;
+	if (!PyArg_ParseTuple(args, "O", &py_key))
+		return NULL;
+
+	key = PyString_AsTDB_DATA(py_key);
+	ret = tdb_exists(self->ctx, key);
+	if (ret == TDB_ERR_NOEXIST)
+		return Py_False;
+	PyErr_TDB_ERROR_IS_ERR_RAISE(ret);
+	return Py_True;
+}
+
+static PyObject *obj_store(PyTdbObject *self, PyObject *args)
+{
+	TDB_DATA key, value;
+	enum TDB_ERROR ret;
+	int flag = TDB_REPLACE;
+	PyObject *py_key, *py_value;
+
+	if (!PyArg_ParseTuple(args, "OO|i", &py_key, &py_value, &flag))
+		return NULL;
+
+	key = PyString_AsTDB_DATA(py_key);
+	value = PyString_AsTDB_DATA(py_value);
+
+	ret = tdb_store(self->ctx, key, value, flag);
+	PyErr_TDB_ERROR_IS_ERR_RAISE(ret);
+	Py_RETURN_NONE;
+}
+
+static PyObject *obj_add_flag(PyTdbObject *self, PyObject *args)
+{
+	unsigned flag;
+
+	if (!PyArg_ParseTuple(args, "I", &flag))
+		return NULL;
+
+	tdb_add_flag(self->ctx, flag);
+	Py_RETURN_NONE;
+}
+
+static PyObject *obj_remove_flag(PyTdbObject *self, PyObject *args)
+{
+	unsigned flag;
+
+	if (!PyArg_ParseTuple(args, "I", &flag))
+		return NULL;
+
+	tdb_remove_flag(self->ctx, flag);
+	Py_RETURN_NONE;
+}
+
+typedef struct {
+	PyObject_HEAD
+	TDB_DATA current;
+	bool end;
+	PyTdbObject *iteratee;
+} PyTdbIteratorObject;
+
+static PyObject *tdb_iter_next(PyTdbIteratorObject *self)
+{
+	enum TDB_ERROR e;
+	PyObject *ret;
+	if (self->end)
+		return NULL;
+	ret = PyString_FromStringAndSize((const char *)self->current.dptr,
+					 self->current.dsize);
+	e = tdb_nextkey(self->iteratee->ctx, &self->current);
+	if (e == TDB_ERR_NOEXIST)
+		self->end = true;
+	else
+		PyErr_TDB_ERROR_IS_ERR_RAISE(e);
+	return ret;
+}
+
+static void tdb_iter_dealloc(PyTdbIteratorObject *self)
+{
+	Py_DECREF(self->iteratee);
+	PyObject_Del(self);
+}
+
+PyTypeObject PyTdbIterator = {
+	.tp_name = "Iterator",
+	.tp_basicsize = sizeof(PyTdbIteratorObject),
+	.tp_iternext = (iternextfunc)tdb_iter_next,
+	.tp_dealloc = (destructor)tdb_iter_dealloc,
+	.tp_flags = Py_TPFLAGS_DEFAULT,
+	.tp_iter = PyObject_SelfIter,
+};
+
+static PyObject *tdb_object_iter(PyTdbObject *self)
+{
+	PyTdbIteratorObject *ret;
+	enum TDB_ERROR e;
+
+	ret = PyObject_New(PyTdbIteratorObject, &PyTdbIterator);
+	if (!ret)
+		return NULL;
+	e = tdb_firstkey(self->ctx, &ret->current);
+	if (e == TDB_ERR_NOEXIST) {
+		ret->end = true;
+	} else {
+		PyErr_TDB_ERROR_IS_ERR_RAISE(e);
+		ret->end = false;
+	}
+	ret->iteratee = self;
+	Py_INCREF(self);
+	return (PyObject *)ret;
+}
+
+static PyObject *obj_clear(PyTdbObject *self)
+{
+	enum TDB_ERROR ret = tdb_wipe_all(self->ctx);
+	PyErr_TDB_ERROR_IS_ERR_RAISE(ret);
+	Py_RETURN_NONE;
+}
+
+static PyObject *obj_enable_seqnum(PyTdbObject *self)
+{
+	tdb_add_flag(self->ctx, TDB_SEQNUM);
+	Py_RETURN_NONE;
+}
+
+static PyMethodDef tdb_object_methods[] = {
+	{ "transaction_cancel", (PyCFunction)obj_transaction_cancel, METH_NOARGS,
+		"S.transaction_cancel() -> None\n"
+		"Cancel the currently active transaction." },
+	{ "transaction_commit", (PyCFunction)obj_transaction_commit, METH_NOARGS,
+		"S.transaction_commit() -> None\n"
+		"Commit the currently active transaction." },
+	{ "transaction_prepare_commit", (PyCFunction)obj_transaction_prepare_commit, METH_NOARGS,
+		"S.transaction_prepare_commit() -> None\n"
+		"Prepare to commit the currently active transaction" },
+	{ "transaction_start", (PyCFunction)obj_transaction_start, METH_NOARGS,
+		"S.transaction_start() -> None\n"
+		"Start a new transaction." },
+	{ "lock_all", (PyCFunction)obj_lockall, METH_NOARGS, NULL },
+	{ "unlock_all", (PyCFunction)obj_unlockall, METH_NOARGS, NULL },
+	{ "read_lock_all", (PyCFunction)obj_lockall_read, METH_NOARGS, NULL },
+	{ "read_unlock_all", (PyCFunction)obj_unlockall_read, METH_NOARGS, NULL },
+	{ "close", (PyCFunction)obj_close, METH_NOARGS, NULL },
+	{ "get", (PyCFunction)obj_get, METH_VARARGS, "S.get(key) -> value\n"
+		"Fetch a value." },
+	{ "append", (PyCFunction)obj_append, METH_VARARGS, "S.append(key, value) -> None\n"
+		"Append data to an existing key." },
+	{ "firstkey", (PyCFunction)obj_firstkey, METH_NOARGS, "S.firstkey() -> data\n"
+		"Return the first key in this database." },
+	{ "nextkey", (PyCFunction)obj_nextkey, METH_NOARGS, "S.nextkey(key) -> data\n"
+		"Return the next key in this database." },
+	{ "delete", (PyCFunction)obj_delete, METH_VARARGS, "S.delete(key) -> None\n"
+		"Delete an entry." },
+	{ "has_key", (PyCFunction)obj_has_key, METH_VARARGS, "S.has_key(key) -> None\n"
+		"Check whether key exists in this database." },
+	{ "store", (PyCFunction)obj_store, METH_VARARGS, "S.store(key, data, flag=REPLACE) -> None"
+		"Store data." },
+	{ "add_flag", (PyCFunction)obj_add_flag, METH_VARARGS, "S.add_flag(flag) -> None" },
+	{ "remove_flag", (PyCFunction)obj_remove_flag, METH_VARARGS, "S.remove_flag(flag) -> None" },
+	{ "iterkeys", (PyCFunction)tdb_object_iter, METH_NOARGS, "S.iterkeys() -> iterator" },
+	{ "clear", (PyCFunction)obj_clear, METH_NOARGS, "S.clear() -> None\n"
+		"Wipe the entire database." },
+	{ "enable_seqnum", (PyCFunction)obj_enable_seqnum, METH_NOARGS,
+		"S.enable_seqnum() -> None" },
+	{ NULL }
+};
+
+static PyObject *obj_get_flags(PyTdbObject *self, void *closure)
+{
+	return PyInt_FromLong(tdb_get_flags(self->ctx));
+}
+
+static PyObject *obj_get_filename(PyTdbObject *self, void *closure)
+{
+	return PyString_FromString(tdb_name(self->ctx));
+}
+
+static PyObject *obj_get_seqnum(PyTdbObject *self, void *closure)
+{
+	return PyInt_FromLong(tdb_get_seqnum(self->ctx));
+}
+
+
+static PyGetSetDef tdb_object_getsetters[] = {
+	{ (char *)"flags", (getter)obj_get_flags, NULL, NULL },
+	{ (char *)"filename", (getter)obj_get_filename, NULL, (char *)"The filename of this TDB file."},
+	{ (char *)"seqnum", (getter)obj_get_seqnum, NULL, NULL },
+	{ NULL }
+};
+
+static PyObject *tdb_object_repr(PyTdbObject *self)
+{
+	if (tdb_get_flags(self->ctx) & TDB_INTERNAL) {
+		return PyString_FromString("Tdb(<internal>)");
+	} else {
+		return PyString_FromFormat("Tdb('%s')", tdb_name(self->ctx));
+	}
+}
+
+static void tdb_object_dealloc(PyTdbObject *self)
+{
+	if (!self->closed)
+		tdb_close(self->ctx);
+	self->ob_type->tp_free(self);
+}
+
+static PyObject *obj_getitem(PyTdbObject *self, PyObject *key)
+{
+	TDB_DATA tkey, val;
+	enum TDB_ERROR ret;
+
+	if (!PyString_Check(key)) {
+		PyErr_SetString(PyExc_TypeError, "Expected string as key");
+		return NULL;
+	}
+
+	tkey.dptr = (unsigned char *)PyString_AsString(key);
+	tkey.dsize = PyString_Size(key);
+
+	ret = tdb_fetch(self->ctx, tkey, &val);
+	if (ret == TDB_ERR_NOEXIST) {
+		PyErr_SetString(PyExc_KeyError, "No such TDB entry");
+		return NULL;
+	} else {
+		PyErr_TDB_ERROR_IS_ERR_RAISE(ret);
+		return PyString_FromTDB_DATA(val);
+	}
+}
+
+static int obj_setitem(PyTdbObject *self, PyObject *key, PyObject *value)
+{
+	TDB_DATA tkey, tval;
+	enum TDB_ERROR ret;
+	if (!PyString_Check(key)) {
+		PyErr_SetString(PyExc_TypeError, "Expected string as key");
+		return -1;
+	}
+
+	tkey = PyString_AsTDB_DATA(key);
+
+	if (value == NULL) {
+		ret = tdb_delete(self->ctx, tkey);
+	} else {
+		if (!PyString_Check(value)) {
+			PyErr_SetString(PyExc_TypeError, "Expected string as value");
+			return -1;
+		}
+
+		tval = PyString_AsTDB_DATA(value);
+
+		ret = tdb_store(self->ctx, tkey, tval, TDB_REPLACE);
+	}
+
+	if (ret != TDB_SUCCESS) {
+		PyErr_SetTDBError(ret);
+		return -1;
+	}
+
+	return ret;
+}
+
+static PyMappingMethods tdb_object_mapping = {
+	.mp_subscript = (binaryfunc)obj_getitem,
+	.mp_ass_subscript = (objobjargproc)obj_setitem,
+};
+static PyTypeObject PyTdb = {
+	.tp_name = "Tdb",
+	.tp_basicsize = sizeof(PyTdbObject),
+	.tp_methods = tdb_object_methods,
+	.tp_getset = tdb_object_getsetters,
+	.tp_new = py_tdb_open,
+	.tp_doc = "A TDB file",
+	.tp_repr = (reprfunc)tdb_object_repr,
+	.tp_dealloc = (destructor)tdb_object_dealloc,
+	.tp_as_mapping = &tdb_object_mapping,
+	.tp_flags = Py_TPFLAGS_DEFAULT|Py_TPFLAGS_BASETYPE|Py_TPFLAGS_HAVE_ITER,
+	.tp_iter = (getiterfunc)tdb_object_iter,
+};
+
+static PyMethodDef tdb_methods[] = {
+	{ "open", (PyCFunction)py_tdb_open, METH_VARARGS|METH_KEYWORDS, "open(name, hash_size=0, tdb_flags=TDB_DEFAULT, flags=O_RDWR, mode=0600)\n"
+		"Open a TDB file." },
+	{ NULL }
+};
+
+void inittdb(void);
+void inittdb(void)
+{
+	PyObject *m;
+
+	if (PyType_Ready(&PyTdb) < 0)
+		return;
+
+	if (PyType_Ready(&PyTdbIterator) < 0)
+		return;
+
+	m = Py_InitModule3("tdb", tdb_methods, "TDB is a simple key-value database similar to GDBM that supports multiple writers.");
+	if (m == NULL)
+		return;
+
+	PyModule_AddObject(m, "REPLACE", PyInt_FromLong(TDB_REPLACE));
+	PyModule_AddObject(m, "INSERT", PyInt_FromLong(TDB_INSERT));
+	PyModule_AddObject(m, "MODIFY", PyInt_FromLong(TDB_MODIFY));
+
+	PyModule_AddObject(m, "DEFAULT", PyInt_FromLong(TDB_DEFAULT));
+	PyModule_AddObject(m, "INTERNAL", PyInt_FromLong(TDB_INTERNAL));
+	PyModule_AddObject(m, "NOLOCK", PyInt_FromLong(TDB_NOLOCK));
+	PyModule_AddObject(m, "NOMMAP", PyInt_FromLong(TDB_NOMMAP));
+	PyModule_AddObject(m, "CONVERT", PyInt_FromLong(TDB_CONVERT));
+	PyModule_AddObject(m, "NOSYNC", PyInt_FromLong(TDB_NOSYNC));
+	PyModule_AddObject(m, "SEQNUM", PyInt_FromLong(TDB_SEQNUM));
+	PyModule_AddObject(m, "ALLOW_NESTING", PyInt_FromLong(TDB_ALLOW_NESTING));
+
+	PyModule_AddObject(m, "__docformat__", PyString_FromString("restructuredText"));
+
+	PyModule_AddObject(m, "__version__", PyString_FromString(PACKAGE_VERSION));
+
+	Py_INCREF(&PyTdb);
+	PyModule_AddObject(m, "Tdb", (PyObject *)&PyTdb);
+
+	Py_INCREF(&PyTdbIterator);
+}
diff --git a/lib/tdb2/summary.c b/lib/tdb2/summary.c
new file mode 100644
index 0000000000..26cdd3e4fe
--- /dev/null
+++ b/lib/tdb2/summary.c
@@ -0,0 +1,282 @@
+ /*
+   Trivial Database 2: human-readable summary code
+   Copyright (C) Rusty Russell 2010
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+#include "private.h"
+#include <assert.h>
+#include <ccan/tally/tally.h>
+
+static tdb_off_t count_hash(struct tdb_context *tdb,
+			    tdb_off_t hash_off, unsigned bits)
+{
+	const tdb_off_t *h;
+	tdb_off_t count = 0;
+	unsigned int i;
+
+	h = tdb_access_read(tdb, hash_off, sizeof(*h) << bits, true);
+	if (TDB_PTR_IS_ERR(h)) {
+		return TDB_PTR_ERR(h);
+	}
+	for (i = 0; i < (1 << bits); i++)
+		count += (h[i] != 0);
+
+	tdb_access_release(tdb, h);
+	return count;
+}
+
+static enum TDB_ERROR summarize(struct tdb_context *tdb,
+				struct tally *hashes,
+				struct tally *ftables,
+				struct tally *fr,
+				struct tally *keys,
+				struct tally *data,
+				struct tally *extra,
+				struct tally *uncoal,
+				struct tally *chains)
+{
+	tdb_off_t off;
+	tdb_len_t len;
+	tdb_len_t unc = 0;
+
+	for (off = sizeof(struct tdb_header);
+	     off < tdb->file->map_size;
+	     off += len) {
+		const union {
+			struct tdb_used_record u;
+			struct tdb_free_record f;
+			struct tdb_recovery_record r;
+		} *p;
+		/* We might not be able to get the whole thing. */
+		p = tdb_access_read(tdb, off, sizeof(p->f), true);
+		if (TDB_PTR_IS_ERR(p)) {
+			return TDB_PTR_ERR(p);
+		}
+		if (frec_magic(&p->f) != TDB_FREE_MAGIC) {
+			if (unc > 1) {
+				tally_add(uncoal, unc);
+				unc = 0;
+			}
+		}
+
+		if (p->r.magic == TDB_RECOVERY_INVALID_MAGIC
+		    || p->r.magic == TDB_RECOVERY_MAGIC) {
+			len = sizeof(p->r) + p->r.max_len;
+		} else if (frec_magic(&p->f) == TDB_FREE_MAGIC) {
+			len = frec_len(&p->f);
+			tally_add(fr, len);
+			len += sizeof(p->u);
+			unc++;
+		} else if (rec_magic(&p->u) == TDB_USED_MAGIC) {
+			len = sizeof(p->u)
+				+ rec_key_length(&p->u)
+				+ rec_data_length(&p->u)
+				+ rec_extra_padding(&p->u);
+
+			tally_add(keys, rec_key_length(&p->u));
+			tally_add(data, rec_data_length(&p->u));
+			tally_add(extra, rec_extra_padding(&p->u));
+		} else if (rec_magic(&p->u) == TDB_HTABLE_MAGIC) {
+			tdb_off_t count = count_hash(tdb,
+						     off + sizeof(p->u),
+						     TDB_SUBLEVEL_HASH_BITS);
+			if (TDB_OFF_IS_ERR(count)) {
+				return count;
+			}
+			tally_add(hashes, count);
+			tally_add(extra, rec_extra_padding(&p->u));
+			len = sizeof(p->u)
+				+ rec_data_length(&p->u)
+				+ rec_extra_padding(&p->u);
+		} else if (rec_magic(&p->u) == TDB_FTABLE_MAGIC) {
+			len = sizeof(p->u)
+				+ rec_data_length(&p->u)
+				+ rec_extra_padding(&p->u);
+			tally_add(ftables, rec_data_length(&p->u));
+			tally_add(extra, rec_extra_padding(&p->u));
+		} else if (rec_magic(&p->u) == TDB_CHAIN_MAGIC) {
+			len = sizeof(p->u)
+				+ rec_data_length(&p->u)
+				+ rec_extra_padding(&p->u);
+			tally_add(chains, 1);
+			tally_add(extra, rec_extra_padding(&p->u));
+		} else {
+			len = dead_space(tdb, off);
+			if (TDB_OFF_IS_ERR(len)) {
+				return len;
+			}
+		}
+		tdb_access_release(tdb, p);
+	}
+	if (unc)
+		tally_add(uncoal, unc);
+	return TDB_SUCCESS;
+}
+
+#define SUMMARY_FORMAT \
+	"Size of file/data: %zu/%zu\n" \
+	"Number of records: %zu\n" \
+	"Smallest/average/largest keys: %zu/%zu/%zu\n%s" \
+	"Smallest/average/largest data: %zu/%zu/%zu\n%s" \
+	"Smallest/average/largest padding: %zu/%zu/%zu\n%s" \
+	"Number of free records: %zu\n" \
+	"Smallest/average/largest free records: %zu/%zu/%zu\n%s" \
+	"Number of uncoalesced records: %zu\n" \
+	"Smallest/average/largest uncoalesced runs: %zu/%zu/%zu\n%s" \
+	"Toplevel hash used: %u of %u\n" \
+	"Number of chains: %zu\n" \
+	"Number of subhashes: %zu\n" \
+	"Smallest/average/largest subhash entries: %zu/%zu/%zu\n%s" \
+	"Percentage keys/data/padding/free/rechdrs/freehdrs/hashes: %.0f/%.0f/%.0f/%.0f/%.0f/%.0f/%.0f\n"
+
+#define BUCKET_SUMMARY_FORMAT_A					\
+	"Free bucket %zu: total entries %zu.\n"			\
+	"Smallest/average/largest length: %zu/%zu/%zu\n%s"
+#define BUCKET_SUMMARY_FORMAT_B					\
+	"Free bucket %zu-%zu: total entries %zu.\n"		\
+	"Smallest/average/largest length: %zu/%zu/%zu\n%s"
+
+#define HISTO_WIDTH 70
+#define HISTO_HEIGHT 20
+
+enum TDB_ERROR tdb_summary(struct tdb_context *tdb,
+			   enum tdb_summary_flags flags,
+			   char **summary)
+{
+	tdb_len_t len;
+	struct tally *ftables, *hashes, *freet, *keys, *data, *extra, *uncoal,
+		*chains;
+	char *hashesg, *freeg, *keysg, *datag, *extrag, *uncoalg;
+	enum TDB_ERROR ecode;
+
+	hashesg = freeg = keysg = datag = extrag = uncoalg = NULL;
+
+	ecode = tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_WAIT, false);
+	if (ecode != TDB_SUCCESS) {
+		return tdb->last_error = ecode;
+	}
+
+	ecode = tdb_lock_expand(tdb, F_RDLCK);
+	if (ecode != TDB_SUCCESS) {
+		tdb_allrecord_unlock(tdb, F_RDLCK);
+		return tdb->last_error = ecode;
+	}
+
+	/* Start stats off empty. */
+	ftables = tally_new(HISTO_HEIGHT);
+	hashes = tally_new(HISTO_HEIGHT);
+	freet = tally_new(HISTO_HEIGHT);
+	keys = tally_new(HISTO_HEIGHT);
+	data = tally_new(HISTO_HEIGHT);
+	extra = tally_new(HISTO_HEIGHT);
+	uncoal = tally_new(HISTO_HEIGHT);
+	chains = tally_new(HISTO_HEIGHT);
+	if (!ftables || !hashes || !freet || !keys || !data || !extra
+	    || !uncoal || !chains) {
+		ecode = tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR,
+				   "tdb_summary: failed to allocate"
+				   " tally structures");
+		goto unlock;
+	}
+
+	ecode = summarize(tdb, hashes, ftables, freet, keys, data, extra,
+			  uncoal, chains);
+	if (ecode != TDB_SUCCESS) {
+		goto unlock;
+	}
+
+	if (flags & TDB_SUMMARY_HISTOGRAMS) {
+		hashesg = tally_histogram(hashes, HISTO_WIDTH, HISTO_HEIGHT);
+		freeg = tally_histogram(freet, HISTO_WIDTH, HISTO_HEIGHT);
+		keysg = tally_histogram(keys, HISTO_WIDTH, HISTO_HEIGHT);
+		datag = tally_histogram(data, HISTO_WIDTH, HISTO_HEIGHT);
+		extrag = tally_histogram(extra, HISTO_WIDTH, HISTO_HEIGHT);
+		uncoalg = tally_histogram(uncoal, HISTO_WIDTH, HISTO_HEIGHT);
+	}
+
+	/* 20 is max length of a %llu. */
+	len = strlen(SUMMARY_FORMAT) + 33*20 + 1
+		+ (hashesg ? strlen(hashesg) : 0)
+		+ (freeg ? strlen(freeg) : 0)
+		+ (keysg ? strlen(keysg) : 0)
+		+ (datag ? strlen(datag) : 0)
+		+ (extrag ? strlen(extrag) : 0)
+		+ (uncoalg ? strlen(uncoalg) : 0);
+
+	*summary = malloc(len);
+	if (!*summary) {
+		ecode = tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR,
+				   "tdb_summary: failed to allocate string");
+		goto unlock;
+	}
+
+	sprintf(*summary, SUMMARY_FORMAT,
+		(size_t)tdb->file->map_size,
+		tally_total(keys, NULL) + tally_total(data, NULL),
+		tally_num(keys),
+		tally_min(keys), tally_mean(keys), tally_max(keys),
+		keysg ? keysg : "",
+		tally_min(data), tally_mean(data), tally_max(data),
+		datag ? datag : "",
+		tally_min(extra), tally_mean(extra), tally_max(extra),
+		extrag ? extrag : "",
+		tally_num(freet),
+		tally_min(freet), tally_mean(freet), tally_max(freet),
+		freeg ? freeg : "",
+		tally_total(uncoal, NULL),
+		tally_min(uncoal), tally_mean(uncoal), tally_max(uncoal),
+		uncoalg ? uncoalg : "",
+		(unsigned)count_hash(tdb, offsetof(struct tdb_header,
+						   hashtable),
+				     TDB_TOPLEVEL_HASH_BITS),
+		1 << TDB_TOPLEVEL_HASH_BITS,
+		tally_num(chains),
+		tally_num(hashes),
+		tally_min(hashes), tally_mean(hashes), tally_max(hashes),
+		hashesg ? hashesg : "",
+		tally_total(keys, NULL) * 100.0 / tdb->file->map_size,
+		tally_total(data, NULL) * 100.0 / tdb->file->map_size,
+		tally_total(extra, NULL) * 100.0 / tdb->file->map_size,
+		tally_total(freet, NULL) * 100.0 / tdb->file->map_size,
+		(tally_num(keys) + tally_num(freet) + tally_num(hashes))
+		* sizeof(struct tdb_used_record) * 100.0 / tdb->file->map_size,
+		tally_num(ftables) * sizeof(struct tdb_freetable)
+		* 100.0 / tdb->file->map_size,
+		(tally_num(hashes)
+		 * (sizeof(tdb_off_t) << TDB_SUBLEVEL_HASH_BITS)
+		 + (sizeof(tdb_off_t) << TDB_TOPLEVEL_HASH_BITS)
+		 + sizeof(struct tdb_chain) * tally_num(chains))
+		* 100.0 / tdb->file->map_size);
+
+unlock:
+	free(hashesg);
+	free(freeg);
+	free(keysg);
+	free(datag);
+	free(extrag);
+	free(uncoalg);
+	free(hashes);
+	free(freet);
+	free(keys);
+	free(data);
+	free(extra);
+	free(uncoal);
+	free(ftables);
+	free(chains);
+
+	tdb_allrecord_unlock(tdb, F_RDLCK);
+	tdb_unlock_expand(tdb, F_RDLCK);
+	return tdb->last_error = ecode;
+}
diff --git a/lib/tdb2/tdb.c b/lib/tdb2/tdb.c
new file mode 100644
index 0000000000..753ccb0c8b
--- /dev/null
+++ b/lib/tdb2/tdb.c
@@ -0,0 +1,486 @@
+#include "private.h"
+#ifndef _SAMBA_BUILD_
+#include <ccan/asprintf/asprintf.h>
+#include <stdarg.h>
+#endif
+
+static enum TDB_ERROR update_rec_hdr(struct tdb_context *tdb,
+				     tdb_off_t off,
+				     tdb_len_t keylen,
+				     tdb_len_t datalen,
+				     struct tdb_used_record *rec,
+				     uint64_t h)
+{
+	uint64_t dataroom = rec_data_length(rec) + rec_extra_padding(rec);
+	enum TDB_ERROR ecode;
+
+	ecode = set_header(tdb, rec, TDB_USED_MAGIC, keylen, datalen,
+			   keylen + dataroom, h);
+	if (ecode == TDB_SUCCESS) {
+		ecode = tdb_write_convert(tdb, off, rec, sizeof(*rec));
+	}
+	return ecode;
+}
+
+static enum TDB_ERROR replace_data(struct tdb_context *tdb,
+				   struct hash_info *h,
+				   struct tdb_data key, struct tdb_data dbuf,
+				   tdb_off_t old_off, tdb_len_t old_room,
+				   bool growing)
+{
+	tdb_off_t new_off;
+	enum TDB_ERROR ecode;
+
+	/* Allocate a new record. */
+	new_off = alloc(tdb, key.dsize, dbuf.dsize, h->h, TDB_USED_MAGIC,
+			growing);
+	if (TDB_OFF_IS_ERR(new_off)) {
+		return new_off;
+	}
+
+	/* We didn't like the existing one: remove it. */
+	if (old_off) {
+		tdb->stats.frees++;
+		ecode = add_free_record(tdb, old_off,
+					sizeof(struct tdb_used_record)
+					+ key.dsize + old_room,
+					TDB_LOCK_WAIT, true);
+		if (ecode == TDB_SUCCESS)
+			ecode = replace_in_hash(tdb, h, new_off);
+	} else {
+		ecode = add_to_hash(tdb, h, new_off);
+	}
+	if (ecode != TDB_SUCCESS) {
+		return ecode;
+	}
+
+	new_off += sizeof(struct tdb_used_record);
+	ecode = tdb->methods->twrite(tdb, new_off, key.dptr, key.dsize);
+	if (ecode != TDB_SUCCESS) {
+		return ecode;
+	}
+
+	new_off += key.dsize;
+	ecode = tdb->methods->twrite(tdb, new_off, dbuf.dptr, dbuf.dsize);
+	if (ecode != TDB_SUCCESS) {
+		return ecode;
+	}
+
+	if (tdb->flags & TDB_SEQNUM)
+		tdb_inc_seqnum(tdb);
+
+	return TDB_SUCCESS;
+}
+
+static enum TDB_ERROR update_data(struct tdb_context *tdb,
+				  tdb_off_t off,
+				  struct tdb_data dbuf,
+				  tdb_len_t extra)
+{
+	enum TDB_ERROR ecode;
+
+	ecode = tdb->methods->twrite(tdb, off, dbuf.dptr, dbuf.dsize);
+	if (ecode == TDB_SUCCESS && extra) {
+		/* Put a zero in; future versions may append other data. */
+		ecode = tdb->methods->twrite(tdb, off + dbuf.dsize, "", 1);
+	}
+	if (tdb->flags & TDB_SEQNUM)
+		tdb_inc_seqnum(tdb);
+
+	return ecode;
+}
+
+enum TDB_ERROR tdb_store(struct tdb_context *tdb,
+			 struct tdb_data key, struct tdb_data dbuf, int flag)
+{
+	struct hash_info h;
+	tdb_off_t off;
+	tdb_len_t old_room = 0;
+	struct tdb_used_record rec;
+	enum TDB_ERROR ecode;
+
+	off = find_and_lock(tdb, key, F_WRLCK, &h, &rec, NULL);
+	if (TDB_OFF_IS_ERR(off)) {
+		return tdb->last_error = off;
+	}
+
+	/* Now we have lock on this hash bucket. */
+	if (flag == TDB_INSERT) {
+		if (off) {
+			ecode = TDB_ERR_EXISTS;
+			goto out;
+		}
+	} else {
+		if (off) {
+			old_room = rec_data_length(&rec)
+				+ rec_extra_padding(&rec);
+			if (old_room >= dbuf.dsize) {
+				/* Can modify in-place.  Easy! */
+				ecode = update_rec_hdr(tdb, off,
+						       key.dsize, dbuf.dsize,
+						       &rec, h.h);
+				if (ecode != TDB_SUCCESS) {
+					goto out;
+				}
+				ecode = update_data(tdb,
+						    off + sizeof(rec)
+						    + key.dsize, dbuf,
+						    old_room - dbuf.dsize);
+				if (ecode != TDB_SUCCESS) {
+					goto out;
+				}
+				tdb_unlock_hashes(tdb, h.hlock_start,
+						  h.hlock_range, F_WRLCK);
+				return tdb->last_error = TDB_SUCCESS;
+			}
+		} else {
+			if (flag == TDB_MODIFY) {
+				/* if the record doesn't exist and we
+				   are in TDB_MODIFY mode then we should fail
+				   the store */
+				ecode = TDB_ERR_NOEXIST;
+				goto out;
+			}
+		}
+	}
+
+	/* If we didn't use the old record, this implies we're growing. */
+	ecode = replace_data(tdb, &h, key, dbuf, off, old_room, off);
+out:
+	tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range, F_WRLCK);
+	return tdb->last_error = ecode;
+}
+
+enum TDB_ERROR tdb_append(struct tdb_context *tdb,
+			  struct tdb_data key, struct tdb_data dbuf)
+{
+	struct hash_info h;
+	tdb_off_t off;
+	struct tdb_used_record rec;
+	tdb_len_t old_room = 0, old_dlen;
+	unsigned char *newdata;
+	struct tdb_data new_dbuf;
+	enum TDB_ERROR ecode;
+
+	off = find_and_lock(tdb, key, F_WRLCK, &h, &rec, NULL);
+	if (TDB_OFF_IS_ERR(off)) {
+		return tdb->last_error = off;
+	}
+
+	if (off) {
+		old_dlen = rec_data_length(&rec);
+		old_room = old_dlen + rec_extra_padding(&rec);
+
+		/* Fast path: can append in place. */
+		if (rec_extra_padding(&rec) >= dbuf.dsize) {
+			ecode = update_rec_hdr(tdb, off, key.dsize,
+					       old_dlen + dbuf.dsize, &rec,
+					       h.h);
+			if (ecode != TDB_SUCCESS) {
+				goto out;
+			}
+
+			off += sizeof(rec) + key.dsize + old_dlen;
+			ecode = update_data(tdb, off, dbuf,
+					    rec_extra_padding(&rec));
+			goto out;
+		}
+
+		/* Slow path. */
+		newdata = malloc(key.dsize + old_dlen + dbuf.dsize);
+		if (!newdata) {
+			ecode = tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR,
+					   "tdb_append:"
+					   " failed to allocate %zu bytes",
+					   (size_t)(key.dsize + old_dlen
+						    + dbuf.dsize));
+			goto out;
+		}
+		ecode = tdb->methods->tread(tdb, off + sizeof(rec) + key.dsize,
+					    newdata, old_dlen);
+		if (ecode != TDB_SUCCESS) {
+			goto out_free_newdata;
+		}
+		memcpy(newdata + old_dlen, dbuf.dptr, dbuf.dsize);
+		new_dbuf.dptr = newdata;
+		new_dbuf.dsize = old_dlen + dbuf.dsize;
+	} else {
+		newdata = NULL;
+		new_dbuf = dbuf;
+	}
+
+	/* If they're using tdb_append(), it implies they're growing record. */
+	ecode = replace_data(tdb, &h, key, new_dbuf, off, old_room, true);
+
+out_free_newdata:
+	free(newdata);
+out:
+	tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range, F_WRLCK);
+	return tdb->last_error = ecode;
+}
+
+enum TDB_ERROR tdb_fetch(struct tdb_context *tdb, struct tdb_data key,
+			 struct tdb_data *data)
+{
+	tdb_off_t off;
+	struct tdb_used_record rec;
+	struct hash_info h;
+	enum TDB_ERROR ecode;
+
+	off = find_and_lock(tdb, key, F_RDLCK, &h, &rec, NULL);
+	if (TDB_OFF_IS_ERR(off)) {
+		return tdb->last_error = off;
+	}
+
+	if (!off) {
+		ecode = TDB_ERR_NOEXIST;
+	} else {
+		data->dsize = rec_data_length(&rec);
+		data->dptr = tdb_alloc_read(tdb, off + sizeof(rec) + key.dsize,
+					    data->dsize);
+		if (TDB_PTR_IS_ERR(data->dptr)) {
+			ecode = TDB_PTR_ERR(data->dptr);
+		} else
+			ecode = TDB_SUCCESS;
+	}
+
+	tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range, F_RDLCK);
+	return tdb->last_error = ecode;
+}
+
+bool tdb_exists(struct tdb_context *tdb, TDB_DATA key)
+{
+	tdb_off_t off;
+	struct tdb_used_record rec;
+	struct hash_info h;
+
+	off = find_and_lock(tdb, key, F_RDLCK, &h, &rec, NULL);
+	if (TDB_OFF_IS_ERR(off)) {
+		tdb->last_error = off;
+		return false;
+	}
+	tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range, F_RDLCK);
+
+	tdb->last_error = TDB_SUCCESS;
+	return off ? true : false;
+}
+
+enum TDB_ERROR tdb_delete(struct tdb_context *tdb, struct tdb_data key)
+{
+	tdb_off_t off;
+	struct tdb_used_record rec;
+	struct hash_info h;
+	enum TDB_ERROR ecode;
+
+	off = find_and_lock(tdb, key, F_WRLCK, &h, &rec, NULL);
+	if (TDB_OFF_IS_ERR(off)) {
+		return tdb->last_error = off;
+	}
+
+	if (!off) {
+		ecode = TDB_ERR_NOEXIST;
+		goto unlock;
+	}
+
+	ecode = delete_from_hash(tdb, &h);
+	if (ecode != TDB_SUCCESS) {
+		goto unlock;
+	}
+
+	/* Free the deleted entry. */
+	tdb->stats.frees++;
+	ecode = add_free_record(tdb, off,
+				sizeof(struct tdb_used_record)
+				+ rec_key_length(&rec)
+				+ rec_data_length(&rec)
+				+ rec_extra_padding(&rec),
+				TDB_LOCK_WAIT, true);
+
+	if (tdb->flags & TDB_SEQNUM)
+		tdb_inc_seqnum(tdb);
+
+unlock:
+	tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range, F_WRLCK);
+	return tdb->last_error = ecode;
+}
+
+unsigned int tdb_get_flags(struct tdb_context *tdb)
+{
+	return tdb->flags;
+}
+
+void tdb_add_flag(struct tdb_context *tdb, unsigned flag)
+{
+	if (tdb->flags & TDB_INTERNAL) {
+		tdb->last_error = tdb_logerr(tdb, TDB_ERR_EINVAL,
+					     TDB_LOG_USE_ERROR,
+					     "tdb_add_flag: internal db");
+		return;
+	}
+	switch (flag) {
+	case TDB_NOLOCK:
+		tdb->flags |= TDB_NOLOCK;
+		break;
+	case TDB_NOMMAP:
+		tdb->flags |= TDB_NOMMAP;
+		tdb_munmap(tdb->file);
+		break;
+	case TDB_NOSYNC:
+		tdb->flags |= TDB_NOSYNC;
+		break;
+	case TDB_SEQNUM:
+		tdb->flags |= TDB_SEQNUM;
+		break;
+	case TDB_ALLOW_NESTING:
+		tdb->flags |= TDB_ALLOW_NESTING;
+		break;
+	default:
+		tdb->last_error = tdb_logerr(tdb, TDB_ERR_EINVAL,
+					     TDB_LOG_USE_ERROR,
+					     "tdb_add_flag: Unknown flag %u",
+					     flag);
+	}
+}
+
+void tdb_remove_flag(struct tdb_context *tdb, unsigned flag)
+{
+	if (tdb->flags & TDB_INTERNAL) {
+		tdb->last_error = tdb_logerr(tdb, TDB_ERR_EINVAL,
+					     TDB_LOG_USE_ERROR,
+					     "tdb_remove_flag: internal db");
+		return;
+	}
+	switch (flag) {
+	case TDB_NOLOCK:
+		tdb->flags &= ~TDB_NOLOCK;
+		break;
+	case TDB_NOMMAP:
+		tdb->flags &= ~TDB_NOMMAP;
+		tdb_mmap(tdb);
+		break;
+	case TDB_NOSYNC:
+		tdb->flags &= ~TDB_NOSYNC;
+		break;
+	case TDB_SEQNUM:
+		tdb->flags &= ~TDB_SEQNUM;
+		break;
+	case TDB_ALLOW_NESTING:
+		tdb->flags &= ~TDB_ALLOW_NESTING;
+		break;
+	default:
+		tdb->last_error = tdb_logerr(tdb, TDB_ERR_EINVAL,
+					     TDB_LOG_USE_ERROR,
+					     "tdb_remove_flag: Unknown flag %u",
+					     flag);
+	}
+}
+
+const char *tdb_errorstr(enum TDB_ERROR ecode)
+{
+	/* Gcc warns if you miss a case in the switch, so use that. */
+	switch (ecode) {
+	case TDB_SUCCESS: return "Success";
+	case TDB_ERR_CORRUPT: return "Corrupt database";
+	case TDB_ERR_IO: return "IO Error";
+	case TDB_ERR_LOCK: return "Locking error";
+	case TDB_ERR_OOM: return "Out of memory";
+	case TDB_ERR_EXISTS: return "Record exists";
+	case TDB_ERR_EINVAL: return "Invalid parameter";
+	case TDB_ERR_NOEXIST: return "Record does not exist";
+	case TDB_ERR_RDONLY: return "write not permitted";
+	}
+	return "Invalid error code";
+}
+
+enum TDB_ERROR tdb_error(struct tdb_context *tdb)
+{
+	return tdb->last_error;
+}
+
+enum TDB_ERROR COLD tdb_logerr(struct tdb_context *tdb,
+			       enum TDB_ERROR ecode,
+			       enum tdb_log_level level,
+			       const char *fmt, ...)
+{
+	char *message;
+	va_list ap;
+	size_t len;
+	/* tdb_open paths care about errno, so save it. */
+	int saved_errno = errno;
+
+	if (!tdb->log_fn)
+		return ecode;
+
+	va_start(ap, fmt);
+	len = vasprintf(&message, fmt, ap);
+	va_end(ap);
+
+	if (len < 0) {
+		tdb->log_fn(tdb, TDB_LOG_ERROR,
+			    "out of memory formatting message:", tdb->log_data);
+		tdb->log_fn(tdb, level, fmt, tdb->log_data);
+	} else {
+		tdb->log_fn(tdb, level, message, tdb->log_data);
+		free(message);
+	}
+	errno = saved_errno;
+	return ecode;
+}
+
+enum TDB_ERROR tdb_parse_record_(struct tdb_context *tdb,
+				 TDB_DATA key,
+				 enum TDB_ERROR (*parse)(TDB_DATA k,
+							 TDB_DATA d,
+							 void *data),
+				 void *data)
+{
+	tdb_off_t off;
+	struct tdb_used_record rec;
+	struct hash_info h;
+	enum TDB_ERROR ecode;
+
+	off = find_and_lock(tdb, key, F_RDLCK, &h, &rec, NULL);
+	if (TDB_OFF_IS_ERR(off)) {
+		return tdb->last_error = off;
+	}
+
+	if (!off) {
+		ecode = TDB_ERR_NOEXIST;
+	} else {
+		const void *dptr;
+		dptr = tdb_access_read(tdb, off + sizeof(rec) + key.dsize,
+				       rec_data_length(&rec), false);
+		if (TDB_PTR_IS_ERR(dptr)) {
+			ecode = TDB_PTR_ERR(dptr);
+		} else {
+			TDB_DATA d = tdb_mkdata(dptr, rec_data_length(&rec));
+
+			ecode = parse(key, d, data);
+			tdb_access_release(tdb, dptr);
+		}
+	}
+
+	tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range, F_RDLCK);
+	return tdb->last_error = ecode;
+}
+
+const char *tdb_name(const struct tdb_context *tdb)
+{
+	return tdb->name;
+}
+
+int64_t tdb_get_seqnum(struct tdb_context *tdb)
+{
+	tdb_off_t off = tdb_read_off(tdb, offsetof(struct tdb_header, seqnum));
+	if (TDB_OFF_IS_ERR(off))
+		tdb->last_error = off;
+	else
+		tdb->last_error = TDB_SUCCESS;
+	return off;
+}
+
+
+int tdb_fd(const struct tdb_context *tdb)
+{
+	return tdb->file->fd;
+}
diff --git a/lib/tdb2/tdb2.h b/lib/tdb2/tdb2.h
new file mode 100644
index 0000000000..3f80793d76
--- /dev/null
+++ b/lib/tdb2/tdb2.h
@@ -0,0 +1,848 @@
+#ifndef CCAN_TDB2_H
+#define CCAN_TDB2_H
+
+/*
+   TDB version 2: trivial database library
+
+   Copyright (C) Andrew Tridgell 1999-2004
+   Copyright (C) Rusty Russell 2010-2011
+
+     ** NOTE! The following LGPL license applies to the tdb
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+#ifndef _SAMBA_BUILD_
+/* For mode_t */
+#include <sys/types.h>
+/* For O_* flags. */
+#include <sys/stat.h>
+/* For sig_atomic_t. */
+#include <signal.h>
+/* For uint64_t */
+#include <stdint.h>
+/* For bool */
+#include <stdbool.h>
+/* For memcmp */
+#include <string.h>
+#else
+#include "replace.h"
+#endif
+#include <ccan/compiler/compiler.h>
+#include <ccan/typesafe_cb/typesafe_cb.h>
+#include <ccan/cast/cast.h>
+
+union tdb_attribute;
+struct tdb_context;
+
+/**
+ * tdb_open - open a database file
+ * @name: the file name (can be NULL if flags contains TDB_INTERNAL)
+ * @tdb_flags: options for this database
+ * @open_flags: flags argument for tdb's open() call.
+ * @mode: mode argument for tdb's open() call.
+ * @attributes: linked list of extra attributes for this tdb.
+ *
+ * This call opens (and potentially creates) a database file.
+ * Multiple processes can have the TDB file open at once.
+ *
+ * On failure it will return NULL, and set errno: it may also call
+ * any log attribute found in @attributes.
+ *
+ * See also:
+ *	union tdb_attribute
+ */
+struct tdb_context *tdb_open(const char *name, int tdb_flags,
+			     int open_flags, mode_t mode,
+			     union tdb_attribute *attributes);
+
+
+/* flags for tdb_open() */
+#define TDB_DEFAULT 0 /* just a readability place holder */
+#define TDB_INTERNAL 2 /* don't store on disk */
+#define TDB_NOLOCK   4 /* don't do any locking */
+#define TDB_NOMMAP   8 /* don't use mmap */
+#define TDB_CONVERT 16 /* convert endian */
+#define TDB_NOSYNC   64 /* don't use synchronous transactions */
+#define TDB_SEQNUM   128 /* maintain a sequence number */
+#define TDB_ALLOW_NESTING   256 /* fake nested transactions */
+
+/**
+ * tdb_close - close and free a tdb.
+ * @tdb: the tdb context returned from tdb_open()
+ *
+ * This always succeeds, in that @tdb is unusable after this call.  But if
+ * some unexpected error occurred while closing, it will return non-zero
+ * (the only clue as to cause will be via the log attribute).
+ */
+int tdb_close(struct tdb_context *tdb);
+
+/**
+ * struct tdb_data - representation of keys or values.
+ * @dptr: the data pointer
+ * @dsize: the size of the data pointed to by dptr.
+ *
+ * This is the "blob" representation of keys and data used by TDB.
+ */
+typedef struct tdb_data {
+	unsigned char *dptr;
+	size_t dsize;
+} TDB_DATA;
+
+/**
+ * enum TDB_ERROR - error returns for TDB
+ *
+ * See Also:
+ *	tdb_errorstr()
+ */
+enum TDB_ERROR {
+	TDB_SUCCESS	= 0,	/* No error. */
+	TDB_ERR_CORRUPT = -1,	/* We read the db, and it was bogus. */
+	TDB_ERR_IO	= -2,	/* We couldn't read/write the db. */
+	TDB_ERR_LOCK	= -3,	/* Locking failed. */
+	TDB_ERR_OOM	= -4,	/* Out of Memory. */
+	TDB_ERR_EXISTS	= -5,	/* The key already exists. */
+	TDB_ERR_NOEXIST	= -6,	/* The key does not exist. */
+	TDB_ERR_EINVAL	= -7,	/* You're using it wrong. */
+	TDB_ERR_RDONLY	= -8,	/* The database is read-only. */
+	TDB_ERR_LAST = TDB_ERR_RDONLY
+};
+
+/**
+ * tdb_store - store a key/value pair in a tdb.
+ * @tdb: the tdb context returned from tdb_open()
+ * @key: the key
+ * @dbuf: the data to associate with the key.
+ * @flag: TDB_REPLACE, TDB_INSERT or TDB_MODIFY.
+ *
+ * This inserts (or overwrites) a key/value pair in the TDB.  If flag
+ * is TDB_REPLACE, it doesn't matter whether the key exists or not;
+ * TDB_INSERT means it must not exist (returns TDB_ERR_EXISTS otherwise),
+ * and TDB_MODIFY means it must exist (returns TDB_ERR_NOEXIST otherwise).
+ *
+ * On success, this returns TDB_SUCCESS.
+ *
+ * See also:
+ *	tdb_fetch, tdb_transaction_start, tdb_append, tdb_delete.
+ */
+enum TDB_ERROR tdb_store(struct tdb_context *tdb,
+			 struct tdb_data key,
+			 struct tdb_data dbuf,
+			 int flag);
+
+/* flags to tdb_store() */
+#define TDB_REPLACE 1		/* A readability place holder */
+#define TDB_INSERT 2 		/* Don't overwrite an existing entry */
+#define TDB_MODIFY 3		/* Don't create an existing entry    */
+
+/**
+ * tdb_fetch - fetch a value from a tdb.
+ * @tdb: the tdb context returned from tdb_open()
+ * @key: the key
+ * @data: pointer to data.
+ *
+ * This looks up a key in the database and sets it in @data.
+ *
+ * If it returns TDB_SUCCESS, the key was found: it is your
+ * responsibility to call free() on @data->dptr.
+ *
+ * Otherwise, it returns an error (usually, TDB_ERR_NOEXIST) and @data is
+ * undefined.
+ */
+enum TDB_ERROR tdb_fetch(struct tdb_context *tdb, struct tdb_data key,
+			 struct tdb_data *data);
+
+/**
+ * tdb_errorstr - map the tdb error onto a constant readable string
+ * @ecode: the enum TDB_ERROR to map.
+ *
+ * This is useful for displaying errors to users.
+ */
+const char *tdb_errorstr(enum TDB_ERROR ecode);
+
+/**
+ * tdb_append - append a value to a key/value pair in a tdb.
+ * @tdb: the tdb context returned from tdb_open()
+ * @key: the key
+ * @dbuf: the data to append.
+ *
+ * This is equivalent to fetching a record, reallocating .dptr to add the
+ * data, and writing it back, only it's much more efficient.  If the key
+ * doesn't exist, it's equivalent to tdb_store (with an additional hint that
+ * you expect to expand the record in future).
+ *
+ * See Also:
+ *	tdb_fetch(), tdb_store()
+ */
+enum TDB_ERROR tdb_append(struct tdb_context *tdb,
+			  struct tdb_data key, struct tdb_data dbuf);
+
+/**
+ * tdb_delete - delete a key from a tdb.
+ * @tdb: the tdb context returned from tdb_open()
+ * @key: the key to delete.
+ *
+ * Returns TDB_SUCCESS on success, or an error (usually TDB_ERR_NOEXIST).
+ *
+ * See Also:
+ *	tdb_fetch(), tdb_store()
+ */
+enum TDB_ERROR tdb_delete(struct tdb_context *tdb, struct tdb_data key);
+
+/**
+ * tdb_exists - does a key exist in the database?
+ * @tdb: the tdb context returned from tdb_open()
+ * @key: the key to search for.
+ *
+ * Returns true if it exists, or false if it doesn't or any other error.
+ */
+bool tdb_exists(struct tdb_context *tdb, TDB_DATA key);
+
+/**
+ * tdb_deq - are struct tdb_data equal?
+ * @a: one struct tdb_data
+ * @b: another struct tdb_data
+ */
+static inline bool tdb_deq(struct tdb_data a, struct tdb_data b)
+{
+	return a.dsize == b.dsize && memcmp(a.dptr, b.dptr, a.dsize) == 0;
+}
+
+/**
+ * tdb_mkdata - make a struct tdb_data from const data
+ * @p: the constant pointer
+ * @len: the length
+ *
+ * As the dptr member of struct tdb_data is not constant, you need to
+ * cast it.  This function keeps thost casts in one place, as well as
+ * suppressing the warning some compilers give when casting away a
+ * qualifier (eg. gcc with -Wcast-qual)
+ */
+static inline struct tdb_data tdb_mkdata(const void *p, size_t len)
+{
+	struct tdb_data d;
+	d.dptr = cast_const(void *, p);
+	d.dsize = len;
+	return d;
+}
+
+/**
+ * tdb_transaction_start - start a transaction
+ * @tdb: the tdb context returned from tdb_open()
+ *
+ * This begins a series of atomic operations.  Other processes will be able
+ * to read the tdb, but not alter it (they will block), nor will they see
+ * any changes until tdb_transaction_commit() is called.
+ *
+ * Note that if the TDB_ALLOW_NESTING flag is set, a tdb_transaction_start()
+ * within a transaction will succeed, but it's not a real transaction:
+ * (1) An inner transaction which is committed is not actually committed until
+ *     the outer transaction is; if the outer transaction is cancelled, the
+ *     inner ones are discarded.
+ * (2) tdb_transaction_cancel() marks the outer transaction as having an error,
+ *     so the final tdb_transaction_commit() will fail.
+ * (3) the outer transaction will see the results of the inner transaction.
+ *
+ * See Also:
+ *	tdb_transaction_cancel, tdb_transaction_commit.
+ */
+enum TDB_ERROR tdb_transaction_start(struct tdb_context *tdb);
+
+/**
+ * tdb_transaction_cancel - abandon a transaction
+ * @tdb: the tdb context returned from tdb_open()
+ *
+ * This aborts a transaction, discarding any changes which were made.
+ * tdb_close() does this implicitly.
+ */
+void tdb_transaction_cancel(struct tdb_context *tdb);
+
+/**
+ * tdb_transaction_commit - commit a transaction
+ * @tdb: the tdb context returned from tdb_open()
+ *
+ * This completes a transaction, writing any changes which were made.
+ *
+ * fsync() is used to commit the transaction (unless TDB_NOSYNC is set),
+ * making it robust against machine crashes, but very slow compared to
+ * other TDB operations.
+ *
+ * A failure can only be caused by unexpected errors (eg. I/O or
+ * memory); this is no point looping on transaction failure.
+ *
+ * See Also:
+ *	tdb_transaction_prepare_commit()
+ */
+enum TDB_ERROR tdb_transaction_commit(struct tdb_context *tdb);
+
+/**
+ * tdb_transaction_prepare_commit - prepare to commit a transaction
+ * @tdb: the tdb context returned from tdb_open()
+ *
+ * This ensures we have the resources to commit a transaction (using
+ * tdb_transaction_commit): if this succeeds then a transaction will only
+ * fail if the write() or fsync() calls fail.
+ *
+ * If this fails you must still call tdb_transaction_cancel() to cancel
+ * the transaction.
+ *
+ * See Also:
+ *	tdb_transaction_commit()
+ */
+enum TDB_ERROR tdb_transaction_prepare_commit(struct tdb_context *tdb);
+
+/**
+ * tdb_traverse - traverse a TDB
+ * @tdb: the tdb context returned from tdb_open()
+ * @fn: the function to call for every key/value pair (or NULL)
+ * @p: the pointer to hand to @f
+ *
+ * This walks the TDB until all they keys have been traversed, or @fn
+ * returns non-zero.  If the traverse function or other processes are
+ * changing data or adding or deleting keys, the traverse may be
+ * unreliable: keys may be skipped or (rarely) visited twice.
+ *
+ * There is one specific exception: the special case of deleting the
+ * current key does not undermine the reliability of the traversal.
+ *
+ * On success, returns the number of keys iterated.  On error returns
+ * a negative enum TDB_ERROR value.
+ */
+#define tdb_traverse(tdb, fn, p)					\
+	tdb_traverse_(tdb, typesafe_cb_preargs(int, void *, (fn), (p),	\
+					       struct tdb_context *,	\
+					       TDB_DATA, TDB_DATA), (p))
+
+int64_t tdb_traverse_(struct tdb_context *tdb,
+		      int (*fn)(struct tdb_context *,
+				TDB_DATA, TDB_DATA, void *), void *p);
+
+/**
+ * tdb_parse_record - operate directly on data in the database.
+ * @tdb: the tdb context returned from tdb_open()
+ * @key: the key whose record we should hand to @parse
+ * @parse: the function to call for the data
+ * @data: the private pointer to hand to @parse (types must match).
+ *
+ * This avoids a copy for many cases, by handing you a pointer into
+ * the memory-mapped database.  It also locks the record to prevent
+ * other accesses at the same time.
+ *
+ * Do not alter the data handed to parse()!
+ */
+#define tdb_parse_record(tdb, key, parse, data)				\
+	tdb_parse_record_((tdb), (key),					\
+			  typesafe_cb_preargs(enum TDB_ERROR, void *,	\
+					      (parse), (data),		\
+					      TDB_DATA, TDB_DATA), (data))
+
+enum TDB_ERROR tdb_parse_record_(struct tdb_context *tdb,
+				 TDB_DATA key,
+				 enum TDB_ERROR (*parse)(TDB_DATA k,
+							 TDB_DATA d,
+							 void *data),
+				 void *data);
+
+/**
+ * tdb_get_seqnum - get a database sequence number
+ * @tdb: the tdb context returned from tdb_open()
+ *
+ * This returns a sequence number: any change to the database from a
+ * tdb context opened with the TDB_SEQNUM flag will cause that number
+ * to increment.  Note that the incrementing is unreliable (it is done
+ * without locking), so this is only useful as an optimization.
+ *
+ * For example, you may have a regular database backup routine which
+ * does not operate if the sequence number is unchanged.  In the
+ * unlikely event of a failed increment, it will be backed up next
+ * time any way.
+ *
+ * Returns an enum TDB_ERROR (ie. negative) on error.
+ */
+int64_t tdb_get_seqnum(struct tdb_context *tdb);
+
+/**
+ * tdb_firstkey - get the "first" key in a TDB
+ * @tdb: the tdb context returned from tdb_open()
+ * @key: pointer to key.
+ *
+ * This returns an arbitrary key in the database; with tdb_nextkey() it allows
+ * open-coded traversal of the database, though it is slightly less efficient
+ * than tdb_traverse.
+ *
+ * It is your responsibility to free @key->dptr on success.
+ *
+ * Returns TDB_ERR_NOEXIST if the database is empty.
+ */
+enum TDB_ERROR tdb_firstkey(struct tdb_context *tdb, struct tdb_data *key);
+
+/**
+ * tdb_nextkey - get the "next" key in a TDB
+ * @tdb: the tdb context returned from tdb_open()
+ * @key: a key returned by tdb_firstkey() or tdb_nextkey().
+ *
+ * This returns another key in the database; it will free @key.dptr for
+ * your convenience.
+ *
+ * Returns TDB_ERR_NOEXIST if there are no more keys.
+ */
+enum TDB_ERROR tdb_nextkey(struct tdb_context *tdb, struct tdb_data *key);
+
+/**
+ * tdb_chainlock - lock a record in the TDB
+ * @tdb: the tdb context returned from tdb_open()
+ * @key: the key to lock.
+ *
+ * This prevents any access occurring to a group of keys including @key,
+ * even if @key does not exist.  This allows primitive atomic updates of
+ * records without using transactions.
+ *
+ * You cannot begin a transaction while holding a tdb_chainlock(), nor can
+ * you do any operations on any other keys in the database.  This also means
+ * that you cannot hold more than one tdb_chainlock() at a time.
+ *
+ * See Also:
+ *	tdb_chainunlock()
+ */
+enum TDB_ERROR tdb_chainlock(struct tdb_context *tdb, TDB_DATA key);
+
+/**
+ * tdb_chainunlock - unlock a record in the TDB
+ * @tdb: the tdb context returned from tdb_open()
+ * @key: the key to unlock.
+ *
+ * The key must have previously been locked by tdb_chainlock().
+ */
+void tdb_chainunlock(struct tdb_context *tdb, TDB_DATA key);
+
+/**
+ * tdb_chainlock_read - lock a record in the TDB, for reading
+ * @tdb: the tdb context returned from tdb_open()
+ * @key: the key to lock.
+ *
+ * This prevents any changes from occurring to a group of keys including @key,
+ * even if @key does not exist.  This allows primitive atomic updates of
+ * records without using transactions.
+ *
+ * You cannot begin a transaction while holding a tdb_chainlock_read(), nor can
+ * you do any operations on any other keys in the database.  This also means
+ * that you cannot hold more than one tdb_chainlock()/read() at a time.
+ *
+ * See Also:
+ *	tdb_chainlock()
+ */
+enum TDB_ERROR tdb_chainlock_read(struct tdb_context *tdb, TDB_DATA key);
+
+/**
+ * tdb_chainunlock_read - unlock a record in the TDB for reading
+ * @tdb: the tdb context returned from tdb_open()
+ * @key: the key to unlock.
+ *
+ * The key must have previously been locked by tdb_chainlock_read().
+ */
+void tdb_chainunlock_read(struct tdb_context *tdb, TDB_DATA key);
+
+/**
+ * tdb_lockall - lock the entire TDB
+ * @tdb: the tdb context returned from tdb_open()
+ *
+ * You cannot hold a tdb_chainlock while calling this.  It nests, so you
+ * must call tdb_unlockall as many times as you call tdb_lockall.
+ */
+enum TDB_ERROR tdb_lockall(struct tdb_context *tdb);
+
+/**
+ * tdb_unlockall - unlock the entire TDB
+ * @tdb: the tdb context returned from tdb_open()
+ */
+void tdb_unlockall(struct tdb_context *tdb);
+
+/**
+ * tdb_lockall_read - lock the entire TDB for reading
+ * @tdb: the tdb context returned from tdb_open()
+ *
+ * This prevents others writing to the database, eg. tdb_delete, tdb_store,
+ * tdb_append, but not tdb_fetch.
+ *
+ * You cannot hold a tdb_chainlock while calling this.  It nests, so you
+ * must call tdb_unlockall_read as many times as you call tdb_lockall_read.
+ */
+enum TDB_ERROR tdb_lockall_read(struct tdb_context *tdb);
+
+/**
+ * tdb_unlockall_read - unlock the entire TDB for reading
+ * @tdb: the tdb context returned from tdb_open()
+ */
+void tdb_unlockall_read(struct tdb_context *tdb);
+
+/**
+ * tdb_wipe_all - wipe the database clean
+ * @tdb: the tdb context returned from tdb_open()
+ *
+ * Completely erase the database.  This is faster than iterating through
+ * each key and doing tdb_delete.
+ */
+enum TDB_ERROR tdb_wipe_all(struct tdb_context *tdb);
+
+/**
+ * tdb_check - check a TDB for consistency
+ * @tdb: the tdb context returned from tdb_open()
+ * @check: function to check each key/data pair (or NULL)
+ * @data: argument for @check, must match type.
+ *
+ * This performs a consistency check of the open database, optionally calling
+ * a check() function on each record so you can do your own data consistency
+ * checks as well.  If check() returns an error, that is returned from
+ * tdb_check().
+ *
+ * Returns TDB_SUCCESS or an error.
+ */
+#define tdb_check(tdb, check, data)					\
+	tdb_check_((tdb), typesafe_cb_preargs(enum TDB_ERROR, void *,	\
+					      (check), (data),		\
+					      struct tdb_data,		\
+					      struct tdb_data),		\
+		   (data))
+
+enum TDB_ERROR tdb_check_(struct tdb_context *tdb,
+			  enum TDB_ERROR (*check)(struct tdb_data k,
+						  struct tdb_data d,
+						  void *data),
+			  void *data);
+
+/**
+ * tdb_error - get the last error (not threadsafe)
+ * @tdb: the tdb context returned from tdb_open()
+ *
+ * Returns the last error returned by a TDB function.
+ *
+ * This makes porting from TDB1 easier, but note that the last error is not
+ * reliable in threaded programs.
+ */
+enum TDB_ERROR tdb_error(struct tdb_context *tdb);
+
+/**
+ * enum tdb_summary_flags - flags for tdb_summary.
+ */
+enum tdb_summary_flags {
+	TDB_SUMMARY_HISTOGRAMS = 1 /* Draw graphs in the summary. */
+};
+
+/**
+ * tdb_summary - return a string describing the TDB state
+ * @tdb: the tdb context returned from tdb_open()
+ * @flags: flags to control the summary output.
+ * @summary: pointer to string to allocate.
+ *
+ * This returns a developer-readable string describing the overall
+ * state of the tdb, such as the percentage used and sizes of records.
+ * It is designed to provide information about the tdb at a glance
+ * without displaying any keys or data in the database.
+ *
+ * On success, sets @summary to point to a malloc()'ed nul-terminated
+ * multi-line string.  It is your responsibility to free() it.
+ */
+enum TDB_ERROR tdb_summary(struct tdb_context *tdb,
+			   enum tdb_summary_flags flags,
+			   char **summary);
+
+
+/**
+ * tdb_get_flags - return the flags for a tdb
+ * @tdb: the tdb context returned from tdb_open()
+ *
+ * This returns the flags on the current tdb.  Some of these are caused by
+ * the flags argument to tdb_open(), others (such as TDB_CONVERT) are
+ * intuited.
+ */
+unsigned int tdb_get_flags(struct tdb_context *tdb);
+
+/**
+ * tdb_add_flag - set a flag for a tdb
+ * @tdb: the tdb context returned from tdb_open()
+ * @flag: one of TDB_NOLOCK, TDB_NOMMAP, TDB_NOSYNC or TDB_ALLOW_NESTING.
+ *
+ * You can use this to set a flag on the TDB.  You cannot set these flags
+ * on a TDB_INTERNAL tdb.
+ */
+void tdb_add_flag(struct tdb_context *tdb, unsigned flag);
+
+/**
+ * tdb_remove_flag - unset a flag for a tdb
+ * @tdb: the tdb context returned from tdb_open()
+ * @flag: one of TDB_NOLOCK, TDB_NOMMAP, TDB_NOSYNC or TDB_ALLOW_NESTING.
+ *
+ * You can use this to clear a flag on the TDB.  You cannot clear flags
+ * on a TDB_INTERNAL tdb.
+ */
+void tdb_remove_flag(struct tdb_context *tdb, unsigned flag);
+
+/**
+ * enum tdb_attribute_type - descriminator for union tdb_attribute.
+ */
+enum tdb_attribute_type {
+	TDB_ATTRIBUTE_LOG = 0,
+	TDB_ATTRIBUTE_HASH = 1,
+	TDB_ATTRIBUTE_SEED = 2,
+	TDB_ATTRIBUTE_STATS = 3,
+	TDB_ATTRIBUTE_OPENHOOK = 4,
+	TDB_ATTRIBUTE_FLOCK = 5
+};
+
+/**
+ * tdb_get_attribute - get an attribute for an existing tdb
+ * @tdb: the tdb context returned from tdb_open()
+ * @attr: the union tdb_attribute to set.
+ *
+ * This gets an attribute from a TDB which has previously been set (or
+ * may return the default values).  Set @attr.base.attr to the
+ * attribute type you want get.
+ *
+ * Currently this does not work for TDB_ATTRIBUTE_OPENHOOK.
+ */
+enum TDB_ERROR tdb_get_attribute(struct tdb_context *tdb,
+				 union tdb_attribute *attr);
+
+/**
+ * tdb_set_attribute - set an attribute for an existing tdb
+ * @tdb: the tdb context returned from tdb_open()
+ * @attr: the union tdb_attribute to set.
+ *
+ * This sets an attribute on a TDB, overriding any previous attribute
+ * of the same type.  It returns TDB_ERR_EINVAL if the attribute is
+ * unknown or invalid.
+ *
+ * Note that TDB_ATTRIBUTE_HASH, TDB_ATTRIBUTE_SEED and
+ * TDB_ATTRIBUTE_OPENHOOK cannot currently be set after tdb_open.
+ */
+enum TDB_ERROR tdb_set_attribute(struct tdb_context *tdb,
+				 const union tdb_attribute *attr);
+
+/**
+ * tdb_unset_attribute - reset an attribute for an existing tdb
+ * @tdb: the tdb context returned from tdb_open()
+ * @type: the attribute type to unset.
+ *
+ * This unsets an attribute on a TDB, returning it to the defaults
+ * (where applicable).
+ *
+ * Note that it only makes sense for TDB_ATTRIBUTE_LOG and TDB_ATTRIBUTE_FLOCK
+ * to be unset.
+ */
+void tdb_unset_attribute(struct tdb_context *tdb,
+			 enum tdb_attribute_type type);
+
+/**
+ * tdb_name - get the name of a tdb
+ * @tdb: the tdb context returned from tdb_open()
+ *
+ * This returns a copy of the name string, made at tdb_open() time.  If that
+ * argument was NULL (possible for a TDB_INTERNAL db) this will return NULL.
+ *
+ * This is mostly useful for logging.
+ */
+const char *tdb_name(const struct tdb_context *tdb);
+
+/**
+ * tdb_fd - get the file descriptor of a tdb
+ * @tdb: the tdb context returned from tdb_open()
+ *
+ * This returns the file descriptor for the underlying database file, or -1
+ * for TDB_INTERNAL.
+ */
+int tdb_fd(const struct tdb_context *tdb);
+
+/**
+ * struct tdb_attribute_base - common fields for all tdb attributes.
+ */
+struct tdb_attribute_base {
+	enum tdb_attribute_type attr;
+	union tdb_attribute *next;
+};
+
+/**
+ * enum tdb_log_level - log levels for tdb_attribute_log
+ * @TDB_LOG_ERROR: used to log unrecoverable errors such as I/O errors
+ *		   or internal consistency failures.
+ * @TDB_LOG_USE_ERROR: used to log usage errors such as invalid parameters
+ *		   or writing to a read-only database.
+ * @TDB_LOG_WARNING: used for informational messages on issues which
+ *		     are unusual but handled by TDB internally, such
+ *		     as a failure to mmap or failure to open /dev/urandom.
+ */
+enum tdb_log_level {
+	TDB_LOG_ERROR,
+	TDB_LOG_USE_ERROR,
+	TDB_LOG_WARNING
+};
+
+/**
+ * struct tdb_attribute_log - log function attribute
+ *
+ * This attribute provides a hook for you to log errors.
+ */
+struct tdb_attribute_log {
+	struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_LOG */
+	void (*fn)(struct tdb_context *tdb,
+		   enum tdb_log_level level,
+		   const char *message,
+		   void *data);
+	void *data;
+};
+
+/**
+ * struct tdb_attribute_hash - hash function attribute
+ *
+ * This attribute allows you to provide an alternative hash function.
+ * This hash function will be handed keys from the database; it will also
+ * be handed the 8-byte TDB_HASH_MAGIC value for checking the header (the
+ * tdb_open() will fail if the hash value doesn't match the header).
+ *
+ * Note that if your hash function gives different results on
+ * different machine endians, your tdb will no longer work across
+ * different architectures!
+ */
+struct tdb_attribute_hash {
+	struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_HASH */
+	uint64_t (*fn)(const void *key, size_t len, uint64_t seed,
+		       void *data);
+	void *data;
+};
+
+/**
+ * struct tdb_attribute_seed - hash function seed attribute
+ *
+ * The hash function seed is normally taken from /dev/urandom (or equivalent)
+ * but can be set manually here.  This is mainly for testing purposes.
+ */
+struct tdb_attribute_seed {
+	struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_SEED */
+	uint64_t seed;
+};
+
+/**
+ * struct tdb_attribute_stats - tdb operational statistics
+ *
+ * This attribute records statistics of various low-level TDB operations.
+ * This can be used to assist performance evaluation.  This is only
+ * useful for tdb_get_attribute().
+ *
+ * New fields will be added at the end, hence the "size" argument which
+ * indicates how large your structure is: it must be filled in before
+ * calling tdb_get_attribute(), which will overwrite it with the size
+ * tdb knows about.
+ */
+struct tdb_attribute_stats {
+	struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_STATS */
+	size_t size; /* = sizeof(struct tdb_attribute_stats) */
+	uint64_t allocs;
+	uint64_t   alloc_subhash;
+	uint64_t   alloc_chain;
+	uint64_t   alloc_bucket_exact;
+	uint64_t   alloc_bucket_max;
+	uint64_t   alloc_leftover;
+	uint64_t   alloc_coalesce_tried;
+	uint64_t     alloc_coalesce_iterate_clash;
+	uint64_t     alloc_coalesce_lockfail;
+	uint64_t     alloc_coalesce_race;
+	uint64_t     alloc_coalesce_succeeded;
+	uint64_t       alloc_coalesce_num_merged;
+	uint64_t compares;
+	uint64_t   compare_wrong_bucket;
+	uint64_t   compare_wrong_offsetbits;
+	uint64_t   compare_wrong_keylen;
+	uint64_t   compare_wrong_rechash;
+	uint64_t   compare_wrong_keycmp;
+	uint64_t transactions;
+	uint64_t   transaction_cancel;
+	uint64_t   transaction_nest;
+	uint64_t   transaction_expand_file;
+	uint64_t   transaction_read_direct;
+	uint64_t      transaction_read_direct_fail;
+	uint64_t   transaction_write_direct;
+	uint64_t      transaction_write_direct_fail;
+	uint64_t expands;
+	uint64_t frees;
+	uint64_t locks;
+	uint64_t   lock_lowlevel;
+	uint64_t   lock_nonblock;
+	uint64_t     lock_nonblock_fail;
+};
+
+/**
+ * struct tdb_attribute_openhook - tdb special effects hook for open
+ *
+ * This attribute contains a function to call once we have the OPEN_LOCK
+ * for the tdb, but before we've examined its contents.  If this succeeds,
+ * the tdb will be populated if it's then zero-length.
+ *
+ * This is a hack to allow support for TDB1-style TDB_CLEAR_IF_FIRST
+ * behaviour.
+ */
+struct tdb_attribute_openhook {
+	struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_OPENHOOK */
+	enum TDB_ERROR (*fn)(int fd, void *data);
+	void *data;
+};
+
+/**
+ * struct tdb_attribute_flock - tdb special effects hook for file locking
+ *
+ * This attribute contains function to call to place locks on a file; it can
+ * be used to support non-blocking operations or lock proxying.
+ *
+ * They should return 0 on success, -1 on failure and set errno.
+ *
+ * An error will be logged on error if errno is neither EAGAIN nor EINTR
+ * (normally it would only return EAGAIN if waitflag is false, and
+ * loop internally on EINTR).
+ */
+struct tdb_attribute_flock {
+	struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_FLOCK */
+	int (*lock)(int fd,int rw, off_t off, off_t len, bool waitflag, void *);
+	int (*unlock)(int fd, int rw, off_t off, off_t len, void *);
+	void *data;
+};
+
+/**
+ * union tdb_attribute - tdb attributes.
+ *
+ * This represents all the known attributes.
+ *
+ * See also:
+ *	struct tdb_attribute_log, struct tdb_attribute_hash,
+ *	struct tdb_attribute_seed, struct tdb_attribute_stats,
+ *	struct tdb_attribute_openhook, struct tdb_attribute_flock.
+ */
+union tdb_attribute {
+	struct tdb_attribute_base base;
+	struct tdb_attribute_log log;
+	struct tdb_attribute_hash hash;
+	struct tdb_attribute_seed seed;
+	struct tdb_attribute_stats stats;
+	struct tdb_attribute_openhook openhook;
+	struct tdb_attribute_flock flock;
+};
+
+#ifdef  __cplusplus
+}
+#endif
+
+#endif /* tdb2.h */
diff --git a/lib/tdb2/test/external-agent.c b/lib/tdb2/test/external-agent.c
new file mode 100644
index 0000000000..055b5de736
--- /dev/null
+++ b/lib/tdb2/test/external-agent.c
@@ -0,0 +1,250 @@
+#include "external-agent.h"
+#include "logging.h"
+#include "lock-tracking.h"
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include <err.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <limits.h>
+#include <string.h>
+#include <errno.h>
+#include <ccan/tdb2/private.h>
+#include <ccan/tap/tap.h>
+#include <stdio.h>
+#include <stdarg.h>
+
+static struct tdb_context *tdb;
+
+static enum TDB_ERROR clear_if_first(int fd, void *arg)
+{
+/* We hold a lock offset 63 always, so we can tell if anyone is holding it. */
+	struct flock fl;
+
+	fl.l_type = F_WRLCK;
+	fl.l_whence = SEEK_SET;
+	fl.l_start = 63;
+	fl.l_len = 1;
+
+	if (fcntl(fd, F_SETLK, &fl) == 0) {
+		/* We must be first ones to open it! */
+		diag("agent truncating file!");
+		if (ftruncate(fd, 0) != 0) {
+			return TDB_ERR_IO;
+		}
+	}
+	fl.l_type = F_RDLCK;
+	if (fcntl(fd, F_SETLKW, &fl) != 0) {
+		return TDB_ERR_IO;
+	}
+	return TDB_SUCCESS;
+}
+
+static enum agent_return do_operation(enum operation op, const char *name)
+{
+	TDB_DATA k;
+	enum agent_return ret;
+	TDB_DATA data;
+	enum TDB_ERROR ecode;
+	union tdb_attribute cif;
+
+	if (op != OPEN && op != OPEN_WITH_HOOK && !tdb) {
+		diag("external: No tdb open!");
+		return OTHER_FAILURE;
+	}
+
+	diag("external: %s", operation_name(op));
+
+	k = tdb_mkdata(name, strlen(name));
+
+	locking_would_block = 0;
+	switch (op) {
+	case OPEN:
+		if (tdb) {
+			diag("Already have tdb %s open", tdb->name);
+			return OTHER_FAILURE;
+		}
+		tdb = tdb_open(name, TDB_DEFAULT, O_RDWR, 0, &tap_log_attr);
+		if (!tdb) {
+			if (!locking_would_block)
+				diag("Opening tdb gave %s", strerror(errno));
+			forget_locking();
+			ret = OTHER_FAILURE;
+		} else
+			ret = SUCCESS;
+		break;
+	case OPEN_WITH_HOOK:
+		if (tdb) {
+			diag("Already have tdb %s open", tdb->name);
+			return OTHER_FAILURE;
+		}
+		cif.openhook.base.attr = TDB_ATTRIBUTE_OPENHOOK;
+		cif.openhook.base.next = &tap_log_attr;
+		cif.openhook.fn = clear_if_first;
+		tdb = tdb_open(name, TDB_DEFAULT, O_RDWR, 0, &cif);
+		if (!tdb) {
+			if (!locking_would_block)
+				diag("Opening tdb gave %s", strerror(errno));
+			forget_locking();
+			ret = OTHER_FAILURE;
+		} else
+			ret = SUCCESS;
+		break;
+	case FETCH:
+		ecode = tdb_fetch(tdb, k, &data);
+		if (ecode == TDB_ERR_NOEXIST) {
+			ret = FAILED;
+		} else if (ecode < 0) {
+			ret = OTHER_FAILURE;
+		} else if (!tdb_deq(data, k)) {
+			ret = OTHER_FAILURE;
+			free(data.dptr);
+		} else {
+			ret = SUCCESS;
+			free(data.dptr);
+		}
+		break;
+	case STORE:
+		ret = tdb_store(tdb, k, k, 0) == 0 ? SUCCESS : OTHER_FAILURE;
+		break;
+	case TRANSACTION_START:
+		ret = tdb_transaction_start(tdb) == 0 ? SUCCESS : OTHER_FAILURE;
+		break;
+	case TRANSACTION_COMMIT:
+		ret = tdb_transaction_commit(tdb)==0 ? SUCCESS : OTHER_FAILURE;
+		break;
+	case NEEDS_RECOVERY:
+		ret = tdb_needs_recovery(tdb) ? SUCCESS : FAILED;
+		break;
+	case CHECK:
+		ret = tdb_check(tdb, NULL, NULL) == 0 ? SUCCESS : OTHER_FAILURE;
+		break;
+	case CLOSE:
+		ret = tdb_close(tdb) == 0 ? SUCCESS : OTHER_FAILURE;
+		tdb = NULL;
+		break;
+	case SEND_SIGNAL:
+		/* We do this async */
+		ret = SUCCESS;
+		break;
+	default:
+		ret = OTHER_FAILURE;
+	}
+
+	if (locking_would_block)
+		ret = WOULD_HAVE_BLOCKED;
+
+	return ret;
+}
+
+struct agent {
+	int cmdfd, responsefd;
+};
+
+/* Do this before doing any tdb stuff.  Return handle, or NULL. */
+struct agent *prepare_external_agent(void)
+{
+	int pid, ret;
+	int command[2], response[2];
+	char name[1+PATH_MAX];
+
+	if (pipe(command) != 0 || pipe(response) != 0)
+		return NULL;
+
+	pid = fork();
+	if (pid < 0)
+		return NULL;
+
+	if (pid != 0) {
+		struct agent *agent = malloc(sizeof(*agent));
+
+		close(command[0]);
+		close(response[1]);
+		agent->cmdfd = command[1];
+		agent->responsefd = response[0];
+		return agent;
+	}
+
+	close(command[1]);
+	close(response[0]);
+
+	/* We want to fail, not block. */
+	nonblocking_locks = true;
+	log_prefix = "external: ";
+	while ((ret = read(command[0], name, sizeof(name))) > 0) {
+		enum agent_return result;
+
+		result = do_operation(name[0], name+1);
+		if (write(response[1], &result, sizeof(result))
+		    != sizeof(result))
+			err(1, "Writing response");
+		if (name[0] == SEND_SIGNAL) {
+			struct timeval ten_ms;
+			ten_ms.tv_sec = 0;
+			ten_ms.tv_usec = 10000;
+			select(0, NULL, NULL, NULL, &ten_ms);
+			kill(getppid(), SIGUSR1);
+		}
+	}
+	exit(0);
+}
+
+/* Ask the external agent to try to do an operation. */
+enum agent_return external_agent_operation(struct agent *agent,
+					   enum operation op,
+					   const char *name)
+{
+	enum agent_return res;
+	unsigned int len;
+	char *string;
+
+	if (!name)
+		name = "";
+	len = 1 + strlen(name) + 1;
+	string = malloc(len);
+
+	string[0] = op;
+	strcpy(string+1, name);
+
+	if (write(agent->cmdfd, string, len) != len
+	    || read(agent->responsefd, &res, sizeof(res)) != sizeof(res))
+		res = AGENT_DIED;
+
+	free(string);
+	return res;
+}
+
+const char *agent_return_name(enum agent_return ret)
+{
+	return ret == SUCCESS ? "SUCCESS"
+		: ret == WOULD_HAVE_BLOCKED ? "WOULD_HAVE_BLOCKED"
+		: ret == AGENT_DIED ? "AGENT_DIED"
+		: ret == FAILED ? "FAILED"
+		: ret == OTHER_FAILURE ? "OTHER_FAILURE"
+		: "**INVALID**";
+}
+
+const char *operation_name(enum operation op)
+{
+	switch (op) {
+	case OPEN: return "OPEN";
+	case OPEN_WITH_HOOK: return "OPEN_WITH_HOOK";
+	case FETCH: return "FETCH";
+	case STORE: return "STORE";
+	case CHECK: return "CHECK";
+	case TRANSACTION_START: return "TRANSACTION_START";
+	case TRANSACTION_COMMIT: return "TRANSACTION_COMMIT";
+	case NEEDS_RECOVERY: return "NEEDS_RECOVERY";
+	case SEND_SIGNAL: return "SEND_SIGNAL";
+	case CLOSE: return "CLOSE";
+	}
+	return "**INVALID**";
+}
+
+void free_external_agent(struct agent *agent)
+{
+	close(agent->cmdfd);
+	close(agent->responsefd);
+	free(agent);
+}
diff --git a/lib/tdb2/test/external-agent.h b/lib/tdb2/test/external-agent.h
new file mode 100644
index 0000000000..9eada10750
--- /dev/null
+++ b/lib/tdb2/test/external-agent.h
@@ -0,0 +1,43 @@
+#ifndef TDB2_TEST_EXTERNAL_AGENT_H
+#define TDB2_TEST_EXTERNAL_AGENT_H
+
+/* For locking tests, we need a different process to try things at
+ * various times. */
+enum operation {
+	OPEN,
+	OPEN_WITH_HOOK,
+	FETCH,
+	STORE,
+	TRANSACTION_START,
+	TRANSACTION_COMMIT,
+	NEEDS_RECOVERY,
+	CHECK,
+	SEND_SIGNAL,
+	CLOSE,
+};
+
+/* Do this before doing any tdb stuff.  Return handle, or -1. */
+struct agent *prepare_external_agent(void);
+
+enum agent_return {
+	SUCCESS,
+	WOULD_HAVE_BLOCKED,
+	AGENT_DIED,
+	FAILED, /* For fetch, or NEEDS_RECOVERY */
+	OTHER_FAILURE,
+};
+
+/* Ask the external agent to try to do an operation.
+ * name == tdb name for OPEN/OPEN_WITH_CLEAR_IF_FIRST,
+ * record name for FETCH/STORE (store stores name as data too)
+ */
+enum agent_return external_agent_operation(struct agent *handle,
+					   enum operation op,
+					   const char *name);
+
+/* Mapping enum -> string. */
+const char *agent_return_name(enum agent_return ret);
+const char *operation_name(enum operation op);
+
+void free_external_agent(struct agent *agent);
+#endif /* TDB2_TEST_EXTERNAL_AGENT_H */
diff --git a/lib/tdb2/test/failtest_helper.c b/lib/tdb2/test/failtest_helper.c
new file mode 100644
index 0000000000..1358a6c6b2
--- /dev/null
+++ b/lib/tdb2/test/failtest_helper.c
@@ -0,0 +1,117 @@
+#include "failtest_helper.h"
+#include "logging.h"
+#include <string.h>
+#include <ccan/tap/tap.h>
+
+/* FIXME: From ccan/str */
+static inline bool strends(const char *str, const char *postfix)
+{
+	if (strlen(str) < strlen(postfix))
+		return false;
+
+	return !strcmp(str + strlen(str) - strlen(postfix), postfix);
+}
+
+bool failmatch(const struct failtest_call *call,
+	       const char *file, int line, enum failtest_call_type type)
+{
+	return call->type == type
+		&& call->line == line
+		&& ((strcmp(call->file, file) == 0)
+		    || (strends(call->file, file)
+			&& (call->file[strlen(call->file) - strlen(file) - 1]
+			    == '/')));
+}
+
+static const struct failtest_call *
+find_repeat(const struct failtest_call *start, const struct failtest_call *end,
+	    const struct failtest_call *call)
+{
+	const struct failtest_call *i;
+
+	for (i = start; i < end; i++) {
+		if (failmatch(i, call->file, call->line, call->type))
+			return i;
+	}
+	return NULL;
+}
+
+static bool is_nonblocking_lock(const struct failtest_call *call)
+{
+	return call->type == FAILTEST_FCNTL && call->u.fcntl.cmd == F_SETLK;
+}
+
+static bool is_unlock(const struct failtest_call *call)
+{
+	return call->type == FAILTEST_FCNTL
+		&& call->u.fcntl.arg.fl.l_type == F_UNLCK;
+}
+
+bool exit_check_log(struct failtest_call *history, unsigned num)
+{
+	unsigned int i;
+
+	for (i = 0; i < num; i++) {
+		if (!history[i].fail)
+			continue;
+		/* Failing the /dev/urandom open doesn't count: we fall back. */
+		if (failmatch(&history[i], URANDOM_OPEN))
+			continue;
+
+		/* Similarly with read fail. */
+		if (failmatch(&history[i], URANDOM_READ))
+			continue;
+
+		/* Initial allocation of tdb doesn't log. */
+		if (failmatch(&history[i], INITIAL_TDB_MALLOC))
+			continue;
+
+		/* We don't block "failures" on non-blocking locks. */
+		if (is_nonblocking_lock(&history[i]))
+			continue;
+
+		if (!tap_log_messages)
+			diag("We didn't log for %u (%s:%u)",
+			     i, history[i].file, history[i].line);
+		return tap_log_messages != 0;
+	}
+	return true;
+}
+
+/* Some places we soldier on despite errors: only fail them once. */
+enum failtest_result
+block_repeat_failures(struct failtest_call *history, unsigned num)
+{
+	const struct failtest_call *i, *last = &history[num-1];
+
+	if (failmatch(last, INITIAL_TDB_MALLOC)
+	    || failmatch(last, URANDOM_OPEN)
+	    || failmatch(last, URANDOM_READ)) {
+		if (find_repeat(history, last, last))
+			return FAIL_DONT_FAIL;
+		return FAIL_PROBE;
+	}
+
+	/* Unlock or non-blocking lock is fail-once. */
+	if (is_unlock(last)) {
+		/* Find a previous unlock at this point? */
+		for (i = find_repeat(history, last, last);
+		     i;
+		     i = find_repeat(history, i, last)) {
+			if (is_unlock(i))
+				return FAIL_DONT_FAIL;
+		}
+		return FAIL_PROBE;
+	} else if (is_nonblocking_lock(last)) {
+		/* Find a previous non-blocking lock at this point? */
+		for (i = find_repeat(history, last, last);
+		     i;
+		     i = find_repeat(history, i, last)) {
+			if (is_nonblocking_lock(i))
+				return FAIL_DONT_FAIL;
+		}
+		return FAIL_PROBE;
+	}
+
+	return FAIL_OK;
+}
diff --git a/lib/tdb2/test/failtest_helper.h b/lib/tdb2/test/failtest_helper.h
new file mode 100644
index 0000000000..a62efbad58
--- /dev/null
+++ b/lib/tdb2/test/failtest_helper.h
@@ -0,0 +1,17 @@
+#ifndef TDB2_TEST_FAILTEST_HELPER_H
+#define TDB2_TEST_FAILTEST_HELPER_H
+#include <ccan/failtest/failtest.h>
+#include <stdbool.h>
+
+/* FIXME: Check these! */
+#define INITIAL_TDB_MALLOC	"open.c", 338, FAILTEST_MALLOC
+#define URANDOM_OPEN		"open.c", 45, FAILTEST_OPEN
+#define URANDOM_READ		"open.c", 25, FAILTEST_READ
+
+bool exit_check_log(struct failtest_call *history, unsigned num);
+bool failmatch(const struct failtest_call *call,
+	       const char *file, int line, enum failtest_call_type type);
+enum failtest_result
+block_repeat_failures(struct failtest_call *history, unsigned num);
+
+#endif /* TDB2_TEST_LOGGING_H */
diff --git a/lib/tdb2/test/layout.c b/lib/tdb2/test/layout.c
new file mode 100644
index 0000000000..31889ad080
--- /dev/null
+++ b/lib/tdb2/test/layout.c
@@ -0,0 +1,348 @@
+/* TDB tools to create various canned database layouts. */
+#include "layout.h"
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <err.h>
+#include "logging.h"
+
+struct tdb_layout *new_tdb_layout(const char *filename)
+{
+	struct tdb_layout *layout = malloc(sizeof(*layout));
+	layout->filename = filename;
+	layout->num_elems = 0;
+	layout->elem = NULL;
+	return layout;
+}
+
+static void add(struct tdb_layout *layout, union tdb_layout_elem elem)
+{
+	layout->elem = realloc(layout->elem,
+			       sizeof(layout->elem[0])
+			       * (layout->num_elems+1));
+	layout->elem[layout->num_elems++] = elem;
+}
+
+void tdb_layout_add_freetable(struct tdb_layout *layout)
+{
+	union tdb_layout_elem elem;
+	elem.base.type = FREETABLE;
+	add(layout, elem);
+}
+
+void tdb_layout_add_free(struct tdb_layout *layout, tdb_len_t len,
+			 unsigned ftable)
+{
+	union tdb_layout_elem elem;
+	elem.base.type = FREE;
+	elem.free.len = len;
+	elem.free.ftable_num = ftable;
+	add(layout, elem);
+}
+
+static struct tdb_data dup_key(struct tdb_data key)
+{
+	struct tdb_data ret;
+	ret.dsize = key.dsize;
+	ret.dptr = malloc(ret.dsize);
+	memcpy(ret.dptr, key.dptr, ret.dsize);
+	return ret;
+}
+
+void tdb_layout_add_used(struct tdb_layout *layout,
+			 TDB_DATA key, TDB_DATA data,
+			 tdb_len_t extra)
+{
+	union tdb_layout_elem elem;
+	elem.base.type = DATA;
+	elem.used.key = dup_key(key);
+	elem.used.data = dup_key(data);
+	elem.used.extra = extra;
+	add(layout, elem);
+}
+
+static tdb_len_t free_record_len(tdb_len_t len)
+{
+	return sizeof(struct tdb_used_record) + len;
+}
+
+static tdb_len_t data_record_len(struct tle_used *used)
+{
+	tdb_len_t len;
+	len = sizeof(struct tdb_used_record)
+		+ used->key.dsize + used->data.dsize + used->extra;
+	assert(len >= sizeof(struct tdb_free_record));
+	return len;
+}
+
+static tdb_len_t hashtable_len(struct tle_hashtable *htable)
+{
+	return sizeof(struct tdb_used_record)
+		+ (sizeof(tdb_off_t) << TDB_SUBLEVEL_HASH_BITS)
+		+ htable->extra;
+}
+
+static tdb_len_t freetable_len(struct tle_freetable *ftable)
+{
+	return sizeof(struct tdb_freetable);
+}
+
+static void set_free_record(void *mem, tdb_len_t len)
+{
+	/* We do all the work in add_to_freetable */
+}
+
+static void add_zero_pad(struct tdb_used_record *u, size_t len, size_t extra)
+{
+	if (extra)
+		((char *)(u + 1))[len] = '\0';
+}
+
+static void set_data_record(void *mem, struct tdb_context *tdb,
+			    struct tle_used *used)
+{
+	struct tdb_used_record *u = mem;
+
+	set_header(tdb, u, TDB_USED_MAGIC, used->key.dsize, used->data.dsize,
+		   used->key.dsize + used->data.dsize + used->extra,
+		   tdb_hash(tdb, used->key.dptr, used->key.dsize));
+	memcpy(u + 1, used->key.dptr, used->key.dsize);
+	memcpy((char *)(u + 1) + used->key.dsize,
+	       used->data.dptr, used->data.dsize);
+	add_zero_pad(u, used->key.dsize + used->data.dsize, used->extra);
+}
+
+static void set_hashtable(void *mem, struct tdb_context *tdb,
+			  struct tle_hashtable *htable)
+{
+	struct tdb_used_record *u = mem;
+	tdb_len_t len = sizeof(tdb_off_t) << TDB_SUBLEVEL_HASH_BITS;
+
+	set_header(tdb, u, TDB_HTABLE_MAGIC, 0, len, len + htable->extra, 0);
+	memset(u + 1, 0, len);
+	add_zero_pad(u, len, htable->extra);
+}
+
+static void set_freetable(void *mem, struct tdb_context *tdb,
+			 struct tle_freetable *freetable, struct tdb_header *hdr,
+			 tdb_off_t last_ftable)
+{
+	struct tdb_freetable *ftable = mem;
+	memset(ftable, 0, sizeof(*ftable));
+	set_header(tdb, &ftable->hdr, TDB_FTABLE_MAGIC, 0,
+			sizeof(*ftable) - sizeof(ftable->hdr),
+			sizeof(*ftable) - sizeof(ftable->hdr), 0);
+
+	if (last_ftable) {
+		ftable = (struct tdb_freetable *)((char *)hdr + last_ftable);
+		ftable->next = freetable->base.off;
+	} else {
+		hdr->free_table = freetable->base.off;
+	}
+}
+
+static void add_to_freetable(struct tdb_context *tdb,
+			     tdb_off_t eoff,
+			     tdb_off_t elen,
+			     unsigned ftable,
+			     struct tle_freetable *freetable)
+{
+	tdb->ftable_off = freetable->base.off;
+	tdb->ftable = ftable;
+	add_free_record(tdb, eoff, sizeof(struct tdb_used_record) + elen,
+			TDB_LOCK_WAIT, false);
+}
+
+static tdb_off_t hbucket_off(tdb_off_t group_start, unsigned ingroup)
+{
+	return group_start
+		+ (ingroup % (1 << TDB_HASH_GROUP_BITS)) * sizeof(tdb_off_t);
+}
+
+/* Get bits from a value. */
+static uint32_t bits(uint64_t val, unsigned start, unsigned num)
+{
+	assert(num <= 32);
+	return (val >> start) & ((1U << num) - 1);
+}
+
+/* We take bits from the top: that way we can lock whole sections of the hash
+ * by using lock ranges. */
+static uint32_t use_bits(uint64_t h, unsigned num, unsigned *used)
+{
+	*used += num;
+	return bits(h, 64 - *used, num);
+}
+
+static tdb_off_t encode_offset(tdb_off_t new_off, unsigned bucket,
+			       uint64_t h)
+{
+	return bucket
+		| new_off
+		| ((uint64_t)bits(h, 64 - TDB_OFF_UPPER_STEAL_EXTRA,
+				  TDB_OFF_UPPER_STEAL_EXTRA)
+		   << TDB_OFF_HASH_EXTRA_BIT);
+}
+
+/* FIXME: Our hash table handling here is primitive: we don't expand! */
+static void add_to_hashtable(struct tdb_context *tdb,
+			     tdb_off_t eoff,
+			     struct tdb_data key)
+{
+	uint64_t h = tdb_hash(tdb, key.dptr, key.dsize);
+	tdb_off_t b_off, group_start;
+	unsigned i, group, in_group;
+	unsigned used = 0;
+
+	group = use_bits(h, TDB_TOPLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS, &used);
+	in_group = use_bits(h, TDB_HASH_GROUP_BITS, &used);
+
+	group_start = offsetof(struct tdb_header, hashtable)
+		+ group * (sizeof(tdb_off_t) << TDB_HASH_GROUP_BITS);
+
+	for (i = 0; i < (1 << TDB_HASH_GROUP_BITS); i++) {
+		unsigned bucket = (in_group + i) % (1 << TDB_HASH_GROUP_BITS);
+
+		b_off = hbucket_off(group_start, bucket);
+		if (tdb_read_off(tdb, b_off) == 0) {
+			tdb_write_off(tdb, b_off,
+				      encode_offset(eoff, bucket, h));
+			return;
+		}
+	}
+	abort();
+}
+
+static struct tle_freetable *find_ftable(struct tdb_layout *layout, unsigned num)
+{
+	unsigned i;
+
+	for (i = 0; i < layout->num_elems; i++) {
+		if (layout->elem[i].base.type != FREETABLE)
+			continue;
+		if (num == 0)
+			return &layout->elem[i].ftable;
+		num--;
+	}
+	abort();
+}
+
+/* FIXME: Support TDB_CONVERT */
+struct tdb_context *tdb_layout_get(struct tdb_layout *layout)
+{
+	unsigned int i;
+	tdb_off_t off, len, last_ftable;
+	char *mem;
+	struct tdb_context *tdb;
+
+	off = sizeof(struct tdb_header);
+
+	/* First pass of layout: calc lengths */
+	for (i = 0; i < layout->num_elems; i++) {
+		union tdb_layout_elem *e = &layout->elem[i];
+		e->base.off = off;
+		switch (e->base.type) {
+		case FREETABLE:
+			len = freetable_len(&e->ftable);
+			break;
+		case FREE:
+			len = free_record_len(e->free.len);
+			break;
+		case DATA:
+			len = data_record_len(&e->used);
+			break;
+		case HASHTABLE:
+			len = hashtable_len(&e->hashtable);
+			break;
+		default:
+			abort();
+		}
+		off += len;
+	}
+
+	mem = malloc(off);
+	/* Fill with some weird pattern. */
+	memset(mem, 0x99, off);
+	/* Now populate our header, cribbing from a real TDB header. */
+	tdb = tdb_open(NULL, TDB_INTERNAL, O_RDWR, 0, &tap_log_attr);
+	memcpy(mem, tdb->file->map_ptr, sizeof(struct tdb_header));
+
+	/* Mug the tdb we have to make it use this. */
+	free(tdb->file->map_ptr);
+	tdb->file->map_ptr = mem;
+	tdb->file->map_size = off;
+
+	last_ftable = 0;
+	for (i = 0; i < layout->num_elems; i++) {
+		union tdb_layout_elem *e = &layout->elem[i];
+		switch (e->base.type) {
+		case FREETABLE:
+			set_freetable(mem + e->base.off, tdb, &e->ftable,
+				     (struct tdb_header *)mem, last_ftable);
+			last_ftable = e->base.off;
+			break;
+		case FREE:
+			set_free_record(mem + e->base.off, e->free.len);
+			break;
+		case DATA:
+			set_data_record(mem + e->base.off, tdb, &e->used);
+			break;
+		case HASHTABLE:
+			set_hashtable(mem + e->base.off, tdb, &e->hashtable);
+			break;
+		}
+	}
+	/* Must have a free table! */
+	assert(last_ftable);
+
+	/* Now fill the free and hash tables. */
+	for (i = 0; i < layout->num_elems; i++) {
+		union tdb_layout_elem *e = &layout->elem[i];
+		switch (e->base.type) {
+		case FREE:
+			add_to_freetable(tdb, e->base.off, e->free.len,
+					 e->free.ftable_num,
+					 find_ftable(layout, e->free.ftable_num));
+			break;
+		case DATA:
+			add_to_hashtable(tdb, e->base.off, e->used.key);
+			break;
+		default:
+			break;
+		}
+	}
+
+	tdb->ftable_off = find_ftable(layout, 0)->base.off;
+
+	/* Get physical if they asked for it. */
+	if (layout->filename) {
+		int fd = open(layout->filename, O_WRONLY|O_TRUNC|O_CREAT,
+			      0600);
+		if (fd < 0)
+			err(1, "opening %s for writing", layout->filename);
+		if (write(fd, tdb->file->map_ptr, tdb->file->map_size)
+		    != tdb->file->map_size)
+			err(1, "writing %s", layout->filename);
+		close(fd);
+		tdb_close(tdb);
+		/* NOMMAP is for lockcheck. */
+		tdb = tdb_open(layout->filename, TDB_NOMMAP, O_RDWR, 0,
+			       &tap_log_attr);
+	}
+
+	return tdb;
+}
+
+void tdb_layout_free(struct tdb_layout *layout)
+{
+	unsigned int i;
+
+	for (i = 0; i < layout->num_elems; i++) {
+		if (layout->elem[i].base.type == DATA) {
+			free(layout->elem[i].used.key.dptr);
+			free(layout->elem[i].used.data.dptr);
+		}
+	}
+	free(layout->elem);
+	free(layout);
+}
diff --git a/lib/tdb2/test/layout.h b/lib/tdb2/test/layout.h
new file mode 100644
index 0000000000..6e2e6657a7
--- /dev/null
+++ b/lib/tdb2/test/layout.h
@@ -0,0 +1,68 @@
+#ifndef TDB2_TEST_LAYOUT_H
+#define TDB2_TEST_LAYOUT_H
+#include <ccan/tdb2/private.h>
+
+struct tdb_layout *new_tdb_layout(const char *filename);
+void tdb_layout_add_freetable(struct tdb_layout *layout);
+void tdb_layout_add_free(struct tdb_layout *layout, tdb_len_t len,
+			 unsigned ftable);
+void tdb_layout_add_used(struct tdb_layout *layout,
+			 TDB_DATA key, TDB_DATA data,
+			 tdb_len_t extra);
+#if 0 /* FIXME: Allow allocation of subtables */
+void tdb_layout_add_hashtable(struct tdb_layout *layout,
+			      int htable_parent, /* -1 == toplevel */
+			      unsigned int bucket,
+			      tdb_len_t extra);
+#endif
+struct tdb_context *tdb_layout_get(struct tdb_layout *layout);
+void tdb_layout_free(struct tdb_layout *layout);
+
+enum layout_type {
+	FREETABLE, FREE, DATA, HASHTABLE,
+};
+
+/* Shared by all union members. */
+struct tle_base {
+	enum layout_type type;
+	tdb_off_t off;
+};
+
+struct tle_freetable {
+	struct tle_base base;
+};
+
+struct tle_free {
+	struct tle_base base;
+	tdb_len_t len;
+	unsigned ftable_num;
+};
+
+struct tle_used {
+	struct tle_base base;
+	TDB_DATA key;
+	TDB_DATA data;
+	tdb_len_t extra;
+};
+
+struct tle_hashtable {
+	struct tle_base base;
+	int parent;
+	unsigned int bucket;
+	tdb_len_t extra;
+};
+
+union tdb_layout_elem {
+	struct tle_base base;
+	struct tle_freetable ftable;
+	struct tle_free free;
+	struct tle_used used;
+	struct tle_hashtable hashtable;
+};
+
+struct tdb_layout {
+	const char *filename;
+	unsigned int num_elems;
+	union tdb_layout_elem *elem;
+};
+#endif /* TDB2_TEST_LAYOUT_H */
diff --git a/lib/tdb2/test/lock-tracking.c b/lib/tdb2/test/lock-tracking.c
new file mode 100644
index 0000000000..05dba32fd3
--- /dev/null
+++ b/lib/tdb2/test/lock-tracking.c
@@ -0,0 +1,147 @@
+/* We save the locks so we can reaquire them. */
+#include <unistd.h>
+#include <fcntl.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <ccan/tap/tap.h>
+#include <ccan/tdb2/private.h>
+#include "lock-tracking.h"
+
+struct lock {
+	struct lock *next;
+	unsigned int off;
+	unsigned int len;
+	int type;
+};
+static struct lock *locks;
+int locking_errors = 0;
+bool suppress_lockcheck = false;
+bool nonblocking_locks;
+int locking_would_block = 0;
+void (*unlock_callback)(int fd);
+
+int fcntl_with_lockcheck(int fd, int cmd, ... /* arg */ )
+{
+	va_list ap;
+	int ret, arg3;
+	struct flock *fl;
+	bool may_block = false;
+
+	if (cmd != F_SETLK && cmd != F_SETLKW) {
+		/* This may be totally bogus, but we don't know in general. */
+		va_start(ap, cmd);
+		arg3 = va_arg(ap, int);
+		va_end(ap);
+
+		return fcntl(fd, cmd, arg3);
+	}
+
+	va_start(ap, cmd);
+	fl = va_arg(ap, struct flock *);
+	va_end(ap);
+
+	if (cmd == F_SETLKW && nonblocking_locks) {
+		cmd = F_SETLK;
+		may_block = true;
+	}
+	ret = fcntl(fd, cmd, fl);
+
+	/* Detect when we failed, but might have been OK if we waited. */
+	if (may_block && ret == -1 && (errno == EAGAIN || errno == EACCES)) {
+		locking_would_block++;
+	}
+
+	if (fl->l_type == F_UNLCK) {
+		struct lock **l;
+		struct lock *old = NULL;
+
+		for (l = &locks; *l; l = &(*l)->next) {
+			if ((*l)->off == fl->l_start
+			    && (*l)->len == fl->l_len) {
+				if (ret == 0) {
+					old = *l;
+					*l = (*l)->next;
+					free(old);
+				}
+				break;
+			}
+		}
+		if (!old && !suppress_lockcheck) {
+			diag("Unknown unlock %u@%u - %i",
+			     (int)fl->l_len, (int)fl->l_start, ret);
+			locking_errors++;
+		}
+	} else {
+		struct lock *new, *i;
+		unsigned int fl_end = fl->l_start + fl->l_len;
+		if (fl->l_len == 0)
+			fl_end = (unsigned int)-1;
+
+		/* Check for overlaps: we shouldn't do this. */
+		for (i = locks; i; i = i->next) {
+			unsigned int i_end = i->off + i->len;
+			if (i->len == 0)
+				i_end = (unsigned int)-1;
+
+			if (fl->l_start >= i->off && fl->l_start < i_end)
+				break;
+			if (fl_end > i->off && fl_end < i_end)
+				break;
+
+			/* tdb_allrecord_lock does this, handle adjacent: */
+			if (fl->l_start > TDB_HASH_LOCK_START
+			    && fl->l_start == i_end && fl->l_type == i->type) {
+				if (ret == 0) {
+					i->len = fl->l_len
+						? i->len + fl->l_len
+						: 0;
+				}
+				goto done;
+			}
+		}
+		if (i) {
+			/* Special case: upgrade of allrecord lock. */
+			if (i->type == F_RDLCK && fl->l_type == F_WRLCK
+			    && i->off == TDB_HASH_LOCK_START
+			    && fl->l_start == TDB_HASH_LOCK_START
+			    && i->len == 0
+			    && fl->l_len == 0) {
+				if (ret == 0)
+					i->type = F_WRLCK;
+				goto done;
+			}
+			if (!suppress_lockcheck) {
+				diag("%s lock %u@%u overlaps %u@%u",
+				     fl->l_type == F_WRLCK ? "write" : "read",
+				     (int)fl->l_len, (int)fl->l_start,
+				     i->len, (int)i->off);
+				locking_errors++;
+			}
+		}
+
+		if (ret == 0) {
+			new = malloc(sizeof *new);
+			new->off = fl->l_start;
+			new->len = fl->l_len;
+			new->type = fl->l_type;
+			new->next = locks;
+			locks = new;
+		}
+	}
+done:
+	if (ret == 0 && fl->l_type == F_UNLCK && unlock_callback)
+		unlock_callback(fd);
+	return ret;
+}
+
+unsigned int forget_locking(void)
+{
+	unsigned int num = 0;
+	while (locks) {
+		struct lock *next = locks->next;
+		free(locks);
+		locks = next;
+		num++;
+	}
+	return num;
+}
diff --git a/lib/tdb2/test/lock-tracking.h b/lib/tdb2/test/lock-tracking.h
new file mode 100644
index 0000000000..f2c9c44653
--- /dev/null
+++ b/lib/tdb2/test/lock-tracking.h
@@ -0,0 +1,25 @@
+#ifndef LOCK_TRACKING_H
+#define LOCK_TRACKING_H
+#include <stdbool.h>
+
+/* Set this if you want a callback after fnctl unlock. */
+extern void (*unlock_callback)(int fd);
+
+/* Replacement fcntl. */
+int fcntl_with_lockcheck(int fd, int cmd, ... /* arg */ );
+
+/* Discard locking info: returns number of locks outstanding. */
+unsigned int forget_locking(void);
+
+/* Number of errors in locking. */
+extern int locking_errors;
+
+/* Suppress lock checking. */
+extern bool suppress_lockcheck;
+
+/* Make all locks non-blocking. */
+extern bool nonblocking_locks;
+
+/* Number of times we failed a lock because we made it non-blocking. */
+extern int locking_would_block;
+#endif /* LOCK_TRACKING_H */
diff --git a/lib/tdb2/test/logging.c b/lib/tdb2/test/logging.c
new file mode 100644
index 0000000000..d32cfa9b59
--- /dev/null
+++ b/lib/tdb2/test/logging.c
@@ -0,0 +1,24 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <ccan/tap/tap.h>
+#include "logging.h"
+
+unsigned tap_log_messages;
+const char *log_prefix = "";
+bool suppress_logging;
+
+union tdb_attribute tap_log_attr = {
+	.log = { .base = { .attr = TDB_ATTRIBUTE_LOG },
+		 .fn = tap_log_fn }
+};
+
+void tap_log_fn(struct tdb_context *tdb,
+		enum tdb_log_level level,
+		const char *message, void *priv)
+{
+	if (suppress_logging)
+		return;
+
+	diag("tdb log level %u: %s%s", level, log_prefix, message);
+	tap_log_messages++;
+}
diff --git a/lib/tdb2/test/logging.h b/lib/tdb2/test/logging.h
new file mode 100644
index 0000000000..d172f867fd
--- /dev/null
+++ b/lib/tdb2/test/logging.h
@@ -0,0 +1,15 @@
+#ifndef TDB2_TEST_LOGGING_H
+#define TDB2_TEST_LOGGING_H
+#include <ccan/tdb2/tdb2.h>
+#include <stdbool.h>
+#include <string.h>
+
+extern bool suppress_logging;
+extern const char *log_prefix;
+extern unsigned tap_log_messages;
+extern union tdb_attribute tap_log_attr;
+
+void tap_log_fn(struct tdb_context *tdb,
+		enum tdb_log_level level,
+		const char *message, void *priv);
+#endif /* TDB2_TEST_LOGGING_H */
diff --git a/lib/tdb2/test/run-001-encode.c b/lib/tdb2/test/run-001-encode.c
new file mode 100644
index 0000000000..ffa4b93c02
--- /dev/null
+++ b/lib/tdb2/test/run-001-encode.c
@@ -0,0 +1,48 @@
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tap/tap.h>
+#include "logging.h"
+
+int main(int argc, char *argv[])
+{
+	unsigned int i;
+	struct tdb_used_record rec;
+	struct tdb_context tdb = { .log_fn = tap_log_fn };
+
+	plan_tests(64 + 32 + 48*6 + 1);
+
+	/* We should be able to encode any data value. */
+	for (i = 0; i < 64; i++)
+		ok1(set_header(&tdb, &rec, TDB_USED_MAGIC, 0, 1ULL << i,
+			       1ULL << i, 0) == 0);
+
+	/* And any key and data with < 64 bits between them. */
+	for (i = 0; i < 32; i++) {
+		tdb_len_t dlen = 1ULL >> (63 - i), klen = 1ULL << i;
+		ok1(set_header(&tdb, &rec, TDB_USED_MAGIC, klen, dlen,
+			       klen + dlen, 0)  == 0);
+	}
+
+	/* We should neatly encode all values. */
+	for (i = 0; i < 48; i++) {
+		uint64_t h = 1ULL << (i < 5 ? i : 4);
+		uint64_t klen = 1ULL << (i < 16 ? i : 15);
+		uint64_t dlen = 1ULL << i;
+		uint64_t xlen = 1ULL << (i < 32 ? i : 31);
+		ok1(set_header(&tdb, &rec, TDB_USED_MAGIC, klen, dlen,
+			       klen+dlen+xlen, h) == 0);
+		ok1(rec_key_length(&rec) == klen);
+		ok1(rec_data_length(&rec) == dlen);
+		ok1(rec_extra_padding(&rec) == xlen);
+		ok1((uint64_t)rec_hash(&rec) == h);
+		ok1(rec_magic(&rec) == TDB_USED_MAGIC);
+	}
+	ok1(tap_log_messages == 0);
+	return exit_status();
+}
diff --git a/lib/tdb2/test/run-001-fls.c b/lib/tdb2/test/run-001-fls.c
new file mode 100644
index 0000000000..d54cad1d1c
--- /dev/null
+++ b/lib/tdb2/test/run-001-fls.c
@@ -0,0 +1,40 @@
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tap/tap.h>
+
+static unsigned int dumb_fls(uint64_t num)
+{
+	int i;
+
+	for (i = 63; i >= 0; i--) {
+		if (num & (1ULL << i))
+			break;
+	}
+	return i + 1;
+}
+
+int main(int argc, char *argv[])
+{
+	unsigned int i, j;
+
+	plan_tests(64 * 64 + 2);
+
+	ok1(fls64(0) == 0);
+	ok1(dumb_fls(0) == 0);
+
+	for (i = 0; i < 64; i++) {
+		for (j = 0; j < 64; j++) {
+			uint64_t val = (1ULL << i) | (1ULL << j);
+			ok(fls64(val) == dumb_fls(val),
+			   "%llu -> %u should be %u", (long long)val,
+			   fls64(val), dumb_fls(val));
+		}
+	}
+	return exit_status();
+}
diff --git a/lib/tdb2/test/run-01-new_database.c b/lib/tdb2/test/run-01-new_database.c
new file mode 100644
index 0000000000..32ebaf09c1
--- /dev/null
+++ b/lib/tdb2/test/run-01-new_database.c
@@ -0,0 +1,42 @@
+#include <ccan/failtest/failtest_override.h>
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tap/tap.h>
+#include <ccan/failtest/failtest.h>
+#include "logging.h"
+#include "failtest_helper.h"
+
+int main(int argc, char *argv[])
+{
+	unsigned int i;
+	struct tdb_context *tdb;
+	int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP,
+			TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT,
+			TDB_NOMMAP|TDB_CONVERT };
+
+	failtest_init(argc, argv);
+	failtest_hook = block_repeat_failures;
+	failtest_exit_check = exit_check_log;
+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 3);
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		tdb = tdb_open("run-new_database.tdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+		if (!ok1(tdb))
+			failtest_exit(exit_status());
+		if (tdb) {
+			bool ok = ok1(tdb_check(tdb, NULL, NULL) == 0);
+			tdb_close(tdb);
+			if (!ok)
+				failtest_exit(exit_status());
+		}
+		if (!ok1(tap_log_messages == 0))
+			break;
+	}
+	failtest_exit(exit_status());
+}
diff --git a/lib/tdb2/test/run-02-expand.c b/lib/tdb2/test/run-02-expand.c
new file mode 100644
index 0000000000..6666ae167e
--- /dev/null
+++ b/lib/tdb2/test/run-02-expand.c
@@ -0,0 +1,80 @@
+#include <ccan/failtest/failtest_override.h>
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tap/tap.h>
+#include <ccan/failtest/failtest.h>
+#include "logging.h"
+#include "failtest_helper.h"
+
+static bool failtest_suppress = false;
+
+/* Don't need to test everything here, just want expand testing. */
+static enum failtest_result
+suppress_failure(struct failtest_call *history, unsigned num)
+{
+	if (failtest_suppress)
+		return FAIL_DONT_FAIL;
+	return block_repeat_failures(history, num);
+}
+
+int main(int argc, char *argv[])
+{
+	unsigned int i;
+	uint64_t val;
+	struct tdb_context *tdb;
+	int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP,
+			TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT,
+			TDB_NOMMAP|TDB_CONVERT };
+
+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 11 + 1);
+
+	failtest_init(argc, argv);
+	failtest_hook = suppress_failure;
+	failtest_exit_check = exit_check_log;
+
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		failtest_suppress = true;
+		tdb = tdb_open("run-expand.tdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+		if (!ok1(tdb))
+			break;
+
+		val = tdb->file->map_size;
+		/* Need some hash lock for expand. */
+		ok1(tdb_lock_hashes(tdb, 0, 1, F_WRLCK, TDB_LOCK_WAIT) == 0);
+		failtest_suppress = false;
+		if (!ok1(tdb_expand(tdb, 1) == 0)) {
+			failtest_suppress = true;
+			tdb_close(tdb);
+			break;
+		}
+		failtest_suppress = true;
+
+		ok1(tdb->file->map_size >= val + 1 * TDB_EXTENSION_FACTOR);
+		ok1(tdb_unlock_hashes(tdb, 0, 1, F_WRLCK) == 0);
+		ok1(tdb_check(tdb, NULL, NULL) == 0);
+
+		val = tdb->file->map_size;
+		ok1(tdb_lock_hashes(tdb, 0, 1, F_WRLCK, TDB_LOCK_WAIT) == 0);
+		failtest_suppress = false;
+		if (!ok1(tdb_expand(tdb, 1024) == 0)) {
+			failtest_suppress = true;
+			tdb_close(tdb);
+			break;
+		}
+		failtest_suppress = true;
+		ok1(tdb_unlock_hashes(tdb, 0, 1, F_WRLCK) == 0);
+		ok1(tdb->file->map_size >= val + 1024 * TDB_EXTENSION_FACTOR);
+		ok1(tdb_check(tdb, NULL, NULL) == 0);
+		tdb_close(tdb);
+	}
+
+	ok1(tap_log_messages == 0);
+	failtest_exit(exit_status());
+}
diff --git a/lib/tdb2/test/run-03-coalesce.c b/lib/tdb2/test/run-03-coalesce.c
new file mode 100644
index 0000000000..3fdd11c077
--- /dev/null
+++ b/lib/tdb2/test/run-03-coalesce.c
@@ -0,0 +1,170 @@
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tap/tap.h>
+#include "logging.h"
+#include "layout.h"
+
+static tdb_len_t free_record_length(struct tdb_context *tdb, tdb_off_t off)
+{
+	struct tdb_free_record f;
+	enum TDB_ERROR ecode;
+
+	ecode = tdb_read_convert(tdb, off, &f, sizeof(f));
+	if (ecode != TDB_SUCCESS)
+		return ecode;
+	if (frec_magic(&f) != TDB_FREE_MAGIC)
+		return TDB_ERR_CORRUPT;
+	return frec_len(&f);
+}
+
+int main(int argc, char *argv[])
+{
+	tdb_off_t b_off, test;
+	struct tdb_context *tdb;
+	struct tdb_layout *layout;
+	struct tdb_data data, key;
+	tdb_len_t len;
+
+	/* FIXME: Test TDB_CONVERT */
+	/* FIXME: Test lock order fail. */
+
+	plan_tests(42);
+	data = tdb_mkdata("world", 5);
+	key = tdb_mkdata("hello", 5);
+
+	/* No coalescing can be done due to EOF */
+	layout = new_tdb_layout("run-03-coalesce.tdb");
+	tdb_layout_add_freetable(layout);
+	len = 1024;
+	tdb_layout_add_free(layout, len, 0);
+	tdb = tdb_layout_get(layout);
+	ok1(tdb_check(tdb, NULL, NULL) == 0);
+	ok1(free_record_length(tdb, layout->elem[1].base.off) == len);
+
+	/* Figure out which bucket free entry is. */
+	b_off = bucket_off(tdb->ftable_off, size_to_bucket(len));
+	/* Lock and fail to coalesce. */
+	ok1(tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) == 0);
+	test = layout->elem[1].base.off;
+	ok1(coalesce(tdb, layout->elem[1].base.off, b_off, len, &test)
+	    == 0);
+	tdb_unlock_free_bucket(tdb, b_off);
+	ok1(free_record_length(tdb, layout->elem[1].base.off) == len);
+	ok1(test == layout->elem[1].base.off);
+	ok1(tdb_check(tdb, NULL, NULL) == 0);
+	tdb_close(tdb);
+	tdb_layout_free(layout);
+
+	/* No coalescing can be done due to used record */
+	layout = new_tdb_layout("run-03-coalesce.tdb");
+	tdb_layout_add_freetable(layout);
+	tdb_layout_add_free(layout, 1024, 0);
+	tdb_layout_add_used(layout, key, data, 6);
+	tdb = tdb_layout_get(layout);
+	ok1(free_record_length(tdb, layout->elem[1].base.off) == 1024);
+	ok1(tdb_check(tdb, NULL, NULL) == 0);
+
+	/* Figure out which bucket free entry is. */
+	b_off = bucket_off(tdb->ftable_off, size_to_bucket(1024));
+	/* Lock and fail to coalesce. */
+	ok1(tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) == 0);
+	test = layout->elem[1].base.off;
+	ok1(coalesce(tdb, layout->elem[1].base.off, b_off, 1024, &test)
+	    == 0);
+	tdb_unlock_free_bucket(tdb, b_off);
+	ok1(free_record_length(tdb, layout->elem[1].base.off) == 1024);
+	ok1(test == layout->elem[1].base.off);
+	ok1(tdb_check(tdb, NULL, NULL) == 0);
+	tdb_close(tdb);
+	tdb_layout_free(layout);
+
+	/* Coalescing can be done due to two free records, then EOF */
+	layout = new_tdb_layout("run-03-coalesce.tdb");
+	tdb_layout_add_freetable(layout);
+	tdb_layout_add_free(layout, 1024, 0);
+	tdb_layout_add_free(layout, 2048, 0);
+	tdb = tdb_layout_get(layout);
+	ok1(free_record_length(tdb, layout->elem[1].base.off) == 1024);
+	ok1(free_record_length(tdb, layout->elem[2].base.off) == 2048);
+	ok1(tdb_check(tdb, NULL, NULL) == 0);
+
+	/* Figure out which bucket (first) free entry is. */
+	b_off = bucket_off(tdb->ftable_off, size_to_bucket(1024));
+	/* Lock and coalesce. */
+	ok1(tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) == 0);
+	test = layout->elem[2].base.off;
+	ok1(coalesce(tdb, layout->elem[1].base.off, b_off, 1024, &test)
+	    == 1024 + sizeof(struct tdb_used_record) + 2048);
+	/* Should tell us it's erased this one... */
+	ok1(test == TDB_ERR_NOEXIST);
+	ok1(tdb->file->allrecord_lock.count == 0 && tdb->file->num_lockrecs == 0);
+	ok1(free_record_length(tdb, layout->elem[1].base.off)
+	    == 1024 + sizeof(struct tdb_used_record) + 2048);
+	ok1(tdb_check(tdb, NULL, NULL) == 0);
+	tdb_close(tdb);
+	tdb_layout_free(layout);
+
+	/* Coalescing can be done due to two free records, then data */
+	layout = new_tdb_layout("run-03-coalesce.tdb");
+	tdb_layout_add_freetable(layout);
+	tdb_layout_add_free(layout, 1024, 0);
+	tdb_layout_add_free(layout, 512, 0);
+	tdb_layout_add_used(layout, key, data, 6);
+	tdb = tdb_layout_get(layout);
+	ok1(free_record_length(tdb, layout->elem[1].base.off) == 1024);
+	ok1(free_record_length(tdb, layout->elem[2].base.off) == 512);
+	ok1(tdb_check(tdb, NULL, NULL) == 0);
+
+	/* Figure out which bucket free entry is. */
+	b_off = bucket_off(tdb->ftable_off, size_to_bucket(1024));
+	/* Lock and coalesce. */
+	ok1(tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) == 0);
+	test = layout->elem[2].base.off;
+	ok1(coalesce(tdb, layout->elem[1].base.off, b_off, 1024, &test)
+	    == 1024 + sizeof(struct tdb_used_record) + 512);
+	ok1(tdb->file->allrecord_lock.count == 0 && tdb->file->num_lockrecs == 0);
+	ok1(free_record_length(tdb, layout->elem[1].base.off)
+	    == 1024 + sizeof(struct tdb_used_record) + 512);
+	ok1(test == TDB_ERR_NOEXIST);
+	ok1(tdb_check(tdb, NULL, NULL) == 0);
+	tdb_close(tdb);
+	tdb_layout_free(layout);
+
+	/* Coalescing can be done due to three free records, then EOF */
+	layout = new_tdb_layout("run-03-coalesce.tdb");
+	tdb_layout_add_freetable(layout);
+	tdb_layout_add_free(layout, 1024, 0);
+	tdb_layout_add_free(layout, 512, 0);
+	tdb_layout_add_free(layout, 256, 0);
+	tdb = tdb_layout_get(layout);
+	ok1(free_record_length(tdb, layout->elem[1].base.off) == 1024);
+	ok1(free_record_length(tdb, layout->elem[2].base.off) == 512);
+	ok1(free_record_length(tdb, layout->elem[3].base.off) == 256);
+	ok1(tdb_check(tdb, NULL, NULL) == 0);
+
+	/* Figure out which bucket free entry is. */
+	b_off = bucket_off(tdb->ftable_off, size_to_bucket(1024));
+	/* Lock and coalesce. */
+	ok1(tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) == 0);
+	test = layout->elem[2].base.off;
+	ok1(coalesce(tdb, layout->elem[1].base.off, b_off, 1024, &test)
+	    == 1024 + sizeof(struct tdb_used_record) + 512
+	    + sizeof(struct tdb_used_record) + 256);
+	ok1(tdb->file->allrecord_lock.count == 0
+	    && tdb->file->num_lockrecs == 0);
+	ok1(free_record_length(tdb, layout->elem[1].base.off)
+	    == 1024 + sizeof(struct tdb_used_record) + 512
+	    + sizeof(struct tdb_used_record) + 256);
+	ok1(tdb_check(tdb, NULL, NULL) == 0);
+	tdb_close(tdb);
+	tdb_layout_free(layout);
+
+	ok1(tap_log_messages == 0);
+	return exit_status();
+}
diff --git a/lib/tdb2/test/run-04-basichash.c b/lib/tdb2/test/run-04-basichash.c
new file mode 100644
index 0000000000..62031bdb40
--- /dev/null
+++ b/lib/tdb2/test/run-04-basichash.c
@@ -0,0 +1,267 @@
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tap/tap.h>
+#include "logging.h"
+
+/* We rig the hash so adjacent-numbered records always clash. */
+static uint64_t clash(const void *key, size_t len, uint64_t seed, void *priv)
+{
+	return ((uint64_t)*(const unsigned int *)key)
+		<< (64 - TDB_TOPLEVEL_HASH_BITS - 1);
+}
+
+int main(int argc, char *argv[])
+{
+	unsigned int i, j;
+	struct tdb_context *tdb;
+	unsigned int v;
+	struct tdb_used_record rec;
+	struct tdb_data key = { (unsigned char *)&v, sizeof(v) };
+	struct tdb_data dbuf = { (unsigned char *)&v, sizeof(v) };
+	union tdb_attribute hattr = { .hash = { .base = { TDB_ATTRIBUTE_HASH },
+						.fn = clash } };
+	int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP,
+			TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT,
+			TDB_NOMMAP|TDB_CONVERT,
+	};
+
+	hattr.base.next = &tap_log_attr;
+
+	plan_tests(sizeof(flags) / sizeof(flags[0])
+		   * (91 + (2 * ((1 << TDB_HASH_GROUP_BITS) - 1))) + 1);
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		struct hash_info h;
+		tdb_off_t new_off, off, subhash;
+
+		tdb = tdb_open("run-04-basichash.tdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &hattr);
+		ok1(tdb);
+		if (!tdb)
+			continue;
+
+		v = 0;
+		/* Should not find it. */
+		ok1(find_and_lock(tdb, key, F_WRLCK, &h, &rec, NULL) == 0);
+		/* Should have created correct hash. */
+		ok1(h.h == tdb_hash(tdb, key.dptr, key.dsize));
+		/* Should have located space in group 0, bucket 0. */
+		ok1(h.group_start == offsetof(struct tdb_header, hashtable));
+		ok1(h.home_bucket == 0);
+		ok1(h.found_bucket == 0);
+		ok1(h.hash_used == TDB_TOPLEVEL_HASH_BITS);
+
+		/* Should have lock on bucket 0 */
+		ok1(h.hlock_start == 0);
+		ok1(h.hlock_range ==
+		    1ULL << (64-(TDB_TOPLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS)));
+		ok1((tdb->flags & TDB_NOLOCK) || tdb->file->num_lockrecs == 1);
+		ok1((tdb->flags & TDB_NOLOCK)
+		    || tdb->file->lockrecs[0].off == TDB_HASH_LOCK_START);
+		/* FIXME: Check lock length */
+
+		/* Allocate a new record. */
+		new_off = alloc(tdb, key.dsize, dbuf.dsize, h.h,
+				TDB_USED_MAGIC, false);
+		ok1(!TDB_OFF_IS_ERR(new_off));
+
+		/* We should be able to add it now. */
+		ok1(add_to_hash(tdb, &h, new_off) == 0);
+
+		/* Make sure we fill it in for later finding. */
+		off = new_off + sizeof(struct tdb_used_record);
+		ok1(!tdb->methods->twrite(tdb, off, key.dptr, key.dsize));
+		off += key.dsize;
+		ok1(!tdb->methods->twrite(tdb, off, dbuf.dptr, dbuf.dsize));
+
+		/* We should be able to unlock that OK. */
+		ok1(tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range,
+				      F_WRLCK) == 0);
+
+		/* Database should be consistent. */
+		ok1(tdb_check(tdb, NULL, NULL) == 0);
+
+		/* Now, this should give a successful lookup. */
+		ok1(find_and_lock(tdb, key, F_WRLCK, &h, &rec, NULL)
+		    == new_off);
+		/* Should have created correct hash. */
+		ok1(h.h == tdb_hash(tdb, key.dptr, key.dsize));
+		/* Should have located space in group 0, bucket 0. */
+		ok1(h.group_start == offsetof(struct tdb_header, hashtable));
+		ok1(h.home_bucket == 0);
+		ok1(h.found_bucket == 0);
+		ok1(h.hash_used == TDB_TOPLEVEL_HASH_BITS);
+
+		/* Should have lock on bucket 0 */
+		ok1(h.hlock_start == 0);
+		ok1(h.hlock_range ==
+		    1ULL << (64-(TDB_TOPLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS)));
+		ok1((tdb->flags & TDB_NOLOCK) || tdb->file->num_lockrecs == 1);
+		ok1((tdb->flags & TDB_NOLOCK)
+		    || tdb->file->lockrecs[0].off == TDB_HASH_LOCK_START);
+		/* FIXME: Check lock length */
+
+		ok1(tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range,
+				      F_WRLCK) == 0);
+
+		/* Database should be consistent. */
+		ok1(tdb_check(tdb, NULL, NULL) == 0);
+
+		/* Test expansion. */
+		v = 1;
+		ok1(find_and_lock(tdb, key, F_WRLCK, &h, &rec, NULL) == 0);
+		/* Should have created correct hash. */
+		ok1(h.h == tdb_hash(tdb, key.dptr, key.dsize));
+		/* Should have located space in group 0, bucket 1. */
+		ok1(h.group_start == offsetof(struct tdb_header, hashtable));
+		ok1(h.home_bucket == 0);
+		ok1(h.found_bucket == 1);
+		ok1(h.hash_used == TDB_TOPLEVEL_HASH_BITS);
+
+		/* Should have lock on bucket 0 */
+		ok1(h.hlock_start == 0);
+		ok1(h.hlock_range ==
+		    1ULL << (64-(TDB_TOPLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS)));
+		ok1((tdb->flags & TDB_NOLOCK) || tdb->file->num_lockrecs == 1);
+		ok1((tdb->flags & TDB_NOLOCK)
+		    || tdb->file->lockrecs[0].off == TDB_HASH_LOCK_START);
+		/* FIXME: Check lock length */
+
+		/* Make it expand 0'th bucket. */
+		ok1(expand_group(tdb, &h) == 0);
+		/* First one should be subhash, next should be empty. */
+		ok1(is_subhash(h.group[0]));
+		subhash = (h.group[0] & TDB_OFF_MASK);
+		for (j = 1; j < (1 << TDB_HASH_GROUP_BITS); j++)
+			ok1(h.group[j] == 0);
+
+		ok1(tdb_write_convert(tdb, h.group_start,
+				      h.group, sizeof(h.group)) == 0);
+		ok1(tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range,
+				      F_WRLCK) == 0);
+
+		/* Should be happy with expansion. */
+		ok1(tdb_check(tdb, NULL, NULL) == 0);
+
+		/* Should be able to find it. */
+		v = 0;
+		ok1(find_and_lock(tdb, key, F_WRLCK, &h, &rec, NULL)
+		    == new_off);
+		/* Should have created correct hash. */
+		ok1(h.h == tdb_hash(tdb, key.dptr, key.dsize));
+		/* Should have located space in expanded group 0, bucket 0. */
+		ok1(h.group_start == subhash + sizeof(struct tdb_used_record));
+		ok1(h.home_bucket == 0);
+		ok1(h.found_bucket == 0);
+		ok1(h.hash_used == TDB_TOPLEVEL_HASH_BITS
+		    + TDB_SUBLEVEL_HASH_BITS);
+
+		/* Should have lock on bucket 0 */
+		ok1(h.hlock_start == 0);
+		ok1(h.hlock_range ==
+		    1ULL << (64-(TDB_TOPLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS)));
+		ok1((tdb->flags & TDB_NOLOCK) || tdb->file->num_lockrecs == 1);
+		ok1((tdb->flags & TDB_NOLOCK)
+		    || tdb->file->lockrecs[0].off == TDB_HASH_LOCK_START);
+		/* FIXME: Check lock length */
+
+		/* Simple delete should work. */
+		ok1(delete_from_hash(tdb, &h) == 0);
+		ok1(add_free_record(tdb, new_off,
+				    sizeof(struct tdb_used_record)
+				    + rec_key_length(&rec)
+				    + rec_data_length(&rec)
+				    + rec_extra_padding(&rec),
+				    TDB_LOCK_NOWAIT, false) == 0);
+		ok1(tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range,
+				      F_WRLCK) == 0);
+		ok1(tdb_check(tdb, NULL, NULL) == 0);
+
+		/* Test second-level expansion: should expand 0th bucket. */
+		v = 0;
+		ok1(find_and_lock(tdb, key, F_WRLCK, &h, &rec, NULL) == 0);
+		/* Should have created correct hash. */
+		ok1(h.h == tdb_hash(tdb, key.dptr, key.dsize));
+		/* Should have located space in group 0, bucket 0. */
+		ok1(h.group_start == subhash + sizeof(struct tdb_used_record));
+		ok1(h.home_bucket == 0);
+		ok1(h.found_bucket == 0);
+		ok1(h.hash_used == TDB_TOPLEVEL_HASH_BITS+TDB_SUBLEVEL_HASH_BITS);
+
+		/* Should have lock on bucket 0 */
+		ok1(h.hlock_start == 0);
+		ok1(h.hlock_range ==
+		    1ULL << (64-(TDB_TOPLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS)));
+		ok1((tdb->flags & TDB_NOLOCK) || tdb->file->num_lockrecs == 1);
+		ok1((tdb->flags & TDB_NOLOCK)
+		    || tdb->file->lockrecs[0].off == TDB_HASH_LOCK_START);
+		/* FIXME: Check lock length */
+
+		ok1(expand_group(tdb, &h) == 0);
+		/* First one should be subhash, next should be empty. */
+		ok1(is_subhash(h.group[0]));
+		subhash = (h.group[0] & TDB_OFF_MASK);
+		for (j = 1; j < (1 << TDB_HASH_GROUP_BITS); j++)
+			ok1(h.group[j] == 0);
+		ok1(tdb_write_convert(tdb, h.group_start,
+				      h.group, sizeof(h.group)) == 0);
+		ok1(tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range,
+				      F_WRLCK) == 0);
+
+		/* Should be happy with expansion. */
+		ok1(tdb_check(tdb, NULL, NULL) == 0);
+
+		ok1(find_and_lock(tdb, key, F_WRLCK, &h, &rec, NULL) == 0);
+		/* Should have created correct hash. */
+		ok1(h.h == tdb_hash(tdb, key.dptr, key.dsize));
+		/* Should have located space in group 0, bucket 0. */
+		ok1(h.group_start == subhash + sizeof(struct tdb_used_record));
+		ok1(h.home_bucket == 0);
+		ok1(h.found_bucket == 0);
+		ok1(h.hash_used == TDB_TOPLEVEL_HASH_BITS
+		    + TDB_SUBLEVEL_HASH_BITS * 2);
+
+		/* We should be able to add it now. */
+		/* Allocate a new record. */
+		new_off = alloc(tdb, key.dsize, dbuf.dsize, h.h,
+				TDB_USED_MAGIC, false);
+		ok1(!TDB_OFF_IS_ERR(new_off));
+		ok1(add_to_hash(tdb, &h, new_off) == 0);
+
+		/* Make sure we fill it in for later finding. */
+		off = new_off + sizeof(struct tdb_used_record);
+		ok1(!tdb->methods->twrite(tdb, off, key.dptr, key.dsize));
+		off += key.dsize;
+		ok1(!tdb->methods->twrite(tdb, off, dbuf.dptr, dbuf.dsize));
+
+		/* We should be able to unlock that OK. */
+		ok1(tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range,
+				      F_WRLCK) == 0);
+
+		/* Database should be consistent. */
+		ok1(tdb_check(tdb, NULL, NULL) == 0);
+
+		/* Should be able to find it. */
+		v = 0;
+		ok1(find_and_lock(tdb, key, F_WRLCK, &h, &rec, NULL)
+		    == new_off);
+		/* Should have created correct hash. */
+		ok1(h.h == tdb_hash(tdb, key.dptr, key.dsize));
+		/* Should have located space in expanded group 0, bucket 0. */
+		ok1(h.group_start == subhash + sizeof(struct tdb_used_record));
+		ok1(h.home_bucket == 0);
+		ok1(h.found_bucket == 0);
+		ok1(h.hash_used == TDB_TOPLEVEL_HASH_BITS
+		    + TDB_SUBLEVEL_HASH_BITS * 2);
+
+		tdb_close(tdb);
+	}
+
+	ok1(tap_log_messages == 0);
+	return exit_status();
+}
diff --git a/lib/tdb2/test/run-05-readonly-open.c b/lib/tdb2/test/run-05-readonly-open.c
new file mode 100644
index 0000000000..0f1a4343d8
--- /dev/null
+++ b/lib/tdb2/test/run-05-readonly-open.c
@@ -0,0 +1,88 @@
+#include <ccan/failtest/failtest_override.h>
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tap/tap.h>
+#include <ccan/failtest/failtest.h>
+#include "logging.h"
+#include "failtest_helper.h"
+
+static bool failtest_suppress = false;
+
+/* Don't need to test everything here, just want expand testing. */
+static enum failtest_result
+suppress_failure(struct failtest_call *history, unsigned num)
+{
+	if (failtest_suppress)
+		return FAIL_DONT_FAIL;
+	return block_repeat_failures(history, num);
+}
+
+int main(int argc, char *argv[])
+{
+	unsigned int i;
+	struct tdb_context *tdb;
+	int flags[] = { TDB_DEFAULT, TDB_NOMMAP,
+			TDB_CONVERT, TDB_NOMMAP|TDB_CONVERT };
+	struct tdb_data key = tdb_mkdata("key", 3);
+	struct tdb_data data = tdb_mkdata("data", 4), d;
+	union tdb_attribute seed_attr;
+	unsigned int msgs = 0;
+
+	failtest_init(argc, argv);
+	failtest_hook = suppress_failure;
+	failtest_exit_check = exit_check_log;
+
+	seed_attr.base.attr = TDB_ATTRIBUTE_SEED;
+	seed_attr.base.next = &tap_log_attr;
+	seed_attr.seed.seed = 0;
+
+	failtest_suppress = true;
+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 11);
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		tdb = tdb_open("run-05-readonly-open.tdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &seed_attr);
+		ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0);
+		tdb_close(tdb);
+
+		failtest_suppress = false;
+		tdb = tdb_open("run-05-readonly-open.tdb", flags[i],
+			       O_RDONLY, 0600, &tap_log_attr);
+		if (!ok1(tdb))
+			break;
+		ok1(tap_log_messages == msgs);
+		/* Fetch should succeed, stores should fail. */
+		if (!ok1(tdb_fetch(tdb, key, &d) == 0))
+			goto fail;
+		ok1(tdb_deq(d, data));
+		free(d.dptr);
+		if (!ok1(tdb_store(tdb, key, data, TDB_MODIFY)
+			 == TDB_ERR_RDONLY))
+			goto fail;
+		ok1(tap_log_messages == ++msgs);
+		if (!ok1(tdb_store(tdb, key, data, TDB_INSERT)
+			 == TDB_ERR_RDONLY))
+			goto fail;
+		ok1(tap_log_messages == ++msgs);
+		failtest_suppress = true;
+		ok1(tdb_check(tdb, NULL, NULL) == 0);
+		tdb_close(tdb);
+		ok1(tap_log_messages == msgs);
+		/* SIGH: failtest bug, it doesn't save the tdb file because
+		 * we have it read-only.  If we go around again, it gets
+		 * changed underneath us and things get screwy. */
+		if (failtest_has_failed())
+			break;
+	}
+	failtest_exit(exit_status());
+
+fail:
+	failtest_suppress = true;
+	tdb_close(tdb);
+	failtest_exit(exit_status());
+}
diff --git a/lib/tdb2/test/run-10-simple-store.c b/lib/tdb2/test/run-10-simple-store.c
new file mode 100644
index 0000000000..35c387a3be
--- /dev/null
+++ b/lib/tdb2/test/run-10-simple-store.c
@@ -0,0 +1,76 @@
+#include <ccan/failtest/failtest_override.h>
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tap/tap.h>
+#include <ccan/failtest/failtest.h>
+#include "logging.h"
+#include "failtest_helper.h"
+
+static bool failtest_suppress = false;
+
+/* Don't need to test everything here, just want expand testing. */
+static enum failtest_result
+suppress_failure(struct failtest_call *history, unsigned num)
+{
+	if (failtest_suppress)
+		return FAIL_DONT_FAIL;
+	return block_repeat_failures(history, num);
+}
+
+int main(int argc, char *argv[])
+{
+	unsigned int i;
+	struct tdb_context *tdb;
+	int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP,
+			TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT,
+			TDB_NOMMAP|TDB_CONVERT };
+	struct tdb_data key = tdb_mkdata("key", 3);
+	struct tdb_data data = tdb_mkdata("data", 4);
+
+	failtest_init(argc, argv);
+	failtest_hook = suppress_failure;
+	failtest_exit_check = exit_check_log;
+
+	failtest_suppress = true;
+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 7 + 1);
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		tdb = tdb_open("run-10-simple-store.tdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+		if (!ok1(tdb))
+			break;
+		/* Modify should fail. */
+		failtest_suppress = false;
+		if (!ok1(tdb_store(tdb, key, data, TDB_MODIFY)
+			 == TDB_ERR_NOEXIST))
+			goto fail;
+		failtest_suppress = true;
+		ok1(tdb_check(tdb, NULL, NULL) == 0);
+		/* Insert should succeed. */
+		failtest_suppress = false;
+		if (!ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0))
+			goto fail;
+		failtest_suppress = true;
+		ok1(tdb_check(tdb, NULL, NULL) == 0);
+		/* Second insert should fail. */
+		failtest_suppress = false;
+		if (!ok1(tdb_store(tdb, key, data, TDB_INSERT)
+			 == TDB_ERR_EXISTS))
+			goto fail;
+		failtest_suppress = true;
+		ok1(tdb_check(tdb, NULL, NULL) == 0);
+		tdb_close(tdb);
+	}
+	ok1(tap_log_messages == 0);
+	failtest_exit(exit_status());
+
+fail:
+	failtest_suppress = true;
+	tdb_close(tdb);
+	failtest_exit(exit_status());
+}
diff --git a/lib/tdb2/test/run-11-simple-fetch.c b/lib/tdb2/test/run-11-simple-fetch.c
new file mode 100644
index 0000000000..29b6bf0872
--- /dev/null
+++ b/lib/tdb2/test/run-11-simple-fetch.c
@@ -0,0 +1,76 @@
+#include <ccan/failtest/failtest_override.h>
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tap/tap.h>
+#include <ccan/failtest/failtest.h>
+#include "logging.h"
+#include "failtest_helper.h"
+
+static bool failtest_suppress = false;
+
+/* Don't need to test everything here, just want fetch testing. */
+static enum failtest_result
+suppress_failure(struct failtest_call *history, unsigned num)
+{
+	if (failtest_suppress)
+		return FAIL_DONT_FAIL;
+	return block_repeat_failures(history, num);
+}
+
+int main(int argc, char *argv[])
+{
+	unsigned int i;
+	struct tdb_context *tdb;
+	int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP,
+			TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT,
+			TDB_NOMMAP|TDB_CONVERT };
+	struct tdb_data key = tdb_mkdata("key", 3);
+	struct tdb_data data = tdb_mkdata("data", 4);
+
+	failtest_init(argc, argv);
+	failtest_hook = suppress_failure;
+	failtest_exit_check = exit_check_log;
+
+	failtest_suppress = true;
+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 8 + 1);
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		tdb = tdb_open("run-11-simple-fetch.tdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+		ok1(tdb);
+		if (tdb) {
+			struct tdb_data d = { NULL, 0 }; /* Bogus GCC warning */
+
+			/* fetch should fail. */
+			failtest_suppress = false;
+			if (!ok1(tdb_fetch(tdb, key, &d) == TDB_ERR_NOEXIST))
+				goto fail;
+			failtest_suppress = true;
+			ok1(tdb_check(tdb, NULL, NULL) == 0);
+			/* Insert should succeed. */
+			ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0);
+			ok1(tdb_check(tdb, NULL, NULL) == 0);
+			/* Fetch should now work. */
+			failtest_suppress = false;
+			if (!ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS))
+				goto fail;
+			failtest_suppress = true;
+			ok1(tdb_deq(d, data));
+			free(d.dptr);
+			ok1(tdb_check(tdb, NULL, NULL) == 0);
+			tdb_close(tdb);
+		}
+	}
+	ok1(tap_log_messages == 0);
+	return exit_status();
+
+fail:
+	failtest_suppress = true;
+	tdb_close(tdb);
+	failtest_exit(exit_status());
+}
diff --git a/lib/tdb2/test/run-12-store.c b/lib/tdb2/test/run-12-store.c
new file mode 100644
index 0000000000..ba2e4f8971
--- /dev/null
+++ b/lib/tdb2/test/run-12-store.c
@@ -0,0 +1,58 @@
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tap/tap.h>
+#include "logging.h"
+
+/* We use the same seed which we saw a failure on. */
+static uint64_t fixedhash(const void *key, size_t len, uint64_t seed, void *p)
+{
+	return hash64_stable((const unsigned char *)key, len,
+			     *(uint64_t *)p);
+}
+
+int main(int argc, char *argv[])
+{
+	unsigned int i, j;
+	struct tdb_context *tdb;
+	uint64_t seed = 16014841315512641303ULL;
+	union tdb_attribute fixed_hattr
+		= { .hash = { .base = { TDB_ATTRIBUTE_HASH },
+			      .fn = fixedhash,
+			      .data = &seed } };
+	int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP,
+			TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT,
+			TDB_NOMMAP|TDB_CONVERT };
+	struct tdb_data key = { (unsigned char *)&j, sizeof(j) };
+	struct tdb_data data = { (unsigned char *)&j, sizeof(j) };
+
+	fixed_hattr.base.next = &tap_log_attr;
+
+	plan_tests(sizeof(flags) / sizeof(flags[0]) * (1 + 500 * 3) + 1);
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		tdb = tdb_open("run-12-store.tdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &fixed_hattr);
+		ok1(tdb);
+		if (!tdb)
+			continue;
+
+		/* We seemed to lose some keys.
+		 * Insert and check they're in there! */
+		for (j = 0; j < 500; j++) {
+			struct tdb_data d = { NULL, 0 }; /* Bogus GCC warning */
+			ok1(tdb_store(tdb, key, data, TDB_REPLACE) == 0);
+			ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS);
+			ok1(tdb_deq(d, data));
+			free(d.dptr);
+		}
+		tdb_close(tdb);
+	}
+
+	ok1(tap_log_messages == 0);
+	return exit_status();
+}
diff --git a/lib/tdb2/test/run-13-delete.c b/lib/tdb2/test/run-13-delete.c
new file mode 100644
index 0000000000..3b464d927e
--- /dev/null
+++ b/lib/tdb2/test/run-13-delete.c
@@ -0,0 +1,207 @@
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tap/tap.h>
+#include "logging.h"
+
+/* We rig the hash so adjacent-numbered records always clash. */
+static uint64_t clash(const void *key, size_t len, uint64_t seed, void *priv)
+{
+	return ((uint64_t)*(const unsigned int *)key)
+		<< (64 - TDB_TOPLEVEL_HASH_BITS - 1);
+}
+
+/* We use the same seed which we saw a failure on. */
+static uint64_t fixedhash(const void *key, size_t len, uint64_t seed, void *p)
+{
+	return hash64_stable((const unsigned char *)key, len,
+			     *(uint64_t *)p);
+}
+
+static bool store_records(struct tdb_context *tdb)
+{
+	int i;
+	struct tdb_data key = { (unsigned char *)&i, sizeof(i) };
+	struct tdb_data d, data = { (unsigned char *)&i, sizeof(i) };
+
+	for (i = 0; i < 1000; i++) {
+		if (tdb_store(tdb, key, data, TDB_REPLACE) != 0)
+			return false;
+		tdb_fetch(tdb, key, &d);
+		if (!tdb_deq(d, data))
+			return false;
+		free(d.dptr);
+	}
+	return true;
+}
+
+static void test_val(struct tdb_context *tdb, uint64_t val)
+{
+	uint64_t v;
+	struct tdb_data key = { (unsigned char *)&v, sizeof(v) };
+	struct tdb_data d, data = { (unsigned char *)&v, sizeof(v) };
+
+	/* Insert an entry, then delete it. */
+	v = val;
+	/* Delete should fail. */
+	ok1(tdb_delete(tdb, key) == TDB_ERR_NOEXIST);
+	ok1(tdb_check(tdb, NULL, NULL) == 0);
+
+	/* Insert should succeed. */
+	ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0);
+	ok1(tdb_check(tdb, NULL, NULL) == 0);
+
+	/* Delete should succeed. */
+	ok1(tdb_delete(tdb, key) == 0);
+	ok1(tdb_check(tdb, NULL, NULL) == 0);
+
+	/* Re-add it, then add collision. */
+	ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0);
+	v = val + 1;
+	ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0);
+	ok1(tdb_check(tdb, NULL, NULL) == 0);
+
+	/* Can find both? */
+	ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS);
+	ok1(d.dsize == data.dsize);
+	free(d.dptr);
+	v = val;
+	ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS);
+	ok1(d.dsize == data.dsize);
+	free(d.dptr);
+
+	/* Delete second one. */
+	v = val + 1;
+	ok1(tdb_delete(tdb, key) == 0);
+	ok1(tdb_check(tdb, NULL, NULL) == 0);
+
+	/* Re-add */
+	ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0);
+	ok1(tdb_check(tdb, NULL, NULL) == 0);
+
+	/* Now, try deleting first one. */
+	v = val;
+	ok1(tdb_delete(tdb, key) == 0);
+	ok1(tdb_check(tdb, NULL, NULL) == 0);
+
+	/* Can still find second? */
+	v = val + 1;
+	ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS);
+	ok1(d.dsize == data.dsize);
+	free(d.dptr);
+
+	/* Now, this will be ideally placed. */
+	v = val + 2;
+	ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0);
+	ok1(tdb_check(tdb, NULL, NULL) == 0);
+
+	/* This will collide with both. */
+	v = val;
+	ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0);
+
+	/* We can still find them all, right? */
+	ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS);
+	ok1(d.dsize == data.dsize);
+	free(d.dptr);
+	v = val + 1;
+	ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS);
+	ok1(d.dsize == data.dsize);
+	free(d.dptr);
+	v = val + 2;
+	ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS);
+	ok1(d.dsize == data.dsize);
+	free(d.dptr);
+
+	/* And if we delete val + 1, that val + 2 should not move! */
+	v = val + 1;
+	ok1(tdb_delete(tdb, key) == 0);
+	ok1(tdb_check(tdb, NULL, NULL) == 0);
+
+	v = val;
+	ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS);
+	ok1(d.dsize == data.dsize);
+	free(d.dptr);
+	v = val + 2;
+	ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS);
+	ok1(d.dsize == data.dsize);
+	free(d.dptr);
+
+	/* Delete those two, so we are empty. */
+	ok1(tdb_delete(tdb, key) == 0);
+	v = val;
+	ok1(tdb_delete(tdb, key) == 0);
+
+	ok1(tdb_check(tdb, NULL, NULL) == 0);
+}
+
+int main(int argc, char *argv[])
+{
+	unsigned int i, j;
+	struct tdb_context *tdb;
+	uint64_t seed = 16014841315512641303ULL;
+	union tdb_attribute clash_hattr
+		= { .hash = { .base = { TDB_ATTRIBUTE_HASH },
+			      .fn = clash } };
+	union tdb_attribute fixed_hattr
+		= { .hash = { .base = { TDB_ATTRIBUTE_HASH },
+			      .fn = fixedhash,
+			      .data = &seed } };
+	int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP,
+			TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT,
+			TDB_NOMMAP|TDB_CONVERT };
+	/* These two values gave trouble before. */
+	int vals[] = { 755, 837 };
+
+	clash_hattr.base.next = &tap_log_attr;
+	fixed_hattr.base.next = &tap_log_attr;
+
+	plan_tests(sizeof(flags) / sizeof(flags[0])
+		   * (39 * 3 + 5 + sizeof(vals)/sizeof(vals[0])*2) + 1);
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		tdb = tdb_open("run-13-delete.tdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &clash_hattr);
+		ok1(tdb);
+		if (!tdb)
+			continue;
+
+		/* Check start of hash table. */
+		test_val(tdb, 0);
+
+		/* Check end of hash table. */
+		test_val(tdb, -1ULL);
+
+		/* Check mixed bitpattern. */
+		test_val(tdb, 0x123456789ABCDEF0ULL);
+
+		ok1(!tdb->file || (tdb->file->allrecord_lock.count == 0
+				   && tdb->file->num_lockrecs == 0));
+		tdb_close(tdb);
+
+		/* Deleting these entries in the db gave problems. */
+		tdb = tdb_open("run-13-delete.tdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &fixed_hattr);
+		ok1(tdb);
+		if (!tdb)
+			continue;
+
+		ok1(store_records(tdb));
+		ok1(tdb_check(tdb, NULL, NULL) == 0);
+		for (j = 0; j < sizeof(vals)/sizeof(vals[0]); j++) {
+			struct tdb_data key;
+
+			key.dptr = (unsigned char *)&vals[j];
+			key.dsize = sizeof(vals[j]);
+			ok1(tdb_delete(tdb, key) == 0);
+			ok1(tdb_check(tdb, NULL, NULL) == 0);
+		}
+		tdb_close(tdb);
+	}
+
+	ok1(tap_log_messages == 0);
+	return exit_status();
+}
diff --git a/lib/tdb2/test/run-14-exists.c b/lib/tdb2/test/run-14-exists.c
new file mode 100644
index 0000000000..f264a6f2c9
--- /dev/null
+++ b/lib/tdb2/test/run-14-exists.c
@@ -0,0 +1,57 @@
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tap/tap.h>
+#include "logging.h"
+
+static bool test_records(struct tdb_context *tdb)
+{
+	int i;
+	struct tdb_data key = { (unsigned char *)&i, sizeof(i) };
+	struct tdb_data data = { (unsigned char *)&i, sizeof(i) };
+
+	for (i = 0; i < 1000; i++) {
+		if (tdb_exists(tdb, key))
+			return false;
+		if (tdb_store(tdb, key, data, TDB_REPLACE) != 0)
+			return false;
+		if (!tdb_exists(tdb, key))
+			return false;
+	}
+
+	for (i = 0; i < 1000; i++) {
+		if (!tdb_exists(tdb, key))
+			return false;
+		if (tdb_delete(tdb, key) != 0)
+			return false;
+		if (tdb_exists(tdb, key))
+			return false;
+	}
+	return true;
+}
+
+int main(int argc, char *argv[])
+{
+	unsigned int i;
+	struct tdb_context *tdb;
+	int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP,
+			TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT,
+			TDB_NOMMAP|TDB_CONVERT };
+
+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 2 + 1);
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		tdb = tdb_open("run-14-exists.tdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+		if (ok1(tdb))
+			ok1(test_records(tdb));
+		tdb_close(tdb);
+	}
+
+	ok1(tap_log_messages == 0);
+	return exit_status();
+}
diff --git a/lib/tdb2/test/run-15-append.c b/lib/tdb2/test/run-15-append.c
new file mode 100644
index 0000000000..d2f9ec6598
--- /dev/null
+++ b/lib/tdb2/test/run-15-append.c
@@ -0,0 +1,135 @@
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tap/tap.h>
+#include <ccan/ilog/ilog.h>
+#include "logging.h"
+
+#define MAX_SIZE 13100
+#define SIZE_STEP 131
+
+static tdb_off_t tdb_offset(struct tdb_context *tdb, struct tdb_data key)
+{
+	tdb_off_t off;
+	struct tdb_used_record rec;
+	struct hash_info h;
+
+	off = find_and_lock(tdb, key, F_RDLCK, &h, &rec, NULL);
+	if (TDB_OFF_IS_ERR(off))
+		return 0;
+	tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range, F_RDLCK);
+	return off;
+}
+
+int main(int argc, char *argv[])
+{
+	unsigned int i, j, moves;
+	struct tdb_context *tdb;
+	unsigned char *buffer;
+	tdb_off_t oldoff = 0, newoff;
+	int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP,
+			TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT,
+			TDB_NOMMAP|TDB_CONVERT };
+	struct tdb_data key = tdb_mkdata("key", 3);
+	struct tdb_data data;
+
+	buffer = malloc(MAX_SIZE);
+	for (i = 0; i < MAX_SIZE; i++)
+		buffer[i] = i;
+
+	plan_tests(sizeof(flags) / sizeof(flags[0])
+		   * ((3 + MAX_SIZE/SIZE_STEP * 5) * 2 + 7)
+		   + 1);
+
+	/* Using tdb_store. */
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		tdb = tdb_open("run-append.tdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+		ok1(tdb);
+		if (!tdb)
+			continue;
+
+		moves = 0;
+		for (j = 0; j < MAX_SIZE; j += SIZE_STEP) {
+			data.dptr = buffer;
+			data.dsize = j;
+			ok1(tdb_store(tdb, key, data, TDB_REPLACE) == 0);
+			ok1(tdb_check(tdb, NULL, NULL) == 0);
+			ok1(tdb_fetch(tdb, key, &data) == TDB_SUCCESS);
+			ok1(data.dsize == j);
+			ok1(memcmp(data.dptr, buffer, data.dsize) == 0);
+			free(data.dptr);
+			newoff = tdb_offset(tdb, key);
+			if (newoff != oldoff)
+				moves++;
+			oldoff = newoff;
+		}
+		ok1(!tdb->file || (tdb->file->allrecord_lock.count == 0
+				   && tdb->file->num_lockrecs == 0));
+		/* We should increase by 50% each time... */
+		ok(moves <= ilog64(j / SIZE_STEP)*2, "Moved %u times", moves);
+		tdb_close(tdb);
+	}
+
+	/* Using tdb_append. */
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		size_t prev_len = 0;
+		tdb = tdb_open("run-append.tdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+		ok1(tdb);
+		if (!tdb)
+			continue;
+
+		moves = 0;
+		for (j = 0; j < MAX_SIZE; j += SIZE_STEP) {
+			data.dptr = buffer + prev_len;
+			data.dsize = j - prev_len;
+			ok1(tdb_append(tdb, key, data) == 0);
+			ok1(tdb_check(tdb, NULL, NULL) == 0);
+			ok1(tdb_fetch(tdb, key, &data) == TDB_SUCCESS);
+			ok1(data.dsize == j);
+			ok1(memcmp(data.dptr, buffer, data.dsize) == 0);
+			free(data.dptr);
+			prev_len = data.dsize;
+			newoff = tdb_offset(tdb, key);
+			if (newoff != oldoff)
+				moves++;
+			oldoff = newoff;
+		}
+		ok1(!tdb->file || (tdb->file->allrecord_lock.count == 0
+				   && tdb->file->num_lockrecs == 0));
+		/* We should increase by 50% each time... */
+		ok(moves <= ilog64(j / SIZE_STEP)*2, "Moved %u times", moves);
+		tdb_close(tdb);
+	}
+
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		tdb = tdb_open("run-append.tdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+		ok1(tdb);
+		if (!tdb)
+			continue;
+
+		/* Huge initial store. */
+		data.dptr = buffer;
+		data.dsize = MAX_SIZE;
+		ok1(tdb_append(tdb, key, data) == 0);
+		ok1(tdb_check(tdb, NULL, NULL) == 0);
+		ok1(tdb_fetch(tdb, key, &data) == TDB_SUCCESS);
+		ok1(data.dsize == MAX_SIZE);
+		ok1(memcmp(data.dptr, buffer, data.dsize) == 0);
+		free(data.dptr);
+		ok1(!tdb->file || (tdb->file->allrecord_lock.count == 0
+				   && tdb->file->num_lockrecs == 0));
+		tdb_close(tdb);
+	}
+
+	ok1(tap_log_messages == 0);
+	free(buffer);
+	return exit_status();
+}
diff --git a/lib/tdb2/test/run-16-wipe_all.c b/lib/tdb2/test/run-16-wipe_all.c
new file mode 100644
index 0000000000..d9c5128e0b
--- /dev/null
+++ b/lib/tdb2/test/run-16-wipe_all.c
@@ -0,0 +1,50 @@
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tdb2/traverse.c>
+#include <ccan/tap/tap.h>
+#include "logging.h"
+
+static bool add_records(struct tdb_context *tdb)
+{
+	int i;
+	struct tdb_data key = { (unsigned char *)&i, sizeof(i) };
+	struct tdb_data data = { (unsigned char *)&i, sizeof(i) };
+
+	for (i = 0; i < 1000; i++) {
+		if (tdb_store(tdb, key, data, TDB_REPLACE) != 0)
+			return false;
+	}
+	return true;
+}
+
+
+int main(int argc, char *argv[])
+{
+	unsigned int i;
+	struct tdb_context *tdb;
+	int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP,
+			TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT,
+			TDB_NOMMAP|TDB_CONVERT };
+
+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 4 + 1);
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		tdb = tdb_open("run-16-wipe_all.tdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+		if (ok1(tdb)) {
+			struct tdb_data key;
+			ok1(add_records(tdb));
+			ok1(tdb_wipe_all(tdb) == TDB_SUCCESS);
+			ok1(tdb_firstkey(tdb, &key) == TDB_ERR_NOEXIST);
+			tdb_close(tdb);
+		}
+	}
+
+	ok1(tap_log_messages == 0);
+	return exit_status();
+}
diff --git a/lib/tdb2/test/run-20-growhash.c b/lib/tdb2/test/run-20-growhash.c
new file mode 100644
index 0000000000..22a88c4504
--- /dev/null
+++ b/lib/tdb2/test/run-20-growhash.c
@@ -0,0 +1,144 @@
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tap/tap.h>
+#include "logging.h"
+
+static uint64_t myhash(const void *key, size_t len, uint64_t seed, void *priv)
+{
+	return *(const uint64_t *)key;
+}
+
+static void add_bits(uint64_t *val, unsigned new, unsigned new_bits,
+		     unsigned *done)
+{
+	*done += new_bits;
+	*val |= ((uint64_t)new << (64 - *done));
+}
+
+static uint64_t make_key(unsigned topgroup, unsigned topbucket,
+			 unsigned subgroup1, unsigned subbucket1,
+			 unsigned subgroup2, unsigned subbucket2)
+{
+	uint64_t key = 0;
+	unsigned done = 0;
+
+	add_bits(&key, topgroup, TDB_TOPLEVEL_HASH_BITS - TDB_HASH_GROUP_BITS,
+		 &done);
+	add_bits(&key, topbucket, TDB_HASH_GROUP_BITS, &done);
+	add_bits(&key, subgroup1, TDB_SUBLEVEL_HASH_BITS - TDB_HASH_GROUP_BITS,
+		 &done);
+	add_bits(&key, subbucket1, TDB_HASH_GROUP_BITS, &done);
+	add_bits(&key, subgroup2, TDB_SUBLEVEL_HASH_BITS - TDB_HASH_GROUP_BITS,
+		 &done);
+	add_bits(&key, subbucket2, TDB_HASH_GROUP_BITS, &done);
+	return key;
+}
+
+int main(int argc, char *argv[])
+{
+	unsigned int i, j;
+	struct tdb_context *tdb;
+	uint64_t kdata;
+	struct tdb_used_record rec;
+	struct tdb_data key = { (unsigned char *)&kdata, sizeof(kdata) };
+	struct tdb_data dbuf = { (unsigned char *)&kdata, sizeof(kdata) };
+	union tdb_attribute hattr = { .hash = { .base = { TDB_ATTRIBUTE_HASH },
+						.fn = myhash } };
+	int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP,
+			TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT,
+			TDB_NOMMAP|TDB_CONVERT,
+	};
+
+	hattr.base.next = &tap_log_attr;
+
+	plan_tests(sizeof(flags) / sizeof(flags[0])
+		   * (9 + (20 + 2 * ((1 << TDB_HASH_GROUP_BITS) - 2))
+		      * (1 << TDB_HASH_GROUP_BITS)) + 1);
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		struct hash_info h;
+
+		tdb = tdb_open("run-04-basichash.tdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &hattr);
+		ok1(tdb);
+		if (!tdb)
+			continue;
+
+		/* Fill a group. */
+		for (j = 0; j < (1 << TDB_HASH_GROUP_BITS); j++) {
+			kdata = make_key(0, j, 0, 0, 0, 0);
+			ok1(tdb_store(tdb, key, dbuf, TDB_INSERT) == 0);
+		}
+		ok1(tdb_check(tdb, NULL, NULL) == 0);
+
+		/* Check first still exists. */
+		kdata = make_key(0, 0, 0, 0, 0, 0);
+		ok1(find_and_lock(tdb, key, F_RDLCK, &h, &rec, NULL) != 0);
+		/* Should have created correct hash. */
+		ok1(h.h == tdb_hash(tdb, key.dptr, key.dsize));
+		/* Should have located space in group 0, bucket 0. */
+		ok1(h.group_start == offsetof(struct tdb_header, hashtable));
+		ok1(h.home_bucket == 0);
+		ok1(h.found_bucket == 0);
+		ok1(h.hash_used == TDB_TOPLEVEL_HASH_BITS);
+		/* Entire group should be full! */
+		for (j = 0; j < (1 << TDB_HASH_GROUP_BITS); j++)
+			ok1(h.group[j] != 0);
+
+		ok1(tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range,
+				      F_RDLCK) == 0);
+
+		/* Now, add one more to each should expand (that) bucket. */
+		for (j = 0; j < (1 << TDB_HASH_GROUP_BITS); j++) {
+			unsigned int k;
+			kdata = make_key(0, j, 0, 1, 0, 0);
+			ok1(tdb_store(tdb, key, dbuf, TDB_INSERT) == 0);
+			ok1(tdb_check(tdb, NULL, NULL) == 0);
+
+			ok1(find_and_lock(tdb, key, F_RDLCK, &h, &rec, NULL));
+			/* Should have created correct hash. */
+			ok1(h.h == tdb_hash(tdb, key.dptr, key.dsize));
+			/* Should have moved to subhash */
+			ok1(h.group_start >= sizeof(struct tdb_header));
+			ok1(h.home_bucket == 1);
+			ok1(h.found_bucket == 1);
+			ok1(h.hash_used == TDB_TOPLEVEL_HASH_BITS
+			    + TDB_SUBLEVEL_HASH_BITS);
+			ok1(tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range,
+					      F_RDLCK) == 0);
+
+			/* Keep adding, make it expand again. */
+			for (k = 2; k < (1 << TDB_HASH_GROUP_BITS); k++) {
+				kdata = make_key(0, j, 0, k, 0, 0);
+				ok1(tdb_store(tdb, key, dbuf, TDB_INSERT) == 0);
+				ok1(tdb_check(tdb, NULL, NULL) == 0);
+			}
+
+			/* This should tip it over to sub-sub-hash. */
+			kdata = make_key(0, j, 0, 0, 0, 1);
+			ok1(tdb_store(tdb, key, dbuf, TDB_INSERT) == 0);
+			ok1(tdb_check(tdb, NULL, NULL) == 0);
+
+			ok1(find_and_lock(tdb, key, F_RDLCK, &h, &rec, NULL));
+			/* Should have created correct hash. */
+			ok1(h.h == tdb_hash(tdb, key.dptr, key.dsize));
+			/* Should have moved to subhash */
+			ok1(h.group_start >= sizeof(struct tdb_header));
+			ok1(h.home_bucket == 1);
+			ok1(h.found_bucket == 1);
+			ok1(h.hash_used == TDB_TOPLEVEL_HASH_BITS
+			    + TDB_SUBLEVEL_HASH_BITS + TDB_SUBLEVEL_HASH_BITS);
+			ok1(tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range,
+					      F_RDLCK) == 0);
+		}
+		tdb_close(tdb);
+	}
+
+	ok1(tap_log_messages == 0);
+	return exit_status();
+}
diff --git a/lib/tdb2/test/run-21-parse_record.c b/lib/tdb2/test/run-21-parse_record.c
new file mode 100644
index 0000000000..773cdff4e0
--- /dev/null
+++ b/lib/tdb2/test/run-21-parse_record.c
@@ -0,0 +1,70 @@
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tap/tap.h>
+#include "logging.h"
+
+static enum TDB_ERROR parse(TDB_DATA key, TDB_DATA data, TDB_DATA *expected)
+{
+	if (!tdb_deq(data, *expected))
+		return TDB_ERR_EINVAL;
+	return TDB_SUCCESS;
+}
+
+static enum TDB_ERROR parse_err(TDB_DATA key, TDB_DATA data, void *unused)
+{
+	return 100;
+}
+
+static bool test_records(struct tdb_context *tdb)
+{
+	int i;
+	struct tdb_data key = { (unsigned char *)&i, sizeof(i) };
+	struct tdb_data data = { (unsigned char *)&i, sizeof(i) };
+
+	for (i = 0; i < 1000; i++) {
+		if (tdb_store(tdb, key, data, TDB_REPLACE) != 0)
+			return false;
+	}
+
+	for (i = 0; i < 1000; i++) {
+		if (tdb_parse_record(tdb, key, parse, &data) != TDB_SUCCESS)
+			return false;
+	}
+
+	if (tdb_parse_record(tdb, key, parse, &data) != TDB_ERR_NOEXIST)
+		return false;
+
+	/* Test error return from parse function. */
+	i = 0;
+	if (tdb_parse_record(tdb, key, parse_err, NULL) != 100)
+		return false;
+
+	return true;
+}
+
+int main(int argc, char *argv[])
+{
+	unsigned int i;
+	struct tdb_context *tdb;
+	int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP,
+			TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT,
+			TDB_NOMMAP|TDB_CONVERT };
+
+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 2 + 1);
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		tdb = tdb_open("run-14-exists.tdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+		if (ok1(tdb))
+			ok1(test_records(tdb));
+		tdb_close(tdb);
+	}
+
+	ok1(tap_log_messages == 0);
+	return exit_status();
+}
diff --git a/lib/tdb2/test/run-25-hashoverload.c b/lib/tdb2/test/run-25-hashoverload.c
new file mode 100644
index 0000000000..83f549d6b2
--- /dev/null
+++ b/lib/tdb2/test/run-25-hashoverload.c
@@ -0,0 +1,121 @@
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tdb2/traverse.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tap/tap.h>
+#include "logging.h"
+
+static uint64_t badhash(const void *key, size_t len, uint64_t seed, void *priv)
+{
+	return 0;
+}
+
+static int trav(struct tdb_context *tdb, TDB_DATA key, TDB_DATA dbuf, void *p)
+{
+	if (p)
+		return tdb_delete(tdb, key);
+	return 0;
+}
+
+int main(int argc, char *argv[])
+{
+	unsigned int i, j;
+	struct tdb_context *tdb;
+	struct tdb_data key = { (unsigned char *)&j, sizeof(j) };
+	struct tdb_data dbuf = { (unsigned char *)&j, sizeof(j) };
+	union tdb_attribute hattr = { .hash = { .base = { TDB_ATTRIBUTE_HASH },
+						.fn = badhash } };
+	int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP,
+			TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT,
+			TDB_NOMMAP|TDB_CONVERT,
+	};
+
+	hattr.base.next = &tap_log_attr;
+
+	plan_tests(6883);
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		struct tdb_data d = { NULL, 0 }; /* Bogus GCC warning */
+
+		tdb = tdb_open("run-25-hashoverload.tdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &hattr);
+		ok1(tdb);
+		if (!tdb)
+			continue;
+
+		/* Fill a group. */
+		for (j = 0; j < (1 << TDB_HASH_GROUP_BITS); j++) {
+			ok1(tdb_store(tdb, key, dbuf, TDB_INSERT) == 0);
+		}
+		ok1(tdb_check(tdb, NULL, NULL) == 0);
+
+		/* Now store one last value: should form chain. */
+		ok1(tdb_store(tdb, key, dbuf, TDB_INSERT) == 0);
+		ok1(tdb_check(tdb, NULL, NULL) == 0);
+
+		/* Check we can find them all. */
+		for (j = 0; j < (1 << TDB_HASH_GROUP_BITS) + 1; j++) {
+			ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS);
+			ok1(d.dsize == sizeof(j));
+			ok1(d.dptr != NULL);
+			ok1(d.dptr && memcmp(d.dptr, &j, d.dsize) == 0);
+			free(d.dptr);
+		}
+
+		/* Now add a *lot* more. */
+		for (j = (1 << TDB_HASH_GROUP_BITS) + 1;
+		     j < (16 << TDB_HASH_GROUP_BITS);
+		     j++) {
+			ok1(tdb_store(tdb, key, dbuf, TDB_INSERT) == 0);
+			ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS);
+			ok1(d.dsize == sizeof(j));
+			ok1(d.dptr != NULL);
+			ok1(d.dptr && memcmp(d.dptr, &j, d.dsize) == 0);
+			free(d.dptr);
+		}
+		ok1(tdb_check(tdb, NULL, NULL) == 0);
+
+		/* Traverse through them. */
+		ok1(tdb_traverse(tdb, trav, NULL) == j);
+
+		/* Empty the first chain-worth. */
+		for (j = 0; j < (1 << TDB_HASH_GROUP_BITS); j++)
+			ok1(tdb_delete(tdb, key) == 0);
+
+		ok1(tdb_check(tdb, NULL, NULL) == 0);
+
+		for (j = (1 << TDB_HASH_GROUP_BITS);
+		     j < (16 << TDB_HASH_GROUP_BITS);
+		     j++) {
+			ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS);
+			ok1(d.dsize == sizeof(j));
+			ok1(d.dptr != NULL);
+			ok1(d.dptr && memcmp(d.dptr, &j, d.dsize) == 0);
+			free(d.dptr);
+		}
+
+		/* Traverse through them. */
+		ok1(tdb_traverse(tdb, trav, NULL)
+		    == (15 << TDB_HASH_GROUP_BITS));
+
+		/* Re-add */
+		for (j = 0; j < (1 << TDB_HASH_GROUP_BITS); j++) {
+			ok1(tdb_store(tdb, key, dbuf, TDB_INSERT) == 0);
+		}
+		ok1(tdb_check(tdb, NULL, NULL) == 0);
+
+		/* Now try deleting as we go. */
+		ok1(tdb_traverse(tdb, trav, trav)
+		    == (16 << TDB_HASH_GROUP_BITS));
+		ok1(tdb_check(tdb, NULL, NULL) == 0);
+		ok1(tdb_traverse(tdb, trav, NULL) == 0);
+		tdb_close(tdb);
+	}
+
+	ok1(tap_log_messages == 0);
+	return exit_status();
+}
diff --git a/lib/tdb2/test/run-30-exhaust-before-expand.c b/lib/tdb2/test/run-30-exhaust-before-expand.c
new file mode 100644
index 0000000000..2386f85f26
--- /dev/null
+++ b/lib/tdb2/test/run-30-exhaust-before-expand.c
@@ -0,0 +1,79 @@
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tap/tap.h>
+#include <err.h>
+#include "logging.h"
+
+static bool empty_freetable(struct tdb_context *tdb)
+{
+	struct tdb_freetable ftab;
+	unsigned int i;
+
+	/* Now, free table should be completely exhausted in zone 0 */
+	if (tdb_read_convert(tdb, tdb->ftable_off, &ftab, sizeof(ftab)) != 0)
+		abort();
+
+	for (i = 0; i < sizeof(ftab.buckets)/sizeof(ftab.buckets[0]); i++) {
+		if (ftab.buckets[i])
+			return false;
+	}
+	return true;
+}
+
+
+int main(int argc, char *argv[])
+{
+	unsigned int i, j;
+	struct tdb_context *tdb;
+	int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP,
+			TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT,
+			TDB_NOMMAP|TDB_CONVERT };
+
+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 9 + 1);
+
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		TDB_DATA k;
+		uint64_t size;
+		bool was_empty = false;
+
+		k.dptr = (void *)&j;
+		k.dsize = sizeof(j);
+
+		tdb = tdb_open("run-30-exhaust-before-expand.tdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+		ok1(tdb);
+		if (!tdb)
+			continue;
+
+		ok1(empty_freetable(tdb));
+		/* Need some hash lock for expand. */
+		ok1(tdb_lock_hashes(tdb, 0, 1, F_WRLCK, TDB_LOCK_WAIT) == 0);
+		/* Create some free space. */
+		ok1(tdb_expand(tdb, 1) == 0);
+		ok1(tdb_unlock_hashes(tdb, 0, 1, F_WRLCK) == 0);
+		ok1(tdb_check(tdb, NULL, NULL) == 0);
+		ok1(!empty_freetable(tdb));
+
+		size = tdb->file->map_size;
+		/* Insert minimal-length records until we expand. */
+		for (j = 0; tdb->file->map_size == size; j++) {
+			was_empty = empty_freetable(tdb);
+			if (tdb_store(tdb, k, k, TDB_INSERT) != 0)
+				err(1, "Failed to store record %i", j);
+		}
+
+		/* Would have been empty before expansion, but no longer. */
+		ok1(was_empty);
+		ok1(!empty_freetable(tdb));
+		tdb_close(tdb);
+	}
+
+	ok1(tap_log_messages == 0);
+	return exit_status();
+}
diff --git a/lib/tdb2/test/run-50-multiple-freelists.c b/lib/tdb2/test/run-50-multiple-freelists.c
new file mode 100644
index 0000000000..7a48c3e0ee
--- /dev/null
+++ b/lib/tdb2/test/run-50-multiple-freelists.c
@@ -0,0 +1,71 @@
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tap/tap.h>
+#include <ccan/tdb2/transaction.c>
+#include "logging.h"
+#include "layout.h"
+
+int main(int argc, char *argv[])
+{
+	tdb_off_t off;
+	struct tdb_context *tdb;
+	struct tdb_layout *layout;
+	TDB_DATA key, data;
+
+	plan_tests(11);
+	key = tdb_mkdata("Hello", 5);
+	data = tdb_mkdata("world", 5);
+
+	/* Create a TDB with three free tables. */
+	layout = new_tdb_layout(NULL);
+	tdb_layout_add_freetable(layout);
+	tdb_layout_add_freetable(layout);
+	tdb_layout_add_freetable(layout);
+	tdb_layout_add_free(layout, 80, 0);
+	/* Used record prevent coalescing. */
+	tdb_layout_add_used(layout, key, data, 6);
+	tdb_layout_add_free(layout, 160, 1);
+	key.dsize--;
+	tdb_layout_add_used(layout, key, data, 7);
+	tdb_layout_add_free(layout, 320, 2);
+	key.dsize--;
+	tdb_layout_add_used(layout, key, data, 8);
+	tdb_layout_add_free(layout, 40, 0);
+	tdb = tdb_layout_get(layout);
+	ok1(tdb_check(tdb, NULL, NULL) == 0);
+
+	off = get_free(tdb, 0, 80 - sizeof(struct tdb_used_record), 0,
+		       TDB_USED_MAGIC, 0);
+	ok1(off == layout->elem[3].base.off);
+	ok1(tdb->ftable_off == layout->elem[0].base.off);
+
+	off = get_free(tdb, 0, 160 - sizeof(struct tdb_used_record), 0,
+		       TDB_USED_MAGIC, 0);
+	ok1(off == layout->elem[5].base.off);
+	ok1(tdb->ftable_off == layout->elem[1].base.off);
+
+	off = get_free(tdb, 0, 320 - sizeof(struct tdb_used_record), 0,
+		       TDB_USED_MAGIC, 0);
+	ok1(off == layout->elem[7].base.off);
+	ok1(tdb->ftable_off == layout->elem[2].base.off);
+
+	off = get_free(tdb, 0, 40 - sizeof(struct tdb_used_record), 0,
+		       TDB_USED_MAGIC, 0);
+	ok1(off == layout->elem[9].base.off);
+	ok1(tdb->ftable_off == layout->elem[0].base.off);
+
+	/* Now we fail. */
+	off = get_free(tdb, 0, 0, 1, TDB_USED_MAGIC, 0);
+	ok1(off == 0);
+
+	tdb_close(tdb);
+	tdb_layout_free(layout);
+
+	ok1(tap_log_messages == 0);
+	return exit_status();
+}
diff --git a/lib/tdb2/test/run-55-transaction.c b/lib/tdb2/test/run-55-transaction.c
new file mode 100644
index 0000000000..1650e40e1f
--- /dev/null
+++ b/lib/tdb2/test/run-55-transaction.c
@@ -0,0 +1,75 @@
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tap/tap.h>
+#include "logging.h"
+
+int main(int argc, char *argv[])
+{
+	unsigned int i;
+	struct tdb_context *tdb;
+	unsigned char *buffer;
+	int flags[] = { TDB_DEFAULT, TDB_NOMMAP,
+			TDB_CONVERT, TDB_NOMMAP|TDB_CONVERT };
+	struct tdb_data key = tdb_mkdata("key", 3);
+	struct tdb_data data;
+
+	buffer = malloc(1000);
+	for (i = 0; i < 1000; i++)
+		buffer[i] = i;
+
+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 20 + 1);
+
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		tdb = tdb_open("run-55-transaction.tdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+		ok1(tdb);
+		if (!tdb)
+			continue;
+
+		ok1(tdb_transaction_start(tdb) == 0);
+		data.dptr = buffer;
+		data.dsize = 1000;
+		ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0);
+		ok1(tdb_fetch(tdb, key, &data) == TDB_SUCCESS);
+		ok1(data.dsize == 1000);
+		ok1(memcmp(data.dptr, buffer, data.dsize) == 0);
+		free(data.dptr);
+
+		/* Cancelling a transaction means no store */
+		tdb_transaction_cancel(tdb);
+		ok1(tdb->file->allrecord_lock.count == 0
+		    && tdb->file->num_lockrecs == 0);
+		ok1(tdb_check(tdb, NULL, NULL) == 0);
+		ok1(tdb_fetch(tdb, key, &data) == TDB_ERR_NOEXIST);
+
+		/* Commit the transaction. */
+		ok1(tdb_transaction_start(tdb) == 0);
+		data.dptr = buffer;
+		data.dsize = 1000;
+		ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0);
+		ok1(tdb_fetch(tdb, key, &data) == TDB_SUCCESS);
+		ok1(data.dsize == 1000);
+		ok1(memcmp(data.dptr, buffer, data.dsize) == 0);
+		free(data.dptr);
+		ok1(tdb_transaction_commit(tdb) == 0);
+		ok1(tdb->file->allrecord_lock.count == 0
+		    && tdb->file->num_lockrecs == 0);
+		ok1(tdb_check(tdb, NULL, NULL) == 0);
+		ok1(tdb_fetch(tdb, key, &data) == TDB_SUCCESS);
+		ok1(data.dsize == 1000);
+		ok1(memcmp(data.dptr, buffer, data.dsize) == 0);
+		free(data.dptr);
+
+		tdb_close(tdb);
+	}
+
+	ok1(tap_log_messages == 0);
+	free(buffer);
+	return exit_status();
+}
diff --git a/lib/tdb2/test/run-56-open-during-transaction.c b/lib/tdb2/test/run-56-open-during-transaction.c
new file mode 100644
index 0000000000..96107d637e
--- /dev/null
+++ b/lib/tdb2/test/run-56-open-during-transaction.c
@@ -0,0 +1,175 @@
+#include "config.h"
+#include <unistd.h>
+#include "lock-tracking.h"
+
+static ssize_t pwrite_check(int fd, const void *buf, size_t count, off_t offset);
+static ssize_t write_check(int fd, const void *buf, size_t count);
+static int ftruncate_check(int fd, off_t length);
+
+#define pwrite pwrite_check
+#define write write_check
+#define fcntl fcntl_with_lockcheck
+#define ftruncate ftruncate_check
+
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tap/tap.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <stdarg.h>
+#include <err.h>
+#include "external-agent.h"
+#include "logging.h"
+
+static struct agent *agent;
+static bool opened;
+static int errors = 0;
+#define TEST_DBNAME "run-56-open-during-transaction.tdb"
+
+#undef write
+#undef pwrite
+#undef fcntl
+#undef ftruncate
+
+static bool is_same(const char *snapshot, const char *latest, off_t len)
+{
+	unsigned i;
+
+	for (i = 0; i < len; i++) {
+		if (snapshot[i] != latest[i])
+			return false;
+	}
+	return true;
+}
+
+static bool compare_file(int fd, const char *snapshot, off_t snapshot_len)
+{
+	char *contents;
+	bool same;
+
+	/* over-length read serves as length check. */
+	contents = malloc(snapshot_len+1);
+	same = pread(fd, contents, snapshot_len+1, 0) == snapshot_len
+		&& is_same(snapshot, contents, snapshot_len);
+	free(contents);
+	return same;
+}
+
+static void check_file_intact(int fd)
+{
+	enum agent_return ret;
+	struct stat st;
+	char *contents;
+
+	fstat(fd, &st);
+	contents = malloc(st.st_size);
+	if (pread(fd, contents, st.st_size, 0) != st.st_size) {
+		diag("Read fail");
+		errors++;
+		return;
+	}
+
+	/* Ask agent to open file. */
+	ret = external_agent_operation(agent, OPEN, TEST_DBNAME);
+
+	/* It's OK to open it, but it must not have changed! */
+	if (!compare_file(fd, contents, st.st_size)) {
+		diag("Agent changed file after opening %s",
+		     agent_return_name(ret));
+		errors++;
+	}
+
+	if (ret == SUCCESS) {
+		ret = external_agent_operation(agent, CLOSE, NULL);
+		if (ret != SUCCESS) {
+			diag("Agent failed to close tdb: %s",
+			     agent_return_name(ret));
+			errors++;
+		}
+	} else if (ret != WOULD_HAVE_BLOCKED) {
+		diag("Agent opening file gave %s",
+		     agent_return_name(ret));
+		errors++;
+	}
+
+	free(contents);
+}
+
+static void after_unlock(int fd)
+{
+	if (opened)
+		check_file_intact(fd);
+}
+
+static ssize_t pwrite_check(int fd,
+			    const void *buf, size_t count, off_t offset)
+{
+	if (opened)
+		check_file_intact(fd);
+
+	return pwrite(fd, buf, count, offset);
+}
+
+static ssize_t write_check(int fd, const void *buf, size_t count)
+{
+	if (opened)
+		check_file_intact(fd);
+
+	return write(fd, buf, count);
+}
+
+static int ftruncate_check(int fd, off_t length)
+{
+	if (opened)
+		check_file_intact(fd);
+
+	return ftruncate(fd, length);
+
+}
+
+int main(int argc, char *argv[])
+{
+	const int flags[] = { TDB_DEFAULT,
+			      TDB_NOMMAP,
+			      TDB_CONVERT,
+			      TDB_CONVERT | TDB_NOMMAP };
+	int i;
+	struct tdb_context *tdb;
+	TDB_DATA key, data;
+
+	plan_tests(20);
+	agent = prepare_external_agent();
+	if (!agent)
+		err(1, "preparing agent");
+
+	unlock_callback = after_unlock;
+	for (i = 0; i < sizeof(flags)/sizeof(flags[0]); i++) {
+		diag("Test with %s and %s\n",
+		     (flags[i] & TDB_CONVERT) ? "CONVERT" : "DEFAULT",
+		     (flags[i] & TDB_NOMMAP) ? "no mmap" : "mmap");
+		unlink(TEST_DBNAME);
+		tdb = tdb_open(TEST_DBNAME, flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+		ok1(tdb);
+
+		opened = true;
+		ok1(tdb_transaction_start(tdb) == 0);
+		key = tdb_mkdata("hi", strlen("hi"));
+		data = tdb_mkdata("world", strlen("world"));
+
+		ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0);
+		ok1(tdb_transaction_commit(tdb) == 0);
+		ok(!errors, "We had %u open errors", errors);
+
+		opened = false;
+		tdb_close(tdb);
+	}
+
+	return exit_status();
+}
diff --git a/lib/tdb2/test/run-57-die-during-transaction.c b/lib/tdb2/test/run-57-die-during-transaction.c
new file mode 100644
index 0000000000..84f01eb21a
--- /dev/null
+++ b/lib/tdb2/test/run-57-die-during-transaction.c
@@ -0,0 +1,275 @@
+#include "config.h"
+#include <unistd.h>
+#include "lock-tracking.h"
+#include <ccan/tap/tap.h>
+#include <stdlib.h>
+#include <assert.h>
+static ssize_t pwrite_check(int fd, const void *buf, size_t count, off_t offset);
+static ssize_t write_check(int fd, const void *buf, size_t count);
+static int ftruncate_check(int fd, off_t length);
+
+#define pwrite pwrite_check
+#define write write_check
+#define fcntl fcntl_with_lockcheck
+#define ftruncate ftruncate_check
+
+/* There's a malloc inside transaction_setup_recovery, and valgrind complains
+ * when we longjmp and leak it. */
+#define MAX_ALLOCATIONS 200
+static void *allocated[MAX_ALLOCATIONS];
+
+static void *malloc_noleak(size_t len)
+{
+	unsigned int i;
+
+	for (i = 0; i < MAX_ALLOCATIONS; i++)
+		if (!allocated[i]) {
+			allocated[i] = malloc(len);
+			return allocated[i];
+		}
+	diag("Too many allocations!");
+	abort();
+}
+
+static void free_noleak(void *p)
+{
+	unsigned int i;
+
+	/* We don't catch realloc, so don't care if we miss one. */
+	for (i = 0; i < MAX_ALLOCATIONS; i++) {
+		if (allocated[i] == p) {
+			allocated[i] = NULL;
+			break;
+		}
+	}
+	free(p);
+}
+
+static void free_all(void)
+{
+	unsigned int i;
+
+	for (i = 0; i < MAX_ALLOCATIONS; i++) {
+		free(allocated[i]);
+		allocated[i] = NULL;
+	}
+}
+
+#define malloc malloc_noleak
+#define free free_noleak
+
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tdb2/transaction.c>
+#undef malloc
+#undef free
+#undef write
+#undef pwrite
+#undef fcntl
+#undef ftruncate
+
+#include <stdbool.h>
+#include <stdarg.h>
+#include <err.h>
+#include <setjmp.h>
+#include "external-agent.h"
+#include "logging.h"
+
+static bool in_transaction;
+static int target, current;
+static jmp_buf jmpbuf;
+#define TEST_DBNAME "run-57-die-during-transaction.tdb"
+#define KEY_STRING "helloworld"
+
+static void maybe_die(int fd)
+{
+	if (in_transaction && current++ == target) {
+		longjmp(jmpbuf, 1);
+	}
+}
+
+static ssize_t pwrite_check(int fd,
+			    const void *buf, size_t count, off_t offset)
+{
+	ssize_t ret;
+
+	maybe_die(fd);
+
+	ret = pwrite(fd, buf, count, offset);
+	if (ret != count)
+		return ret;
+
+	maybe_die(fd);
+	return ret;
+}
+
+static ssize_t write_check(int fd, const void *buf, size_t count)
+{
+	ssize_t ret;
+
+	maybe_die(fd);
+
+	ret = write(fd, buf, count);
+	if (ret != count)
+		return ret;
+
+	maybe_die(fd);
+	return ret;
+}
+
+static int ftruncate_check(int fd, off_t length)
+{
+	int ret;
+
+	maybe_die(fd);
+
+	ret = ftruncate(fd, length);
+
+	maybe_die(fd);
+	return ret;
+}
+
+static bool test_death(enum operation op, struct agent *agent)
+{
+	struct tdb_context *tdb = NULL;
+	TDB_DATA key;
+	enum agent_return ret;
+	int needed_recovery = 0;
+
+	current = target = 0;
+reset:
+	unlink(TEST_DBNAME);
+	tdb = tdb_open(TEST_DBNAME, TDB_NOMMAP,
+		       O_CREAT|O_TRUNC|O_RDWR, 0600, &tap_log_attr);
+	if (!tdb) {
+		diag("Failed opening TDB: %s", strerror(errno));
+		return false;
+	}
+
+	if (setjmp(jmpbuf) != 0) {
+		/* We're partway through.  Simulate our death. */
+		close(tdb->file->fd);
+		forget_locking();
+		in_transaction = false;
+
+		ret = external_agent_operation(agent, NEEDS_RECOVERY, "");
+		if (ret == SUCCESS)
+			needed_recovery++;
+		else if (ret != FAILED) {
+			diag("Step %u agent NEEDS_RECOVERY = %s", current,
+			     agent_return_name(ret));
+			return false;
+		}
+
+		ret = external_agent_operation(agent, op, KEY_STRING);
+		if (ret != SUCCESS) {
+			diag("Step %u op %s failed = %s", current,
+			     operation_name(op),
+			     agent_return_name(ret));
+			return false;
+		}
+
+		ret = external_agent_operation(agent, NEEDS_RECOVERY, "");
+		if (ret != FAILED) {
+			diag("Still needs recovery after step %u = %s",
+			     current, agent_return_name(ret));
+			return false;
+		}
+
+		ret = external_agent_operation(agent, CHECK, "");
+		if (ret != SUCCESS) {
+			diag("Step %u check failed = %s", current,
+			     agent_return_name(ret));
+			return false;
+		}
+
+		ret = external_agent_operation(agent, CLOSE, "");
+		if (ret != SUCCESS) {
+			diag("Step %u close failed = %s", current,
+			     agent_return_name(ret));
+			return false;
+		}
+
+		/* Suppress logging as this tries to use closed fd. */
+		suppress_logging = true;
+		suppress_lockcheck = true;
+		tdb_close(tdb);
+		suppress_logging = false;
+		suppress_lockcheck = false;
+		target++;
+		current = 0;
+		free_all();
+		goto reset;
+	}
+
+	/* Put key for agent to fetch. */
+	key = tdb_mkdata(KEY_STRING, strlen(KEY_STRING));
+	if (tdb_store(tdb, key, key, TDB_INSERT) != 0)
+		return false;
+
+	/* This is the key we insert in transaction. */
+	key.dsize--;
+
+	ret = external_agent_operation(agent, OPEN, TEST_DBNAME);
+	if (ret != SUCCESS)
+		errx(1, "Agent failed to open: %s", agent_return_name(ret));
+
+	ret = external_agent_operation(agent, FETCH, KEY_STRING);
+	if (ret != SUCCESS)
+		errx(1, "Agent failed find key: %s", agent_return_name(ret));
+
+	in_transaction = true;
+	if (tdb_transaction_start(tdb) != 0)
+		return false;
+
+	if (tdb_store(tdb, key, key, TDB_INSERT) != 0)
+		return false;
+
+	if (tdb_transaction_commit(tdb) != 0)
+		return false;
+
+	in_transaction = false;
+
+	/* We made it! */
+	diag("Completed %u runs", current);
+	tdb_close(tdb);
+	ret = external_agent_operation(agent, CLOSE, "");
+	if (ret != SUCCESS) {
+		diag("Step %u close failed = %s", current,
+		     agent_return_name(ret));
+		return false;
+	}
+
+	ok1(needed_recovery);
+	ok1(locking_errors == 0);
+	ok1(forget_locking() == 0);
+	locking_errors = 0;
+	return true;
+}
+
+int main(int argc, char *argv[])
+{
+	enum operation ops[] = { FETCH, STORE, TRANSACTION_START };
+	struct agent *agent;
+	int i;
+
+	plan_tests(12);
+	unlock_callback = maybe_die;
+
+	agent = prepare_external_agent();
+	if (!agent)
+		err(1, "preparing agent");
+
+	for (i = 0; i < sizeof(ops)/sizeof(ops[0]); i++) {
+		diag("Testing %s after death", operation_name(ops[i]));
+		ok1(test_death(ops[i], agent));
+	}
+
+	free_external_agent(agent);
+	return exit_status();
+}
diff --git a/lib/tdb2/test/run-64-bit-tdb.c b/lib/tdb2/test/run-64-bit-tdb.c
new file mode 100644
index 0000000000..78dadca016
--- /dev/null
+++ b/lib/tdb2/test/run-64-bit-tdb.c
@@ -0,0 +1,80 @@
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tdb2/traverse.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tap/tap.h>
+#include "logging.h"
+
+int main(int argc, char *argv[])
+{
+	unsigned int i;
+	struct tdb_context *tdb;
+	int flags[] = { TDB_DEFAULT, TDB_NOMMAP,
+			TDB_CONVERT,
+			TDB_NOMMAP|TDB_CONVERT };
+
+	if (sizeof(off_t) <= 4) {
+		plan_tests(1);
+		pass("No 64 bit off_t");
+		return exit_status();
+	}
+
+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 14);
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		off_t old_size;
+		TDB_DATA k, d;
+		struct hash_info h;
+		struct tdb_used_record rec;
+		tdb_off_t off;
+
+		tdb = tdb_open("run-64-bit-tdb.tdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+		ok1(tdb);
+		if (!tdb)
+			continue;
+
+		old_size = tdb->file->map_size;
+
+		/* This makes a sparse file */
+		ok1(ftruncate(tdb->file->fd, 0xFFFFFFF0) == 0);
+		ok1(add_free_record(tdb, old_size, 0xFFFFFFF0 - old_size,
+				    TDB_LOCK_WAIT, false) == TDB_SUCCESS);
+
+		/* Now add a little record past the 4G barrier. */
+		ok1(tdb_expand_file(tdb, 100) == TDB_SUCCESS);
+		ok1(add_free_record(tdb, 0xFFFFFFF0, 100, TDB_LOCK_WAIT, false)
+		    == TDB_SUCCESS);
+
+		ok1(tdb_check(tdb, NULL, NULL) == TDB_SUCCESS);
+
+		/* Test allocation path. */
+		k = tdb_mkdata("key", 4);
+		d = tdb_mkdata("data", 5);
+		ok1(tdb_store(tdb, k, d, TDB_INSERT) == 0);
+		ok1(tdb_check(tdb, NULL, NULL) == TDB_SUCCESS);
+
+		/* Make sure it put it at end as we expected. */
+		off = find_and_lock(tdb, k, F_RDLCK, &h, &rec, NULL);
+		ok1(off >= 0xFFFFFFF0);
+		tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range, F_RDLCK);
+
+		ok1(tdb_fetch(tdb, k, &d) == 0);
+		ok1(d.dsize == 5);
+		ok1(strcmp((char *)d.dptr, "data") == 0);
+		free(d.dptr);
+
+		ok1(tdb_delete(tdb, k) == 0);
+		ok1(tdb_check(tdb, NULL, NULL) == TDB_SUCCESS);
+
+		tdb_close(tdb);
+	}
+
+	/* We might get messages about mmap failing, so don't test
+	 * tap_log_messages */
+	return exit_status();
+}
diff --git a/lib/tdb2/test/run-80-tdb_fd.c b/lib/tdb2/test/run-80-tdb_fd.c
new file mode 100644
index 0000000000..e8b2fae2dd
--- /dev/null
+++ b/lib/tdb2/test/run-80-tdb_fd.c
@@ -0,0 +1,35 @@
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tap/tap.h>
+#include "logging.h"
+
+int main(int argc, char *argv[])
+{
+	unsigned int i;
+	struct tdb_context *tdb;
+	int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP,
+			TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT,
+			TDB_NOMMAP|TDB_CONVERT };
+
+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 3);
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		tdb = tdb_open("run-new_database.tdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+		if (!ok1(tdb))
+			continue;
+
+		if (flags[i] & TDB_INTERNAL)
+			ok1(tdb_fd(tdb) == -1);
+		else
+			ok1(tdb_fd(tdb) > 2);
+		tdb_close(tdb);
+		ok1(tap_log_messages == 0);
+	}
+	return exit_status();
+}
diff --git a/lib/tdb2/test/run-81-seqnum.c b/lib/tdb2/test/run-81-seqnum.c
new file mode 100644
index 0000000000..6e8b2698b6
--- /dev/null
+++ b/lib/tdb2/test/run-81-seqnum.c
@@ -0,0 +1,71 @@
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tdb2/traverse.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tap/tap.h>
+#include "logging.h"
+
+int main(int argc, char *argv[])
+{
+	unsigned int i;
+	struct tdb_context *tdb;
+	struct tdb_data d = { NULL, 0 }; /* Bogus GCC warning */
+	struct tdb_data key = tdb_mkdata("key", 3);
+	struct tdb_data data = tdb_mkdata("data", 4);
+	int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP,
+			TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT,
+			TDB_NOMMAP|TDB_CONVERT };
+
+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 15 + 4 * 13);
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		tdb = tdb_open("run-new_database.tdb", flags[i]|TDB_SEQNUM,
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+		if (!ok1(tdb))
+			continue;
+
+		ok1(tdb_get_seqnum(tdb) == 0);
+		ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0);
+		ok1(tdb_get_seqnum(tdb) == 1);
+		/* Fetch doesn't change seqnum */
+		if (ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS))
+			free(d.dptr);
+		ok1(tdb_get_seqnum(tdb) == 1);
+		ok1(tdb_append(tdb, key, data) == TDB_SUCCESS);
+		ok1(tdb_get_seqnum(tdb) == 2);
+
+		ok1(tdb_delete(tdb, key) == TDB_SUCCESS);
+		ok1(tdb_get_seqnum(tdb) == 3);
+		/* Empty append works */
+		ok1(tdb_append(tdb, key, data) == TDB_SUCCESS);
+		ok1(tdb_get_seqnum(tdb) == 4);
+
+		ok1(tdb_wipe_all(tdb) == TDB_SUCCESS);
+		ok1(tdb_get_seqnum(tdb) == 5);
+
+		if (!(flags[i] & TDB_INTERNAL)) {
+			ok1(tdb_transaction_start(tdb) == TDB_SUCCESS);
+			ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0);
+			ok1(tdb_get_seqnum(tdb) == 6);
+			ok1(tdb_append(tdb, key, data) == TDB_SUCCESS);
+			ok1(tdb_get_seqnum(tdb) == 7);
+			ok1(tdb_delete(tdb, key) == TDB_SUCCESS);
+			ok1(tdb_get_seqnum(tdb) == 8);
+			ok1(tdb_transaction_commit(tdb) == TDB_SUCCESS);
+			ok1(tdb_get_seqnum(tdb) == 8);
+
+			ok1(tdb_transaction_start(tdb) == TDB_SUCCESS);
+			ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0);
+			ok1(tdb_get_seqnum(tdb) == 9);
+			tdb_transaction_cancel(tdb);
+			ok1(tdb_get_seqnum(tdb) == 8);
+		}
+		tdb_close(tdb);
+		ok1(tap_log_messages == 0);
+	}
+	return exit_status();
+}
diff --git a/lib/tdb2/test/run-82-lockattr.c b/lib/tdb2/test/run-82-lockattr.c
new file mode 100644
index 0000000000..bfc2653222
--- /dev/null
+++ b/lib/tdb2/test/run-82-lockattr.c
@@ -0,0 +1,263 @@
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tdb2/traverse.c>
+#include <ccan/tap/tap.h>
+#include "logging.h"
+
+static int mylock(int fd, int rw, off_t off, off_t len, bool waitflag,
+		  void *_err)
+{
+	int *lock_err = _err;
+	struct flock fl;
+	int ret;
+
+	if (*lock_err) {
+		errno = *lock_err;
+		return -1;
+	}
+
+	do {
+		fl.l_type = rw;
+		fl.l_whence = SEEK_SET;
+		fl.l_start = off;
+		fl.l_len = len;
+
+		if (waitflag)
+			ret = fcntl(fd, F_SETLKW, &fl);
+		else
+			ret = fcntl(fd, F_SETLK, &fl);
+	} while (ret != 0 && errno == EINTR);
+
+	return ret;
+}
+
+static int myunlock(int fd, int rw, off_t off, off_t len, void *_err)
+{
+	int *lock_err = _err;
+	struct flock fl;
+	int ret;
+
+	if (*lock_err) {
+		errno = *lock_err;
+		return -1;
+	}
+
+	do {
+		fl.l_type = F_UNLCK;
+		fl.l_whence = SEEK_SET;
+		fl.l_start = off;
+		fl.l_len = len;
+
+		ret = fcntl(fd, F_SETLKW, &fl);
+	} while (ret != 0 && errno == EINTR);
+
+	return ret;
+}
+
+static int trav_err;
+static int trav(struct tdb_context *tdb, TDB_DATA k, TDB_DATA d, int *err)
+{
+	*err = trav_err;
+	return 0;
+}
+
+int main(int argc, char *argv[])
+{
+	unsigned int i;
+	struct tdb_context *tdb;
+	int flags[] = { TDB_DEFAULT, TDB_NOMMAP,
+			TDB_CONVERT, TDB_NOMMAP|TDB_CONVERT };
+	union tdb_attribute lock_attr;
+	struct tdb_data key = tdb_mkdata("key", 3);
+	struct tdb_data data = tdb_mkdata("data", 4);
+	int lock_err;
+
+	lock_attr.base.attr = TDB_ATTRIBUTE_FLOCK;
+	lock_attr.base.next = &tap_log_attr;
+	lock_attr.flock.lock = mylock;
+	lock_attr.flock.unlock = myunlock;
+	lock_attr.flock.data = &lock_err;
+
+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 80);
+
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		struct tdb_data d;
+
+		/* Nonblocking open; expect no error message. */
+		lock_err = EAGAIN;
+		tdb = tdb_open("run-82-lockattr.tdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &lock_attr);
+		ok(errno == lock_err, "Errno is %u", errno);
+		ok1(!tdb);
+		ok1(tap_log_messages == 0);
+
+		lock_err = EINTR;
+		tdb = tdb_open("run-82-lockattr.tdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &lock_attr);
+		ok(errno == lock_err, "Errno is %u", errno);
+		ok1(!tdb);
+		ok1(tap_log_messages == 0);
+
+		/* Forced fail open. */
+		lock_err = ENOMEM;
+		tdb = tdb_open("run-82-lockattr.tdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &lock_attr);
+		ok1(errno == lock_err);
+		ok1(!tdb);
+		ok1(tap_log_messages == 1);
+		tap_log_messages = 0;
+
+		lock_err = 0;
+		tdb = tdb_open("run-82-lockattr.tdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &lock_attr);
+		if (!ok1(tdb))
+			continue;
+		ok1(tap_log_messages == 0);
+
+		/* Nonblocking store. */
+		lock_err = EAGAIN;
+		ok1(tdb_store(tdb, key, data, TDB_REPLACE) == TDB_ERR_LOCK);
+		ok1(tap_log_messages == 0);
+		lock_err = EINTR;
+		ok1(tdb_store(tdb, key, data, TDB_REPLACE) == TDB_ERR_LOCK);
+		ok1(tap_log_messages == 0);
+		lock_err = ENOMEM;
+		ok1(tdb_store(tdb, key, data, TDB_REPLACE) == TDB_ERR_LOCK);
+		ok1(tap_log_messages == 1);
+		tap_log_messages = 0;
+
+		/* Nonblocking fetch. */
+		lock_err = EAGAIN;
+		ok1(!tdb_exists(tdb, key));
+		ok1(tap_log_messages == 0);
+		lock_err = EINTR;
+		ok1(!tdb_exists(tdb, key));
+		ok1(tap_log_messages == 0);
+		lock_err = ENOMEM;
+		ok1(!tdb_exists(tdb, key));
+		ok1(tap_log_messages == 1);
+		tap_log_messages = 0;
+
+		lock_err = EAGAIN;
+		ok1(tdb_fetch(tdb, key, &d) == TDB_ERR_LOCK);
+		ok1(tap_log_messages == 0);
+		lock_err = EINTR;
+		ok1(tdb_fetch(tdb, key, &d) == TDB_ERR_LOCK);
+		ok1(tap_log_messages == 0);
+		lock_err = ENOMEM;
+		ok1(tdb_fetch(tdb, key, &d) == TDB_ERR_LOCK);
+		ok1(tap_log_messages == 1);
+		tap_log_messages = 0;
+
+		/* Nonblocking delete. */
+		lock_err = EAGAIN;
+		ok1(tdb_delete(tdb, key) == TDB_ERR_LOCK);
+		ok1(tap_log_messages == 0);
+		lock_err = EINTR;
+		ok1(tdb_delete(tdb, key) == TDB_ERR_LOCK);
+		ok1(tap_log_messages == 0);
+		lock_err = ENOMEM;
+		ok1(tdb_delete(tdb, key) == TDB_ERR_LOCK);
+		ok1(tap_log_messages == 1);
+		tap_log_messages = 0;
+
+		/* Nonblocking locks. */
+		lock_err = EAGAIN;
+		ok1(tdb_chainlock(tdb, key) == TDB_ERR_LOCK);
+		ok1(tap_log_messages == 0);
+		lock_err = EINTR;
+		ok1(tdb_chainlock(tdb, key) == TDB_ERR_LOCK);
+		ok1(tap_log_messages == 0);
+		lock_err = ENOMEM;
+		ok1(tdb_chainlock(tdb, key) == TDB_ERR_LOCK);
+		ok1(tap_log_messages == 1);
+		tap_log_messages = 0;
+
+		lock_err = EAGAIN;
+		ok1(tdb_chainlock_read(tdb, key) == TDB_ERR_LOCK);
+		ok1(tap_log_messages == 0);
+		lock_err = EINTR;
+		ok1(tdb_chainlock_read(tdb, key) == TDB_ERR_LOCK);
+		ok1(tap_log_messages == 0);
+		lock_err = ENOMEM;
+		ok1(tdb_chainlock_read(tdb, key) == TDB_ERR_LOCK);
+		ok1(tap_log_messages == 1);
+		tap_log_messages = 0;
+
+		lock_err = EAGAIN;
+		ok1(tdb_lockall(tdb) == TDB_ERR_LOCK);
+		ok1(tap_log_messages == 0);
+		lock_err = EINTR;
+		ok1(tdb_lockall(tdb) == TDB_ERR_LOCK);
+		ok1(tap_log_messages == 0);
+		lock_err = ENOMEM;
+		ok1(tdb_lockall(tdb) == TDB_ERR_LOCK);
+		/* This actually does divide and conquer. */
+		ok1(tap_log_messages > 0);
+		tap_log_messages = 0;
+
+		lock_err = EAGAIN;
+		ok1(tdb_lockall_read(tdb) == TDB_ERR_LOCK);
+		ok1(tap_log_messages == 0);
+		lock_err = EINTR;
+		ok1(tdb_lockall_read(tdb) == TDB_ERR_LOCK);
+		ok1(tap_log_messages == 0);
+		lock_err = ENOMEM;
+		ok1(tdb_lockall_read(tdb) == TDB_ERR_LOCK);
+		ok1(tap_log_messages > 0);
+		tap_log_messages = 0;
+
+		/* Nonblocking traverse; go nonblock partway through. */
+		lock_err = 0;
+		ok1(tdb_store(tdb, key, data, TDB_REPLACE) == 0);
+		trav_err = EAGAIN;
+		ok1(tdb_traverse(tdb, trav, &lock_err) == TDB_ERR_LOCK);
+		ok1(tap_log_messages == 0);
+		trav_err = EINTR;
+		lock_err = 0;
+		ok1(tdb_traverse(tdb, trav, &lock_err) == TDB_ERR_LOCK);
+		ok1(tap_log_messages == 0);
+		trav_err = ENOMEM;
+		lock_err = 0;
+		ok1(tdb_traverse(tdb, trav, &lock_err) == TDB_ERR_LOCK);
+		ok1(tap_log_messages == 1);
+		tap_log_messages = 0;
+
+		/* Nonblocking transactions. */
+		lock_err = EAGAIN;
+		ok1(tdb_transaction_start(tdb) == TDB_ERR_LOCK);
+		ok1(tap_log_messages == 0);
+		lock_err = EINTR;
+		ok1(tdb_transaction_start(tdb) == TDB_ERR_LOCK);
+		ok1(tap_log_messages == 0);
+		lock_err = ENOMEM;
+		ok1(tdb_transaction_start(tdb) == TDB_ERR_LOCK);
+		ok1(tap_log_messages == 1);
+		tap_log_messages = 0;
+
+		/* Nonblocking transaction prepare. */
+		lock_err = 0;
+		ok1(tdb_transaction_start(tdb) == 0);
+		ok1(tdb_delete(tdb, key) == 0);
+
+		lock_err = EAGAIN;
+		ok1(tdb_transaction_prepare_commit(tdb) == TDB_ERR_LOCK);
+		ok1(tap_log_messages == 0);
+
+		lock_err = 0;
+		ok1(tdb_transaction_prepare_commit(tdb) == 0);
+		ok1(tdb_transaction_commit(tdb) == 0);
+
+		/* And the transaction was committed, right? */
+		ok1(!tdb_exists(tdb, key));
+		tdb_close(tdb);
+		ok1(tap_log_messages == 0);
+	}
+	return exit_status();
+}
diff --git a/lib/tdb2/test/run-83-openhook.c b/lib/tdb2/test/run-83-openhook.c
new file mode 100644
index 0000000000..320be7d4da
--- /dev/null
+++ b/lib/tdb2/test/run-83-openhook.c
@@ -0,0 +1,98 @@
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tap/tap.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <stdarg.h>
+#include <err.h>
+#include "external-agent.h"
+#include "logging.h"
+
+static enum TDB_ERROR clear_if_first(int fd, void *arg)
+{
+/* We hold a lock offset 63 always, so we can tell if anyone is holding it. */
+	struct flock fl;
+
+	if (arg != clear_if_first)
+		return TDB_ERR_CORRUPT;
+
+	fl.l_type = F_WRLCK;
+	fl.l_whence = SEEK_SET;
+	fl.l_start = 63;
+	fl.l_len = 1;
+
+	if (fcntl(fd, F_SETLK, &fl) == 0) {
+		/* We must be first ones to open it! */
+		diag("truncating file!");
+		if (ftruncate(fd, 0) != 0) {
+			return TDB_ERR_IO;
+		}
+	}
+	fl.l_type = F_RDLCK;
+	if (fcntl(fd, F_SETLKW, &fl) != 0) {
+		return TDB_ERR_IO;
+	}
+	return TDB_SUCCESS;
+}
+
+int main(int argc, char *argv[])
+{
+	unsigned int i;
+	struct tdb_context *tdb;
+	struct agent *agent;
+	union tdb_attribute cif;
+	struct tdb_data key = tdb_mkdata("key", 3);
+	int flags[] = { TDB_DEFAULT, TDB_NOMMAP,
+			TDB_CONVERT, TDB_NOMMAP|TDB_CONVERT };
+
+	cif.openhook.base.attr = TDB_ATTRIBUTE_OPENHOOK;
+	cif.openhook.base.next = &tap_log_attr;
+	cif.openhook.fn = clear_if_first;
+	cif.openhook.data = clear_if_first;
+
+	agent = prepare_external_agent();
+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 13);
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		/* Create it */
+		tdb = tdb_open("run-83-openhook.tdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, NULL);
+		ok1(tdb);
+		ok1(tdb_store(tdb, key, key, TDB_REPLACE) == 0);
+		tdb_close(tdb);
+
+		/* Now, open with CIF, should clear it. */
+		tdb = tdb_open("run-83-openhook.tdb", flags[i],
+			       O_RDWR, 0, &cif);
+		ok1(tdb);
+		ok1(!tdb_exists(tdb, key));
+		ok1(tdb_store(tdb, key, key, TDB_REPLACE) == 0);
+
+		/* Agent should not clear it, since it's still open. */
+		ok1(external_agent_operation(agent, OPEN_WITH_HOOK,
+					     "run-83-openhook.tdb") == SUCCESS);
+		ok1(external_agent_operation(agent, FETCH, "key") == SUCCESS);
+		ok1(external_agent_operation(agent, CLOSE, "") == SUCCESS);
+
+		/* Still exists for us too. */
+		ok1(tdb_exists(tdb, key));
+
+		/* Close it, now agent should clear it. */
+		tdb_close(tdb);
+
+		ok1(external_agent_operation(agent, OPEN_WITH_HOOK,
+					     "run-83-openhook.tdb") == SUCCESS);
+		ok1(external_agent_operation(agent, FETCH, "key") == FAILED);
+		ok1(external_agent_operation(agent, CLOSE, "") == SUCCESS);
+
+		ok1(tap_log_messages == 0);
+	}
+
+	free_external_agent(agent);
+	return exit_status();
+}
diff --git a/lib/tdb2/test/run-90-get-set-attributes.c b/lib/tdb2/test/run-90-get-set-attributes.c
new file mode 100644
index 0000000000..159d8a01ea
--- /dev/null
+++ b/lib/tdb2/test/run-90-get-set-attributes.c
@@ -0,0 +1,165 @@
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tdb2/traverse.c>
+#include <ccan/tap/tap.h>
+#include "logging.h"
+
+static int mylock(int fd, int rw, off_t off, off_t len, bool waitflag,
+		  void *unused)
+{
+	return 0;
+}
+
+static int myunlock(int fd, int rw, off_t off, off_t len, void *unused)
+{
+	return 0;
+}
+
+static uint64_t hash_fn(const void *key, size_t len, uint64_t seed,
+			void *priv)
+{
+	return 0;
+}
+
+int main(int argc, char *argv[])
+{
+	unsigned int i;
+	struct tdb_context *tdb;
+	int flags[] = { TDB_DEFAULT, TDB_NOMMAP,
+			TDB_CONVERT, TDB_NOMMAP|TDB_CONVERT };
+	union tdb_attribute seed_attr;
+	union tdb_attribute hash_attr;
+	union tdb_attribute lock_attr;
+
+	hash_attr.base.attr = TDB_ATTRIBUTE_HASH;
+	hash_attr.base.next = &seed_attr;
+	hash_attr.hash.fn = hash_fn;
+	hash_attr.hash.data = &hash_attr;
+
+	seed_attr.base.attr = TDB_ATTRIBUTE_SEED;
+	seed_attr.base.next = &lock_attr;
+	seed_attr.seed.seed = 100;
+
+	lock_attr.base.attr = TDB_ATTRIBUTE_FLOCK;
+	lock_attr.base.next = &tap_log_attr;
+	lock_attr.flock.lock = mylock;
+	lock_attr.flock.unlock = myunlock;
+	lock_attr.flock.data = &lock_attr;
+
+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 50);
+
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		union tdb_attribute attr;
+
+		/* First open with no attributes. */
+		tdb = tdb_open("run-90-get-set-attributes.tdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, NULL);
+		ok1(tdb);
+
+		/* Get log on no attributes will fail */
+		attr.base.attr = TDB_ATTRIBUTE_LOG;
+		ok1(tdb_get_attribute(tdb, &attr) == TDB_ERR_NOEXIST);
+		/* These always work. */
+		attr.base.attr = TDB_ATTRIBUTE_HASH;
+		ok1(tdb_get_attribute(tdb, &attr) == 0);
+		ok1(attr.base.attr == TDB_ATTRIBUTE_HASH);
+		ok1(attr.hash.fn == jenkins_hash);
+		attr.base.attr = TDB_ATTRIBUTE_FLOCK;
+		ok1(tdb_get_attribute(tdb, &attr) == 0);
+		ok1(attr.base.attr == TDB_ATTRIBUTE_FLOCK);
+		ok1(attr.flock.lock == tdb_fcntl_lock);
+		ok1(attr.flock.unlock == tdb_fcntl_unlock);
+		attr.base.attr = TDB_ATTRIBUTE_SEED;
+		ok1(tdb_get_attribute(tdb, &attr) == 0);
+		ok1(attr.base.attr == TDB_ATTRIBUTE_SEED);
+		/* This is possible, just astronomically unlikely. */
+		ok1(attr.seed.seed != 0);
+
+		/* Unset attributes. */
+		tdb_unset_attribute(tdb, TDB_ATTRIBUTE_LOG);
+		tdb_unset_attribute(tdb, TDB_ATTRIBUTE_FLOCK);
+
+		/* Set them. */
+		ok1(tdb_set_attribute(tdb, &tap_log_attr) == 0);
+		ok1(tdb_set_attribute(tdb, &lock_attr) == 0);
+		/* These should fail. */
+		ok1(tdb_set_attribute(tdb, &seed_attr) == TDB_ERR_EINVAL);
+		ok1(tap_log_messages == 1);
+		ok1(tdb_set_attribute(tdb, &hash_attr) == TDB_ERR_EINVAL);
+		ok1(tap_log_messages == 2);
+		tap_log_messages = 0;
+
+		/* Getting them should work as expected. */
+		attr.base.attr = TDB_ATTRIBUTE_LOG;
+		ok1(tdb_get_attribute(tdb, &attr) == 0);
+		ok1(attr.base.attr == TDB_ATTRIBUTE_LOG);
+		ok1(attr.log.fn == tap_log_attr.log.fn);
+		ok1(attr.log.data == tap_log_attr.log.data);
+
+		attr.base.attr = TDB_ATTRIBUTE_FLOCK;
+		ok1(tdb_get_attribute(tdb, &attr) == 0);
+		ok1(attr.base.attr == TDB_ATTRIBUTE_FLOCK);
+		ok1(attr.flock.lock == mylock);
+		ok1(attr.flock.unlock == myunlock);
+		ok1(attr.flock.data == &lock_attr);
+
+		/* Unset them again. */
+		tdb_unset_attribute(tdb, TDB_ATTRIBUTE_FLOCK);
+		ok1(tap_log_messages == 0);
+		tdb_unset_attribute(tdb, TDB_ATTRIBUTE_LOG);
+		ok1(tap_log_messages == 0);
+
+		tdb_close(tdb);
+		ok1(tap_log_messages == 0);
+
+		/* Now open with all attributes. */
+		tdb = tdb_open("run-90-get-set-attributes.tdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &hash_attr);
+		ok1(tdb);
+
+		/* Get will succeed */
+		attr.base.attr = TDB_ATTRIBUTE_LOG;
+		ok1(tdb_get_attribute(tdb, &attr) == 0);
+		ok1(attr.base.attr == TDB_ATTRIBUTE_LOG);
+		ok1(attr.log.fn == tap_log_attr.log.fn);
+		ok1(attr.log.data == tap_log_attr.log.data);
+
+		attr.base.attr = TDB_ATTRIBUTE_HASH;
+		ok1(tdb_get_attribute(tdb, &attr) == 0);
+		ok1(attr.base.attr == TDB_ATTRIBUTE_HASH);
+		ok1(attr.hash.fn == hash_fn);
+		ok1(attr.hash.data == &hash_attr);
+
+		attr.base.attr = TDB_ATTRIBUTE_FLOCK;
+		ok1(tdb_get_attribute(tdb, &attr) == 0);
+		ok1(attr.base.attr == TDB_ATTRIBUTE_FLOCK);
+		ok1(attr.flock.lock == mylock);
+		ok1(attr.flock.unlock == myunlock);
+		ok1(attr.flock.data == &lock_attr);
+
+		attr.base.attr = TDB_ATTRIBUTE_SEED;
+		ok1(tdb_get_attribute(tdb, &attr) == 0);
+		ok1(attr.base.attr == TDB_ATTRIBUTE_SEED);
+		ok1(attr.seed.seed == seed_attr.seed.seed);
+
+		/* Unset attributes. */
+		tdb_unset_attribute(tdb, TDB_ATTRIBUTE_HASH);
+		ok1(tap_log_messages == 1);
+		tdb_unset_attribute(tdb, TDB_ATTRIBUTE_SEED);
+		ok1(tap_log_messages == 2);
+		tdb_unset_attribute(tdb, TDB_ATTRIBUTE_FLOCK);
+		tdb_unset_attribute(tdb, TDB_ATTRIBUTE_LOG);
+		ok1(tap_log_messages == 2);
+		tap_log_messages = 0;
+
+		tdb_close(tdb);
+
+	}
+	return exit_status();
+}
diff --git a/lib/tdb2/test/run-91-get-stats.c b/lib/tdb2/test/run-91-get-stats.c
new file mode 100644
index 0000000000..795dfd6602
--- /dev/null
+++ b/lib/tdb2/test/run-91-get-stats.c
@@ -0,0 +1,59 @@
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tdb2/traverse.c>
+#include <ccan/tap/tap.h>
+#include "logging.h"
+
+int main(int argc, char *argv[])
+{
+	unsigned int i;
+	struct tdb_context *tdb;
+	int flags[] = { TDB_DEFAULT, TDB_NOMMAP,
+			TDB_CONVERT, TDB_NOMMAP|TDB_CONVERT };
+
+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 11);
+
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		union tdb_attribute *attr;
+		struct tdb_data key = tdb_mkdata("key", 3);
+
+		tdb = tdb_open("run-91-get-stats.tdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+		ok1(tdb);
+		ok1(tdb_store(tdb, key, key, TDB_REPLACE) == 0);
+
+		/* Use malloc so valgrind will catch overruns. */
+		attr = malloc(sizeof *attr);
+		attr->stats.base.attr = TDB_ATTRIBUTE_STATS;
+		attr->stats.size = sizeof(*attr);
+
+		ok1(tdb_get_attribute(tdb, attr) == 0);
+		ok1(attr->stats.size == sizeof(*attr));
+		ok1(attr->stats.allocs > 0);
+		ok1(attr->stats.expands > 0);
+		ok1(attr->stats.locks > 0);
+		free(attr);
+
+		/* Try short one. */
+		attr = malloc(offsetof(struct tdb_attribute_stats, allocs)
+			      + sizeof(attr->stats.allocs));
+		attr->stats.base.attr = TDB_ATTRIBUTE_STATS;
+		attr->stats.size = offsetof(struct tdb_attribute_stats, allocs)
+			+ sizeof(attr->stats.allocs);
+		ok1(tdb_get_attribute(tdb, attr) == 0);
+		ok1(attr->stats.size == sizeof(*attr));
+		ok1(attr->stats.allocs > 0);
+		free(attr);
+		ok1(tap_log_messages == 0);
+
+		tdb_close(tdb);
+
+	}
+	return exit_status();
+}
diff --git a/lib/tdb2/test/run-add-remove-flags.c b/lib/tdb2/test/run-add-remove-flags.c
new file mode 100644
index 0000000000..1dc8463662
--- /dev/null
+++ b/lib/tdb2/test/run-add-remove-flags.c
@@ -0,0 +1,93 @@
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tdb2/traverse.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tap/tap.h>
+#include "logging.h"
+
+int main(int argc, char *argv[])
+{
+	unsigned int i;
+	struct tdb_context *tdb;
+	int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP,
+			TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT,
+			TDB_NOMMAP|TDB_CONVERT };
+
+	plan_tests(87);
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		tdb = tdb_open("run-add-remove-flags.tdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+		ok1(tdb);
+		if (!tdb)
+			continue;
+
+		ok1(tdb_get_flags(tdb) == tdb->flags);
+		tap_log_messages = 0;
+		tdb_add_flag(tdb, TDB_NOLOCK);
+		if (flags[i] & TDB_INTERNAL)
+			ok1(tap_log_messages == 1);
+		else {
+			ok1(tap_log_messages == 0);
+			ok1(tdb_get_flags(tdb) & TDB_NOLOCK);
+		}
+
+		tap_log_messages = 0;
+		tdb_add_flag(tdb, TDB_NOMMAP);
+		if (flags[i] & TDB_INTERNAL)
+			ok1(tap_log_messages == 1);
+		else {
+			ok1(tap_log_messages == 0);
+			ok1(tdb_get_flags(tdb) & TDB_NOMMAP);
+			ok1(tdb->file->map_ptr == NULL);
+		}
+
+		tap_log_messages = 0;
+		tdb_add_flag(tdb, TDB_NOSYNC);
+		if (flags[i] & TDB_INTERNAL)
+			ok1(tap_log_messages == 1);
+		else {
+			ok1(tap_log_messages == 0);
+			ok1(tdb_get_flags(tdb) & TDB_NOSYNC);
+		}
+
+		ok1(tdb_get_flags(tdb) == tdb->flags);
+
+		tap_log_messages = 0;
+		tdb_remove_flag(tdb, TDB_NOLOCK);
+		if (flags[i] & TDB_INTERNAL)
+			ok1(tap_log_messages == 1);
+		else {
+			ok1(tap_log_messages == 0);
+			ok1(!(tdb_get_flags(tdb) & TDB_NOLOCK));
+		}
+
+		tap_log_messages = 0;
+		tdb_remove_flag(tdb, TDB_NOMMAP);
+		if (flags[i] & TDB_INTERNAL)
+			ok1(tap_log_messages == 1);
+		else {
+			ok1(tap_log_messages == 0);
+			ok1(!(tdb_get_flags(tdb) & TDB_NOMMAP));
+			ok1(tdb->file->map_ptr != NULL);
+		}
+
+		tap_log_messages = 0;
+		tdb_remove_flag(tdb, TDB_NOSYNC);
+		if (flags[i] & TDB_INTERNAL)
+			ok1(tap_log_messages == 1);
+		else {
+			ok1(tap_log_messages == 0);
+			ok1(!(tdb_get_flags(tdb) & TDB_NOSYNC));
+		}
+
+		tdb_close(tdb);
+	}
+
+	ok1(tap_log_messages == 0);
+	return exit_status();
+}
diff --git a/lib/tdb2/test/run-check-callback.c b/lib/tdb2/test/run-check-callback.c
new file mode 100644
index 0000000000..1e87436717
--- /dev/null
+++ b/lib/tdb2/test/run-check-callback.c
@@ -0,0 +1,90 @@
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tdb2/traverse.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tap/tap.h>
+#include "logging.h"
+
+#define NUM_RECORDS 1000
+
+static bool store_records(struct tdb_context *tdb)
+{
+	int i;
+	struct tdb_data key = { (unsigned char *)&i, sizeof(i) };
+	struct tdb_data data = { (unsigned char *)&i, sizeof(i) };
+
+	for (i = 0; i < NUM_RECORDS; i++)
+		if (tdb_store(tdb, key, data, TDB_REPLACE) != 0)
+			return false;
+	return true;
+}
+
+static enum TDB_ERROR check(struct tdb_data key,
+			    struct tdb_data data,
+			    bool *array)
+{
+	int val;
+
+	if (key.dsize != sizeof(val)) {
+		diag("Wrong key size: %u\n", key.dsize);
+		return TDB_ERR_CORRUPT;
+	}
+
+	if (key.dsize != data.dsize
+	    || memcmp(key.dptr, data.dptr, sizeof(val)) != 0) {
+		diag("Key and data differ\n");
+		return TDB_ERR_CORRUPT;
+	}
+
+	memcpy(&val, key.dptr, sizeof(val));
+	if (val >= NUM_RECORDS || val < 0) {
+		diag("check value %i\n", val);
+		return TDB_ERR_CORRUPT;
+	}
+
+	if (array[val]) {
+		diag("Value %i already seen\n", val);
+		return TDB_ERR_CORRUPT;
+	}
+
+	array[val] = true;
+	return TDB_SUCCESS;
+}
+
+int main(int argc, char *argv[])
+{
+	unsigned int i, j;
+	struct tdb_context *tdb;
+	int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP,
+			TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT,
+			TDB_NOMMAP|TDB_CONVERT };
+
+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 4 + 1);
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		bool array[NUM_RECORDS];
+
+		tdb = tdb_open("run-check-callback.tdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+		ok1(tdb);
+		if (!tdb)
+			continue;
+
+		ok1(store_records(tdb));
+		for (j = 0; j < NUM_RECORDS; j++)
+			array[j] = false;
+		ok1(tdb_check(tdb, check, array) == TDB_SUCCESS);
+		for (j = 0; j < NUM_RECORDS; j++)
+			if (!array[j])
+				break;
+		ok1(j == NUM_RECORDS);
+		tdb_close(tdb);
+	}
+
+	ok1(tap_log_messages == 0);
+	return exit_status();
+}
diff --git a/lib/tdb2/test/run-expand-in-transaction.c b/lib/tdb2/test/run-expand-in-transaction.c
new file mode 100644
index 0000000000..49ba03c924
--- /dev/null
+++ b/lib/tdb2/test/run-expand-in-transaction.c
@@ -0,0 +1,45 @@
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tap/tap.h>
+#include "logging.h"
+
+int main(int argc, char *argv[])
+{
+	unsigned int i;
+	struct tdb_context *tdb;
+	int flags[] = { TDB_DEFAULT, TDB_NOMMAP,
+			TDB_CONVERT, TDB_NOMMAP|TDB_CONVERT,
+			TDB_CONVERT|TDB_NOSYNC,
+			TDB_NOMMAP|TDB_CONVERT|TDB_NOSYNC };
+	struct tdb_data key = tdb_mkdata("key", 3);
+	struct tdb_data data = tdb_mkdata("data", 4);
+
+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 7 + 1);
+
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		size_t size;
+		tdb = tdb_open("run-expand-in-transaction.tdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+		ok1(tdb);
+		if (!tdb)
+			continue;
+
+		size = tdb->file->map_size;
+		ok1(tdb_transaction_start(tdb) == 0);
+		ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0);
+		ok1(tdb->file->map_size > size);
+		ok1(tdb_transaction_commit(tdb) == 0);
+		ok1(tdb->file->map_size > size);
+		ok1(tdb_check(tdb, NULL, NULL) == 0);
+		tdb_close(tdb);
+	}
+
+	ok1(tap_log_messages == 0);
+	return exit_status();
+}
diff --git a/lib/tdb2/test/run-features.c b/lib/tdb2/test/run-features.c
new file mode 100644
index 0000000000..6d82dc308c
--- /dev/null
+++ b/lib/tdb2/test/run-features.c
@@ -0,0 +1,70 @@
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tdb2/summary.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tap/tap.h>
+#include "logging.h"
+
+int main(int argc, char *argv[])
+{
+	unsigned int i, j;
+	struct tdb_context *tdb;
+	int flags[] = { TDB_DEFAULT, TDB_NOMMAP,
+			TDB_CONVERT, TDB_NOMMAP|TDB_CONVERT };
+	struct tdb_data key = { (unsigned char *)&j, sizeof(j) };
+	struct tdb_data data = { (unsigned char *)&j, sizeof(j) };
+
+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 8 + 1);
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		uint64_t features;
+		tdb = tdb_open("run-features.tdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+		ok1(tdb);
+		if (!tdb)
+			continue;
+
+		/* Put some stuff in there. */
+		for (j = 0; j < 100; j++) {
+			if (tdb_store(tdb, key, data, TDB_REPLACE) != 0)
+				fail("Storing in tdb");
+		}
+
+		/* Mess with features fields in hdr. */
+		features = (~TDB_FEATURE_MASK ^ 1);
+		ok1(tdb_write_convert(tdb, offsetof(struct tdb_header,
+						    features_used),
+				      &features, sizeof(features)) == 0);
+		ok1(tdb_write_convert(tdb, offsetof(struct tdb_header,
+						    features_offered),
+				      &features, sizeof(features)) == 0);
+		tdb_close(tdb);
+
+		tdb = tdb_open("run-features.tdb", flags[i], O_RDWR, 0,
+			       &tap_log_attr);
+		ok1(tdb);
+		if (!tdb)
+			continue;
+
+		/* Should not have changed features offered. */
+		ok1(tdb_read_convert(tdb, offsetof(struct tdb_header,
+						   features_offered),
+				     &features, sizeof(features)) == 0);
+		ok1(features == (~TDB_FEATURE_MASK ^ 1));
+
+		/* Should have cleared unknown bits in features_used. */
+		ok1(tdb_read_convert(tdb, offsetof(struct tdb_header,
+						   features_used),
+				     &features, sizeof(features)) == 0);
+		ok1(features == (1 & TDB_FEATURE_MASK));
+
+		tdb_close(tdb);
+	}
+
+	ok1(tap_log_messages == 0);
+	return exit_status();
+}
diff --git a/lib/tdb2/test/run-firstkey-nextkey.c b/lib/tdb2/test/run-firstkey-nextkey.c
new file mode 100644
index 0000000000..65a6090a96
--- /dev/null
+++ b/lib/tdb2/test/run-firstkey-nextkey.c
@@ -0,0 +1,162 @@
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tdb2/traverse.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tap/tap.h>
+#include "logging.h"
+
+#define NUM_RECORDS 1000
+
+static bool store_records(struct tdb_context *tdb)
+{
+	int i;
+	struct tdb_data key = { (unsigned char *)&i, sizeof(i) };
+	struct tdb_data data = { (unsigned char *)&i, sizeof(i) };
+
+	for (i = 0; i < NUM_RECORDS; i++)
+		if (tdb_store(tdb, key, data, TDB_REPLACE) != 0)
+			return false;
+	return true;
+}
+
+struct trav_data {
+	unsigned int records[NUM_RECORDS];
+	unsigned int calls;
+};
+
+static int trav(struct tdb_context *tdb, TDB_DATA key, TDB_DATA dbuf, void *p)
+{
+	struct trav_data *td = p;
+	int val;
+
+	memcpy(&val, dbuf.dptr, dbuf.dsize);
+	td->records[td->calls++] = val;
+	return 0;
+}
+
+/* Since tdb_nextkey frees dptr, we need to clone it. */
+static TDB_DATA dup_key(TDB_DATA key)
+{
+	void *p = malloc(key.dsize);
+	memcpy(p, key.dptr, key.dsize);
+	key.dptr = p;
+	return key;
+}
+
+int main(int argc, char *argv[])
+{
+	unsigned int i, j;
+	int num;
+	struct trav_data td;
+	TDB_DATA k;
+	struct tdb_context *tdb;
+	union tdb_attribute seed_attr;
+	enum TDB_ERROR ecode;
+
+	int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP,
+			TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT,
+			TDB_NOMMAP|TDB_CONVERT };
+
+	seed_attr.base.attr = TDB_ATTRIBUTE_SEED;
+	seed_attr.base.next = &tap_log_attr;
+	seed_attr.seed.seed = 6334326220117065685ULL;
+
+	plan_tests(sizeof(flags) / sizeof(flags[0])
+		   * (NUM_RECORDS*6 + (NUM_RECORDS-1)*3 + 22) + 1);
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		tdb = tdb_open("run-traverse.tdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &seed_attr);
+		ok1(tdb);
+		if (!tdb)
+			continue;
+
+		ok1(tdb_firstkey(tdb, &k) == TDB_ERR_NOEXIST);
+
+		/* One entry... */
+		k.dptr = (unsigned char *)&num;
+		k.dsize = sizeof(num);
+		num = 0;
+		ok1(tdb_store(tdb, k, k, TDB_INSERT) == 0);
+		ok1(tdb_firstkey(tdb, &k) == TDB_SUCCESS);
+		ok1(k.dsize == sizeof(num));
+		ok1(memcmp(k.dptr, &num, sizeof(num)) == 0);
+		ok1(tdb_nextkey(tdb, &k) == TDB_ERR_NOEXIST);
+
+		/* Two entries. */
+		k.dptr = (unsigned char *)&num;
+		k.dsize = sizeof(num);
+		num = 1;
+		ok1(tdb_store(tdb, k, k, TDB_INSERT) == 0);
+		ok1(tdb_firstkey(tdb, &k) == TDB_SUCCESS);
+		ok1(k.dsize == sizeof(num));
+		memcpy(&num, k.dptr, sizeof(num));
+		ok1(num == 0 || num == 1);
+		ok1(tdb_nextkey(tdb, &k) == TDB_SUCCESS);
+		ok1(k.dsize == sizeof(j));
+		memcpy(&j, k.dptr, sizeof(j));
+		ok1(j == 0 || j == 1);
+		ok1(j != num);
+		ok1(tdb_nextkey(tdb, &k) == TDB_ERR_NOEXIST);
+
+		/* Clean up. */
+		k.dptr = (unsigned char *)&num;
+		k.dsize = sizeof(num);
+		num = 0;
+		ok1(tdb_delete(tdb, k) == 0);
+		num = 1;
+		ok1(tdb_delete(tdb, k) == 0);
+
+		/* Now lots of records. */
+		ok1(store_records(tdb));
+		td.calls = 0;
+
+		num = tdb_traverse(tdb, trav, &td);
+		ok1(num == NUM_RECORDS);
+		ok1(td.calls == NUM_RECORDS);
+
+		/* Simple loop should match tdb_traverse */
+		for (j = 0, ecode = tdb_firstkey(tdb, &k); j < td.calls; j++) {
+			int val;
+
+			ok1(ecode == TDB_SUCCESS);
+			ok1(k.dsize == sizeof(val));
+			memcpy(&val, k.dptr, k.dsize);
+			ok1(td.records[j] == val);
+			ecode = tdb_nextkey(tdb, &k);
+		}
+
+		/* But arbitrary orderings should work too. */
+		for (j = td.calls-1; j > 0; j--) {
+			k.dptr = (unsigned char *)&td.records[j-1];
+			k.dsize = sizeof(td.records[j-1]);
+			k = dup_key(k);
+			ok1(tdb_nextkey(tdb, &k) == TDB_SUCCESS);
+			ok1(k.dsize == sizeof(td.records[j]));
+			ok1(memcmp(k.dptr, &td.records[j], k.dsize) == 0);
+			free(k.dptr);
+		}
+
+		/* Even delete should work. */
+		for (j = 0, ecode = tdb_firstkey(tdb, &k);
+		     ecode != TDB_ERR_NOEXIST;
+		     j++) {
+			ok1(ecode == TDB_SUCCESS);
+			ok1(k.dsize == 4);
+			ok1(tdb_delete(tdb, k) == 0);
+			ecode = tdb_nextkey(tdb, &k);
+		}
+
+		diag("delete using first/nextkey gave %u of %u records",
+		     j, NUM_RECORDS);
+		ok1(j == NUM_RECORDS);
+		tdb_close(tdb);
+	}
+
+	ok1(tap_log_messages == 0);
+	return exit_status();
+}
diff --git a/lib/tdb2/test/run-fork-test.c b/lib/tdb2/test/run-fork-test.c
new file mode 100644
index 0000000000..e9813e0a0f
--- /dev/null
+++ b/lib/tdb2/test/run-fork-test.c
@@ -0,0 +1,180 @@
+/* Test forking while holding lock.
+ *
+ * There are only five ways to do this currently:
+ * (1) grab a tdb_chainlock, then fork.
+ * (2) grab a tdb_lockall, then fork.
+ * (3) grab a tdb_lockall_read, then fork.
+ * (4) start a transaction, then fork.
+ * (5) fork from inside a tdb_parse() callback.
+ *
+ * Note that we don't hold a lock across tdb_traverse callbacks, so
+ * that doesn't matter.
+ */
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tap/tap.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include "logging.h"
+
+static enum TDB_ERROR fork_in_parse(TDB_DATA key, TDB_DATA data,
+				    struct tdb_context *tdb)
+{
+	int status;
+
+	if (fork() == 0) {
+		/* We expect this to fail. */
+		if (tdb_store(tdb, key, data, TDB_REPLACE) != TDB_ERR_LOCK)
+			exit(1);
+
+		if (tdb_fetch(tdb, key, &data) != TDB_ERR_LOCK)
+			exit(1);
+
+		if (tap_log_messages != 2)
+			exit(2);
+
+		tdb_close(tdb);
+		if (tap_log_messages != 2)
+			exit(3);
+		exit(0);
+	}
+	wait(&status);
+	ok1(WIFEXITED(status) && WEXITSTATUS(status) == 0);
+	return TDB_SUCCESS;
+}
+
+int main(int argc, char *argv[])
+{
+	unsigned int i;
+	struct tdb_context *tdb;
+	int flags[] = { TDB_DEFAULT, TDB_NOMMAP,
+			TDB_CONVERT, TDB_NOMMAP|TDB_CONVERT };
+	struct tdb_data key = tdb_mkdata("key", 3);
+	struct tdb_data data = tdb_mkdata("data", 4);
+
+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 14);
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		int status;
+
+		tap_log_messages = 0;
+
+		tdb = tdb_open("run-fork-test.tdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+		if (!ok1(tdb))
+			continue;
+
+		/* Put a record in here. */
+		ok1(tdb_store(tdb, key, data, TDB_REPLACE) == TDB_SUCCESS);
+
+		ok1(tdb_chainlock(tdb, key) == TDB_SUCCESS);
+		if (fork() == 0) {
+			/* We expect this to fail. */
+			if (tdb_store(tdb, key, data, TDB_REPLACE) != TDB_ERR_LOCK)
+				return 1;
+
+			if (tdb_fetch(tdb, key, &data) != TDB_ERR_LOCK)
+				return 1;
+
+			if (tap_log_messages != 2)
+				return 2;
+
+			tdb_chainunlock(tdb, key);
+			if (tap_log_messages != 3)
+				return 3;
+			tdb_close(tdb);
+			if (tap_log_messages != 3)
+				return 4;
+			return 0;
+		}
+		wait(&status);
+		ok1(WIFEXITED(status) && WEXITSTATUS(status) == 0);
+		tdb_chainunlock(tdb, key);
+
+		ok1(tdb_lockall(tdb) == TDB_SUCCESS);
+		if (fork() == 0) {
+			/* We expect this to fail. */
+			if (tdb_store(tdb, key, data, TDB_REPLACE) != TDB_ERR_LOCK)
+				return 1;
+
+			if (tdb_fetch(tdb, key, &data) != TDB_ERR_LOCK)
+				return 1;
+
+			if (tap_log_messages != 2)
+				return 2;
+
+			tdb_unlockall(tdb);
+			if (tap_log_messages != 2)
+				return 3;
+			tdb_close(tdb);
+			if (tap_log_messages != 2)
+				return 4;
+			return 0;
+		}
+		wait(&status);
+		ok1(WIFEXITED(status) && WEXITSTATUS(status) == 0);
+		tdb_unlockall(tdb);
+
+		ok1(tdb_lockall_read(tdb) == TDB_SUCCESS);
+		if (fork() == 0) {
+			/* We expect this to fail. */
+			/* This would always fail anyway... */
+			if (tdb_store(tdb, key, data, TDB_REPLACE) != TDB_ERR_LOCK)
+				return 1;
+
+			if (tdb_fetch(tdb, key, &data) != TDB_ERR_LOCK)
+				return 1;
+
+			if (tap_log_messages != 2)
+				return 2;
+
+			tdb_unlockall_read(tdb);
+			if (tap_log_messages != 2)
+				return 3;
+			tdb_close(tdb);
+			if (tap_log_messages != 2)
+				return 4;
+			return 0;
+		}
+		wait(&status);
+		ok1(WIFEXITED(status) && WEXITSTATUS(status) == 0);
+		tdb_unlockall_read(tdb);
+
+		ok1(tdb_transaction_start(tdb) == TDB_SUCCESS);
+		/* If transactions is empty, noop "commit" succeeds. */
+		ok1(tdb_delete(tdb, key) == TDB_SUCCESS);
+		if (fork() == 0) {
+			/* We expect this to fail. */
+			if (tdb_store(tdb, key, data, TDB_REPLACE) != TDB_ERR_LOCK)
+				return 1;
+
+			if (tdb_fetch(tdb, key, &data) != TDB_ERR_LOCK)
+				return 1;
+
+			if (tap_log_messages != 2)
+				return 2;
+
+			if (tdb_transaction_commit(tdb) != TDB_ERR_LOCK)
+				return 3;
+
+			tdb_close(tdb);
+			if (tap_log_messages < 3)
+				return 4;
+			return 0;
+		}
+		wait(&status);
+		ok1(WIFEXITED(status) && WEXITSTATUS(status) == 0);
+		tdb_transaction_cancel(tdb);
+
+		ok1(tdb_parse_record(tdb, key, fork_in_parse, tdb)
+		    == TDB_SUCCESS);
+		tdb_close(tdb);
+		ok1(tap_log_messages == 0);
+	}
+	return exit_status();
+}
diff --git a/lib/tdb2/test/run-lockall.c b/lib/tdb2/test/run-lockall.c
new file mode 100644
index 0000000000..4aedf59743
--- /dev/null
+++ b/lib/tdb2/test/run-lockall.c
@@ -0,0 +1,80 @@
+#include "config.h"
+#include <unistd.h>
+#include "lock-tracking.h"
+
+#define fcntl fcntl_with_lockcheck
+
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tap/tap.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <stdarg.h>
+#include <err.h>
+#include "external-agent.h"
+#include "logging.h"
+
+#define TEST_DBNAME "run-lockall.tdb"
+
+#undef fcntl
+
+int main(int argc, char *argv[])
+{
+	struct agent *agent;
+	const int flags[] = { TDB_DEFAULT,
+			      TDB_NOMMAP,
+			      TDB_CONVERT,
+			      TDB_CONVERT | TDB_NOMMAP };
+	int i;
+
+	plan_tests(13 * sizeof(flags)/sizeof(flags[0]) + 1);
+	agent = prepare_external_agent();
+	if (!agent)
+		err(1, "preparing agent");
+
+	for (i = 0; i < sizeof(flags)/sizeof(flags[0]); i++) {
+		enum agent_return ret;
+		struct tdb_context *tdb;
+
+		tdb = tdb_open(TEST_DBNAME, flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+		ok1(tdb);
+
+		ret = external_agent_operation(agent, OPEN, TEST_DBNAME);
+		ok1(ret == SUCCESS);
+
+		ok1(tdb_lockall(tdb) == TDB_SUCCESS);
+		ok1(external_agent_operation(agent, STORE, "key")
+		    == WOULD_HAVE_BLOCKED);
+		ok1(external_agent_operation(agent, FETCH, "key")
+		    == WOULD_HAVE_BLOCKED);
+		/* Test nesting. */
+		ok1(tdb_lockall(tdb) == TDB_SUCCESS);
+		tdb_unlockall(tdb);
+		tdb_unlockall(tdb);
+
+		ok1(external_agent_operation(agent, STORE, "key") == SUCCESS);
+
+		ok1(tdb_lockall_read(tdb) == TDB_SUCCESS);
+		ok1(external_agent_operation(agent, STORE, "key")
+		    == WOULD_HAVE_BLOCKED);
+		ok1(external_agent_operation(agent, FETCH, "key") == SUCCESS);
+		ok1(tdb_lockall_read(tdb) == TDB_SUCCESS);
+		tdb_unlockall_read(tdb);
+		tdb_unlockall_read(tdb);
+
+		ok1(external_agent_operation(agent, STORE, "key") == SUCCESS);
+		ok1(external_agent_operation(agent, CLOSE, NULL) == SUCCESS);
+		tdb_close(tdb);
+	}
+
+	free_external_agent(agent);
+	ok1(tap_log_messages == 0);
+	return exit_status();
+}
diff --git a/lib/tdb2/test/run-locktimeout.c b/lib/tdb2/test/run-locktimeout.c
new file mode 100644
index 0000000000..bb5b5db29b
--- /dev/null
+++ b/lib/tdb2/test/run-locktimeout.c
@@ -0,0 +1,192 @@
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tap/tap.h>
+#include "logging.h"
+#include "external-agent.h"
+
+#undef alarm
+#define alarm fast_alarm
+
+/* Speed things up by doing things in milliseconds. */
+static unsigned int fast_alarm(unsigned int milli_seconds)
+{
+	struct itimerval it;
+
+	it.it_interval.tv_sec = it.it_interval.tv_usec = 0;
+	it.it_value.tv_sec = milli_seconds / 1000;
+	it.it_value.tv_usec = milli_seconds * 1000;
+	setitimer(ITIMER_REAL, &it, NULL);
+	return 0;
+}
+
+#define CatchSignal(sig, handler) signal((sig), (handler))
+
+static void do_nothing(int signum)
+{
+}
+
+/* This example code is taken from SAMBA, so try not to change it. */
+static struct flock flock_struct;
+
+/* Return a value which is none of v1, v2 or v3. */
+static inline short int invalid_value(short int v1, short int v2, short int v3)
+{
+	short int try = (v1+v2+v3)^((v1+v2+v3) << 16);
+	while (try == v1 || try == v2 || try == v3)
+		try++;
+	return try;
+}
+
+/* We invalidate in as many ways as we can, so the OS rejects it */
+static void invalidate_flock_struct(int signum)
+{
+	flock_struct.l_type = invalid_value(F_RDLCK, F_WRLCK, F_UNLCK);
+	flock_struct.l_whence = invalid_value(SEEK_SET, SEEK_CUR, SEEK_END);
+	flock_struct.l_start = -1;
+	/* A large negative. */
+	flock_struct.l_len = (((off_t)1 << (sizeof(off_t)*CHAR_BIT - 1)) + 1);
+}
+
+static int timeout_lock(int fd, int rw, off_t off, off_t len, bool waitflag,
+			void *_timeout)
+{
+	int ret, saved_errno = errno;
+	unsigned int timeout = *(unsigned int *)_timeout;
+
+	flock_struct.l_type = rw;
+	flock_struct.l_whence = SEEK_SET;
+	flock_struct.l_start = off;
+	flock_struct.l_len = len;
+
+	CatchSignal(SIGALRM, invalidate_flock_struct);
+	alarm(timeout);
+
+	for (;;) {
+		if (waitflag)
+			ret = fcntl(fd, F_SETLKW, &flock_struct);
+		else
+			ret = fcntl(fd, F_SETLK, &flock_struct);
+
+		if (ret == 0)
+			break;
+
+		/* Not signalled?  Something else went wrong. */
+		if (flock_struct.l_len == len) {
+			if (errno == EAGAIN || errno == EINTR)
+				continue;
+			saved_errno = errno;
+			break;
+		} else {
+			saved_errno = EINTR;
+			break;
+		}
+	}
+
+	alarm(0);
+	errno = saved_errno;
+	return ret;
+}
+
+static int tdb_chainlock_with_timeout_internal(struct tdb_context *tdb,
+					       TDB_DATA key,
+					       unsigned int timeout,
+					       int rw_type)
+{
+	union tdb_attribute locking;
+	enum TDB_ERROR ecode;
+
+	if (timeout) {
+		locking.base.attr = TDB_ATTRIBUTE_FLOCK;
+		ecode = tdb_get_attribute(tdb, &locking);
+		if (ecode != TDB_SUCCESS)
+			return ecode;
+
+		/* Replace locking function with our own. */
+		locking.flock.data = &timeout;
+		locking.flock.lock = timeout_lock;
+
+		ecode = tdb_set_attribute(tdb, &locking);
+		if (ecode != TDB_SUCCESS)
+			return ecode;
+	}
+	if (rw_type == F_RDLCK)
+		ecode = tdb_chainlock_read(tdb, key);
+	else
+		ecode = tdb_chainlock(tdb, key);
+
+	if (timeout) {
+		tdb_unset_attribute(tdb, TDB_ATTRIBUTE_FLOCK);
+	}
+	return ecode;
+}
+
+int main(int argc, char *argv[])
+{
+	unsigned int i;
+	struct tdb_context *tdb;
+	TDB_DATA key = tdb_mkdata("hello", 5);
+	int flags[] = { TDB_DEFAULT, TDB_NOMMAP,
+			TDB_CONVERT, TDB_NOMMAP|TDB_CONVERT };
+	struct agent *agent;
+
+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 15);
+
+	agent = prepare_external_agent();
+
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		enum TDB_ERROR ecode;
+		tdb = tdb_open("run-locktimeout.tdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+		if (!ok1(tdb))
+			break;
+
+		/* Simple cases: should succeed. */
+		ecode = tdb_chainlock_with_timeout_internal(tdb, key, 20,
+							    F_RDLCK);
+		ok1(ecode == TDB_SUCCESS);
+		ok1(tap_log_messages == 0);
+
+		tdb_chainunlock_read(tdb, key);
+		ok1(tap_log_messages == 0);
+
+		ecode = tdb_chainlock_with_timeout_internal(tdb, key, 20,
+							    F_WRLCK);
+		ok1(ecode == TDB_SUCCESS);
+		ok1(tap_log_messages == 0);
+
+		tdb_chainunlock(tdb, key);
+		ok1(tap_log_messages == 0);
+
+		/* OK, get agent to start transaction, then we should time out. */
+		ok1(external_agent_operation(agent, OPEN, "run-locktimeout.tdb")
+		    == SUCCESS);
+		ok1(external_agent_operation(agent, TRANSACTION_START, "")
+		    == SUCCESS);
+		ecode = tdb_chainlock_with_timeout_internal(tdb, key, 20,
+							    F_WRLCK);
+		ok1(ecode == TDB_ERR_LOCK);
+		ok1(tap_log_messages == 0);
+
+		/* Even if we get a different signal, should be fine. */
+		CatchSignal(SIGUSR1, do_nothing);
+		external_agent_operation(agent, SEND_SIGNAL, "");
+		ecode = tdb_chainlock_with_timeout_internal(tdb, key, 20,
+							    F_WRLCK);
+		ok1(ecode == TDB_ERR_LOCK);
+		ok1(tap_log_messages == 0);
+
+		ok1(external_agent_operation(agent, TRANSACTION_COMMIT, "")
+		    == SUCCESS);
+		ok1(external_agent_operation(agent, CLOSE, "")
+		    == SUCCESS);
+		tdb_close(tdb);
+	}
+	free_external_agent(agent);
+	return exit_status();
+}
diff --git a/lib/tdb2/test/run-missing-entries.c b/lib/tdb2/test/run-missing-entries.c
new file mode 100644
index 0000000000..e99572f64c
--- /dev/null
+++ b/lib/tdb2/test/run-missing-entries.c
@@ -0,0 +1,48 @@
+/* Another test revealed that we lost an entry.  This reproduces it. */
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tdb2/traverse.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tap/tap.h>
+#include "logging.h"
+
+#define NUM_RECORDS 1189
+
+/* We use the same seed which we saw this failure on. */
+static uint64_t failhash(const void *key, size_t len, uint64_t seed, void *p)
+{
+	seed = 699537674708983027ULL;
+	return hash64_stable((const unsigned char *)key, len, seed);
+}
+
+int main(int argc, char *argv[])
+{
+	int i;
+	struct tdb_context *tdb;
+	struct tdb_data key = { (unsigned char *)&i, sizeof(i) };
+	struct tdb_data data = { (unsigned char *)&i, sizeof(i) };
+	union tdb_attribute hattr = { .hash = { .base = { TDB_ATTRIBUTE_HASH },
+						.fn = failhash } };
+
+	hattr.base.next = &tap_log_attr;
+	plan_tests(1 + 2 * NUM_RECORDS + 1);
+
+	tdb = tdb_open("run-missing-entries.tdb", TDB_INTERNAL,
+		       O_RDWR|O_CREAT|O_TRUNC, 0600, &hattr);
+	ok1(tdb);
+	if (tdb) {
+		for (i = 0; i < NUM_RECORDS; i++) {
+			ok1(tdb_store(tdb, key, data, TDB_REPLACE) == 0);
+			ok1(tdb_check(tdb, NULL, NULL) == 0);
+		}
+		tdb_close(tdb);
+	}
+
+	ok1(tap_log_messages == 0);
+	return exit_status();
+}
diff --git a/lib/tdb2/test/run-open-multiple-times.c b/lib/tdb2/test/run-open-multiple-times.c
new file mode 100644
index 0000000000..240828df16
--- /dev/null
+++ b/lib/tdb2/test/run-open-multiple-times.c
@@ -0,0 +1,84 @@
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tap/tap.h>
+#include "logging.h"
+
+int main(int argc, char *argv[])
+{
+	unsigned int i;
+	struct tdb_context *tdb, *tdb2;
+	struct tdb_data key = { (unsigned char *)&i, sizeof(i) };
+	struct tdb_data data = { (unsigned char *)&i, sizeof(i) };
+	struct tdb_data d = { NULL, 0 }; /* Bogus GCC warning */
+	int flags[] = { TDB_DEFAULT, TDB_NOMMAP,
+			TDB_CONVERT, TDB_NOMMAP|TDB_CONVERT };
+
+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 28);
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		tdb = tdb_open("run-open-multiple-times.tdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+		ok1(tdb);
+		if (!tdb)
+			continue;
+		tdb2 = tdb_open("run-open-multiple-times.tdb", flags[i],
+				O_RDWR|O_CREAT, 0600, &tap_log_attr);
+		ok1(tdb_check(tdb, NULL, NULL) == 0);
+		ok1(tdb_check(tdb2, NULL, NULL) == 0);
+
+		/* Store in one, fetch in the other. */
+		ok1(tdb_store(tdb, key, data, TDB_REPLACE) == 0);
+		ok1(tdb_fetch(tdb2, key, &d) == TDB_SUCCESS);
+		ok1(tdb_deq(d, data));
+		free(d.dptr);
+
+		/* Vice versa, with delete. */
+		ok1(tdb_delete(tdb2, key) == 0);
+		ok1(tdb_fetch(tdb, key, &d) == TDB_ERR_NOEXIST);
+
+		/* OK, now close first one, check second still good. */
+		ok1(tdb_close(tdb) == 0);
+
+		ok1(tdb_store(tdb2, key, data, TDB_REPLACE) == 0);
+		ok1(tdb_fetch(tdb2, key, &d) == TDB_SUCCESS);
+		ok1(tdb_deq(d, data));
+		free(d.dptr);
+
+		/* Reopen */
+		tdb = tdb_open("run-open-multiple-times.tdb", flags[i],
+			       O_RDWR|O_CREAT, 0600, &tap_log_attr);
+		ok1(tdb);
+
+		ok1(tdb_transaction_start(tdb2) == 0);
+
+		/* Anything in the other one should fail. */
+		ok1(tdb_fetch(tdb, key, &d) == TDB_ERR_LOCK);
+		ok1(tap_log_messages == 1);
+		ok1(tdb_store(tdb, key, data, TDB_REPLACE) == TDB_ERR_LOCK);
+		ok1(tap_log_messages == 2);
+		ok1(tdb_transaction_start(tdb) == TDB_ERR_LOCK);
+		ok1(tap_log_messages == 3);
+		ok1(tdb_chainlock(tdb, key) == TDB_ERR_LOCK);
+		ok1(tap_log_messages == 4);
+
+		/* Transaciton should work as normal. */
+		ok1(tdb_store(tdb2, key, data, TDB_REPLACE) == TDB_SUCCESS);
+
+		/* Now... try closing with locks held. */
+		ok1(tdb_close(tdb2) == 0);
+
+		ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS);
+		ok1(tdb_deq(d, data));
+		free(d.dptr);
+		ok1(tdb_close(tdb) == 0);
+		ok1(tap_log_messages == 4);
+		tap_log_messages = 0;
+	}
+
+	return exit_status();
+}
diff --git a/lib/tdb2/test/run-record-expand.c b/lib/tdb2/test/run-record-expand.c
new file mode 100644
index 0000000000..109a099278
--- /dev/null
+++ b/lib/tdb2/test/run-record-expand.c
@@ -0,0 +1,53 @@
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tap/tap.h>
+#include "logging.h"
+
+#define MAX_SIZE 10000
+#define SIZE_STEP 131
+
+int main(int argc, char *argv[])
+{
+	unsigned int i;
+	struct tdb_context *tdb;
+	int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP,
+			TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT,
+			TDB_NOMMAP|TDB_CONVERT };
+	struct tdb_data key = tdb_mkdata("key", 3);
+	struct tdb_data data;
+
+	data.dptr = malloc(MAX_SIZE);
+	memset(data.dptr, 0x24, MAX_SIZE);
+
+	plan_tests(sizeof(flags) / sizeof(flags[0])
+		   * (3 + (1 + (MAX_SIZE/SIZE_STEP)) * 2) + 1);
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		tdb = tdb_open("run-record-expand.tdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+		ok1(tdb);
+		if (!tdb)
+			continue;
+
+		data.dsize = 0;
+		ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0);
+		ok1(tdb_check(tdb, NULL, NULL) == 0);
+		for (data.dsize = 0;
+		     data.dsize < MAX_SIZE;
+		     data.dsize += SIZE_STEP) {
+			memset(data.dptr, data.dsize, data.dsize);
+			ok1(tdb_store(tdb, key, data, TDB_MODIFY) == 0);
+			ok1(tdb_check(tdb, NULL, NULL) == 0);
+		}
+		tdb_close(tdb);
+	}
+	ok1(tap_log_messages == 0);
+	free(data.dptr);
+
+	return exit_status();
+}
diff --git a/lib/tdb2/test/run-remap-in-read_traverse.c b/lib/tdb2/test/run-remap-in-read_traverse.c
new file mode 100644
index 0000000000..d784ca3407
--- /dev/null
+++ b/lib/tdb2/test/run-remap-in-read_traverse.c
@@ -0,0 +1,65 @@
+/* We had a bug where we marked the tdb read-only for a tdb_traverse_read.
+ * If we then expanded the tdb, we would remap read-only, and later SEGV. */
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tdb2/traverse.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tap/tap.h>
+#include "external-agent.h"
+#include "logging.h"
+
+static bool file_larger(int fd, tdb_len_t size)
+{
+	struct stat st;
+
+	fstat(fd, &st);
+	return st.st_size != size;
+}
+
+static unsigned add_records_to_grow(struct agent *agent, int fd, tdb_len_t size)
+{
+	unsigned int i;
+
+	for (i = 0; !file_larger(fd, size); i++) {
+		char data[20];
+		sprintf(data, "%i", i);
+		if (external_agent_operation(agent, STORE, data) != SUCCESS)
+			return 0;
+	}
+	diag("Added %u records to grow file", i);
+	return i;
+}
+
+int main(int argc, char *argv[])
+{
+	unsigned int i;
+	struct agent *agent;
+	struct tdb_context *tdb;
+	struct tdb_data d = tdb_mkdata("hello", 5);
+	const char filename[] = "run-remap-in-read_traverse.tdb";
+
+	plan_tests(4);
+
+	agent = prepare_external_agent();
+
+	tdb = tdb_open(filename, TDB_DEFAULT,
+		       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+
+	ok1(external_agent_operation(agent, OPEN, filename) == SUCCESS);
+	i = add_records_to_grow(agent, tdb->file->fd, tdb->file->map_size);
+
+	/* Do a traverse. */
+	ok1(tdb_traverse(tdb, NULL, NULL) == i);
+
+	/* Now store something! */
+	ok1(tdb_store(tdb, d, d, TDB_INSERT) == 0);
+	ok1(tap_log_messages == 0);
+	tdb_close(tdb);
+	free_external_agent(agent);
+	return exit_status();
+}
diff --git a/lib/tdb2/test/run-seed.c b/lib/tdb2/test/run-seed.c
new file mode 100644
index 0000000000..a9b370b6e5
--- /dev/null
+++ b/lib/tdb2/test/run-seed.c
@@ -0,0 +1,67 @@
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tap/tap.h>
+#include "logging.h"
+
+static int log_count = 0;
+
+/* Normally we get a log when setting random seed. */
+static void my_log_fn(struct tdb_context *tdb,
+		      enum tdb_log_level level,
+		      const char *message, void *priv)
+{
+	log_count++;
+}
+
+static union tdb_attribute log_attr = {
+	.log = { .base = { .attr = TDB_ATTRIBUTE_LOG },
+		 .fn = my_log_fn }
+};
+
+int main(int argc, char *argv[])
+{
+	unsigned int i;
+	struct tdb_context *tdb;
+	union tdb_attribute attr;
+	int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP,
+			TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT,
+			TDB_NOMMAP|TDB_CONVERT };
+
+	attr.seed.base.attr = TDB_ATTRIBUTE_SEED;
+	attr.seed.base.next = &log_attr;
+	attr.seed.seed = 42;
+
+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 4 + 4 * 3);
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		struct tdb_header hdr;
+		int fd;
+		tdb = tdb_open("run-seed.tdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &attr);
+		ok1(tdb);
+		if (!tdb)
+			continue;
+		ok1(tdb_check(tdb, NULL, NULL) == 0);
+		ok1(tdb->hash_seed == 42);
+		ok1(log_count == 0);
+		tdb_close(tdb);
+
+		if (flags[i] & TDB_INTERNAL)
+			continue;
+
+		fd = open("run-seed.tdb", O_RDONLY);
+		ok1(fd >= 0);
+		ok1(read(fd, &hdr, sizeof(hdr)) == sizeof(hdr));
+		if (flags[i] & TDB_CONVERT)
+			ok1(bswap_64(hdr.hash_seed) == 42);
+		else
+			ok1(hdr.hash_seed == 42);
+		close(fd);
+	}
+	return exit_status();
+}
diff --git a/lib/tdb2/test/run-simple-delete.c b/lib/tdb2/test/run-simple-delete.c
new file mode 100644
index 0000000000..d06bf2d2bd
--- /dev/null
+++ b/lib/tdb2/test/run-simple-delete.c
@@ -0,0 +1,42 @@
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tap/tap.h>
+#include "logging.h"
+
+int main(int argc, char *argv[])
+{
+	unsigned int i;
+	struct tdb_context *tdb;
+	int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP,
+			TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT,
+			TDB_NOMMAP|TDB_CONVERT };
+	struct tdb_data key = tdb_mkdata("key", 3);
+	struct tdb_data data = tdb_mkdata("data", 4);
+
+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 7 + 1);
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		tdb = tdb_open("run-simple-delete.tdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+		ok1(tdb);
+		if (tdb) {
+			/* Delete should fail. */
+			ok1(tdb_delete(tdb, key) == TDB_ERR_NOEXIST);
+			ok1(tdb_check(tdb, NULL, NULL) == 0);
+			/* Insert should succeed. */
+			ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0);
+			ok1(tdb_check(tdb, NULL, NULL) == 0);
+			/* Delete should now work. */
+			ok1(tdb_delete(tdb, key) == 0);
+			ok1(tdb_check(tdb, NULL, NULL) == 0);
+			tdb_close(tdb);
+		}
+	}
+	ok1(tap_log_messages == 0);
+	return exit_status();
+}
diff --git a/lib/tdb2/test/run-summary.c b/lib/tdb2/test/run-summary.c
new file mode 100644
index 0000000000..c92e759373
--- /dev/null
+++ b/lib/tdb2/test/run-summary.c
@@ -0,0 +1,60 @@
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tdb2/summary.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tap/tap.h>
+#include "logging.h"
+
+int main(int argc, char *argv[])
+{
+	unsigned int i, j;
+	struct tdb_context *tdb;
+	int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP,
+			TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT,
+			TDB_NOMMAP|TDB_CONVERT };
+	struct tdb_data key = { (unsigned char *)&j, sizeof(j) };
+	struct tdb_data data = { (unsigned char *)&j, sizeof(j) };
+	char *summary;
+
+	plan_tests(sizeof(flags) / sizeof(flags[0]) * (1 + 2 * 5) + 1);
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		tdb = tdb_open("run-summary.tdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+		ok1(tdb);
+		if (!tdb)
+			continue;
+
+		/* Put some stuff in there. */
+		for (j = 0; j < 500; j++) {
+			/* Make sure padding varies to we get some graphs! */
+			data.dsize = j % (sizeof(j) + 1);
+			if (tdb_store(tdb, key, data, TDB_REPLACE) != 0)
+				fail("Storing in tdb");
+		}
+
+		for (j = 0;
+		     j <= TDB_SUMMARY_HISTOGRAMS;
+		     j += TDB_SUMMARY_HISTOGRAMS) {
+			ok1(tdb_summary(tdb, j, &summary) == TDB_SUCCESS);
+			ok1(strstr(summary, "Number of records: 500\n"));
+			ok1(strstr(summary, "Smallest/average/largest keys: 4/4/4\n"));
+			ok1(strstr(summary, "Smallest/average/largest data: 0/2/4\n"));
+			if (j == TDB_SUMMARY_HISTOGRAMS)
+				ok1(strstr(summary, "|")
+				    && strstr(summary, "*"));
+			else
+				ok1(!strstr(summary, "|")
+				    && !strstr(summary, "*"));
+			free(summary);
+		}
+		tdb_close(tdb);
+	}
+
+	ok1(tap_log_messages == 0);
+	return exit_status();
+}
diff --git a/lib/tdb2/test/run-tdb_errorstr.c b/lib/tdb2/test/run-tdb_errorstr.c
new file mode 100644
index 0000000000..27bdfcd67c
--- /dev/null
+++ b/lib/tdb2/test/run-tdb_errorstr.c
@@ -0,0 +1,59 @@
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tap/tap.h>
+
+int main(int argc, char *argv[])
+{
+	enum TDB_ERROR err;
+	plan_tests(TDB_ERR_RDONLY*-1 + 2);
+
+	for (err = TDB_SUCCESS; err >= TDB_ERR_RDONLY; err--) {
+		switch (err) {
+		case TDB_SUCCESS:
+			ok1(!strcmp(tdb_errorstr(err),
+				    "Success"));
+			break;
+		case TDB_ERR_IO:
+			ok1(!strcmp(tdb_errorstr(err),
+				    "IO Error"));
+			break;
+		case TDB_ERR_LOCK:
+			ok1(!strcmp(tdb_errorstr(err),
+				    "Locking error"));
+			break;
+		case TDB_ERR_OOM:
+			ok1(!strcmp(tdb_errorstr(err),
+				    "Out of memory"));
+			break;
+		case TDB_ERR_EXISTS:
+			ok1(!strcmp(tdb_errorstr(err),
+				    "Record exists"));
+			break;
+		case TDB_ERR_EINVAL:
+			ok1(!strcmp(tdb_errorstr(err),
+				    "Invalid parameter"));
+			break;
+		case TDB_ERR_NOEXIST:
+			ok1(!strcmp(tdb_errorstr(err),
+				    "Record does not exist"));
+			break;
+		case TDB_ERR_RDONLY:
+			ok1(!strcmp(tdb_errorstr(err),
+				    "write not permitted"));
+			break;
+		case TDB_ERR_CORRUPT:
+			ok1(!strcmp(tdb_errorstr(err),
+				    "Corrupt database"));
+			break;
+		}
+	}
+	ok1(!strcmp(tdb_errorstr(err), "Invalid error code"));
+
+	return exit_status();
+}
diff --git a/lib/tdb2/test/run-traverse.c b/lib/tdb2/test/run-traverse.c
new file mode 100644
index 0000000000..f973d95d0f
--- /dev/null
+++ b/lib/tdb2/test/run-traverse.c
@@ -0,0 +1,211 @@
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tdb2/traverse.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tap/tap.h>
+#include "logging.h"
+
+#define NUM_RECORDS 1000
+
+/* We use the same seed which we saw a failure on. */
+static uint64_t fixedhash(const void *key, size_t len, uint64_t seed, void *p)
+{
+	return hash64_stable((const unsigned char *)key, len,
+			     *(uint64_t *)p);
+}
+
+static bool store_records(struct tdb_context *tdb)
+{
+	int i;
+	struct tdb_data key = { (unsigned char *)&i, sizeof(i) };
+	struct tdb_data data = { (unsigned char *)&i, sizeof(i) };
+
+	for (i = 0; i < NUM_RECORDS; i++)
+		if (tdb_store(tdb, key, data, TDB_REPLACE) != 0)
+			return false;
+	return true;
+}
+
+struct trav_data {
+	unsigned int calls, call_limit;
+	int low, high;
+	bool mismatch;
+	bool delete;
+	enum TDB_ERROR delete_error;
+};
+
+static int trav(struct tdb_context *tdb, TDB_DATA key, TDB_DATA dbuf,
+		struct trav_data *td)
+{
+	int val;
+
+	td->calls++;
+	if (key.dsize != sizeof(val) || dbuf.dsize != sizeof(val)
+	    || memcmp(key.dptr, dbuf.dptr, key.dsize) != 0) {
+		td->mismatch = true;
+		return -1;
+	}
+	memcpy(&val, dbuf.dptr, dbuf.dsize);
+	if (val < td->low)
+		td->low = val;
+	if (val > td->high)
+		td->high = val;
+
+	if (td->delete) {
+		td->delete_error = tdb_delete(tdb, key);
+		if (td->delete_error != TDB_SUCCESS) {
+			return -1;
+		}
+	}
+
+	if (td->calls == td->call_limit)
+		return 1;
+	return 0;
+}
+
+struct trav_grow_data {
+	unsigned int calls;
+	unsigned int num_large;
+	bool mismatch;
+	enum TDB_ERROR error;
+};
+
+static int trav_grow(struct tdb_context *tdb, TDB_DATA key, TDB_DATA dbuf,
+		     struct trav_grow_data *tgd)
+{
+	int val;
+	unsigned char buffer[128] = { 0 };
+
+	tgd->calls++;
+	if (key.dsize != sizeof(val) || dbuf.dsize < sizeof(val)
+	    || memcmp(key.dptr, dbuf.dptr, key.dsize) != 0) {
+		tgd->mismatch = true;
+		return -1;
+	}
+
+	if (dbuf.dsize > sizeof(val))
+		/* We must have seen this before! */
+		tgd->num_large++;
+
+	/* Make a big difference to the database. */
+	dbuf.dptr = buffer;
+	dbuf.dsize = sizeof(buffer);
+	tgd->error = tdb_append(tdb, key, dbuf);
+	if (tgd->error != TDB_SUCCESS) {
+		return -1;
+	}
+	return 0;
+}
+
+int main(int argc, char *argv[])
+{
+	unsigned int i;
+	int num;
+	struct trav_data td;
+	struct trav_grow_data tgd;
+	struct tdb_context *tdb;
+	uint64_t seed = 16014841315512641303ULL;
+	int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP,
+			TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT,
+			TDB_NOMMAP|TDB_CONVERT };
+	union tdb_attribute hattr = { .hash = { .base = { TDB_ATTRIBUTE_HASH },
+						.fn = fixedhash,
+						.data = &seed } };
+
+	hattr.base.next = &tap_log_attr;
+
+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 32 + 1);
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		tdb = tdb_open("run-traverse.tdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &hattr);
+		ok1(tdb);
+		if (!tdb)
+			continue;
+
+		ok1(tdb_traverse(tdb, NULL, NULL) == 0);
+
+		ok1(store_records(tdb));
+		num = tdb_traverse(tdb, NULL, NULL);
+		ok1(num == NUM_RECORDS);
+
+		/* Full traverse. */
+		td.calls = 0;
+		td.call_limit = UINT_MAX;
+		td.low = INT_MAX;
+		td.high = INT_MIN;
+		td.mismatch = false;
+		td.delete = false;
+
+		num = tdb_traverse(tdb, trav, &td);
+		ok1(num == NUM_RECORDS);
+		ok1(!td.mismatch);
+		ok1(td.calls == NUM_RECORDS);
+		ok1(td.low == 0);
+		ok1(td.high == NUM_RECORDS-1);
+
+		/* Short traverse. */
+		td.calls = 0;
+		td.call_limit = NUM_RECORDS / 2;
+		td.low = INT_MAX;
+		td.high = INT_MIN;
+		td.mismatch = false;
+		td.delete = false;
+
+		num = tdb_traverse(tdb, trav, &td);
+		ok1(num == NUM_RECORDS / 2);
+		ok1(!td.mismatch);
+		ok1(td.calls == NUM_RECORDS / 2);
+		ok1(td.low <= NUM_RECORDS / 2);
+		ok1(td.high > NUM_RECORDS / 2);
+		ok1(tdb_check(tdb, NULL, NULL) == 0);
+		ok1(tap_log_messages == 0);
+
+		/* Deleting traverse (delete everything). */
+		td.calls = 0;
+		td.call_limit = UINT_MAX;
+		td.low = INT_MAX;
+		td.high = INT_MIN;
+		td.mismatch = false;
+		td.delete = true;
+		td.delete_error = TDB_SUCCESS;
+		num = tdb_traverse(tdb, trav, &td);
+		ok1(num == NUM_RECORDS);
+		ok1(td.delete_error == TDB_SUCCESS);
+		ok1(!td.mismatch);
+		ok1(td.calls == NUM_RECORDS);
+		ok1(td.low == 0);
+		ok1(td.high == NUM_RECORDS - 1);
+		ok1(tdb_check(tdb, NULL, NULL) == 0);
+
+		/* Now it's empty! */
+		ok1(tdb_traverse(tdb, NULL, NULL) == 0);
+
+		/* Re-add. */
+		ok1(store_records(tdb));
+		ok1(tdb_traverse(tdb, NULL, NULL) == NUM_RECORDS);
+		ok1(tdb_check(tdb, NULL, NULL) == 0);
+
+		/* Grow.  This will cause us to be reshuffled. */
+		tgd.calls = 0;
+		tgd.num_large = 0;
+		tgd.mismatch = false;
+		tgd.error = TDB_SUCCESS;
+		ok1(tdb_traverse(tdb, trav_grow, &tgd) > 1);
+		ok1(tgd.error == 0);
+		ok1(!tgd.mismatch);
+		ok1(tdb_check(tdb, NULL, NULL) == 0);
+		ok1(tgd.num_large < tgd.calls);
+		diag("growing db: %u calls, %u repeats",
+		     tgd.calls, tgd.num_large);
+
+		tdb_close(tdb);
+	}
+
+	ok1(tap_log_messages == 0);
+	return exit_status();
+}
diff --git a/lib/tdb2/tools/Makefile b/lib/tdb2/tools/Makefile
new file mode 100644
index 0000000000..11188c3baf
--- /dev/null
+++ b/lib/tdb2/tools/Makefile
@@ -0,0 +1,16 @@
+OBJS:=../../tdb2.o ../../hash.o ../../tally.o
+CFLAGS:=-I../../.. -I.. -Wall -g -O3 #-g -pg
+LDFLAGS:=-L../../..
+
+default: tdb2torture tdb2tool tdb2dump tdb2restore mktdb2 speed growtdb-bench
+
+tdb2dump: tdb2dump.c $(OBJS)
+tdb2restore: tdb2restore.c $(OBJS)
+tdb2torture: tdb2torture.c $(OBJS)
+tdb2tool: tdb2tool.c $(OBJS)
+mktdb2: mktdb2.c $(OBJS)
+speed: speed.c $(OBJS)
+growtdb-bench: growtdb-bench.c $(OBJS)
+
+clean:
+	rm -f tdb2torture tdb2dump tdb2restore tdb2tool mktdb2 speed growtdb-bench
diff --git a/lib/tdb2/tools/growtdb-bench.c b/lib/tdb2/tools/growtdb-bench.c
new file mode 100644
index 0000000000..f7f6845a8a
--- /dev/null
+++ b/lib/tdb2/tools/growtdb-bench.c
@@ -0,0 +1,112 @@
+#include "tdb2.h"
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <err.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+static void logfn(struct tdb_context *tdb,
+		  enum tdb_log_level level,
+		  const char *message,
+		  void *data)
+{
+	fprintf(stderr, "tdb:%s:%s\n", tdb_name(tdb), message);
+}
+
+int main(int argc, char *argv[])
+{
+	unsigned int i, j, users, groups;
+	TDB_DATA idxkey, idxdata;
+	TDB_DATA k, d, gk;
+	char cmd[100];
+	struct tdb_context *tdb;
+	enum TDB_ERROR ecode;
+	union tdb_attribute log;
+
+	if (argc != 3) {
+		printf("Usage: growtdb-bench <users> <groups>\n");
+		exit(1);
+	}
+	users = atoi(argv[1]);
+	groups = atoi(argv[2]);
+
+	sprintf(cmd, "cat /proc/%i/statm", getpid());
+
+	log.base.attr = TDB_ATTRIBUTE_LOG;
+	log.base.next = NULL;
+	log.log.fn = logfn;
+
+	tdb = tdb_open("/tmp/growtdb.tdb", TDB_DEFAULT,
+		       O_RDWR|O_CREAT|O_TRUNC, 0600, &log);
+
+	idxkey.dptr = (unsigned char *)"User index";
+	idxkey.dsize = strlen("User index");
+	idxdata.dsize = 51;
+	idxdata.dptr = calloc(idxdata.dsize, 1);
+
+	/* Create users. */
+	k.dsize = 48;
+	k.dptr = calloc(k.dsize, 1);
+	d.dsize = 64;
+	d.dptr = calloc(d.dsize, 1);
+
+	tdb_transaction_start(tdb);
+	for (i = 0; i < users; i++) {
+		memcpy(k.dptr, &i, sizeof(i));
+		ecode = tdb_store(tdb, k, d, TDB_INSERT);
+		if (ecode != TDB_SUCCESS)
+			errx(1, "tdb insert failed: %s", tdb_errorstr(ecode));
+
+		/* This simulates a growing index record. */
+		ecode = tdb_append(tdb, idxkey, idxdata);
+		if (ecode != TDB_SUCCESS)
+			errx(1, "tdb append failed: %s", tdb_errorstr(ecode));
+	}
+	if ((ecode = tdb_transaction_commit(tdb)) != 0)
+		errx(1, "tdb commit1 failed: %s", tdb_errorstr(ecode));
+
+	if ((ecode = tdb_check(tdb, NULL, NULL)) != 0)
+		errx(1, "tdb_check failed after initial insert!");
+
+	system(cmd);
+
+	/* Now put them all in groups: add 32 bytes to each record for
+	 * a group. */
+	gk.dsize = 48;
+	gk.dptr = calloc(k.dsize, 1);
+	gk.dptr[gk.dsize-1] = 1;
+
+	d.dsize = 32;
+	for (i = 0; i < groups; i++) {
+		tdb_transaction_start(tdb);
+		/* Create the "group". */
+		memcpy(gk.dptr, &i, sizeof(i));
+		ecode = tdb_store(tdb, gk, d, TDB_INSERT);
+		if (ecode != TDB_SUCCESS)
+			errx(1, "tdb insert failed: %s", tdb_errorstr(ecode));
+
+		/* Now populate it. */
+		for (j = 0; j < users; j++) {
+			/* Append to the user. */
+			memcpy(k.dptr, &j, sizeof(j));
+			if ((ecode = tdb_append(tdb, k, d)) != 0)
+				errx(1, "tdb append failed: %s",
+				     tdb_errorstr(ecode));
+
+			/* Append to the group. */
+			if ((ecode = tdb_append(tdb, gk, d)) != 0)
+				errx(1, "tdb append failed: %s",
+				     tdb_errorstr(ecode));
+		}
+		if ((ecode = tdb_transaction_commit(tdb)) != 0)
+			errx(1, "tdb commit2 failed: %s", tdb_errorstr(ecode));
+		if ((ecode = tdb_check(tdb, NULL, NULL)) != 0)
+			errx(1, "tdb_check failed after iteration %i!", i);
+		system(cmd);
+	}
+
+	return 0;
+}
diff --git a/lib/tdb2/tools/mktdb2.c b/lib/tdb2/tools/mktdb2.c
new file mode 100644
index 0000000000..c8c280349e
--- /dev/null
+++ b/lib/tdb2/tools/mktdb2.c
@@ -0,0 +1,29 @@
+#include "tdb2.h"
+#include <stdlib.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <err.h>
+
+int main(int argc, char *argv[])
+{
+	unsigned int i, num_recs;
+	struct tdb_context *tdb;
+
+	if (argc != 3 || (num_recs = atoi(argv[2])) == 0)
+		errx(1, "Usage: mktdb <tdbfile> <numrecords>");
+
+	tdb = tdb_open(argv[1], TDB_DEFAULT, O_CREAT|O_TRUNC|O_RDWR, 0600,NULL);
+	if (!tdb)
+		err(1, "Opening %s", argv[1]);
+
+	for (i = 0; i < num_recs; i++) {
+		TDB_DATA d;
+
+		d.dptr = (void *)&i;
+		d.dsize = sizeof(i);
+		if (tdb_store(tdb, d, d, TDB_INSERT) != 0)
+			err(1, "Failed to store record %i", i);
+	}
+	printf("Done\n");
+	return 0;
+}
diff --git a/lib/tdb2/tools/speed.c b/lib/tdb2/tools/speed.c
new file mode 100644
index 0000000000..3222465a71
--- /dev/null
+++ b/lib/tdb2/tools/speed.c
@@ -0,0 +1,440 @@
+/* Simple speed test for TDB */
+#include <err.h>
+#include <time.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <sys/time.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdbool.h>
+#include "tdb2.h"
+
+/* Nanoseconds per operation */
+static size_t normalize(const struct timeval *start,
+			const struct timeval *stop,
+			unsigned int num)
+{
+	struct timeval diff;
+
+	timersub(stop, start, &diff);
+
+	/* Floating point is more accurate here. */
+	return (double)(diff.tv_sec * 1000000 + diff.tv_usec)
+		/ num * 1000;
+}
+
+static size_t file_size(void)
+{
+	struct stat st;
+
+	if (stat("/tmp/speed.tdb", &st) != 0)
+		return -1;
+	return st.st_size;
+}
+
+static int count_record(struct tdb_context *tdb,
+			TDB_DATA key, TDB_DATA data, void *p)
+{
+	int *total = p;
+	*total += *(int *)data.dptr;
+	return 0;
+}
+
+static void dump_and_clear_stats(struct tdb_context **tdb,
+				 int flags,
+				 union tdb_attribute *attr)
+{
+	union tdb_attribute stats;
+	enum TDB_ERROR ecode;
+
+	stats.base.attr = TDB_ATTRIBUTE_STATS;
+	stats.stats.size = sizeof(stats.stats);
+	ecode = tdb_get_attribute(*tdb, &stats);
+	if (ecode != TDB_SUCCESS)
+		errx(1, "Getting stats: %s", tdb_errorstr(ecode));
+
+	printf("allocs = %llu\n",
+	       (unsigned long long)stats.stats.allocs);
+	printf("  alloc_subhash = %llu\n",
+	       (unsigned long long)stats.stats.alloc_subhash);
+	printf("  alloc_chain = %llu\n",
+	       (unsigned long long)stats.stats.alloc_chain);
+	printf("  alloc_bucket_exact = %llu\n",
+	       (unsigned long long)stats.stats.alloc_bucket_exact);
+	printf("  alloc_bucket_max = %llu\n",
+	       (unsigned long long)stats.stats.alloc_bucket_max);
+	printf("  alloc_leftover = %llu\n",
+	       (unsigned long long)stats.stats.alloc_leftover);
+	printf("  alloc_coalesce_tried = %llu\n",
+	       (unsigned long long)stats.stats.alloc_coalesce_tried);
+	printf("    alloc_coalesce_iterate_clash = %llu\n",
+	       (unsigned long long)stats.stats.alloc_coalesce_iterate_clash);
+	printf("    alloc_coalesce_lockfail = %llu\n",
+	       (unsigned long long)stats.stats.alloc_coalesce_lockfail);
+	printf("    alloc_coalesce_race = %llu\n",
+	       (unsigned long long)stats.stats.alloc_coalesce_race);
+	printf("    alloc_coalesce_succeeded = %llu\n",
+	       (unsigned long long)stats.stats.alloc_coalesce_succeeded);
+	printf("      alloc_coalesce_num_merged = %llu\n",
+	       (unsigned long long)stats.stats.alloc_coalesce_num_merged);
+	printf("compares = %llu\n",
+	       (unsigned long long)stats.stats.compares);
+	printf("  compare_wrong_bucket = %llu\n",
+	       (unsigned long long)stats.stats.compare_wrong_bucket);
+	printf("  compare_wrong_offsetbits = %llu\n",
+	       (unsigned long long)stats.stats.compare_wrong_offsetbits);
+	printf("  compare_wrong_keylen = %llu\n",
+	       (unsigned long long)stats.stats.compare_wrong_keylen);
+	printf("  compare_wrong_rechash = %llu\n",
+	       (unsigned long long)stats.stats.compare_wrong_rechash);
+	printf("  compare_wrong_keycmp = %llu\n",
+	       (unsigned long long)stats.stats.compare_wrong_keycmp);
+	printf("transactions = %llu\n",
+	       (unsigned long long)stats.stats.transactions);
+	printf("  transaction_cancel = %llu\n",
+	       (unsigned long long)stats.stats.transaction_cancel);
+	printf("  transaction_nest = %llu\n",
+	       (unsigned long long)stats.stats.transaction_nest);
+	printf("  transaction_expand_file = %llu\n",
+	       (unsigned long long)stats.stats.transaction_expand_file);
+	printf("  transaction_read_direct = %llu\n",
+	       (unsigned long long)stats.stats.transaction_read_direct);
+	printf("    transaction_read_direct_fail = %llu\n",
+	       (unsigned long long)stats.stats.transaction_read_direct_fail);
+	printf("  transaction_write_direct = %llu\n",
+	       (unsigned long long)stats.stats.transaction_write_direct);
+	printf("    transaction_write_direct_fail = %llu\n",
+	       (unsigned long long)stats.stats.transaction_write_direct_fail);
+	printf("expands = %llu\n",
+	       (unsigned long long)stats.stats.expands);
+	printf("frees = %llu\n",
+	       (unsigned long long)stats.stats.frees);
+	printf("locks = %llu\n",
+	       (unsigned long long)stats.stats.locks);
+	printf("  lock_lowlevel = %llu\n",
+	       (unsigned long long)stats.stats.lock_lowlevel);
+	printf("  lock_nonblock = %llu\n",
+	       (unsigned long long)stats.stats.lock_nonblock);
+	printf("    lock_nonblock_fail = %llu\n",
+	       (unsigned long long)stats.stats.lock_nonblock_fail);
+
+	/* Now clear. */
+	tdb_close(*tdb);
+	*tdb = tdb_open("/tmp/speed.tdb", flags, O_RDWR, 0, attr);
+}
+
+static void tdb_log(struct tdb_context *tdb, enum tdb_log_level level,
+		    const char *message, void *data)
+{
+	fputs(message, stderr);
+	putc('\n', stderr);
+}
+
+int main(int argc, char *argv[])
+{
+	unsigned int i, j, num = 1000, stage = 0, stopat = -1;
+	int flags = TDB_DEFAULT;
+	bool transaction = false, summary = false;
+	TDB_DATA key, data;
+	struct tdb_context *tdb;
+	struct timeval start, stop;
+	union tdb_attribute seed, log;
+	bool do_stats = false;
+	enum TDB_ERROR ecode;
+
+	/* Try to keep benchmarks even. */
+	seed.base.attr = TDB_ATTRIBUTE_SEED;
+	seed.base.next = NULL;
+	seed.seed.seed = 0;
+
+	log.base.attr = TDB_ATTRIBUTE_LOG;
+	log.base.next = &seed;
+	log.log.fn = tdb_log;
+
+	if (argv[1] && strcmp(argv[1], "--internal") == 0) {
+		flags = TDB_INTERNAL;
+		argc--;
+		argv++;
+	}
+	if (argv[1] && strcmp(argv[1], "--transaction") == 0) {
+		transaction = true;
+		argc--;
+		argv++;
+	}
+	if (argv[1] && strcmp(argv[1], "--no-sync") == 0) {
+		flags |= TDB_NOSYNC;
+		argc--;
+		argv++;
+	}
+	if (argv[1] && strcmp(argv[1], "--summary") == 0) {
+		summary = true;
+		argc--;
+		argv++;
+	}
+	if (argv[1] && strcmp(argv[1], "--stats") == 0) {
+		do_stats = true;
+		argc--;
+		argv++;
+	}
+
+	tdb = tdb_open("/tmp/speed.tdb", flags, O_RDWR|O_CREAT|O_TRUNC,
+		       0600, &log);
+	if (!tdb)
+		err(1, "Opening /tmp/speed.tdb");
+
+	key.dptr = (void *)&i;
+	key.dsize = sizeof(i);
+	data = key;
+
+	if (argv[1]) {
+		num = atoi(argv[1]);
+		argv++;
+		argc--;
+	}
+
+	if (argv[1]) {
+		stopat = atoi(argv[1]);
+		argv++;
+		argc--;
+	}
+
+	/* Add 1000 records. */
+	printf("Adding %u records: ", num); fflush(stdout);
+	if (transaction && (ecode = tdb_transaction_start(tdb)))
+		errx(1, "starting transaction: %s", tdb_errorstr(ecode));
+	gettimeofday(&start, NULL);
+	for (i = 0; i < num; i++)
+		if ((ecode = tdb_store(tdb, key, data, TDB_INSERT)) != 0)
+			errx(1, "Inserting key %u in tdb: %s",
+			     i, tdb_errorstr(ecode));
+	gettimeofday(&stop, NULL);
+	if (transaction && (ecode = tdb_transaction_commit(tdb)))
+		errx(1, "committing transaction: %s", tdb_errorstr(ecode));
+	printf(" %zu ns (%zu bytes)\n",
+	       normalize(&start, &stop, num), file_size());
+
+	if (tdb_check(tdb, NULL, NULL))
+		errx(1, "tdb_check failed!");
+	if (summary) {
+		char *sumstr = NULL;
+		tdb_summary(tdb, TDB_SUMMARY_HISTOGRAMS, &sumstr);
+		printf("%s\n", sumstr);
+		free(sumstr);
+	}
+	if (do_stats)
+		dump_and_clear_stats(&tdb, flags, &log);
+
+	if (++stage == stopat)
+		exit(0);
+
+	/* Finding 1000 records. */
+	printf("Finding %u records: ", num); fflush(stdout);
+	if (transaction && (ecode = tdb_transaction_start(tdb)))
+		errx(1, "starting transaction: %s", tdb_errorstr(ecode));
+	gettimeofday(&start, NULL);
+	for (i = 0; i < num; i++) {
+		struct tdb_data dbuf;
+		if ((ecode = tdb_fetch(tdb, key, &dbuf)) != TDB_SUCCESS
+		    || *(int *)dbuf.dptr != i) {
+			errx(1, "Fetching key %u in tdb gave %u",
+			     i, ecode ? ecode : *(int *)dbuf.dptr);
+		}
+	}
+	gettimeofday(&stop, NULL);
+	if (transaction && (ecode = tdb_transaction_commit(tdb)))
+		errx(1, "committing transaction: %s", tdb_errorstr(ecode));
+	printf(" %zu ns (%zu bytes)\n",
+	       normalize(&start, &stop, num), file_size());
+	if (tdb_check(tdb, NULL, NULL))
+		errx(1, "tdb_check failed!");
+	if (summary) {
+		char *sumstr = NULL;
+		tdb_summary(tdb, TDB_SUMMARY_HISTOGRAMS, &sumstr);
+		printf("%s\n", sumstr);
+		free(sumstr);
+	}
+	if (do_stats)
+		dump_and_clear_stats(&tdb, flags, &log);
+	if (++stage == stopat)
+		exit(0);
+
+	/* Missing 1000 records. */
+	printf("Missing %u records: ", num); fflush(stdout);
+	if (transaction && (ecode = tdb_transaction_start(tdb)))
+		errx(1, "starting transaction: %s", tdb_errorstr(ecode));
+	gettimeofday(&start, NULL);
+	for (i = num; i < num*2; i++) {
+		struct tdb_data dbuf;
+		ecode = tdb_fetch(tdb, key, &dbuf);
+		if (ecode != TDB_ERR_NOEXIST)
+			errx(1, "Fetching key %u in tdb gave %s",
+			     i, tdb_errorstr(ecode));
+	}
+	gettimeofday(&stop, NULL);
+	if (transaction && (ecode = tdb_transaction_commit(tdb)))
+		errx(1, "committing transaction: %s", tdb_errorstr(ecode));
+	printf(" %zu ns (%zu bytes)\n",
+	       normalize(&start, &stop, num), file_size());
+	if (tdb_check(tdb, NULL, NULL))
+		errx(1, "tdb_check failed!");
+	if (summary) {
+		char *sumstr = NULL;
+		tdb_summary(tdb, TDB_SUMMARY_HISTOGRAMS, &sumstr);
+		printf("%s\n", sumstr);
+		free(sumstr);
+	}
+	if (do_stats)
+		dump_and_clear_stats(&tdb, flags, &log);
+	if (++stage == stopat)
+		exit(0);
+
+	/* Traverse 1000 records. */
+	printf("Traversing %u records: ", num); fflush(stdout);
+	if (transaction && (ecode = tdb_transaction_start(tdb)))
+		errx(1, "starting transaction: %s", tdb_errorstr(ecode));
+	i = 0;
+	gettimeofday(&start, NULL);
+	if (tdb_traverse(tdb, count_record, &i) != num)
+		errx(1, "Traverse returned wrong number of records");
+	if (i != (num - 1) * (num / 2))
+		errx(1, "Traverse tallied to %u", i);
+	gettimeofday(&stop, NULL);
+	if (transaction && (ecode = tdb_transaction_commit(tdb)))
+		errx(1, "committing transaction: %s", tdb_errorstr(ecode));
+	printf(" %zu ns (%zu bytes)\n",
+	       normalize(&start, &stop, num), file_size());
+	if (tdb_check(tdb, NULL, NULL))
+		errx(1, "tdb_check failed!");
+	if (summary) {
+		char *sumstr = NULL;
+		tdb_summary(tdb, TDB_SUMMARY_HISTOGRAMS, &sumstr);
+		printf("%s\n", sumstr);
+		free(sumstr);
+	}
+	if (do_stats)
+		dump_and_clear_stats(&tdb, flags, &log);
+	if (++stage == stopat)
+		exit(0);
+
+	/* Delete 1000 records (not in order). */
+	printf("Deleting %u records: ", num); fflush(stdout);
+	if (transaction && (ecode = tdb_transaction_start(tdb)))
+		errx(1, "starting transaction: %s", tdb_errorstr(ecode));
+	gettimeofday(&start, NULL);
+	for (j = 0; j < num; j++) {
+		i = (j + 100003) % num;
+		if ((ecode = tdb_delete(tdb, key)) != TDB_SUCCESS)
+			errx(1, "Deleting key %u in tdb: %s",
+			     i, tdb_errorstr(ecode));
+	}
+	gettimeofday(&stop, NULL);
+	if (transaction && (ecode = tdb_transaction_commit(tdb)))
+		errx(1, "committing transaction: %s", tdb_errorstr(ecode));
+	printf(" %zu ns (%zu bytes)\n",
+	       normalize(&start, &stop, num), file_size());
+	if (tdb_check(tdb, NULL, NULL))
+		errx(1, "tdb_check failed!");
+	if (summary) {
+		char *sumstr = NULL;
+		tdb_summary(tdb, TDB_SUMMARY_HISTOGRAMS, &sumstr);
+		printf("%s\n", sumstr);
+		free(sumstr);
+	}
+	if (do_stats)
+		dump_and_clear_stats(&tdb, flags, &log);
+	if (++stage == stopat)
+		exit(0);
+
+	/* Re-add 1000 records (not in order). */
+	printf("Re-adding %u records: ", num); fflush(stdout);
+	if (transaction && (ecode = tdb_transaction_start(tdb)))
+		errx(1, "starting transaction: %s", tdb_errorstr(ecode));
+	gettimeofday(&start, NULL);
+	for (j = 0; j < num; j++) {
+		i = (j + 100003) % num;
+		if ((ecode = tdb_store(tdb, key, data, TDB_INSERT)) != 0)
+			errx(1, "Inserting key %u in tdb: %s",
+			     i, tdb_errorstr(ecode));
+	}
+	gettimeofday(&stop, NULL);
+	if (transaction && (ecode = tdb_transaction_commit(tdb)))
+		errx(1, "committing transaction: %s", tdb_errorstr(ecode));
+	printf(" %zu ns (%zu bytes)\n",
+	       normalize(&start, &stop, num), file_size());
+	if (tdb_check(tdb, NULL, NULL))
+		errx(1, "tdb_check failed!");
+	if (summary) {
+		char *sumstr = NULL;
+		tdb_summary(tdb, TDB_SUMMARY_HISTOGRAMS, &sumstr);
+		printf("%s\n", sumstr);
+		free(sumstr);
+	}
+	if (do_stats)
+		dump_and_clear_stats(&tdb, flags, &log);
+	if (++stage == stopat)
+		exit(0);
+
+	/* Append 1000 records. */
+	if (transaction && (ecode = tdb_transaction_start(tdb)))
+		errx(1, "starting transaction: %s", tdb_errorstr(ecode));
+	printf("Appending %u records: ", num); fflush(stdout);
+	gettimeofday(&start, NULL);
+	for (i = 0; i < num; i++)
+		if ((ecode = tdb_append(tdb, key, data)) != TDB_SUCCESS)
+			errx(1, "Appending key %u in tdb: %s",
+			     i, tdb_errorstr(ecode));
+	gettimeofday(&stop, NULL);
+	if (transaction && (ecode = tdb_transaction_commit(tdb)))
+		errx(1, "committing transaction: %s", tdb_errorstr(ecode));
+	printf(" %zu ns (%zu bytes)\n",
+	       normalize(&start, &stop, num), file_size());
+	if (tdb_check(tdb, NULL, NULL))
+		errx(1, "tdb_check failed!");
+	if (summary) {
+		char *sumstr = NULL;
+		tdb_summary(tdb, TDB_SUMMARY_HISTOGRAMS, &sumstr);
+		printf("%s\n", sumstr);
+		free(sumstr);
+	}
+	if (++stage == stopat)
+		exit(0);
+
+	/* Churn 1000 records: not in order! */
+	if (transaction && (ecode = tdb_transaction_start(tdb)))
+		errx(1, "starting transaction: %s", tdb_errorstr(ecode));
+	printf("Churning %u records: ", num); fflush(stdout);
+	gettimeofday(&start, NULL);
+	for (j = 0; j < num; j++) {
+		i = (j + 1000019) % num;
+		if ((ecode = tdb_delete(tdb, key)) != TDB_SUCCESS)
+			errx(1, "Deleting key %u in tdb: %s",
+			     i, tdb_errorstr(ecode));
+		i += num;
+		if ((ecode = tdb_store(tdb, key, data, TDB_INSERT)) != 0)
+			errx(1, "Inserting key %u in tdb: %s",
+			     i, tdb_errorstr(ecode));
+	}
+	gettimeofday(&stop, NULL);
+	if (transaction && (ecode = tdb_transaction_commit(tdb)))
+		errx(1, "committing transaction: %s", tdb_errorstr(ecode));
+	printf(" %zu ns (%zu bytes)\n",
+	       normalize(&start, &stop, num), file_size());
+
+	if (tdb_check(tdb, NULL, NULL))
+		errx(1, "tdb_check failed!");
+	if (summary) {
+		char *sumstr = NULL;
+		tdb_summary(tdb, TDB_SUMMARY_HISTOGRAMS, &sumstr);
+		printf("%s\n", sumstr);
+		free(sumstr);
+	}
+	if (do_stats)
+		dump_and_clear_stats(&tdb, flags, &log);
+	if (++stage == stopat)
+		exit(0);
+
+	return 0;
+}
diff --git a/lib/tdb2/tools/tdb2dump.c b/lib/tdb2/tools/tdb2dump.c
new file mode 100644
index 0000000000..abe1d9b871
--- /dev/null
+++ b/lib/tdb2/tools/tdb2dump.c
@@ -0,0 +1,115 @@
+/*
+   simple tdb2 dump util
+   Copyright (C) Andrew Tridgell              2001
+   Copyright (C) Rusty Russell                2011
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+#include "tdb2.h"
+#include <ctype.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+static void print_data(TDB_DATA d)
+{
+	unsigned char *p = (unsigned char *)d.dptr;
+	int len = d.dsize;
+	while (len--) {
+		if (isprint(*p) && !strchr("\"\\", *p)) {
+			fputc(*p, stdout);
+		} else {
+			printf("\\%02X", *p);
+		}
+		p++;
+	}
+}
+
+static int traverse_fn(struct tdb_context *tdb, TDB_DATA key, TDB_DATA dbuf, void *state)
+{
+	printf("{\n");
+	printf("key(%d) = \"", (int)key.dsize);
+	print_data(key);
+	printf("\"\n");
+	printf("data(%d) = \"", (int)dbuf.dsize);
+	print_data(dbuf);
+	printf("\"\n");
+	printf("}\n");
+	return 0;
+}
+
+static int dump_tdb(const char *fname, const char *keyname)
+{
+	struct tdb_context *tdb;
+	TDB_DATA key, value;
+
+	tdb = tdb_open(fname, 0, O_RDONLY, 0, NULL);
+	if (!tdb) {
+		printf("Failed to open %s\n", fname);
+		return 1;
+	}
+
+	if (!keyname) {
+		tdb_traverse(tdb, traverse_fn, NULL);
+	} else {
+		key = tdb_mkdata(keyname, strlen(keyname));
+		if (tdb_fetch(tdb, key, &value) != 0) {
+			return 1;
+		} else {
+			print_data(value);
+			free(value.dptr);
+		}
+	}
+
+	return 0;
+}
+
+static void usage( void)
+{
+	printf( "Usage: tdb2dump [options] <filename>\n\n");
+	printf( "   -h          this help message\n");
+	printf( "   -k keyname  dumps value of keyname\n");
+}
+
+ int main(int argc, char *argv[])
+{
+	char *fname, *keyname=NULL;
+	int c;
+
+	if (argc < 2) {
+		printf("Usage: tdb2dump <fname>\n");
+		exit(1);
+	}
+
+	while ((c = getopt( argc, argv, "hk:")) != -1) {
+		switch (c) {
+		case 'h':
+			usage();
+			exit( 0);
+		case 'k':
+			keyname = optarg;
+			break;
+		default:
+			usage();
+			exit( 1);
+		}
+	}
+
+	fname = argv[optind];
+
+	return dump_tdb(fname, keyname);
+}
diff --git a/lib/tdb2/tools/tdb2restore.c b/lib/tdb2/tools/tdb2restore.c
new file mode 100644
index 0000000000..658215a16c
--- /dev/null
+++ b/lib/tdb2/tools/tdb2restore.c
@@ -0,0 +1,227 @@
+/*
+   tdb2restore -- construct a tdb from tdbdump output.
+   Copyright (C) Volker Lendecke		2010
+   Copyright (C) Simon McVittie			2005
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "tdb2.h"
+#include <assert.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#define debug_fprintf(file, fmt, ...) do {/*nothing*/} while (0)
+
+static int read_linehead(FILE *f)
+{
+	int i, c;
+	int num_bytes;
+	char prefix[128];
+
+	while (1) {
+		c = getc(f);
+		if (c == EOF) {
+			return -1;
+		}
+		if (c == '(') {
+			break;
+		}
+	}
+	for (i=0; i<sizeof(prefix); i++) {
+		c = getc(f);
+		if (c == EOF) {
+			return -1;
+		}
+		prefix[i] = c;
+		if (c == '"') {
+			break;
+		}
+	}
+	if (i == sizeof(prefix)) {
+		return -1;
+	}
+	prefix[i] = '\0';
+
+	if (sscanf(prefix, "%d) = ", &num_bytes) != 1) {
+		return -1;
+	}
+	return num_bytes;
+}
+
+static int read_hex(void) {
+	int c;
+	c = getchar();
+	if (c == EOF) {
+		fprintf(stderr, "Unexpected EOF in data\n");
+		return -1;
+	} else if (c == '"') {
+		fprintf(stderr, "Unexpected \\\" sequence\n");
+		return -1;
+	} else if ('0' <= c && c <= '9')  {
+		return c - '0';
+	} else if ('A' <= c && c <= 'F')  {
+		return c - 'A' + 10;
+	} else if ('a' <= c && c <= 'f')  {
+		return c - 'a' + 10;
+	} else {
+		fprintf(stderr, "Invalid hex: %c\n", c);
+		return -1;
+	}
+}
+
+static int read_data(FILE *f, struct tdb_data *d, size_t size) {
+	int c, low, high;
+	int i;
+
+	d->dptr = (unsigned char *)malloc(size);
+	if (d->dptr == NULL) {
+		return -1;
+	}
+	d->dsize = size;
+
+	for (i=0; i<size; i++) {
+		c = getc(f);
+		if (c == EOF) {
+			fprintf(stderr, "Unexpected EOF in data\n");
+			return 1;
+		} else if (c == '"') {
+			return 0;
+		} else if (c == '\\') {
+			high = read_hex();
+			if (high < 0) {
+				return -1;
+			}
+			high = high << 4;
+			assert(high == (high & 0xf0));
+			low = read_hex();
+			if (low < 0) {
+				return -1;
+			}
+			assert(low == (low & 0x0f));
+			d->dptr[i] = (low|high);
+		} else {
+			d->dptr[i] = c;
+		}
+	}
+	return 0;
+}
+
+static int swallow(FILE *f, const char *s, int *eof)
+{
+	char line[128];
+
+	if (fgets(line, sizeof(line), f) == NULL) {
+		if (eof != NULL) {
+			*eof = 1;
+		}
+		return -1;
+	}
+	if (strcmp(line, s) != 0) {
+		return -1;
+	}
+	return 0;
+}
+
+static bool read_rec(FILE *f, struct tdb_context *tdb, int *eof)
+{
+	int length;
+	struct tdb_data key, data;
+	bool ret = false;
+	enum TDB_ERROR e;
+
+	key.dptr = NULL;
+	data.dptr = NULL;
+
+	if (swallow(f, "{\n", eof) == -1) {
+		goto fail;
+	}
+	length = read_linehead(f);
+	if (length == -1) {
+		goto fail;
+	}
+	if (read_data(f, &key, length) == -1) {
+		goto fail;
+	}
+	if (swallow(f, "\"\n", NULL) == -1) {
+		goto fail;
+	}
+	length = read_linehead(f);
+	if (length == -1) {
+		goto fail;
+	}
+	if (read_data(f, &data, length) == -1) {
+		goto fail;
+	}
+	if ((swallow(f, "\"\n", NULL) == -1)
+	    || (swallow(f, "}\n", NULL) == -1)) {
+		goto fail;
+	}
+	e = tdb_store(tdb, key, data, TDB_INSERT);
+	if (e != TDB_SUCCESS) {
+		fprintf(stderr, "TDB error: %s\n", tdb_errorstr(e));
+		goto fail;
+	}
+
+	ret = true;
+fail:
+	free(key.dptr);
+	free(data.dptr);
+	return ret;
+}
+
+static int restore_tdb(const char *fname)
+{
+	struct tdb_context *tdb;
+
+	tdb = tdb_open(fname, 0, O_RDWR|O_CREAT|O_EXCL, 0666, NULL);
+	if (!tdb) {
+		perror("tdb_open");
+		fprintf(stderr, "Failed to open %s\n", fname);
+		return 1;
+	}
+
+	while (1) {
+		int eof = 0;
+		if (!read_rec(stdin, tdb, &eof)) {
+			if (eof) {
+				break;
+			}
+			return 1;
+		}
+	}
+	if (tdb_close(tdb)) {
+		fprintf(stderr, "Error closing tdb\n");
+		return 1;
+	}
+	fprintf(stderr, "EOF\n");
+	return 0;
+}
+
+int main(int argc, char *argv[])
+{
+	char *fname;
+
+	if (argc < 2) {
+		printf("Usage: %s dbname < tdbdump_output\n", argv[0]);
+		exit(1);
+	}
+
+	fname = argv[1];
+
+	return restore_tdb(fname);
+}
diff --git a/lib/tdb2/tools/tdb2tool.c b/lib/tdb2/tools/tdb2tool.c
new file mode 100644
index 0000000000..cd301c80b7
--- /dev/null
+++ b/lib/tdb2/tools/tdb2tool.c
@@ -0,0 +1,798 @@
+/*
+   Unix SMB/CIFS implementation.
+   Samba database functions
+   Copyright (C) Andrew Tridgell              1999-2000
+   Copyright (C) Paul `Rusty' Russell		   2000
+   Copyright (C) Jeremy Allison			   2000
+   Copyright (C) Andrew Esh                        2001
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "tdb2.h"
+#include <stdlib.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <string.h>
+#include <stdarg.h>
+
+static int do_command(void);
+const char *cmdname;
+char *arg1, *arg2;
+size_t arg1len, arg2len;
+int bIterate = 0;
+char *line;
+TDB_DATA iterate_kbuf;
+char cmdline[1024];
+static int disable_mmap;
+
+enum commands {
+	CMD_CREATE_TDB,
+	CMD_OPEN_TDB,
+	CMD_TRANSACTION_START,
+	CMD_TRANSACTION_COMMIT,
+	CMD_TRANSACTION_CANCEL,
+	CMD_ERASE,
+	CMD_DUMP,
+	CMD_INSERT,
+	CMD_MOVE,
+	CMD_STORE,
+	CMD_SHOW,
+	CMD_KEYS,
+	CMD_HEXKEYS,
+	CMD_DELETE,
+#if 0
+	CMD_LIST_HASH_FREE,
+	CMD_LIST_FREE,
+#endif
+	CMD_INFO,
+	CMD_MMAP,
+	CMD_SPEED,
+	CMD_FIRST,
+	CMD_NEXT,
+	CMD_SYSTEM,
+	CMD_CHECK,
+	CMD_QUIT,
+	CMD_HELP
+};
+
+typedef struct {
+	const char *name;
+	enum commands cmd;
+} COMMAND_TABLE;
+
+COMMAND_TABLE cmd_table[] = {
+	{"create",	CMD_CREATE_TDB},
+	{"open",	CMD_OPEN_TDB},
+#if 0
+	{"transaction_start",	CMD_TRANSACTION_START},
+	{"transaction_commit",	CMD_TRANSACTION_COMMIT},
+	{"transaction_cancel",	CMD_TRANSACTION_CANCEL},
+#endif
+	{"erase",	CMD_ERASE},
+	{"dump",	CMD_DUMP},
+	{"insert",	CMD_INSERT},
+	{"move",	CMD_MOVE},
+	{"store",	CMD_STORE},
+	{"show",	CMD_SHOW},
+	{"keys",	CMD_KEYS},
+	{"hexkeys",	CMD_HEXKEYS},
+	{"delete",	CMD_DELETE},
+#if 0
+	{"list",	CMD_LIST_HASH_FREE},
+	{"free",	CMD_LIST_FREE},
+#endif
+	{"info",	CMD_INFO},
+	{"speed",	CMD_SPEED},
+	{"mmap",	CMD_MMAP},
+	{"first",	CMD_FIRST},
+	{"1",		CMD_FIRST},
+	{"next",	CMD_NEXT},
+	{"n",		CMD_NEXT},
+	{"check",	CMD_CHECK},
+	{"quit",	CMD_QUIT},
+	{"q",		CMD_QUIT},
+	{"!",		CMD_SYSTEM},
+	{NULL,		CMD_HELP}
+};
+
+struct timeval tp1,tp2;
+
+static void _start_timer(void)
+{
+	gettimeofday(&tp1,NULL);
+}
+
+static double _end_timer(void)
+{
+	gettimeofday(&tp2,NULL);
+	return((tp2.tv_sec - tp1.tv_sec) +
+	       (tp2.tv_usec - tp1.tv_usec)*1.0e-6);
+}
+
+static void tdb_log(struct tdb_context *tdb, enum tdb_log_level level,
+		    const char *message, void *priv)
+{
+	fputs(message, stderr);
+}
+
+/* a tdb tool for manipulating a tdb database */
+
+static struct tdb_context *tdb;
+
+static int print_rec(struct tdb_context *the_tdb, TDB_DATA key, TDB_DATA dbuf, void *state);
+static int print_key(struct tdb_context *the_tdb, TDB_DATA key, TDB_DATA dbuf, void *state);
+static int print_hexkey(struct tdb_context *the_tdb, TDB_DATA key, TDB_DATA dbuf, void *state);
+
+static void print_asc(const char *buf,int len)
+{
+	int i;
+
+	/* We're probably printing ASCII strings so don't try to display
+	   the trailing NULL character. */
+
+	if (buf[len - 1] == 0)
+	        len--;
+
+	for (i=0;i<len;i++)
+		printf("%c",isprint(buf[i])?buf[i]:'.');
+}
+
+static void print_data(const char *buf,int len)
+{
+	int i=0;
+	if (len<=0) return;
+	printf("[%03X] ",i);
+	for (i=0;i<len;) {
+		printf("%02X ",(int)((unsigned char)buf[i]));
+		i++;
+		if (i%8 == 0) printf(" ");
+		if (i%16 == 0) {
+			print_asc(&buf[i-16],8); printf(" ");
+			print_asc(&buf[i-8],8); printf("\n");
+			if (i<len) printf("[%03X] ",i);
+		}
+	}
+	if (i%16) {
+		int n;
+
+		n = 16 - (i%16);
+		printf(" ");
+		if (n>8) printf(" ");
+		while (n--) printf("   ");
+
+		n = i%16;
+		if (n > 8) n = 8;
+		print_asc(&buf[i-(i%16)],n); printf(" ");
+		n = (i%16) - n;
+		if (n>0) print_asc(&buf[i-n],n);
+		printf("\n");
+	}
+}
+
+static void help(void)
+{
+	printf("\n"
+"tdbtool: \n"
+"  create    dbname     : create a database\n"
+"  open      dbname     : open an existing database\n"
+"  openjh    dbname     : open an existing database (jenkins hash)\n"
+"  transaction_start    : start a transaction\n"
+"  transaction_commit   : commit a transaction\n"
+"  transaction_cancel   : cancel a transaction\n"
+"  erase                : erase the database\n"
+"  dump                 : dump the database as strings\n"
+"  keys                 : dump the database keys as strings\n"
+"  hexkeys              : dump the database keys as hex values\n"
+"  info                 : print summary info about the database\n"
+"  insert    key  data  : insert a record\n"
+"  move      key  file  : move a record to a destination tdb\n"
+"  store     key  data  : store a record (replace)\n"
+"  show      key        : show a record by key\n"
+"  delete    key        : delete a record by key\n"
+#if 0
+"  list                 : print the database hash table and freelist\n"
+"  free                 : print the database freelist\n"
+#endif
+"  check                : check the integrity of an opened database\n"
+"  speed                : perform speed tests on the database\n"
+"  ! command            : execute system command\n"
+"  1 | first            : print the first record\n"
+"  n | next             : print the next record\n"
+"  q | quit             : terminate\n"
+"  \\n                   : repeat 'next' command\n"
+"\n");
+}
+
+static void terror(enum TDB_ERROR err, const char *why)
+{
+	if (err != TDB_SUCCESS)
+		printf("%s:%s\n", tdb_errorstr(err), why);
+	else
+		printf("%s\n", why);
+}
+
+static void create_tdb(const char *tdbname)
+{
+	union tdb_attribute log_attr;
+	log_attr.base.attr = TDB_ATTRIBUTE_LOG;
+	log_attr.base.next = NULL;
+	log_attr.log.fn = tdb_log;
+
+	if (tdb) tdb_close(tdb);
+	tdb = tdb_open(tdbname, (disable_mmap?TDB_NOMMAP:0),
+		       O_RDWR | O_CREAT | O_TRUNC, 0600, &log_attr);
+	if (!tdb) {
+		printf("Could not create %s: %s\n", tdbname, strerror(errno));
+	}
+}
+
+static void open_tdb(const char *tdbname)
+{
+	union tdb_attribute log_attr;
+	log_attr.base.attr = TDB_ATTRIBUTE_LOG;
+	log_attr.base.next = NULL;
+	log_attr.log.fn = tdb_log;
+
+	if (tdb) tdb_close(tdb);
+	tdb = tdb_open(tdbname, disable_mmap?TDB_NOMMAP:0, O_RDWR, 0600,
+		       &log_attr);
+	if (!tdb) {
+		printf("Could not open %s: %s\n", tdbname, strerror(errno));
+	}
+}
+
+static void insert_tdb(char *keyname, size_t keylen, char* data, size_t datalen)
+{
+	TDB_DATA key, dbuf;
+	enum TDB_ERROR ecode;
+
+	if ((keyname == NULL) || (keylen == 0)) {
+		terror(TDB_SUCCESS, "need key");
+		return;
+	}
+
+	key.dptr = (unsigned char *)keyname;
+	key.dsize = keylen;
+	dbuf.dptr = (unsigned char *)data;
+	dbuf.dsize = datalen;
+
+	ecode = tdb_store(tdb, key, dbuf, TDB_INSERT);
+	if (ecode) {
+		terror(ecode, "insert failed");
+	}
+}
+
+static void store_tdb(char *keyname, size_t keylen, char* data, size_t datalen)
+{
+	TDB_DATA key, dbuf;
+	enum TDB_ERROR ecode;
+
+	if ((keyname == NULL) || (keylen == 0)) {
+		terror(TDB_SUCCESS, "need key");
+		return;
+	}
+
+	if ((data == NULL) || (datalen == 0)) {
+		terror(TDB_SUCCESS, "need data");
+		return;
+	}
+
+	key.dptr = (unsigned char *)keyname;
+	key.dsize = keylen;
+	dbuf.dptr = (unsigned char *)data;
+	dbuf.dsize = datalen;
+
+	printf("Storing key:\n");
+	print_rec(tdb, key, dbuf, NULL);
+
+	ecode = tdb_store(tdb, key, dbuf, TDB_REPLACE);
+	if (ecode) {
+		terror(ecode, "store failed");
+	}
+}
+
+static void show_tdb(char *keyname, size_t keylen)
+{
+	TDB_DATA key, dbuf;
+	enum TDB_ERROR ecode;
+
+	if ((keyname == NULL) || (keylen == 0)) {
+		terror(TDB_SUCCESS, "need key");
+		return;
+	}
+
+	key.dptr = (unsigned char *)keyname;
+	key.dsize = keylen;
+
+	ecode = tdb_fetch(tdb, key, &dbuf);
+	if (ecode) {
+		terror(ecode, "fetch failed");
+		return;
+	}
+
+	print_rec(tdb, key, dbuf, NULL);
+
+	free( dbuf.dptr );
+}
+
+static void delete_tdb(char *keyname, size_t keylen)
+{
+	TDB_DATA key;
+	enum TDB_ERROR ecode;
+
+	if ((keyname == NULL) || (keylen == 0)) {
+		terror(TDB_SUCCESS, "need key");
+		return;
+	}
+
+	key.dptr = (unsigned char *)keyname;
+	key.dsize = keylen;
+
+	ecode = tdb_delete(tdb, key);
+	if (ecode) {
+		terror(ecode, "delete failed");
+	}
+}
+
+static void move_rec(char *keyname, size_t keylen, char* tdbname)
+{
+	TDB_DATA key, dbuf;
+	struct tdb_context *dst_tdb;
+	enum TDB_ERROR ecode;
+
+	if ((keyname == NULL) || (keylen == 0)) {
+		terror(TDB_SUCCESS, "need key");
+		return;
+	}
+
+	if ( !tdbname ) {
+		terror(TDB_SUCCESS, "need destination tdb name");
+		return;
+	}
+
+	key.dptr = (unsigned char *)keyname;
+	key.dsize = keylen;
+
+	ecode = tdb_fetch(tdb, key, &dbuf);
+	if (ecode) {
+		terror(ecode, "fetch failed");
+		return;
+	}
+
+	print_rec(tdb, key, dbuf, NULL);
+
+	dst_tdb = tdb_open(tdbname, 0, O_RDWR, 0600, NULL);
+	if ( !dst_tdb ) {
+		terror(TDB_SUCCESS, "unable to open destination tdb");
+		return;
+	}
+
+	ecode = tdb_store( dst_tdb, key, dbuf, TDB_REPLACE);
+	if (ecode)
+		terror(ecode, "failed to move record");
+	else
+		printf("record moved\n");
+
+	tdb_close( dst_tdb );
+}
+
+static int print_rec(struct tdb_context *the_tdb, TDB_DATA key, TDB_DATA dbuf, void *state)
+{
+	printf("\nkey %d bytes\n", (int)key.dsize);
+	print_asc((const char *)key.dptr, key.dsize);
+	printf("\ndata %d bytes\n", (int)dbuf.dsize);
+	print_data((const char *)dbuf.dptr, dbuf.dsize);
+	return 0;
+}
+
+static int print_key(struct tdb_context *the_tdb, TDB_DATA key, TDB_DATA dbuf, void *state)
+{
+	printf("key %d bytes: ", (int)key.dsize);
+	print_asc((const char *)key.dptr, key.dsize);
+	printf("\n");
+	return 0;
+}
+
+static int print_hexkey(struct tdb_context *the_tdb, TDB_DATA key, TDB_DATA dbuf, void *state)
+{
+	printf("key %d bytes\n", (int)key.dsize);
+	print_data((const char *)key.dptr, key.dsize);
+	printf("\n");
+	return 0;
+}
+
+static int total_bytes;
+
+static int traverse_fn(struct tdb_context *the_tdb, TDB_DATA key, TDB_DATA dbuf, void *state)
+{
+	total_bytes += dbuf.dsize;
+	return 0;
+}
+
+static void info_tdb(void)
+{
+	enum TDB_ERROR ecode;
+	char *summary;
+
+	ecode = tdb_summary(tdb, TDB_SUMMARY_HISTOGRAMS, &summary);
+
+	if (ecode) {
+		terror(ecode, "Getting summary");
+	} else {
+		printf("%s", summary);
+		free(summary);
+	}
+}
+
+static void speed_tdb(const char *tlimit)
+{
+	unsigned timelimit = tlimit?atoi(tlimit):0;
+	double t;
+	int ops;
+	if (timelimit == 0) timelimit = 5;
+
+	ops = 0;
+	printf("Testing store speed for %u seconds\n", timelimit);
+	_start_timer();
+	do {
+		long int r = random();
+		TDB_DATA key, dbuf;
+		key = tdb_mkdata("store test", strlen("store test"));
+		dbuf.dptr = (unsigned char *)&r;
+		dbuf.dsize = sizeof(r);
+		tdb_store(tdb, key, dbuf, TDB_REPLACE);
+		t = _end_timer();
+		ops++;
+	} while (t < timelimit);
+	printf("%10.3f ops/sec\n", ops/t);
+
+	ops = 0;
+	printf("Testing fetch speed for %u seconds\n", timelimit);
+	_start_timer();
+	do {
+		long int r = random();
+		TDB_DATA key, dbuf;
+		key = tdb_mkdata("store test", strlen("store test"));
+		dbuf.dptr = (unsigned char *)&r;
+		dbuf.dsize = sizeof(r);
+		tdb_fetch(tdb, key, &dbuf);
+		t = _end_timer();
+		ops++;
+	} while (t < timelimit);
+	printf("%10.3f ops/sec\n", ops/t);
+
+	ops = 0;
+	printf("Testing transaction speed for %u seconds\n", timelimit);
+	_start_timer();
+	do {
+		long int r = random();
+		TDB_DATA key, dbuf;
+		key = tdb_mkdata("transaction test", strlen("transaction test"));
+		dbuf.dptr = (unsigned char *)&r;
+		dbuf.dsize = sizeof(r);
+		tdb_transaction_start(tdb);
+		tdb_store(tdb, key, dbuf, TDB_REPLACE);
+		tdb_transaction_commit(tdb);
+		t = _end_timer();
+		ops++;
+	} while (t < timelimit);
+	printf("%10.3f ops/sec\n", ops/t);
+
+	ops = 0;
+	printf("Testing traverse speed for %u seconds\n", timelimit);
+	_start_timer();
+	do {
+		tdb_traverse(tdb, traverse_fn, NULL);
+		t = _end_timer();
+		ops++;
+	} while (t < timelimit);
+	printf("%10.3f ops/sec\n", ops/t);
+}
+
+static void toggle_mmap(void)
+{
+	disable_mmap = !disable_mmap;
+	if (disable_mmap) {
+		printf("mmap is disabled\n");
+	} else {
+		printf("mmap is enabled\n");
+	}
+}
+
+static char *tdb_getline(const char *prompt)
+{
+	static char thisline[1024];
+	char *p;
+	fputs(prompt, stdout);
+	thisline[0] = 0;
+	p = fgets(thisline, sizeof(thisline)-1, stdin);
+	if (p) p = strchr(p, '\n');
+	if (p) *p = 0;
+	return p?thisline:NULL;
+}
+
+static int do_delete_fn(struct tdb_context *the_tdb, TDB_DATA key, TDB_DATA dbuf,
+                     void *state)
+{
+    return tdb_delete(the_tdb, key);
+}
+
+static void first_record(struct tdb_context *the_tdb, TDB_DATA *pkey)
+{
+	TDB_DATA dbuf;
+	enum TDB_ERROR ecode;
+	ecode = tdb_firstkey(the_tdb, pkey);
+	if (!ecode)
+		ecode = tdb_fetch(the_tdb, *pkey, &dbuf);
+	if (ecode) terror(ecode, "fetch failed");
+	else {
+		print_rec(the_tdb, *pkey, dbuf, NULL);
+	}
+}
+
+static void next_record(struct tdb_context *the_tdb, TDB_DATA *pkey)
+{
+	TDB_DATA dbuf;
+	enum TDB_ERROR ecode;
+	ecode = tdb_nextkey(the_tdb, pkey);
+
+	if (!ecode)
+		ecode = tdb_fetch(the_tdb, *pkey, &dbuf);
+	if (ecode)
+		terror(ecode, "fetch failed");
+	else
+		print_rec(the_tdb, *pkey, dbuf, NULL);
+}
+
+static void check_db(struct tdb_context *the_tdb)
+{
+	if (!the_tdb) {
+		printf("Error: No database opened!\n");
+	} else {
+		if (tdb_check(the_tdb, NULL, NULL) != 0)
+			printf("Integrity check for the opened database failed.\n");
+		else
+			printf("Database integrity is OK.\n");
+	}
+}
+
+static int do_command(void)
+{
+	COMMAND_TABLE *ctp = cmd_table;
+	enum commands mycmd = CMD_HELP;
+	int cmd_len;
+
+	if (cmdname && strlen(cmdname) == 0) {
+		mycmd = CMD_NEXT;
+	} else {
+		while (ctp->name) {
+			cmd_len = strlen(ctp->name);
+			if (strncmp(ctp->name,cmdname,cmd_len) == 0) {
+				mycmd = ctp->cmd;
+				break;
+			}
+			ctp++;
+		}
+	}
+
+	switch (mycmd) {
+	case CMD_CREATE_TDB:
+		bIterate = 0;
+		create_tdb(arg1);
+		return 0;
+	case CMD_OPEN_TDB:
+		bIterate = 0;
+		open_tdb(arg1);
+		return 0;
+	case CMD_SYSTEM:
+		/* Shell command */
+		if (system(arg1) == -1) {
+			terror(TDB_SUCCESS, "system() call failed\n");
+		}
+		return 0;
+	case CMD_QUIT:
+		return 1;
+	default:
+		/* all the rest require a open database */
+		if (!tdb) {
+			bIterate = 0;
+			terror(TDB_SUCCESS, "database not open");
+			help();
+			return 0;
+		}
+		switch (mycmd) {
+		case CMD_TRANSACTION_START:
+			bIterate = 0;
+			tdb_transaction_start(tdb);
+			return 0;
+		case CMD_TRANSACTION_COMMIT:
+			bIterate = 0;
+			tdb_transaction_commit(tdb);
+			return 0;
+		case CMD_TRANSACTION_CANCEL:
+			bIterate = 0;
+			tdb_transaction_cancel(tdb);
+			return 0;
+		case CMD_ERASE:
+			bIterate = 0;
+			tdb_traverse(tdb, do_delete_fn, NULL);
+			return 0;
+		case CMD_DUMP:
+			bIterate = 0;
+			tdb_traverse(tdb, print_rec, NULL);
+			return 0;
+		case CMD_INSERT:
+			bIterate = 0;
+			insert_tdb(arg1, arg1len,arg2,arg2len);
+			return 0;
+		case CMD_MOVE:
+			bIterate = 0;
+			move_rec(arg1,arg1len,arg2);
+			return 0;
+		case CMD_STORE:
+			bIterate = 0;
+			store_tdb(arg1,arg1len,arg2,arg2len);
+			return 0;
+		case CMD_SHOW:
+			bIterate = 0;
+			show_tdb(arg1, arg1len);
+			return 0;
+		case CMD_KEYS:
+			tdb_traverse(tdb, print_key, NULL);
+			return 0;
+		case CMD_HEXKEYS:
+			tdb_traverse(tdb, print_hexkey, NULL);
+			return 0;
+		case CMD_DELETE:
+			bIterate = 0;
+			delete_tdb(arg1,arg1len);
+			return 0;
+#if 0
+		case CMD_LIST_HASH_FREE:
+			tdb_dump_all(tdb);
+			return 0;
+		case CMD_LIST_FREE:
+			tdb_printfreelist(tdb);
+			return 0;
+#endif
+		case CMD_INFO:
+			info_tdb();
+			return 0;
+		case CMD_SPEED:
+			speed_tdb(arg1);
+			return 0;
+		case CMD_MMAP:
+			toggle_mmap();
+			return 0;
+		case CMD_FIRST:
+			bIterate = 1;
+			first_record(tdb, &iterate_kbuf);
+			return 0;
+		case CMD_NEXT:
+			if (bIterate)
+				next_record(tdb, &iterate_kbuf);
+			return 0;
+		case CMD_CHECK:
+			check_db(tdb);
+			return 0;
+		case CMD_HELP:
+			help();
+			return 0;
+		case CMD_CREATE_TDB:
+		case CMD_OPEN_TDB:
+		case CMD_SYSTEM:
+		case CMD_QUIT:
+			/*
+			 * unhandled commands.  cases included here to avoid compiler
+			 * warnings.
+			 */
+			return 0;
+		}
+	}
+
+	return 0;
+}
+
+static char *convert_string(char *instring, size_t *sizep)
+{
+	size_t length = 0;
+	char *outp, *inp;
+	char temp[3];
+
+	outp = inp = instring;
+
+	while (*inp) {
+		if (*inp == '\\') {
+			inp++;
+			if (*inp && strchr("0123456789abcdefABCDEF",(int)*inp)) {
+				temp[0] = *inp++;
+				temp[1] = '\0';
+				if (*inp && strchr("0123456789abcdefABCDEF",(int)*inp)) {
+					temp[1] = *inp++;
+					temp[2] = '\0';
+				}
+				*outp++ = (char)strtol((const char *)temp,NULL,16);
+			} else {
+				*outp++ = *inp++;
+			}
+		} else {
+			*outp++ = *inp++;
+		}
+		length++;
+	}
+	*sizep = length;
+	return instring;
+}
+
+int main(int argc, char *argv[])
+{
+	cmdname = "";
+	arg1 = NULL;
+	arg1len = 0;
+	arg2 = NULL;
+	arg2len = 0;
+
+	if (argv[1]) {
+		cmdname = "open";
+		arg1 = argv[1];
+		do_command();
+		cmdname =  "";
+		arg1 = NULL;
+	}
+
+	switch (argc) {
+	case 1:
+	case 2:
+		/* Interactive mode */
+		while ((cmdname = tdb_getline("tdb> "))) {
+			arg2 = arg1 = NULL;
+			if ((arg1 = strchr((const char *)cmdname,' ')) != NULL) {
+				arg1++;
+				arg2 = arg1;
+				while (*arg2) {
+					if (*arg2 == ' ') {
+						*arg2++ = '\0';
+						break;
+					}
+					if ((*arg2++ == '\\') && (*arg2 == ' ')) {
+						arg2++;
+					}
+				}
+			}
+			if (arg1) arg1 = convert_string(arg1,&arg1len);
+			if (arg2) arg2 = convert_string(arg2,&arg2len);
+			if (do_command()) break;
+		}
+		break;
+	case 5:
+		arg2 = convert_string(argv[4],&arg2len);
+	case 4:
+		arg1 = convert_string(argv[3],&arg1len);
+	case 3:
+		cmdname = argv[2];
+	default:
+		do_command();
+		break;
+	}
+
+	if (tdb) tdb_close(tdb);
+
+	return 0;
+}
diff --git a/lib/tdb2/tools/tdb2torture.c b/lib/tdb2/tools/tdb2torture.c
new file mode 100644
index 0000000000..f6a7a5064a
--- /dev/null
+++ b/lib/tdb2/tools/tdb2torture.c
@@ -0,0 +1,494 @@
+/* this tests tdb by doing lots of ops from several simultaneous
+   writers - that stresses the locking code.
+*/
+
+#include "tdb2.h"
+#include <stdlib.h>
+#include <err.h>
+#include <getopt.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <fcntl.h>
+#include <time.h>
+#include <sys/wait.h>
+
+//#define REOPEN_PROB 30
+#define DELETE_PROB 8
+#define STORE_PROB 4
+#define APPEND_PROB 6
+#define TRANSACTION_PROB 10
+#define TRANSACTION_PREPARE_PROB 2
+#define LOCKSTORE_PROB 5
+#define TRAVERSE_PROB 20
+#define TRAVERSE_MOD_PROB 100
+#define TRAVERSE_ABORT_PROB 500
+#define CULL_PROB 100
+#define KEYLEN 3
+#define DATALEN 100
+
+static struct tdb_context *db;
+static int in_transaction;
+static int in_traverse;
+static int error_count;
+#if TRANSACTION_PROB
+static int always_transaction = 0;
+#endif
+static int loopnum;
+static int count_pipe;
+static union tdb_attribute log_attr;
+static union tdb_attribute seed_attr;
+
+static void tdb_log(struct tdb_context *tdb, enum tdb_log_level level,
+		    const char *message, void *data)
+{
+	fputs(message, stdout);
+	fflush(stdout);
+#if 0
+	{
+		char str[200];
+		signal(SIGUSR1, SIG_IGN);
+		sprintf(str,"xterm -e gdb /proc/%d/exe %d", getpid(), getpid());
+		system(str);
+	}
+#endif
+}
+
+#include "../private.h"
+
+static void segv_handler(int sig, siginfo_t *info, void *p)
+{
+	char string[100];
+
+	sprintf(string, "%u: death at %p (map_ptr %p, map_size %zu)\n",
+		getpid(), info->si_addr, db->file->map_ptr,
+		(size_t)db->file->map_size);
+	if (write(2, string, strlen(string)) > 0)
+		sleep(60);
+	_exit(11);
+}
+
+static void fatal(struct tdb_context *tdb, const char *why)
+{
+	fprintf(stderr, "%u:%s:%s\n", getpid(), why,
+		tdb ? tdb_errorstr(tdb_error(tdb)) : "(no tdb)");
+	error_count++;
+}
+
+static char *randbuf(int len)
+{
+	char *buf;
+	int i;
+	buf = (char *)malloc(len+1);
+
+	for (i=0;i<len;i++) {
+		buf[i] = 'a' + (rand() % 26);
+	}
+	buf[i] = 0;
+	return buf;
+}
+
+static void addrec_db(void);
+static int modify_traverse(struct tdb_context *tdb, TDB_DATA key, TDB_DATA dbuf,
+			   void *state)
+{
+#if CULL_PROB
+	if (random() % CULL_PROB == 0) {
+		tdb_delete(tdb, key);
+	}
+#endif
+
+#if TRAVERSE_MOD_PROB
+	if (random() % TRAVERSE_MOD_PROB == 0) {
+		addrec_db();
+	}
+#endif
+
+#if TRAVERSE_ABORT_PROB
+	if (random() % TRAVERSE_ABORT_PROB == 0)
+		return 1;
+#endif
+
+	return 0;
+}
+
+static void addrec_db(void)
+{
+	int klen, dlen;
+	char *k, *d;
+	TDB_DATA key, data;
+
+	klen = 1 + (rand() % KEYLEN);
+	dlen = 1 + (rand() % DATALEN);
+
+	k = randbuf(klen);
+	d = randbuf(dlen);
+
+	key.dptr = (unsigned char *)k;
+	key.dsize = klen+1;
+
+	data.dptr = (unsigned char *)d;
+	data.dsize = dlen+1;
+
+#if REOPEN_PROB
+	if (in_traverse == 0 && in_transaction == 0 && random() % REOPEN_PROB == 0) {
+		tdb_reopen_all(0);
+		goto next;
+	}
+#endif
+
+#if TRANSACTION_PROB
+	if (in_traverse == 0 && in_transaction == 0 && (always_transaction || random() % TRANSACTION_PROB == 0)) {
+		if (tdb_transaction_start(db) != 0) {
+			fatal(db, "tdb_transaction_start failed");
+		}
+		in_transaction++;
+		goto next;
+	}
+	if (in_traverse == 0 && in_transaction && random() % TRANSACTION_PROB == 0) {
+		if (random() % TRANSACTION_PREPARE_PROB == 0) {
+			if (tdb_transaction_prepare_commit(db) != 0) {
+				fatal(db, "tdb_transaction_prepare_commit failed");
+			}
+		}
+		if (tdb_transaction_commit(db) != 0) {
+			fatal(db, "tdb_transaction_commit failed");
+		}
+		in_transaction--;
+		goto next;
+	}
+
+	if (in_traverse == 0 && in_transaction && random() % TRANSACTION_PROB == 0) {
+		tdb_transaction_cancel(db);
+		in_transaction--;
+		goto next;
+	}
+#endif
+
+#if DELETE_PROB
+	if (random() % DELETE_PROB == 0) {
+		tdb_delete(db, key);
+		goto next;
+	}
+#endif
+
+#if STORE_PROB
+	if (random() % STORE_PROB == 0) {
+		if (tdb_store(db, key, data, TDB_REPLACE) != 0) {
+			fatal(db, "tdb_store failed");
+		}
+		goto next;
+	}
+#endif
+
+#if APPEND_PROB
+	if (random() % APPEND_PROB == 0) {
+		if (tdb_append(db, key, data) != 0) {
+			fatal(db, "tdb_append failed");
+		}
+		goto next;
+	}
+#endif
+
+#if LOCKSTORE_PROB
+	if (random() % LOCKSTORE_PROB == 0) {
+		tdb_chainlock(db, key);
+		if (tdb_fetch(db, key, &data) != TDB_SUCCESS) {
+			data.dsize = 0;
+			data.dptr = NULL;
+		}
+		if (tdb_store(db, key, data, TDB_REPLACE) != 0) {
+			fatal(db, "tdb_store failed");
+		}
+		if (data.dptr) free(data.dptr);
+		tdb_chainunlock(db, key);
+		goto next;
+	}
+#endif
+
+#if TRAVERSE_PROB
+	/* FIXME: recursive traverses break transactions? */
+	if (in_traverse == 0 && random() % TRAVERSE_PROB == 0) {
+		in_traverse++;
+		tdb_traverse(db, modify_traverse, NULL);
+		in_traverse--;
+		goto next;
+	}
+#endif
+
+	if (tdb_fetch(db, key, &data) == TDB_SUCCESS)
+		free(data.dptr);
+
+next:
+	free(k);
+	free(d);
+}
+
+static int traverse_fn(struct tdb_context *tdb, TDB_DATA key, TDB_DATA dbuf,
+                       void *state)
+{
+	tdb_delete(tdb, key);
+	return 0;
+}
+
+static void usage(void)
+{
+	printf("Usage: tdbtorture"
+#if TRANSACTION_PROB
+	       " [-t]"
+#endif
+	       " [-k] [-n NUM_PROCS] [-l NUM_LOOPS] [-s SEED] [-S]\n");
+	exit(0);
+}
+
+static void send_count_and_suicide(int sig)
+{
+	/* This ensures our successor can continue where we left off. */
+	if (write(count_pipe, &loopnum, sizeof(loopnum)) != sizeof(loopnum))
+		exit(2);
+	/* This gives a unique signature. */
+	kill(getpid(), SIGUSR2);
+}
+
+static int run_child(int i, int seed, unsigned num_loops, unsigned start,
+		     int tdb_flags)
+{
+	struct sigaction act = { .sa_sigaction = segv_handler,
+				 .sa_flags = SA_SIGINFO };
+	sigaction(11, &act, NULL);
+
+	db = tdb_open("torture.tdb", tdb_flags, O_RDWR | O_CREAT, 0600,
+		      &log_attr);
+	if (!db) {
+		fatal(NULL, "db open failed");
+	}
+
+#if 0
+	if (i == 0) {
+		printf("pid %i\n", getpid());
+		sleep(9);
+	} else
+		sleep(10);
+#endif
+
+	srand(seed + i);
+	srandom(seed + i);
+
+	/* Set global, then we're ready to handle being killed. */
+	loopnum = start;
+	signal(SIGUSR1, send_count_and_suicide);
+
+	for (;loopnum<num_loops && error_count == 0;loopnum++) {
+		addrec_db();
+	}
+
+	if (error_count == 0) {
+		tdb_traverse(db, NULL, NULL);
+#if TRANSACTION_PROB
+		if (always_transaction) {
+			while (in_transaction) {
+				tdb_transaction_cancel(db);
+				in_transaction--;
+			}
+			if (tdb_transaction_start(db) != 0)
+				fatal(db, "tdb_transaction_start failed");
+		}
+#endif
+		tdb_traverse(db, traverse_fn, NULL);
+		tdb_traverse(db, traverse_fn, NULL);
+
+#if TRANSACTION_PROB
+		if (always_transaction) {
+			if (tdb_transaction_commit(db) != 0)
+				fatal(db, "tdb_transaction_commit failed");
+		}
+#endif
+	}
+
+	tdb_close(db);
+
+	return (error_count < 100 ? error_count : 100);
+}
+
+int main(int argc, char * const *argv)
+{
+	int i, seed = -1;
+	int num_loops = 5000;
+	int num_procs = 3;
+	int c, pfds[2];
+	extern char *optarg;
+	pid_t *pids;
+	int kill_random = 0;
+	int *done;
+	int tdb_flags = TDB_DEFAULT;
+
+	log_attr.base.attr = TDB_ATTRIBUTE_LOG;
+	log_attr.base.next = &seed_attr;
+	log_attr.log.fn = tdb_log;
+	seed_attr.base.attr = TDB_ATTRIBUTE_SEED;
+
+	while ((c = getopt(argc, argv, "n:l:s:thkS")) != -1) {
+		switch (c) {
+		case 'n':
+			num_procs = strtol(optarg, NULL, 0);
+			break;
+		case 'l':
+			num_loops = strtol(optarg, NULL, 0);
+			break;
+		case 's':
+			seed = strtol(optarg, NULL, 0);
+			break;
+		case 'S':
+			tdb_flags = TDB_NOSYNC;
+			break;
+		case 't':
+#if TRANSACTION_PROB
+			always_transaction = 1;
+#else
+			fprintf(stderr, "Transactions not supported\n");
+			usage();
+#endif
+			break;
+		case 'k':
+			kill_random = 1;
+			break;
+		default:
+			usage();
+		}
+	}
+
+	unlink("torture.tdb");
+
+	if (seed == -1) {
+		seed = (getpid() + time(NULL)) & 0x7FFFFFFF;
+	}
+	seed_attr.seed.seed = (((uint64_t)seed) << 32) | seed;
+
+	if (num_procs == 1 && !kill_random) {
+		/* Don't fork for this case, makes debugging easier. */
+		error_count = run_child(0, seed, num_loops, 0, tdb_flags);
+		goto done;
+	}
+
+	pids = (pid_t *)calloc(sizeof(pid_t), num_procs);
+	done = (int *)calloc(sizeof(int), num_procs);
+
+	if (pipe(pfds) != 0) {
+		perror("Creating pipe");
+		exit(1);
+	}
+	count_pipe = pfds[1];
+
+	for (i=0;i<num_procs;i++) {
+		if ((pids[i]=fork()) == 0) {
+			close(pfds[0]);
+			if (i == 0) {
+				printf("testing with %d processes, %d loops, seed=%d%s\n",
+				       num_procs, num_loops, seed,
+#if TRANSACTION_PROB
+				       always_transaction ? " (all within transactions)" : ""
+#else
+				       ""
+#endif
+					);
+			}
+			exit(run_child(i, seed, num_loops, 0, tdb_flags));
+		}
+	}
+
+	while (num_procs) {
+		int status, j;
+		pid_t pid;
+
+		if (error_count != 0) {
+			/* try and stop the test on any failure */
+			for (j=0;j<num_procs;j++) {
+				if (pids[j] != 0) {
+					kill(pids[j], SIGTERM);
+				}
+			}
+		}
+
+		pid = waitpid(-1, &status, kill_random ? WNOHANG : 0);
+		if (pid == 0) {
+			struct timespec ts;
+
+			/* Sleep for 1/10 second. */
+			ts.tv_sec = 0;
+			ts.tv_nsec = 100000000;
+			nanosleep(&ts, NULL);
+
+			/* Kill someone. */
+			kill(pids[random() % num_procs], SIGUSR1);
+			continue;
+		}
+
+		if (pid == -1) {
+			perror("failed to wait for child\n");
+			exit(1);
+		}
+
+		for (j=0;j<num_procs;j++) {
+			if (pids[j] == pid) break;
+		}
+		if (j == num_procs) {
+			printf("unknown child %d exited!?\n", (int)pid);
+			exit(1);
+		}
+		if (WIFSIGNALED(status)) {
+			if (WTERMSIG(status) == SIGUSR2
+			    || WTERMSIG(status) == SIGUSR1) {
+				/* SIGUSR2 means they wrote to pipe. */
+				if (WTERMSIG(status) == SIGUSR2) {
+					if (read(pfds[0], &done[j],
+						 sizeof(done[j]))
+					    != sizeof(done[j]))
+						err(1,
+						    "Short read from child?");
+				}
+				pids[j] = fork();
+				if (pids[j] == 0)
+					exit(run_child(j, seed, num_loops,
+						       done[j], tdb_flags));
+				printf("Restarting child %i for %u-%u\n",
+				       j, done[j], num_loops);
+				continue;
+			}
+			printf("child %d exited with signal %d\n",
+			       (int)pid, WTERMSIG(status));
+			error_count++;
+		} else {
+			if (WEXITSTATUS(status) != 0) {
+				printf("child %d exited with status %d\n",
+				       (int)pid, WEXITSTATUS(status));
+				error_count++;
+			}
+		}
+		memmove(&pids[j], &pids[j+1],
+			(num_procs - j - 1)*sizeof(pids[0]));
+		num_procs--;
+	}
+
+	free(pids);
+
+done:
+	if (error_count == 0) {
+		db = tdb_open("torture.tdb", TDB_DEFAULT, O_RDWR | O_CREAT,
+			      0600, &log_attr);
+		if (!db) {
+			fatal(db, "db open failed");
+			exit(1);
+		}
+		if (tdb_check(db, NULL, NULL) != 0) {
+			fatal(db, "db check failed");
+			exit(1);
+		}
+		tdb_close(db);
+		printf("OK\n");
+	}
+
+	return error_count;
+}
diff --git a/lib/tdb2/transaction.c b/lib/tdb2/transaction.c
new file mode 100644
index 0000000000..b13223bc2e
--- /dev/null
+++ b/lib/tdb2/transaction.c
@@ -0,0 +1,1308 @@
+ /*
+   Unix SMB/CIFS implementation.
+
+   trivial database library
+
+   Copyright (C) Andrew Tridgell              2005
+   Copyright (C) Rusty Russell                2010
+
+     ** NOTE! The following LGPL license applies to the tdb
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "private.h"
+#define SAFE_FREE(x) do { if ((x) != NULL) {free(x); (x)=NULL;} } while(0)
+
+/*
+  transaction design:
+
+  - only allow a single transaction at a time per database. This makes
+    using the transaction API simpler, as otherwise the caller would
+    have to cope with temporary failures in transactions that conflict
+    with other current transactions
+
+  - keep the transaction recovery information in the same file as the
+    database, using a special 'transaction recovery' record pointed at
+    by the header. This removes the need for extra journal files as
+    used by some other databases
+
+  - dynamically allocated the transaction recover record, re-using it
+    for subsequent transactions. If a larger record is needed then
+    tdb_free() the old record to place it on the normal tdb freelist
+    before allocating the new record
+
+  - during transactions, keep a linked list of writes all that have
+    been performed by intercepting all tdb_write() calls. The hooked
+    transaction versions of tdb_read() and tdb_write() check this
+    linked list and try to use the elements of the list in preference
+    to the real database.
+
+  - don't allow any locks to be held when a transaction starts,
+    otherwise we can end up with deadlock (plus lack of lock nesting
+    in POSIX locks would mean the lock is lost)
+
+  - if the caller gains a lock during the transaction but doesn't
+    release it then fail the commit
+
+  - allow for nested calls to tdb_transaction_start(), re-using the
+    existing transaction record. If the inner transaction is canceled
+    then a subsequent commit will fail
+
+  - keep a mirrored copy of the tdb hash chain heads to allow for the
+    fast hash heads scan on traverse, updating the mirrored copy in
+    the transaction version of tdb_write
+
+  - allow callers to mix transaction and non-transaction use of tdb,
+    although once a transaction is started then an exclusive lock is
+    gained until the transaction is committed or canceled
+
+  - the commit stategy involves first saving away all modified data
+    into a linearised buffer in the transaction recovery area, then
+    marking the transaction recovery area with a magic value to
+    indicate a valid recovery record. In total 4 fsync/msync calls are
+    needed per commit to prevent race conditions. It might be possible
+    to reduce this to 3 or even 2 with some more work.
+
+  - check for a valid recovery record on open of the tdb, while the
+    open lock is held. Automatically recover from the transaction
+    recovery area if needed, then continue with the open as
+    usual. This allows for smooth crash recovery with no administrator
+    intervention.
+
+  - if TDB_NOSYNC is passed to flags in tdb_open then transactions are
+    still available, but no transaction recovery area is used and no
+    fsync/msync calls are made.
+*/
+
+/*
+  hold the context of any current transaction
+*/
+struct tdb_transaction {
+	/* the original io methods - used to do IOs to the real db */
+	const struct tdb_methods *io_methods;
+
+	/* the list of transaction blocks. When a block is first
+	   written to, it gets created in this list */
+	uint8_t **blocks;
+	size_t num_blocks;
+	size_t last_block_size; /* number of valid bytes in the last block */
+
+	/* non-zero when an internal transaction error has
+	   occurred. All write operations will then fail until the
+	   transaction is ended */
+	int transaction_error;
+
+	/* when inside a transaction we need to keep track of any
+	   nested tdb_transaction_start() calls, as these are allowed,
+	   but don't create a new transaction */
+	unsigned int nesting;
+
+	/* set when a prepare has already occurred */
+	bool prepared;
+	tdb_off_t magic_offset;
+
+	/* old file size before transaction */
+	tdb_len_t old_map_size;
+};
+
+/* This doesn't really need to be pagesize, but we use it for similar reasons. */
+#define PAGESIZE 65536
+
+/*
+  read while in a transaction. We need to check first if the data is in our list
+  of transaction elements, then if not do a real read
+*/
+static enum TDB_ERROR transaction_read(struct tdb_context *tdb, tdb_off_t off,
+				       void *buf, tdb_len_t len)
+{
+	size_t blk;
+	enum TDB_ERROR ecode;
+
+	/* break it down into block sized ops */
+	while (len + (off % PAGESIZE) > PAGESIZE) {
+		tdb_len_t len2 = PAGESIZE - (off % PAGESIZE);
+		ecode = transaction_read(tdb, off, buf, len2);
+		if (ecode != TDB_SUCCESS) {
+			return ecode;
+		}
+		len -= len2;
+		off += len2;
+		buf = (void *)(len2 + (char *)buf);
+	}
+
+	if (len == 0) {
+		return TDB_SUCCESS;
+	}
+
+	blk = off / PAGESIZE;
+
+	/* see if we have it in the block list */
+	if (tdb->transaction->num_blocks <= blk ||
+	    tdb->transaction->blocks[blk] == NULL) {
+		/* nope, do a real read */
+		ecode = tdb->transaction->io_methods->tread(tdb, off, buf, len);
+		if (ecode != TDB_SUCCESS) {
+			goto fail;
+		}
+		return 0;
+	}
+
+	/* it is in the block list. Now check for the last block */
+	if (blk == tdb->transaction->num_blocks-1) {
+		if (len > tdb->transaction->last_block_size) {
+			ecode = TDB_ERR_IO;
+			goto fail;
+		}
+	}
+
+	/* now copy it out of this block */
+	memcpy(buf, tdb->transaction->blocks[blk] + (off % PAGESIZE), len);
+	return TDB_SUCCESS;
+
+fail:
+	tdb->transaction->transaction_error = 1;
+	return tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
+			  "transaction_read: failed at off=%zu len=%zu",
+			  (size_t)off, (size_t)len);
+}
+
+
+/*
+  write while in a transaction
+*/
+static enum TDB_ERROR transaction_write(struct tdb_context *tdb, tdb_off_t off,
+					const void *buf, tdb_len_t len)
+{
+	size_t blk;
+	enum TDB_ERROR ecode;
+
+	/* Only a commit is allowed on a prepared transaction */
+	if (tdb->transaction->prepared) {
+		ecode = tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_LOG_ERROR,
+				   "transaction_write: transaction already"
+				   " prepared, write not allowed");
+		goto fail;
+	}
+
+	/* break it up into block sized chunks */
+	while (len + (off % PAGESIZE) > PAGESIZE) {
+		tdb_len_t len2 = PAGESIZE - (off % PAGESIZE);
+		ecode = transaction_write(tdb, off, buf, len2);
+		if (ecode != TDB_SUCCESS) {
+			return -1;
+		}
+		len -= len2;
+		off += len2;
+		if (buf != NULL) {
+			buf = (const void *)(len2 + (const char *)buf);
+		}
+	}
+
+	if (len == 0) {
+		return TDB_SUCCESS;
+	}
+
+	blk = off / PAGESIZE;
+	off = off % PAGESIZE;
+
+	if (tdb->transaction->num_blocks <= blk) {
+		uint8_t **new_blocks;
+		/* expand the blocks array */
+		if (tdb->transaction->blocks == NULL) {
+			new_blocks = (uint8_t **)malloc(
+				(blk+1)*sizeof(uint8_t *));
+		} else {
+			new_blocks = (uint8_t **)realloc(
+				tdb->transaction->blocks,
+				(blk+1)*sizeof(uint8_t *));
+		}
+		if (new_blocks == NULL) {
+			ecode = tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR,
+					   "transaction_write:"
+					   " failed to allocate");
+			goto fail;
+		}
+		memset(&new_blocks[tdb->transaction->num_blocks], 0,
+		       (1+(blk - tdb->transaction->num_blocks))*sizeof(uint8_t *));
+		tdb->transaction->blocks = new_blocks;
+		tdb->transaction->num_blocks = blk+1;
+		tdb->transaction->last_block_size = 0;
+	}
+
+	/* allocate and fill a block? */
+	if (tdb->transaction->blocks[blk] == NULL) {
+		tdb->transaction->blocks[blk] = (uint8_t *)calloc(PAGESIZE, 1);
+		if (tdb->transaction->blocks[blk] == NULL) {
+			ecode = tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR,
+					   "transaction_write:"
+					   " failed to allocate");
+			goto fail;
+		}
+		if (tdb->transaction->old_map_size > blk * PAGESIZE) {
+			tdb_len_t len2 = PAGESIZE;
+			if (len2 + (blk * PAGESIZE) > tdb->transaction->old_map_size) {
+				len2 = tdb->transaction->old_map_size - (blk * PAGESIZE);
+			}
+			ecode = tdb->transaction->io_methods->tread(tdb,
+					blk * PAGESIZE,
+					tdb->transaction->blocks[blk],
+					len2);
+			if (ecode != TDB_SUCCESS) {
+				ecode = tdb_logerr(tdb, ecode,
+						   TDB_LOG_ERROR,
+						   "transaction_write:"
+						   " failed to"
+						   " read old block: %s",
+						   strerror(errno));
+				SAFE_FREE(tdb->transaction->blocks[blk]);
+				goto fail;
+			}
+			if (blk == tdb->transaction->num_blocks-1) {
+				tdb->transaction->last_block_size = len2;
+			}
+		}
+	}
+
+	/* overwrite part of an existing block */
+	if (buf == NULL) {
+		memset(tdb->transaction->blocks[blk] + off, 0, len);
+	} else {
+		memcpy(tdb->transaction->blocks[blk] + off, buf, len);
+	}
+	if (blk == tdb->transaction->num_blocks-1) {
+		if (len + off > tdb->transaction->last_block_size) {
+			tdb->transaction->last_block_size = len + off;
+		}
+	}
+
+	return TDB_SUCCESS;
+
+fail:
+	tdb->transaction->transaction_error = 1;
+	return ecode;
+}
+
+
+/*
+  write while in a transaction - this variant never expands the transaction blocks, it only
+  updates existing blocks. This means it cannot change the recovery size
+*/
+static void transaction_write_existing(struct tdb_context *tdb, tdb_off_t off,
+				       const void *buf, tdb_len_t len)
+{
+	size_t blk;
+
+	/* break it up into block sized chunks */
+	while (len + (off % PAGESIZE) > PAGESIZE) {
+		tdb_len_t len2 = PAGESIZE - (off % PAGESIZE);
+		transaction_write_existing(tdb, off, buf, len2);
+		len -= len2;
+		off += len2;
+		if (buf != NULL) {
+			buf = (const void *)(len2 + (const char *)buf);
+		}
+	}
+
+	if (len == 0) {
+		return;
+	}
+
+	blk = off / PAGESIZE;
+	off = off % PAGESIZE;
+
+	if (tdb->transaction->num_blocks <= blk ||
+	    tdb->transaction->blocks[blk] == NULL) {
+		return;
+	}
+
+	if (blk == tdb->transaction->num_blocks-1 &&
+	    off + len > tdb->transaction->last_block_size) {
+		if (off >= tdb->transaction->last_block_size) {
+			return;
+		}
+		len = tdb->transaction->last_block_size - off;
+	}
+
+	/* overwrite part of an existing block */
+	memcpy(tdb->transaction->blocks[blk] + off, buf, len);
+}
+
+
+/*
+  out of bounds check during a transaction
+*/
+static enum TDB_ERROR transaction_oob(struct tdb_context *tdb, tdb_off_t len,
+				      bool probe)
+{
+	if (len <= tdb->file->map_size) {
+		return TDB_SUCCESS;
+	}
+	if (!probe) {
+		tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
+			   "tdb_oob len %lld beyond transaction size %lld",
+			   (long long)len,
+			   (long long)tdb->file->map_size);
+	}
+	return TDB_ERR_IO;
+}
+
+/*
+  transaction version of tdb_expand().
+*/
+static enum TDB_ERROR transaction_expand_file(struct tdb_context *tdb,
+					      tdb_off_t addition)
+{
+	enum TDB_ERROR ecode;
+
+	/* add a write to the transaction elements, so subsequent
+	   reads see the zero data */
+	ecode = transaction_write(tdb, tdb->file->map_size, NULL, addition);
+	if (ecode == TDB_SUCCESS) {
+		tdb->file->map_size += addition;
+	}
+	return ecode;
+}
+
+static void *transaction_direct(struct tdb_context *tdb, tdb_off_t off,
+				size_t len, bool write_mode)
+{
+	size_t blk = off / PAGESIZE, end_blk;
+
+	/* This is wrong for zero-length blocks, but will fail gracefully */
+	end_blk = (off + len - 1) / PAGESIZE;
+
+	/* Can only do direct if in single block and we've already copied. */
+	if (write_mode) {
+		tdb->stats.transaction_write_direct++;
+		if (blk != end_blk
+		    || blk >= tdb->transaction->num_blocks
+		    || tdb->transaction->blocks[blk] == NULL) {
+			tdb->stats.transaction_write_direct_fail++;
+			return NULL;
+		}
+		return tdb->transaction->blocks[blk] + off % PAGESIZE;
+	}
+
+	tdb->stats.transaction_read_direct++;
+	/* Single which we have copied? */
+	if (blk == end_blk
+	    && blk < tdb->transaction->num_blocks
+	    && tdb->transaction->blocks[blk])
+		return tdb->transaction->blocks[blk] + off % PAGESIZE;
+
+	/* Otherwise must be all not copied. */
+	while (blk <= end_blk) {
+		if (blk >= tdb->transaction->num_blocks)
+			break;
+		if (tdb->transaction->blocks[blk]) {
+			tdb->stats.transaction_read_direct_fail++;
+			return NULL;
+		}
+		blk++;
+	}
+	return tdb->transaction->io_methods->direct(tdb, off, len, false);
+}
+
+static const struct tdb_methods transaction_methods = {
+	transaction_read,
+	transaction_write,
+	transaction_oob,
+	transaction_expand_file,
+	transaction_direct,
+};
+
+/*
+  sync to disk
+*/
+static enum TDB_ERROR transaction_sync(struct tdb_context *tdb,
+				       tdb_off_t offset, tdb_len_t length)
+{
+	if (tdb->flags & TDB_NOSYNC) {
+		return TDB_SUCCESS;
+	}
+
+	if (fsync(tdb->file->fd) != 0) {
+		return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
+				  "tdb_transaction: fsync failed: %s",
+				  strerror(errno));
+	}
+#ifdef MS_SYNC
+	if (tdb->file->map_ptr) {
+		tdb_off_t moffset = offset & ~(getpagesize()-1);
+		if (msync(moffset + (char *)tdb->file->map_ptr,
+			  length + (offset - moffset), MS_SYNC) != 0) {
+			return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
+					  "tdb_transaction: msync failed: %s",
+					  strerror(errno));
+		}
+	}
+#endif
+	return TDB_SUCCESS;
+}
+
+
+static void _tdb_transaction_cancel(struct tdb_context *tdb)
+{
+	int i;
+	enum TDB_ERROR ecode;
+
+	if (tdb->transaction == NULL) {
+		tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_LOG_USE_ERROR,
+			   "tdb_transaction_cancel: no transaction");
+		return;
+	}
+
+	if (tdb->transaction->nesting != 0) {
+		tdb->transaction->transaction_error = 1;
+		tdb->transaction->nesting--;
+		return;
+	}
+
+	tdb->file->map_size = tdb->transaction->old_map_size;
+
+	/* free all the transaction blocks */
+	for (i=0;i<tdb->transaction->num_blocks;i++) {
+		if (tdb->transaction->blocks[i] != NULL) {
+			free(tdb->transaction->blocks[i]);
+		}
+	}
+	SAFE_FREE(tdb->transaction->blocks);
+
+	if (tdb->transaction->magic_offset) {
+		const struct tdb_methods *methods = tdb->transaction->io_methods;
+		uint64_t invalid = TDB_RECOVERY_INVALID_MAGIC;
+
+		/* remove the recovery marker */
+		ecode = methods->twrite(tdb, tdb->transaction->magic_offset,
+					&invalid, sizeof(invalid));
+		if (ecode == TDB_SUCCESS)
+			ecode = transaction_sync(tdb,
+						 tdb->transaction->magic_offset,
+						 sizeof(invalid));
+		if (ecode != TDB_SUCCESS) {
+			tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
+				   "tdb_transaction_cancel: failed to remove"
+				   " recovery magic");
+		}
+	}
+
+	if (tdb->file->allrecord_lock.count)
+		tdb_allrecord_unlock(tdb, tdb->file->allrecord_lock.ltype);
+
+	/* restore the normal io methods */
+	tdb->methods = tdb->transaction->io_methods;
+
+	tdb_transaction_unlock(tdb, F_WRLCK);
+
+	if (tdb_has_open_lock(tdb))
+		tdb_unlock_open(tdb, F_WRLCK);
+
+	SAFE_FREE(tdb->transaction);
+}
+
+/*
+  start a tdb transaction. No token is returned, as only a single
+  transaction is allowed to be pending per tdb_context
+*/
+enum TDB_ERROR tdb_transaction_start(struct tdb_context *tdb)
+{
+	enum TDB_ERROR ecode;
+
+	tdb->stats.transactions++;
+	/* some sanity checks */
+	if (tdb->read_only || (tdb->flags & TDB_INTERNAL)) {
+		return tdb->last_error = tdb_logerr(tdb, TDB_ERR_EINVAL,
+						    TDB_LOG_USE_ERROR,
+						    "tdb_transaction_start:"
+						    " cannot start a"
+						    " transaction on a "
+						    "read-only or internal db");
+	}
+
+	/* cope with nested tdb_transaction_start() calls */
+	if (tdb->transaction != NULL) {
+		if (!(tdb->flags & TDB_ALLOW_NESTING)) {
+			return tdb->last_error
+				= tdb_logerr(tdb, TDB_ERR_IO,
+					     TDB_LOG_USE_ERROR,
+					     "tdb_transaction_start:"
+					     " already inside transaction");
+		}
+		tdb->transaction->nesting++;
+		tdb->stats.transaction_nest++;
+		return 0;
+	}
+
+	if (tdb_has_hash_locks(tdb)) {
+		/* the caller must not have any locks when starting a
+		   transaction as otherwise we'll be screwed by lack
+		   of nested locks in POSIX */
+		return tdb->last_error = tdb_logerr(tdb, TDB_ERR_LOCK,
+						    TDB_LOG_USE_ERROR,
+						    "tdb_transaction_start:"
+						    " cannot start a"
+						    " transaction with locks"
+						    " held");
+	}
+
+	tdb->transaction = (struct tdb_transaction *)
+		calloc(sizeof(struct tdb_transaction), 1);
+	if (tdb->transaction == NULL) {
+		return tdb->last_error = tdb_logerr(tdb, TDB_ERR_OOM,
+						    TDB_LOG_ERROR,
+						    "tdb_transaction_start:"
+						    " cannot allocate");
+	}
+
+	/* get the transaction write lock. This is a blocking lock. As
+	   discussed with Volker, there are a number of ways we could
+	   make this async, which we will probably do in the future */
+	ecode = tdb_transaction_lock(tdb, F_WRLCK);
+	if (ecode != TDB_SUCCESS) {
+		SAFE_FREE(tdb->transaction->blocks);
+		SAFE_FREE(tdb->transaction);
+		return tdb->last_error = ecode;
+	}
+
+	/* get a read lock over entire file. This is upgraded to a write
+	   lock during the commit */
+	ecode = tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_WAIT, true);
+	if (ecode != TDB_SUCCESS) {
+		goto fail_allrecord_lock;
+	}
+
+	/* make sure we know about any file expansions already done by
+	   anyone else */
+	tdb->methods->oob(tdb, tdb->file->map_size + 1, true);
+	tdb->transaction->old_map_size = tdb->file->map_size;
+
+	/* finally hook the io methods, replacing them with
+	   transaction specific methods */
+	tdb->transaction->io_methods = tdb->methods;
+	tdb->methods = &transaction_methods;
+	return tdb->last_error = TDB_SUCCESS;
+
+fail_allrecord_lock:
+	tdb_transaction_unlock(tdb, F_WRLCK);
+	SAFE_FREE(tdb->transaction->blocks);
+	SAFE_FREE(tdb->transaction);
+	return tdb->last_error = ecode;
+}
+
+
+/*
+  cancel the current transaction
+*/
+void tdb_transaction_cancel(struct tdb_context *tdb)
+{
+	tdb->stats.transaction_cancel++;
+	_tdb_transaction_cancel(tdb);
+}
+
+/*
+  work out how much space the linearised recovery data will consume (worst case)
+*/
+static tdb_len_t tdb_recovery_size(struct tdb_context *tdb)
+{
+	tdb_len_t recovery_size = 0;
+	int i;
+
+	recovery_size = 0;
+	for (i=0;i<tdb->transaction->num_blocks;i++) {
+		if (i * PAGESIZE >= tdb->transaction->old_map_size) {
+			break;
+		}
+		if (tdb->transaction->blocks[i] == NULL) {
+			continue;
+		}
+		recovery_size += 2*sizeof(tdb_off_t);
+		if (i == tdb->transaction->num_blocks-1) {
+			recovery_size += tdb->transaction->last_block_size;
+		} else {
+			recovery_size += PAGESIZE;
+		}
+	}
+
+	return recovery_size;
+}
+
+static enum TDB_ERROR tdb_recovery_area(struct tdb_context *tdb,
+					const struct tdb_methods *methods,
+					tdb_off_t *recovery_offset,
+					struct tdb_recovery_record *rec)
+{
+	enum TDB_ERROR ecode;
+
+	*recovery_offset = tdb_read_off(tdb,
+					offsetof(struct tdb_header, recovery));
+	if (TDB_OFF_IS_ERR(*recovery_offset)) {
+		return *recovery_offset;
+	}
+
+	if (*recovery_offset == 0) {
+		rec->max_len = 0;
+		return TDB_SUCCESS;
+	}
+
+	ecode = methods->tread(tdb, *recovery_offset, rec, sizeof(*rec));
+	if (ecode != TDB_SUCCESS)
+		return ecode;
+
+	tdb_convert(tdb, rec, sizeof(*rec));
+	/* ignore invalid recovery regions: can happen in crash */
+	if (rec->magic != TDB_RECOVERY_MAGIC &&
+	    rec->magic != TDB_RECOVERY_INVALID_MAGIC) {
+		*recovery_offset = 0;
+		rec->max_len = 0;
+	}
+	return TDB_SUCCESS;
+}
+
+static unsigned int same(const unsigned char *new,
+			 const unsigned char *old,
+			 unsigned int length)
+{
+	unsigned int i;
+
+	for (i = 0; i < length; i++) {
+		if (new[i] != old[i])
+			break;
+	}
+	return i;
+}
+
+static unsigned int different(const unsigned char *new,
+			      const unsigned char *old,
+			      unsigned int length,
+			      unsigned int min_same,
+			      unsigned int *samelen)
+{
+	unsigned int i;
+
+	*samelen = 0;
+	for (i = 0; i < length; i++) {
+		if (new[i] == old[i]) {
+			(*samelen)++;
+		} else {
+			if (*samelen >= min_same) {
+				return i - *samelen;
+			}
+			*samelen = 0;
+		}
+	}
+
+	if (*samelen < min_same)
+		*samelen = 0;
+	return length - *samelen;
+}
+
+/* Allocates recovery blob, without tdb_recovery_record at head set up. */
+static struct tdb_recovery_record *alloc_recovery(struct tdb_context *tdb,
+						  tdb_len_t *len)
+{
+	struct tdb_recovery_record *rec;
+	size_t i;
+	enum TDB_ERROR ecode;
+	unsigned char *p;
+	const struct tdb_methods *old_methods = tdb->methods;
+
+	rec = malloc(sizeof(*rec) + tdb_recovery_size(tdb));
+	if (!rec) {
+		tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR,
+			   "transaction_setup_recovery:"
+			   " cannot allocate");
+		return TDB_ERR_PTR(TDB_ERR_OOM);
+	}
+
+	/* We temporarily revert to the old I/O methods, so we can use
+	 * tdb_access_read */
+	tdb->methods = tdb->transaction->io_methods;
+
+	/* build the recovery data into a single blob to allow us to do a single
+	   large write, which should be more efficient */
+	p = (unsigned char *)(rec + 1);
+	for (i=0;i<tdb->transaction->num_blocks;i++) {
+		tdb_off_t offset;
+		tdb_len_t length;
+		unsigned int off;
+		const unsigned char *buffer;
+
+		if (tdb->transaction->blocks[i] == NULL) {
+			continue;
+		}
+
+		offset = i * PAGESIZE;
+		length = PAGESIZE;
+		if (i == tdb->transaction->num_blocks-1) {
+			length = tdb->transaction->last_block_size;
+		}
+
+		if (offset >= tdb->transaction->old_map_size) {
+			continue;
+		}
+
+		if (offset + length > tdb->file->map_size) {
+			ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
+					   "tdb_transaction_setup_recovery:"
+					   " transaction data over new region"
+					   " boundary");
+			goto fail;
+		}
+		if (offset + length > tdb->transaction->old_map_size) {
+			/* Short read at EOF. */
+			length = tdb->transaction->old_map_size - offset;
+		}
+		buffer = tdb_access_read(tdb, offset, length, false);
+		if (TDB_PTR_IS_ERR(buffer)) {
+			ecode = TDB_PTR_ERR(buffer);
+			goto fail;
+		}
+
+		/* Skip over anything the same at the start. */
+		off = same(tdb->transaction->blocks[i], buffer, length);
+		offset += off;
+
+		while (off < length) {
+			tdb_len_t len;
+			unsigned int samelen;
+
+			len = different(tdb->transaction->blocks[i] + off,
+					buffer + off, length - off,
+					sizeof(offset) + sizeof(len) + 1,
+					&samelen);
+
+			memcpy(p, &offset, sizeof(offset));
+			memcpy(p + sizeof(offset), &len, sizeof(len));
+			tdb_convert(tdb, p, sizeof(offset) + sizeof(len));
+			p += sizeof(offset) + sizeof(len);
+			memcpy(p, buffer + off, len);
+			p += len;
+			off += len + samelen;
+			offset += len + samelen;
+		}
+		tdb_access_release(tdb, buffer);
+	}
+
+	*len = p - (unsigned char *)(rec + 1);
+	tdb->methods = old_methods;
+	return rec;
+
+fail:
+	free(rec);
+	tdb->methods = old_methods;
+	return TDB_ERR_PTR(ecode);
+}
+
+static tdb_off_t create_recovery_area(struct tdb_context *tdb,
+				      tdb_len_t rec_length,
+				      struct tdb_recovery_record *rec)
+{
+	tdb_off_t off, recovery_off;
+	tdb_len_t addition;
+	enum TDB_ERROR ecode;
+	const struct tdb_methods *methods = tdb->transaction->io_methods;
+
+	/* round up to a multiple of page size. Overallocate, since each
+	 * such allocation forces us to expand the file. */
+	rec->max_len
+		= (((sizeof(*rec) + rec_length + rec_length / 2)
+		    + PAGESIZE-1) & ~(PAGESIZE-1))
+		- sizeof(*rec);
+	off = tdb->file->map_size;
+
+	/* Restore ->map_size before calling underlying expand_file.
+	   Also so that we don't try to expand the file again in the
+	   transaction commit, which would destroy the recovery
+	   area */
+	addition = (tdb->file->map_size - tdb->transaction->old_map_size) +
+		sizeof(*rec) + rec->max_len;
+	tdb->file->map_size = tdb->transaction->old_map_size;
+	tdb->stats.transaction_expand_file++;
+	ecode = methods->expand_file(tdb, addition);
+	if (ecode != TDB_SUCCESS) {
+		return tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
+				  "tdb_recovery_allocate:"
+				  " failed to create recovery area");
+	}
+
+	/* we have to reset the old map size so that we don't try to
+	   expand the file again in the transaction commit, which
+	   would destroy the recovery area */
+	tdb->transaction->old_map_size = tdb->file->map_size;
+
+	/* write the recovery header offset and sync - we can sync without a race here
+	   as the magic ptr in the recovery record has not been set */
+	recovery_off = off;
+	tdb_convert(tdb, &recovery_off, sizeof(recovery_off));
+	ecode = methods->twrite(tdb, offsetof(struct tdb_header, recovery),
+				&recovery_off, sizeof(tdb_off_t));
+	if (ecode != TDB_SUCCESS) {
+		return tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
+				  "tdb_recovery_allocate:"
+				  " failed to write recovery head");
+	}
+	transaction_write_existing(tdb, offsetof(struct tdb_header, recovery),
+				   &recovery_off,
+				   sizeof(tdb_off_t));
+	return off;
+}
+
+/*
+  setup the recovery data that will be used on a crash during commit
+*/
+static enum TDB_ERROR transaction_setup_recovery(struct tdb_context *tdb)
+{
+	tdb_len_t recovery_size = 0;
+	tdb_off_t recovery_off = 0;
+	tdb_off_t old_map_size = tdb->transaction->old_map_size;
+	struct tdb_recovery_record *recovery;
+	const struct tdb_methods *methods = tdb->transaction->io_methods;
+	uint64_t magic;
+	enum TDB_ERROR ecode;
+
+	recovery = alloc_recovery(tdb, &recovery_size);
+	if (TDB_PTR_IS_ERR(recovery))
+		return TDB_PTR_ERR(recovery);
+
+	ecode = tdb_recovery_area(tdb, methods, &recovery_off, recovery);
+	if (ecode) {
+		free(recovery);
+		return ecode;
+	}
+
+	if (recovery->max_len < recovery_size) {
+		/* Not large enough. Free up old recovery area. */
+		if (recovery_off) {
+			tdb->stats.frees++;
+			ecode = add_free_record(tdb, recovery_off,
+						sizeof(*recovery)
+						+ recovery->max_len,
+						TDB_LOCK_WAIT, true);
+			free(recovery);
+			if (ecode != TDB_SUCCESS) {
+				return tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
+						  "tdb_recovery_allocate:"
+						  " failed to free previous"
+						  " recovery area");
+			}
+
+			/* Refresh recovery after add_free_record above. */
+			recovery = alloc_recovery(tdb, &recovery_size);
+			if (TDB_PTR_IS_ERR(recovery))
+				return TDB_PTR_ERR(recovery);
+		}
+
+		recovery_off = create_recovery_area(tdb, recovery_size,
+						    recovery);
+		if (TDB_OFF_IS_ERR(recovery_off)) {
+			free(recovery);
+			return recovery_off;
+		}
+	}
+
+	/* Now we know size, convert rec header. */
+	recovery->magic = TDB_RECOVERY_INVALID_MAGIC;
+	recovery->len = recovery_size;
+	recovery->eof = old_map_size;
+	tdb_convert(tdb, recovery, sizeof(*recovery));
+
+	/* write the recovery data to the recovery area */
+	ecode = methods->twrite(tdb, recovery_off, recovery, recovery_size);
+	if (ecode != TDB_SUCCESS) {
+		free(recovery);
+		return tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
+				  "tdb_transaction_setup_recovery:"
+				  " failed to write recovery data");
+	}
+	transaction_write_existing(tdb, recovery_off, recovery, recovery_size);
+
+	free(recovery);
+
+	/* as we don't have ordered writes, we have to sync the recovery
+	   data before we update the magic to indicate that the recovery
+	   data is present */
+	ecode = transaction_sync(tdb, recovery_off, recovery_size);
+	if (ecode != TDB_SUCCESS)
+		return ecode;
+
+	magic = TDB_RECOVERY_MAGIC;
+	tdb_convert(tdb, &magic, sizeof(magic));
+
+	tdb->transaction->magic_offset
+		= recovery_off + offsetof(struct tdb_recovery_record, magic);
+
+	ecode = methods->twrite(tdb, tdb->transaction->magic_offset,
+				&magic, sizeof(magic));
+	if (ecode != TDB_SUCCESS) {
+		return tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
+				  "tdb_transaction_setup_recovery:"
+				  " failed to write recovery magic");
+	}
+	transaction_write_existing(tdb, tdb->transaction->magic_offset,
+				   &magic, sizeof(magic));
+
+	/* ensure the recovery magic marker is on disk */
+	return transaction_sync(tdb, tdb->transaction->magic_offset,
+				sizeof(magic));
+}
+
+static enum TDB_ERROR _tdb_transaction_prepare_commit(struct tdb_context *tdb)
+{
+	const struct tdb_methods *methods;
+	enum TDB_ERROR ecode;
+
+	if (tdb->transaction == NULL) {
+		return tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_LOG_USE_ERROR,
+				  "tdb_transaction_prepare_commit:"
+				  " no transaction");
+	}
+
+	if (tdb->transaction->prepared) {
+		_tdb_transaction_cancel(tdb);
+		return tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_LOG_USE_ERROR,
+				  "tdb_transaction_prepare_commit:"
+				  " transaction already prepared");
+	}
+
+	if (tdb->transaction->transaction_error) {
+		_tdb_transaction_cancel(tdb);
+		return tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_LOG_ERROR,
+				  "tdb_transaction_prepare_commit:"
+				  " transaction error pending");
+	}
+
+
+	if (tdb->transaction->nesting != 0) {
+		return TDB_SUCCESS;
+	}
+
+	/* check for a null transaction */
+	if (tdb->transaction->blocks == NULL) {
+		return TDB_SUCCESS;
+	}
+
+	methods = tdb->transaction->io_methods;
+
+	/* upgrade the main transaction lock region to a write lock */
+	ecode = tdb_allrecord_upgrade(tdb);
+	if (ecode != TDB_SUCCESS) {
+		return ecode;
+	}
+
+	/* get the open lock - this prevents new users attaching to the database
+	   during the commit */
+	ecode = tdb_lock_open(tdb, F_WRLCK, TDB_LOCK_WAIT|TDB_LOCK_NOCHECK);
+	if (ecode != TDB_SUCCESS) {
+		return ecode;
+	}
+
+	/* Since we have whole db locked, we don't need the expansion lock. */
+	if (!(tdb->flags & TDB_NOSYNC)) {
+		/* Sets up tdb->transaction->recovery and
+		 * tdb->transaction->magic_offset. */
+		ecode = transaction_setup_recovery(tdb);
+		if (ecode != TDB_SUCCESS) {
+			return ecode;
+		}
+	}
+
+	tdb->transaction->prepared = true;
+
+	/* expand the file to the new size if needed */
+	if (tdb->file->map_size != tdb->transaction->old_map_size) {
+		tdb_len_t add;
+
+		add = tdb->file->map_size - tdb->transaction->old_map_size;
+		/* Restore original map size for tdb_expand_file */
+		tdb->file->map_size = tdb->transaction->old_map_size;
+		ecode = methods->expand_file(tdb, add);
+		if (ecode != TDB_SUCCESS) {
+			return ecode;
+		}
+	}
+
+	/* Keep the open lock until the actual commit */
+	return TDB_SUCCESS;
+}
+
+/*
+   prepare to commit the current transaction
+*/
+enum TDB_ERROR tdb_transaction_prepare_commit(struct tdb_context *tdb)
+{
+	return _tdb_transaction_prepare_commit(tdb);
+}
+
+/*
+  commit the current transaction
+*/
+enum TDB_ERROR tdb_transaction_commit(struct tdb_context *tdb)
+{
+	const struct tdb_methods *methods;
+	int i;
+	enum TDB_ERROR ecode;
+
+	if (tdb->transaction == NULL) {
+		return tdb->last_error = tdb_logerr(tdb, TDB_ERR_EINVAL,
+						    TDB_LOG_USE_ERROR,
+						    "tdb_transaction_commit:"
+						    " no transaction");
+	}
+
+	tdb_trace(tdb, "tdb_transaction_commit");
+
+	if (tdb->transaction->nesting != 0) {
+		tdb->transaction->nesting--;
+		return tdb->last_error = TDB_SUCCESS;
+	}
+
+	/* check for a null transaction */
+	if (tdb->transaction->blocks == NULL) {
+		_tdb_transaction_cancel(tdb);
+		return tdb->last_error = TDB_SUCCESS;
+	}
+
+	if (!tdb->transaction->prepared) {
+		ecode = _tdb_transaction_prepare_commit(tdb);
+		if (ecode != TDB_SUCCESS) {
+			_tdb_transaction_cancel(tdb);
+			return tdb->last_error = ecode;
+		}
+	}
+
+	methods = tdb->transaction->io_methods;
+
+	/* perform all the writes */
+	for (i=0;i<tdb->transaction->num_blocks;i++) {
+		tdb_off_t offset;
+		tdb_len_t length;
+
+		if (tdb->transaction->blocks[i] == NULL) {
+			continue;
+		}
+
+		offset = i * PAGESIZE;
+		length = PAGESIZE;
+		if (i == tdb->transaction->num_blocks-1) {
+			length = tdb->transaction->last_block_size;
+		}
+
+		ecode = methods->twrite(tdb, offset,
+					tdb->transaction->blocks[i], length);
+		if (ecode != TDB_SUCCESS) {
+			/* we've overwritten part of the data and
+			   possibly expanded the file, so we need to
+			   run the crash recovery code */
+			tdb->methods = methods;
+			tdb_transaction_recover(tdb);
+
+			_tdb_transaction_cancel(tdb);
+
+			return tdb->last_error = ecode;
+		}
+		SAFE_FREE(tdb->transaction->blocks[i]);
+	}
+
+	SAFE_FREE(tdb->transaction->blocks);
+	tdb->transaction->num_blocks = 0;
+
+	/* ensure the new data is on disk */
+	ecode = transaction_sync(tdb, 0, tdb->file->map_size);
+	if (ecode != TDB_SUCCESS) {
+		return tdb->last_error = ecode;
+	}
+
+	/*
+	  TODO: maybe write to some dummy hdr field, or write to magic
+	  offset without mmap, before the last sync, instead of the
+	  utime() call
+	*/
+
+	/* on some systems (like Linux 2.6.x) changes via mmap/msync
+	   don't change the mtime of the file, this means the file may
+	   not be backed up (as tdb rounding to block sizes means that
+	   file size changes are quite rare too). The following forces
+	   mtime changes when a transaction completes */
+#if HAVE_UTIME
+	utime(tdb->name, NULL);
+#endif
+
+	/* use a transaction cancel to free memory and remove the
+	   transaction locks: it "restores" map_size, too. */
+	tdb->transaction->old_map_size = tdb->file->map_size;
+	_tdb_transaction_cancel(tdb);
+
+	return tdb->last_error = TDB_SUCCESS;
+}
+
+
+/*
+  recover from an aborted transaction. Must be called with exclusive
+  database write access already established (including the open
+  lock to prevent new processes attaching)
+*/
+enum TDB_ERROR tdb_transaction_recover(struct tdb_context *tdb)
+{
+	tdb_off_t recovery_head, recovery_eof;
+	unsigned char *data, *p;
+	struct tdb_recovery_record rec;
+	enum TDB_ERROR ecode;
+
+	/* find the recovery area */
+	recovery_head = tdb_read_off(tdb, offsetof(struct tdb_header,recovery));
+	if (TDB_OFF_IS_ERR(recovery_head)) {
+		return tdb_logerr(tdb, recovery_head, TDB_LOG_ERROR,
+				  "tdb_transaction_recover:"
+				  " failed to read recovery head");
+	}
+
+	if (recovery_head == 0) {
+		/* we have never allocated a recovery record */
+		return TDB_SUCCESS;
+	}
+
+	/* read the recovery record */
+	ecode = tdb_read_convert(tdb, recovery_head, &rec, sizeof(rec));
+	if (ecode != TDB_SUCCESS) {
+		return tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
+				  "tdb_transaction_recover:"
+				  " failed to read recovery record");
+	}
+
+	if (rec.magic != TDB_RECOVERY_MAGIC) {
+		/* there is no valid recovery data */
+		return TDB_SUCCESS;
+	}
+
+	if (tdb->read_only) {
+		return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
+				  "tdb_transaction_recover:"
+				  " attempt to recover read only database");
+	}
+
+	recovery_eof = rec.eof;
+
+	data = (unsigned char *)malloc(rec.len);
+	if (data == NULL) {
+		return tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR,
+				  "tdb_transaction_recover:"
+				  " failed to allocate recovery data");
+	}
+
+	/* read the full recovery data */
+	ecode = tdb->methods->tread(tdb, recovery_head + sizeof(rec), data,
+				    rec.len);
+	if (ecode != TDB_SUCCESS) {
+		return tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
+				  "tdb_transaction_recover:"
+				  " failed to read recovery data");
+	}
+
+	/* recover the file data */
+	p = data;
+	while (p+sizeof(tdb_off_t)+sizeof(tdb_len_t) < data + rec.len) {
+		tdb_off_t ofs;
+		tdb_len_t len;
+		tdb_convert(tdb, p, sizeof(ofs) + sizeof(len));
+		memcpy(&ofs, p, sizeof(ofs));
+		memcpy(&len, p + sizeof(ofs), sizeof(len));
+		p += sizeof(ofs) + sizeof(len);
+
+		ecode = tdb->methods->twrite(tdb, ofs, p, len);
+		if (ecode != TDB_SUCCESS) {
+			free(data);
+			return tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
+					  "tdb_transaction_recover:"
+					  " failed to recover %zu bytes"
+					  " at offset %zu",
+					  (size_t)len, (size_t)ofs);
+		}
+		p += len;
+	}
+
+	free(data);
+
+	ecode = transaction_sync(tdb, 0, tdb->file->map_size);
+	if (ecode != TDB_SUCCESS) {
+		return tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
+				  "tdb_transaction_recover:"
+				  " failed to sync recovery");
+	}
+
+	/* if the recovery area is after the recovered eof then remove it */
+	if (recovery_eof <= recovery_head) {
+		ecode = tdb_write_off(tdb, offsetof(struct tdb_header,
+						    recovery),
+				      0);
+		if (ecode != TDB_SUCCESS) {
+			return tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
+					  "tdb_transaction_recover:"
+					  " failed to remove recovery head");
+		}
+	}
+
+	/* remove the recovery magic */
+	ecode = tdb_write_off(tdb,
+			      recovery_head
+			      + offsetof(struct tdb_recovery_record, magic),
+			      TDB_RECOVERY_INVALID_MAGIC);
+	if (ecode != TDB_SUCCESS) {
+		return tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
+				  "tdb_transaction_recover:"
+				  " failed to remove recovery magic");
+	}
+
+	ecode = transaction_sync(tdb, 0, recovery_eof);
+	if (ecode != TDB_SUCCESS) {
+		return tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
+				  "tdb_transaction_recover:"
+				  " failed to sync2 recovery");
+	}
+
+	tdb_logerr(tdb, TDB_SUCCESS, TDB_LOG_WARNING,
+		   "tdb_transaction_recover: recovered %zu byte database",
+		   (size_t)recovery_eof);
+
+	/* all done */
+	return TDB_SUCCESS;
+}
+
+tdb_bool_err tdb_needs_recovery(struct tdb_context *tdb)
+{
+	tdb_off_t recovery_head;
+	struct tdb_recovery_record rec;
+	enum TDB_ERROR ecode;
+
+	/* find the recovery area */
+	recovery_head = tdb_read_off(tdb, offsetof(struct tdb_header,recovery));
+	if (TDB_OFF_IS_ERR(recovery_head)) {
+		return recovery_head;
+	}
+
+	if (recovery_head == 0) {
+		/* we have never allocated a recovery record */
+		return false;
+	}
+
+	/* read the recovery record */
+	ecode = tdb_read_convert(tdb, recovery_head, &rec, sizeof(rec));
+	if (ecode != TDB_SUCCESS) {
+		return ecode;
+	}
+
+	return (rec.magic == TDB_RECOVERY_MAGIC);
+}
diff --git a/lib/tdb2/traverse.c b/lib/tdb2/traverse.c
new file mode 100644
index 0000000000..179e095142
--- /dev/null
+++ b/lib/tdb2/traverse.c
@@ -0,0 +1,99 @@
+ /*
+   Trivial Database 2: traverse function.
+   Copyright (C) Rusty Russell 2010
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+#include "private.h"
+#include <ccan/likely/likely.h>
+
+int64_t tdb_traverse_(struct tdb_context *tdb,
+		      int (*fn)(struct tdb_context *,
+				TDB_DATA, TDB_DATA, void *),
+		      void *p)
+{
+	enum TDB_ERROR ecode;
+	struct traverse_info tinfo;
+	struct tdb_data k, d;
+	int64_t count = 0;
+
+	k.dptr = NULL;
+	for (ecode = first_in_hash(tdb, &tinfo, &k, &d.dsize);
+	     ecode == TDB_SUCCESS;
+	     ecode = next_in_hash(tdb, &tinfo, &k, &d.dsize)) {
+		d.dptr = k.dptr + k.dsize;
+
+		count++;
+		if (fn && fn(tdb, k, d, p)) {
+			free(k.dptr);
+			tdb->last_error = TDB_SUCCESS;
+			return count;
+		}
+		free(k.dptr);
+	}
+
+	if (ecode != TDB_ERR_NOEXIST) {
+		return tdb->last_error = ecode;
+	}
+	tdb->last_error = TDB_SUCCESS;
+	return count;
+}
+
+enum TDB_ERROR tdb_firstkey(struct tdb_context *tdb, struct tdb_data *key)
+{
+	struct traverse_info tinfo;
+
+	return tdb->last_error = first_in_hash(tdb, &tinfo, key, NULL);
+}
+
+/* We lock twice, not very efficient.  We could keep last key & tinfo cached. */
+enum TDB_ERROR tdb_nextkey(struct tdb_context *tdb, struct tdb_data *key)
+{
+	struct traverse_info tinfo;
+	struct hash_info h;
+	struct tdb_used_record rec;
+
+	tinfo.prev = find_and_lock(tdb, *key, F_RDLCK, &h, &rec, &tinfo);
+	free(key->dptr);
+	if (TDB_OFF_IS_ERR(tinfo.prev)) {
+		return tdb->last_error = tinfo.prev;
+	}
+	tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range, F_RDLCK);
+
+	return tdb->last_error = next_in_hash(tdb, &tinfo, key, NULL);
+}
+
+static int wipe_one(struct tdb_context *tdb,
+		    TDB_DATA key, TDB_DATA data, enum TDB_ERROR *ecode)
+{
+	*ecode = tdb_delete(tdb, key);
+	return (*ecode != TDB_SUCCESS);
+}
+
+enum TDB_ERROR tdb_wipe_all(struct tdb_context *tdb)
+{
+	enum TDB_ERROR ecode;
+	int64_t count;
+
+	ecode = tdb_allrecord_lock(tdb, F_WRLCK, TDB_LOCK_WAIT, false);
+	if (ecode != TDB_SUCCESS)
+		return tdb->last_error = ecode;
+
+	/* FIXME: Be smarter. */
+	count = tdb_traverse(tdb, wipe_one, &ecode);
+	if (count < 0)
+		ecode = count;
+	tdb_allrecord_unlock(tdb, F_WRLCK);
+	return tdb->last_error = ecode;
+}
diff --git a/lib/tdb2/wscript b/lib/tdb2/wscript
new file mode 100644
index 0000000000..386768f0fc
--- /dev/null
+++ b/lib/tdb2/wscript
@@ -0,0 +1,99 @@
+#!/usr/bin/env python
+
+APPNAME = 'tdb'
+VERSION = '2.0-alpha'
+
+blddir = 'bin'
+
+import sys, os
+
+# find the buildtools directory
+srcdir = '.'
+while not os.path.exists(srcdir+'/buildtools') and len(srcdir.split('/')) < 5:
+    srcdir = '../' + srcdir
+sys.path.insert(0, srcdir + '/buildtools/wafsamba')
+
+import wafsamba, samba_dist, Options, Logs
+
+samba_dist.DIST_DIRS('lib/tdb2:. lib/replace:lib/replace buildtools:buildtools')
+
+def set_options(opt):
+    opt.BUILTIN_DEFAULT('replace')
+    opt.PRIVATE_EXTENSION_DEFAULT('tdb2', noextension='tdb2')
+    opt.RECURSE('lib/replace')
+    opt.add_option('--enable-tdb2-breaks-compat',
+                   help=("Build tdb2 instead of tdb1 (BREAKS TDB1!) [False]"),
+                   action="store_true", dest='BUILD_TDB2', default=False)
+    if opt.IN_LAUNCH_DIR():
+        opt.add_option('--disable-python',
+                       help=("disable the pytdb module"),
+                       action="store_true", dest='disable_python', default=False)
+
+def configure(conf):
+    if conf.env.BUILD_TDB2:
+        conf.DEFINE('BUILD_TDB2', 1)
+        conf.RECURSE('lib/replace')
+        conf.RECURSE('lib/ccan')
+
+        conf.env.standalone_tdb2 = conf.IN_LAUNCH_DIR()
+        conf.env.disable_python = getattr(Options.options, 'disable_python', False)
+
+#        if not conf.env.standalone_tdb2:
+#            if conf.CHECK_BUNDLED_SYSTEM('tdb', minversion=VERSION,
+#                                         implied_deps='replace'):
+#                conf.define('USING_SYSTEM_TDB2', 1)
+
+        conf.SAMBA_CONFIG_H()
+
+def build(bld):
+    if bld.env.BUILD_TDB2:
+        bld.RECURSE('lib/replace')
+
+        if bld.env.standalone_tdb2:
+            bld.env.PKGCONFIGDIR = '${LIBDIR}/pkgconfig'
+            bld.PKG_CONFIG_FILES('tdb2.pc', vnum=VERSION)
+            bld.INSTALL_FILES('${INCLUDEDIR}', 'tdb2.h', flat=True)
+            private_library = False
+        else:
+            private_library = True
+
+        if not bld.CONFIG_SET('USING_SYSTEM_TDB2'):
+            # FIXME: hide_symbols=True, abi_directory='ABI', abi_match='tdb_*',  vnum=VERSION,
+            bld.SAMBA_LIBRARY('tdb',
+                              '''check.c free.c hash.c io.c lock.c open.c
+                                 summary.c tdb.c transaction.c traverse.c''',
+                              deps='replace ccan',
+                              private_library=private_library)
+
+            bld.SAMBA_BINARY('tdb2torture',
+                             'tools/tdb2torture.c',
+                             'tdb',
+                             install=False)
+
+            bld.SAMBA_BINARY('tdb2tool',
+                             'tools/tdb2tool.c',
+                             'tdb')
+
+            bld.SAMBA_BINARY('tdb2dump',
+                             'tools/tdb2dump.c',
+                             'tdb')
+
+            bld.SAMBA_BINARY('tdb2restore',
+                             'tools/tdb2restore.c',
+                             'tdb')
+
+        bld.SAMBA_PYTHON('pytdb',
+                         'pytdb.c',
+                         deps='tdb',
+                         enabled=not bld.env.disable_python,
+                         realname='tdb.so',
+                         cflags='-DPACKAGE_VERSION=\"%s\"' % VERSION)
+
+def dist():
+    '''makes a tarball for distribution'''
+    samba_dist.dist()
+
+def reconfigure(ctx):
+    '''reconfigure if config scripts have changed'''
+    import samba_utils
+    samba_utils.reconfigure(ctx)
diff --git a/lib/tdb_compat/tdb_compat.c b/lib/tdb_compat/tdb_compat.c
new file mode 100644
index 0000000000..a9173fc33d
--- /dev/null
+++ b/lib/tdb_compat/tdb_compat.c
@@ -0,0 +1,102 @@
+#include <tdb_compat.h>
+
+/* Note: for the moment, we only need this file for TDB2, so we can
+ * assume waf. */
+#if BUILD_TDB2
+TDB_DATA tdb_null = { NULL, 0 };
+
+/* Proxy which sets waitflag to false so we never block. */
+static int lock_nonblock(int fd, int rw, off_t off, off_t len, bool waitflag,
+			 void *_orig)
+{
+	struct tdb_attribute_flock *orig = _orig;
+
+	return orig->lock(fd, rw, off, len, false, orig->data);
+}
+
+enum TDB_ERROR tdb_transaction_start_nonblock(struct tdb_context *tdb)
+{
+	union tdb_attribute locking, orig;
+	enum TDB_ERROR ecode;
+
+	orig.base.attr = TDB_ATTRIBUTE_FLOCK;
+	ecode = tdb_get_attribute(tdb, &orig);
+	if (ecode != TDB_SUCCESS)
+		return ecode;
+
+	/* Replace locking function with our own. */
+	locking = orig;
+	locking.flock.data = &orig;
+	locking.flock.lock = lock_nonblock;
+
+	ecode = tdb_set_attribute(tdb, &locking);
+	if (ecode != TDB_SUCCESS)
+		return ecode;
+
+	ecode = tdb_transaction_start(tdb);
+	tdb_unset_attribute(tdb, TDB_ATTRIBUTE_FLOCK);
+	return ecode;
+}
+
+/*
+ * This handles TDB_CLEAR_IF_FIRST.
+ */
+static enum TDB_ERROR clear_if_first(int fd, void *unused)
+{
+	/* We hold a lock offset 63 always, so we can tell if anyone else is. */
+	struct flock fl;
+
+	fl.l_type = F_WRLCK;
+	fl.l_whence = SEEK_SET;
+	fl.l_start = 63;
+	fl.l_len = 1;
+
+	if (fcntl(fd, F_SETLK, &fl) == 0) {
+		/* We must be first ones to open it w/ TDB_CLEAR_IF_FIRST! */
+		if (ftruncate(fd, 0) != 0) {
+			return TDB_ERR_IO;
+		}
+	}
+	fl.l_type = F_RDLCK;
+	if (fcntl(fd, F_SETLKW, &fl) != 0) {
+		return TDB_ERR_IO;
+	}
+	return TDB_SUCCESS;
+}
+
+struct tdb_context *
+tdb_open_compat_(const char *name, int hash_size_unused,
+		 int tdb_flags, int open_flags, mode_t mode,
+		 void (*log_fn)(struct tdb_context *,
+				enum tdb_log_level,
+				const char *message,
+				void *data),
+		 void *log_data)
+{
+	union tdb_attribute cif, log, *attr = NULL;
+
+	if (log_fn) {
+		log.log.base.attr = TDB_ATTRIBUTE_LOG;
+		log.log.base.next = NULL;
+		log.log.fn = log_fn;
+		log.log.data = log_data;
+		attr = &log;
+	}
+
+	if (tdb_flags & TDB_CLEAR_IF_FIRST) {
+		cif.openhook.base.attr = TDB_ATTRIBUTE_OPENHOOK;
+		cif.openhook.base.next = attr;
+		cif.openhook.fn = clear_if_first;
+		attr = &cif;
+		tdb_flags &= ~TDB_CLEAR_IF_FIRST;
+	}
+
+	/* Testsuite uses this to speed things up. */
+	if (getenv("TDB_NO_FSYNC")) {
+		tdb_flags |= TDB_NOSYNC;
+	}
+
+	return tdb_open(name, tdb_flags|TDB_ALLOW_NESTING, open_flags, mode,
+			attr);
+}
+#endif
diff --git a/lib/tdb_compat/tdb_compat.h b/lib/tdb_compat/tdb_compat.h
new file mode 100644
index 0000000000..ea401cba49
--- /dev/null
+++ b/lib/tdb_compat/tdb_compat.h
@@ -0,0 +1,136 @@
+/*
+   Unix SMB/CIFS implementation.
+
+   Compatibility layer for TDB1 vs TDB2.
+
+   Copyright (C) Rusty Russell 2011
+
+     ** NOTE! The following LGPL license applies to the tdb_compat
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+#ifndef TDB_COMPAT_H
+#define TDB_COMPAT_H
+
+#include "replace.h"
+#include <ccan/typesafe_cb/typesafe_cb.h>
+#if BUILD_TDB2
+#include <tdb2.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+extern TDB_DATA tdb_null;
+
+/* Old-style tdb_fetch. */
+static inline TDB_DATA tdb_fetch_compat(struct tdb_context *tdb, TDB_DATA k)
+{
+	TDB_DATA dbuf;
+	if (tdb_fetch(tdb, k, &dbuf) != TDB_SUCCESS) {
+		return tdb_null;
+	}
+	return dbuf;
+}
+
+static inline TDB_DATA tdb_firstkey_compat(struct tdb_context *tdb)
+{
+	TDB_DATA k;
+	if (tdb_firstkey(tdb, &k) != TDB_SUCCESS) {
+		return tdb_null;
+	}
+	return k;
+}
+
+/* Note: this frees the old key.dptr. */
+static inline TDB_DATA tdb_nextkey_compat(struct tdb_context *tdb, TDB_DATA k)
+{
+	if (tdb_nextkey(tdb, &k) != TDB_SUCCESS) {
+		return tdb_null;
+	}
+	return k;
+}
+
+/* tdb_traverse_read and tdb_traverse are equal: both only take read locks. */
+#define tdb_traverse_read tdb_traverse
+
+/* Old-style tdb_errorstr */
+#define tdb_errorstr_compat(tdb) tdb_errorstr(tdb_error(tdb))
+
+/* This typedef doesn't exist in TDB2. */
+typedef struct tdb_context TDB_CONTEXT;
+
+/* We don't need these any more. */
+#define tdb_reopen_all(flag) 0
+#define tdb_reopen(tdb) 0
+
+/* These no longer exist in tdb2. */
+#define TDB_CLEAR_IF_FIRST 1048576
+#define TDB_INCOMPATIBLE_HASH 0
+#define TDB_VOLATILE 0
+
+/* tdb2 does nonblocking functions via attibutes. */
+enum TDB_ERROR tdb_transaction_start_nonblock(struct tdb_context *tdb);
+
+/* Convenient (typesafe) wrapper for tdb open with logging */
+#define tdb_open_compat(name, hsize, tdb_fl, open_fl, mode, log_fn, log_data) \
+	tdb_open_compat_((name), (hsize), (tdb_fl), (open_fl), (mode),	\
+			 typesafe_cb_preargs(void, void *,		\
+					     (log_fn), (log_data),	\
+					     struct tdb_context *,	\
+					     enum tdb_log_level,	\
+					     const char *),		\
+			 (log_data))
+
+struct tdb_context *
+tdb_open_compat_(const char *name, int hash_size_unused,
+		 int tdb_flags, int open_flags, mode_t mode,
+		 void (*log_fn)(struct tdb_context *,
+				enum tdb_log_level,
+				const char *message,
+				void *data),
+		 void *log_data);
+#else
+#include <tdb.h>
+
+/* FIXME: Inlining this is a bit lazy, but eases S3 build. */
+static inline struct tdb_context *
+tdb_open_compat(const char *name, int hash_size,
+		int tdb_flags, int open_flags, mode_t mode,
+		tdb_log_func log_fn, void *log_private)
+{
+	struct tdb_logging_context lctx;
+	lctx.log_fn = log_fn;
+	lctx.log_private = log_private;
+
+	if (log_fn)
+		return tdb_open_ex(name, hash_size, tdb_flags, open_flags,
+				   mode, &lctx, NULL);
+	else
+		return tdb_open(name, hash_size, tdb_flags, open_flags, mode);
+}
+
+#define tdb_firstkey_compat tdb_firstkey
+/* Note: this frees the old key.dptr. */
+static inline TDB_DATA tdb_nextkey_compat(struct tdb_context *tdb, TDB_DATA k)
+{
+	TDB_DATA next = tdb_nextkey(tdb, k);
+	free(k.dptr);
+	return next;
+}
+#define tdb_errorstr_compat(tdb) tdb_errorstr(tdb)
+#define tdb_fetch_compat tdb_fetch
+#endif
+
+#endif /* TDB_COMPAT_H */
diff --git a/lib/tdb_compat/wscript b/lib/tdb_compat/wscript
new file mode 100644
index 0000000000..574e67e8ef
--- /dev/null
+++ b/lib/tdb_compat/wscript
@@ -0,0 +1,27 @@
+#!/usr/bin/env python
+
+import Options
+
+def set_options(opt):
+    opt.RECURSE('lib/tdb2')
+    opt.RECURSE('lib/tdb')
+
+def configure(conf):
+    conf.env.BUILD_TDB2 = getattr(Options.options, 'BUILD_TDB2', False)
+
+    if conf.env.BUILD_TDB2:
+        conf.RECURSE('lib/tdb2')
+    else:
+        conf.RECURSE('lib/tdb')
+    conf.RECURSE('lib/ccan')
+
+def build(bld):
+    bld.RECURSE('lib/ccan')
+    if bld.env.BUILD_TDB2:
+        bld.RECURSE('lib/tdb2')
+    else:
+        bld.RECURSE('lib/tdb')
+    bld.SAMBA_LIBRARY('tdb_compat',
+                      source='tdb_compat.c',
+		      deps='replace tdb ccan',
+                      private_library=True)
diff --git a/lib/tevent/ABI/tevent-0.9.12.sigs b/lib/tevent/ABI/tevent-0.9.12.sigs
new file mode 100644
index 0000000000..df9b08dfd5
--- /dev/null
+++ b/lib/tevent/ABI/tevent-0.9.12.sigs
@@ -0,0 +1,74 @@
+_tevent_add_fd: struct tevent_fd *(struct tevent_context *, TALLOC_CTX *, int, uint16_t, tevent_fd_handler_t, void *, const char *, const char *)
+_tevent_add_signal: struct tevent_signal *(struct tevent_context *, TALLOC_CTX *, int, int, tevent_signal_handler_t, void *, const char *, const char *)
+_tevent_add_timer: struct tevent_timer *(struct tevent_context *, TALLOC_CTX *, struct timeval, tevent_timer_handler_t, void *, const char *, const char *)
+_tevent_create_immediate: struct tevent_immediate *(TALLOC_CTX *, const char *)
+_tevent_loop_once: int (struct tevent_context *, const char *)
+_tevent_loop_until: int (struct tevent_context *, bool (*)(void *), void *, const char *)
+_tevent_loop_wait: int (struct tevent_context *, const char *)
+_tevent_queue_create: struct tevent_queue *(TALLOC_CTX *, const char *, const char *)
+_tevent_req_callback_data: void *(struct tevent_req *)
+_tevent_req_cancel: bool (struct tevent_req *, const char *)
+_tevent_req_create: struct tevent_req *(TALLOC_CTX *, void *, size_t, const char *, const char *)
+_tevent_req_data: void *(struct tevent_req *)
+_tevent_req_done: void (struct tevent_req *, const char *)
+_tevent_req_error: bool (struct tevent_req *, uint64_t, const char *)
+_tevent_req_nomem: bool (const void *, struct tevent_req *, const char *)
+_tevent_req_notify_callback: void (struct tevent_req *, const char *)
+_tevent_req_oom: void (struct tevent_req *, const char *)
+_tevent_schedule_immediate: void (struct tevent_immediate *, struct tevent_context *, tevent_immediate_handler_t, void *, const char *, const char *)
+tevent_backend_list: const char **(TALLOC_CTX *)
+tevent_cleanup_pending_signal_handlers: void (struct tevent_signal *)
+tevent_common_add_fd: struct tevent_fd *(struct tevent_context *, TALLOC_CTX *, int, uint16_t, tevent_fd_handler_t, void *, const char *, const char *)
+tevent_common_add_signal: struct tevent_signal *(struct tevent_context *, TALLOC_CTX *, int, int, tevent_signal_handler_t, void *, const char *, const char *)
+tevent_common_add_timer: struct tevent_timer *(struct tevent_context *, TALLOC_CTX *, struct timeval, tevent_timer_handler_t, void *, const char *, const char *)
+tevent_common_check_signal: int (struct tevent_context *)
+tevent_common_context_destructor: int (struct tevent_context *)
+tevent_common_fd_destructor: int (struct tevent_fd *)
+tevent_common_fd_get_flags: uint16_t (struct tevent_fd *)
+tevent_common_fd_set_close_fn: void (struct tevent_fd *, tevent_fd_close_fn_t)
+tevent_common_fd_set_flags: void (struct tevent_fd *, uint16_t)
+tevent_common_loop_immediate: bool (struct tevent_context *)
+tevent_common_loop_timer_delay: struct timeval (struct tevent_context *)
+tevent_common_loop_wait: int (struct tevent_context *, const char *)
+tevent_common_schedule_immediate: void (struct tevent_immediate *, struct tevent_context *, tevent_immediate_handler_t, void *, const char *, const char *)
+tevent_context_init: struct tevent_context *(TALLOC_CTX *)
+tevent_context_init_byname: struct tevent_context *(TALLOC_CTX *, const char *)
+tevent_debug: void (struct tevent_context *, enum tevent_debug_level, const char *, ...)
+tevent_fd_get_flags: uint16_t (struct tevent_fd *)
+tevent_fd_set_auto_close: void (struct tevent_fd *)
+tevent_fd_set_close_fn: void (struct tevent_fd *, tevent_fd_close_fn_t)
+tevent_fd_set_flags: void (struct tevent_fd *, uint16_t)
+tevent_loop_allow_nesting: void (struct tevent_context *)
+tevent_loop_set_nesting_hook: void (struct tevent_context *, tevent_nesting_hook, void *)
+tevent_queue_add: bool (struct tevent_queue *, struct tevent_context *, struct tevent_req *, tevent_queue_trigger_fn_t, void *)
+tevent_queue_length: size_t (struct tevent_queue *)
+tevent_queue_start: void (struct tevent_queue *)
+tevent_queue_stop: void (struct tevent_queue *)
+tevent_re_initialise: int (struct tevent_context *)
+tevent_register_backend: bool (const char *, const struct tevent_ops *)
+tevent_req_default_print: char *(struct tevent_req *, TALLOC_CTX *)
+tevent_req_is_error: bool (struct tevent_req *, enum tevent_req_state *, uint64_t *)
+tevent_req_is_in_progress: bool (struct tevent_req *)
+tevent_req_poll: bool (struct tevent_req *, struct tevent_context *)
+tevent_req_post: struct tevent_req *(struct tevent_req *, struct tevent_context *)
+tevent_req_print: char *(TALLOC_CTX *, struct tevent_req *)
+tevent_req_received: void (struct tevent_req *)
+tevent_req_set_callback: void (struct tevent_req *, tevent_req_fn, void *)
+tevent_req_set_cancel_fn: void (struct tevent_req *, tevent_req_cancel_fn)
+tevent_req_set_endtime: bool (struct tevent_req *, struct tevent_context *, struct timeval)
+tevent_req_set_print_fn: void (struct tevent_req *, tevent_req_print_fn)
+tevent_set_abort_fn: void (void (*)(const char *))
+tevent_set_debug: int (struct tevent_context *, void (*)(void *, enum tevent_debug_level, const char *, va_list), void *)
+tevent_set_debug_stderr: int (struct tevent_context *)
+tevent_set_default_backend: void (const char *)
+tevent_signal_support: bool (struct tevent_context *)
+tevent_timeval_add: struct timeval (const struct timeval *, uint32_t, uint32_t)
+tevent_timeval_compare: int (const struct timeval *, const struct timeval *)
+tevent_timeval_current: struct timeval (void)
+tevent_timeval_current_ofs: struct timeval (uint32_t, uint32_t)
+tevent_timeval_is_zero: bool (const struct timeval *)
+tevent_timeval_set: struct timeval (uint32_t, uint32_t)
+tevent_timeval_until: struct timeval (const struct timeval *, const struct timeval *)
+tevent_timeval_zero: struct timeval (void)
+tevent_wakeup_recv: bool (struct tevent_req *)
+tevent_wakeup_send: struct tevent_req *(TALLOC_CTX *, struct tevent_context *, struct timeval)
diff --git a/lib/tevent/tevent.h b/lib/tevent/tevent.h
index 665c491ebb..8204a28fbe 100644
--- a/lib/tevent/tevent.h
+++ b/lib/tevent/tevent.h
@@ -136,7 +136,7 @@ struct tevent_context *tevent_context_init_byname(TALLOC_CTX *mem_ctx, const cha
 const char **tevent_backend_list(TALLOC_CTX *mem_ctx);
 
 /**
- * @brief Set the default tevent backent.
+ * @brief Set the default tevent backend.
  *
  * @param[in]  backend  The name of the backend to set.
  */
@@ -995,6 +995,20 @@ bool _tevent_req_nomem(const void *p,
 	_tevent_req_nomem(p, req, __location__)
 #endif
 
+#ifdef DOXYGEN
+/**
+ * @brief Indicate out of memory to a request
+ *
+ * @param[in]  req      The request being processed.
+ */
+void tevent_req_oom(struct tevent_req *req);
+#else
+void _tevent_req_oom(struct tevent_req *req,
+		     const char *location);
+#define tevent_req_oom(req) \
+	_tevent_req_oom(req, __location__)
+#endif
+
 /**
  * @brief Finish a request before the caller had the change to set the callback.
  *
@@ -1218,7 +1232,7 @@ struct timeval tevent_timeval_current(void);
  *
  * @param[in]  secs     The seconds to set.
  *
- * @param[in]  usecs    The milliseconds to set.
+ * @param[in]  usecs    The microseconds to set.
  *
  * @return              A timeval structure with the given values.
  */
@@ -1253,7 +1267,7 @@ bool tevent_timeval_is_zero(const struct timeval *tv);
  *
  * @param[in]  secs      The seconds to add to the timeval.
  *
- * @param[in]  usecs     The milliseconds to add to the timeval.
+ * @param[in]  usecs     The microseconds to add to the timeval.
  *
  * @return               The timeval structure with the new time.
  */
@@ -1265,7 +1279,7 @@ struct timeval tevent_timeval_add(const struct timeval *tv, uint32_t secs,
  *
  * @param[in]  secs     The seconds of the offset from now.
  *
- * @param[in]  usecs    The milliseconds of the offset from now.
+ * @param[in]  usecs    The microseconds of the offset from now.
  *
  * @return              A timval with the given offset in the future.
  */
diff --git a/lib/tevent/tevent_poll.c b/lib/tevent/tevent_poll.c
index 712255b373..0b782e99bb 100644
--- a/lib/tevent/tevent_poll.c
+++ b/lib/tevent/tevent_poll.c
@@ -233,7 +233,19 @@ static int poll_event_loop_poll(struct tevent_context *ev,
 
 			pfd = &poll_ev->fds[pfd_idx];
 
-			if (pfd->revents & (POLLIN|POLLHUP|POLLERR)) {
+			if (pfd->revents & (POLLHUP|POLLERR)) {
+				/* If we only wait for TEVENT_FD_WRITE, we
+				   should not tell the event handler about it,
+				   and remove the writable flag, as we only
+				   report errors when waiting for read events
+				   to match the select behavior. */
+				if (!(fde->flags & TEVENT_FD_READ)) {
+					TEVENT_FD_NOT_WRITEABLE(fde);
+					continue;
+				}
+				flags |= TEVENT_FD_READ;
+			}
+			if (pfd->revents & POLLIN) {
 				flags |= TEVENT_FD_READ;
 			}
 			if (pfd->revents & POLLOUT) {
diff --git a/lib/tevent/tevent_req.c b/lib/tevent/tevent_req.c
index b0c9c57dde..92697b7df9 100644
--- a/lib/tevent/tevent_req.c
+++ b/lib/tevent/tevent_req.c
@@ -123,6 +123,11 @@ bool _tevent_req_error(struct tevent_req *req,
 	return true;
 }
 
+void _tevent_req_oom(struct tevent_req *req, const char *location)
+{
+	tevent_req_finish(req, TEVENT_REQ_NO_MEMORY, location);
+}
+
 bool _tevent_req_nomem(const void *p,
 		       struct tevent_req *req,
 		       const char *location)
@@ -130,7 +135,7 @@ bool _tevent_req_nomem(const void *p,
 	if (p != NULL) {
 		return false;
 	}
-	tevent_req_finish(req, TEVENT_REQ_NO_MEMORY, location);
+	_tevent_req_oom(req, location);
 	return true;
 }
 
diff --git a/lib/tevent/wscript b/lib/tevent/wscript
index 75c44c2962..5dcd18814b 100644
--- a/lib/tevent/wscript
+++ b/lib/tevent/wscript
@@ -1,7 +1,7 @@
 #!/usr/bin/env python
 
 APPNAME = 'tevent'
-VERSION = '0.9.11'
+VERSION = '0.9.12'
 
 blddir = 'bin'
 
diff --git a/lib/torture/torture.c b/lib/torture/torture.c
index a0b35bfe7c..cee6bdb934 100644
--- a/lib/torture/torture.c
+++ b/lib/torture/torture.c
@@ -88,7 +88,7 @@ _PUBLIC_ NTSTATUS torture_temp_dir(struct torture_context *tctx,
 	NT_STATUS_HAVE_NO_MEMORY(*tempdir);
 
 	if (mkdtemp(*tempdir) == NULL) {
-		return map_nt_error_from_unix(errno);
+		return map_nt_error_from_unix_common(errno);
 	}
 
 	return NT_STATUS_OK;
@@ -154,7 +154,7 @@ _PUBLIC_ NTSTATUS torture_deltree_outputdir(struct torture_context *tctx)
 
 	if (local_deltree(tctx->outputdir) == -1) {
 		if (errno != 0) {
-			return map_nt_error_from_unix(errno);
+			return map_nt_error_from_unix_common(errno);
 		}
 		return NT_STATUS_UNSUCCESSFUL;
 	}
diff --git a/lib/tsocket/tsocket_helpers.c b/lib/tsocket/tsocket_helpers.c
index 3a41a3efc3..db6b6148e9 100644
--- a/lib/tsocket/tsocket_helpers.c
+++ b/lib/tsocket/tsocket_helpers.c
@@ -73,7 +73,7 @@ struct tevent_req *tdgram_sendto_queue_send(TALLOC_CTX *mem_ctx,
 			      tdgram_sendto_queue_trigger,
 			      NULL);
 	if (!ok) {
-		tevent_req_nomem(NULL, req);
+		tevent_req_oom(req);
 		goto post;
 	}
 
@@ -346,7 +346,7 @@ struct tevent_req *tstream_readv_pdu_queue_send(TALLOC_CTX *mem_ctx,
 			      tstream_readv_pdu_queue_trigger,
 			      NULL);
 	if (!ok) {
-		tevent_req_nomem(NULL, req);
+		tevent_req_oom(req);
 		goto post;
 	}
 
@@ -453,7 +453,7 @@ struct tevent_req *tstream_writev_queue_send(TALLOC_CTX *mem_ctx,
 			      tstream_writev_queue_trigger,
 			      NULL);
 	if (!ok) {
-		tevent_req_nomem(NULL, req);
+		tevent_req_oom(req);
 		goto post;
 	}
 
diff --git a/lib/util/asn1.c b/lib/util/asn1.c
index b716da63c0..c23bf65b8d 100644
--- a/lib/util/asn1.c
+++ b/lib/util/asn1.c
@@ -885,10 +885,19 @@ bool asn1_read_ContextSimple(struct asn1_data *data, uint8_t num, DATA_BLOB *blo
 bool asn1_read_implicit_Integer(struct asn1_data *data, int *i)
 {
 	uint8_t b;
+	bool first_byte = true;
 	*i = 0;
 
 	while (!data->has_error && asn1_tag_remaining(data)>0) {
 		if (!asn1_read_uint8(data, &b)) return false;
+		if (first_byte) {
+			if (b & 0x80) {
+				/* Number is negative.
+				   Set i to -1 for sign extend. */
+				*i = -1;
+			}
+			first_byte = false;
+		}
 		*i = (*i << 8) + b;
 	}
 	return !data->has_error;	
diff --git a/lib/util/byteorder.h b/lib/util/byteorder.h
index 59ad8371e4..6bcf71e83b 100644
--- a/lib/util/byteorder.h
+++ b/lib/util/byteorder.h
@@ -201,18 +201,29 @@ static __inline__ void st_le32(uint32_t *addr, const uint32_t val)
 
 #endif /* not CAREFUL_ALIGNMENT */
 
+/* 64 bit macros */
+#define BVAL(p, ofs) (IVAL(p,ofs) | (((uint64_t)IVAL(p,(ofs)+4)) << 32))
+#define BVALS(p, ofs) ((int64_t)BVAL(p,ofs))
+#define SBVAL(p, ofs, v) (SIVAL(p,ofs,(v)&0xFFFFFFFF), SIVAL(p,(ofs)+4,((uint64_t)(v))>>32))
+#define SBVALS(p, ofs, v) (SBVAL(p,ofs,(uint64_t)v))
+
 /* now the reverse routines - these are used in nmb packets (mostly) */
 #define SREV(x) ((((x)&0xFF)<<8) | (((x)>>8)&0xFF))
 #define IREV(x) ((SREV(x)<<16) | (SREV((x)>>16)))
+#define BREV(x) ((IREV(x)<<32) | (IREV((x)>>32)))
 
 #define RSVAL(buf,pos) SREV(SVAL(buf,pos))
 #define RSVALS(buf,pos) SREV(SVALS(buf,pos))
 #define RIVAL(buf,pos) IREV(IVAL(buf,pos))
 #define RIVALS(buf,pos) IREV(IVALS(buf,pos))
+#define RBVAL(buf,pos) BREV(BVAL(buf,pos))
+#define RBVALS(buf,pos) BREV(BVALS(buf,pos))
 #define RSSVAL(buf,pos,val) SSVAL(buf,pos,SREV(val))
 #define RSSVALS(buf,pos,val) SSVALS(buf,pos,SREV(val))
 #define RSIVAL(buf,pos,val) SIVAL(buf,pos,IREV(val))
 #define RSIVALS(buf,pos,val) SIVALS(buf,pos,IREV(val))
+#define RSBVAL(buf,pos,val) SBVAL(buf,pos,BREV(val))
+#define RSBVALS(buf,pos,val) SBVALS(buf,pos,BREV(val))
 
 /* Alignment macros. */
 #define ALIGN4(p,base) ((p) + ((4 - (PTR_DIFF((p), (base)) & 3)) & 3))
@@ -222,10 +233,4 @@ static __inline__ void st_le32(uint32_t *addr, const uint32_t val)
 /* macros for accessing SMB protocol elements */
 #define VWV(vwv) ((vwv)*2)
 
-/* 64 bit macros */
-#define BVAL(p, ofs) (IVAL(p,ofs) | (((uint64_t)IVAL(p,(ofs)+4)) << 32))
-#define BVALS(p, ofs) ((int64_t)BVAL(p,ofs))
-#define SBVAL(p, ofs, v) (SIVAL(p,ofs,(v)&0xFFFFFFFF), SIVAL(p,(ofs)+4,((uint64_t)(v))>>32))
-#define SBVALS(p, ofs, v) (SBVAL(p,ofs,(uint64_t)v))
-
 #endif /* _BYTEORDER_H */
diff --git a/lib/util/charset/CP437.c b/lib/util/charset/CP437.c
new file mode 100644
index 0000000000..1e478d678f
--- /dev/null
+++ b/lib/util/charset/CP437.c
@@ -0,0 +1,135 @@
+/* 
+ * Conversion table for CP437 charset also known as IBM437
+ *
+ * Copyright (C) Alexander Bokovoy		2003
+ *
+ * Conversion tables are generated using GNU libc 2.2.5's 
+ * localedata/charmaps/IBM437 table and source/script/gen-8bit-gap.sh script
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3 of the License, or
+ * (at your option) any later version.
+ *  
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *  
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "includes.h"
+
+static const uint16_t to_ucs2[256] = {
+ 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
+ 0x0008, 0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x000E, 0x000F,
+ 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017,
+ 0x0018, 0x0019, 0x001A, 0x001B, 0x001C, 0x001D, 0x001E, 0x001F,
+ 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027,
+ 0x0028, 0x0029, 0x002A, 0x002B, 0x002C, 0x002D, 0x002E, 0x002F,
+ 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
+ 0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F,
+ 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
+ 0x0048, 0x0049, 0x004A, 0x004B, 0x004C, 0x004D, 0x004E, 0x004F,
+ 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057,
+ 0x0058, 0x0059, 0x005A, 0x005B, 0x005C, 0x005D, 0x005E, 0x005F,
+ 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,
+ 0x0068, 0x0069, 0x006A, 0x006B, 0x006C, 0x006D, 0x006E, 0x006F,
+ 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077,
+ 0x0078, 0x0079, 0x007A, 0x007B, 0x007C, 0x007D, 0x007E, 0x007F,
+ 0x00C7, 0x00FC, 0x00E9, 0x00E2, 0x00E4, 0x00E0, 0x00E5, 0x00E7,
+ 0x00EA, 0x00EB, 0x00E8, 0x00EF, 0x00EE, 0x00EC, 0x00C4, 0x00C5,
+ 0x00C9, 0x00E6, 0x00C6, 0x00F4, 0x00F6, 0x00F2, 0x00FB, 0x00F9,
+ 0x00FF, 0x00D6, 0x00DC, 0x00A2, 0x00A3, 0x00A5, 0x20A7, 0x0192,
+ 0x00E1, 0x00ED, 0x00F3, 0x00FA, 0x00F1, 0x00D1, 0x00AA, 0x00BA,
+ 0x00BF, 0x2310, 0x00AC, 0x00BD, 0x00BC, 0x00A1, 0x00AB, 0x00BB,
+ 0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x2561, 0x2562, 0x2556,
+ 0x2555, 0x2563, 0x2551, 0x2557, 0x255D, 0x255C, 0x255B, 0x2510,
+ 0x2514, 0x2534, 0x252C, 0x251C, 0x2500, 0x253C, 0x255E, 0x255F,
+ 0x255A, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256C, 0x2567,
+ 0x2568, 0x2564, 0x2565, 0x2559, 0x2558, 0x2552, 0x2553, 0x256B,
+ 0x256A, 0x2518, 0x250C, 0x2588, 0x2584, 0x258C, 0x2590, 0x2580,
+ 0x03B1, 0x00DF, 0x0393, 0x03C0, 0x03A3, 0x03C3, 0x00B5, 0x03C4,
+ 0x03A6, 0x0398, 0x03A9, 0x03B4, 0x221E, 0x03C6, 0x03B5, 0x2229,
+ 0x2261, 0x00B1, 0x2265, 0x2264, 0x2320, 0x2321, 0x00F7, 0x2248,
+ 0x00B0, 0x2219, 0x00B7, 0x221A, 0x207F, 0x00B2, 0x25A0, 0x00A0,
+};
+
+static const struct charset_gap_table from_idx[] = {
+  { 0x0000, 0x007f,     0 },
+  { 0x00a0, 0x00c9,   -32 },
+  { 0x00d1, 0x00ff,   -39 },
+  { 0x0192, 0x0192,  -185 },
+  { 0x0393, 0x0398,  -697 },
+  { 0x03a3, 0x03a9,  -707 },
+  { 0x03b1, 0x03b5,  -714 },
+  { 0x03c0, 0x03c6,  -724 },
+  { 0x207f, 0x207f, -8076 },
+  { 0x20a7, 0x20a7, -8115 },
+  { 0x2219, 0x221e, -8484 },
+  { 0x2229, 0x2229, -8494 },
+  { 0x2248, 0x2248, -8524 },
+  { 0x2261, 0x2265, -8548 },
+  { 0x2310, 0x2310, -8718 },
+  { 0x2320, 0x2321, -8733 },
+  { 0x2500, 0x2502, -9211 },
+  { 0x250c, 0x251c, -9220 },
+  { 0x2524, 0x2524, -9227 },
+  { 0x252c, 0x252c, -9234 },
+  { 0x2534, 0x2534, -9241 },
+  { 0x253c, 0x253c, -9248 },
+  { 0x2550, 0x256c, -9267 },
+  { 0x2580, 0x2593, -9286 },
+  { 0x25a0, 0x25a0, -9298 },
+  { 0xffff, 0xffff,     0 }
+};
+
+static const unsigned char from_ucs2[] = {
+
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+  0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+  0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+  0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+  0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
+  0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
+  0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+  0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
+  0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+  0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
+  0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
+  0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
+  0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
+  0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
+  0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
+  0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
+  0xff, 0xad, 0x9b, 0x9c, 0x00, 0x9d, 0x00, 0x00,
+  0x00, 0x00, 0xa6, 0xae, 0xaa, 0x00, 0x00, 0x00,
+  0xf8, 0xf1, 0xfd, 0x00, 0x00, 0xe6, 0x00, 0xfa,
+  0x00, 0x00, 0xa7, 0xaf, 0xac, 0xab, 0x00, 0xa8,
+  0x00, 0x00, 0x00, 0x00, 0x8e, 0x8f, 0x92, 0x80,
+  0x00, 0x90, 0xa5, 0x00, 0x00, 0x00, 0x00, 0x99,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x9a, 0x00, 0x00,
+  0xe1, 0x85, 0xa0, 0x83, 0x00, 0x84, 0x86, 0x91,
+  0x87, 0x8a, 0x82, 0x88, 0x89, 0x8d, 0xa1, 0x8c,
+  0x8b, 0x00, 0xa4, 0x95, 0xa2, 0x93, 0x00, 0x94,
+  0xf6, 0x00, 0x97, 0xa3, 0x96, 0x81, 0x00, 0x00,
+  0x98, 0x9f, 0xe2, 0x00, 0x00, 0x00, 0x00, 0xe9,
+  0xe4, 0x00, 0x00, 0xe8, 0x00, 0x00, 0xea, 0xe0,
+  0x00, 0x00, 0xeb, 0xee, 0xe3, 0x00, 0x00, 0xe5,
+  0xe7, 0x00, 0xed, 0xfc, 0x9e, 0xf9, 0xfb, 0x00,
+  0x00, 0x00, 0xec, 0xef, 0xf7, 0xf0, 0x00, 0x00,
+  0xf3, 0xf2, 0xa9, 0xf4, 0xf5, 0xc4, 0x00, 0xb3,
+  0xda, 0x00, 0x00, 0x00, 0xbf, 0x00, 0x00, 0x00,
+  0xc0, 0x00, 0x00, 0x00, 0xd9, 0x00, 0x00, 0x00,
+  0xc3, 0xb4, 0xc2, 0xc1, 0xc5, 0xcd, 0xba, 0xd5,
+  0xd6, 0xc9, 0xb8, 0xb7, 0xbb, 0xd4, 0xd3, 0xc8,
+  0xbe, 0xbd, 0xbc, 0xc6, 0xc7, 0xcc, 0xb5, 0xb6,
+  0xb9, 0xd1, 0xd2, 0xcb, 0xcf, 0xd0, 0xca, 0xd8,
+  0xd7, 0xce, 0xdf, 0x00, 0x00, 0x00, 0xdc, 0x00,
+  0x00, 0x00, 0xdb, 0x00, 0x00, 0x00, 0xdd, 0x00,
+  0x00, 0x00, 0xde, 0xb0, 0xb1, 0xb2, 0xfe,
+};
+
+SMB_GENERATE_CHARSET_MODULE_8_BIT_GAP(CP437)
diff --git a/lib/util/charset/CP850.c b/lib/util/charset/CP850.c
new file mode 100644
index 0000000000..87a76f4cdf
--- /dev/null
+++ b/lib/util/charset/CP850.c
@@ -0,0 +1,121 @@
+/* 
+ * Conversion table for CP850 charset also known as IBM850.
+ *
+ * Copyright (C) Alexander Bokovoy		2003
+ *
+ * Conversion tables are generated using GNU libc 2.2.5's 
+ * localedata/charmaps/IBM850 table and source/script/gen-8bit-gap.sh script
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3 of the License, or
+ * (at your option) any later version.
+ *  
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *  
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "includes.h"
+
+static const uint16_t to_ucs2[256] = {
+ 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
+ 0x0008, 0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x000E, 0x000F,
+ 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017,
+ 0x0018, 0x0019, 0x001A, 0x001B, 0x001C, 0x001D, 0x001E, 0x001F,
+ 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027,
+ 0x0028, 0x0029, 0x002A, 0x002B, 0x002C, 0x002D, 0x002E, 0x002F,
+ 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
+ 0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F,
+ 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
+ 0x0048, 0x0049, 0x004A, 0x004B, 0x004C, 0x004D, 0x004E, 0x004F,
+ 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057,
+ 0x0058, 0x0059, 0x005A, 0x005B, 0x005C, 0x005D, 0x005E, 0x005F,
+ 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,
+ 0x0068, 0x0069, 0x006A, 0x006B, 0x006C, 0x006D, 0x006E, 0x006F,
+ 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077,
+ 0x0078, 0x0079, 0x007A, 0x007B, 0x007C, 0x007D, 0x007E, 0x007F,
+ 0x00C7, 0x00FC, 0x00E9, 0x00E2, 0x00E4, 0x00E0, 0x00E5, 0x00E7,
+ 0x00EA, 0x00EB, 0x00E8, 0x00EF, 0x00EE, 0x00EC, 0x00C4, 0x00C5,
+ 0x00C9, 0x00E6, 0x00C6, 0x00F4, 0x00F6, 0x00F2, 0x00FB, 0x00F9,
+ 0x00FF, 0x00D6, 0x00DC, 0x00F8, 0x00A3, 0x00D8, 0x00D7, 0x0192,
+ 0x00E1, 0x00ED, 0x00F3, 0x00FA, 0x00F1, 0x00D1, 0x00AA, 0x00BA,
+ 0x00BF, 0x00AE, 0x00AC, 0x00BD, 0x00BC, 0x00A1, 0x00AB, 0x00BB,
+ 0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x00C1, 0x00C2, 0x00C0,
+ 0x00A9, 0x2563, 0x2551, 0x2557, 0x255D, 0x00A2, 0x00A5, 0x2510,
+ 0x2514, 0x2534, 0x252C, 0x251C, 0x2500, 0x253C, 0x00E3, 0x00C3,
+ 0x255A, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256C, 0x00A4,
+ 0x00F0, 0x00D0, 0x00CA, 0x00CB, 0x00C8, 0x0131, 0x00CD, 0x00CE,
+ 0x00CF, 0x2518, 0x250C, 0x2588, 0x2584, 0x00A6, 0x00CC, 0x2580,
+ 0x00D3, 0x00DF, 0x00D4, 0x00D2, 0x00F5, 0x00D5, 0x00B5, 0x00FE,
+ 0x00DE, 0x00DA, 0x00DB, 0x00D9, 0x00FD, 0x00DD, 0x00AF, 0x00B4,
+ 0x00AD, 0x00B1, 0x2017, 0x00BE, 0x00B6, 0x00A7, 0x00F7, 0x00B8,
+ 0x00B0, 0x00A8, 0x00B7, 0x00B9, 0x00B3, 0x00B2, 0x25A0, 0x00A0,
+};
+
+static const struct charset_gap_table from_idx[] = {
+    /* start, end, idx */
+  { 0x0000, 0x007f, 0 },
+  { 0x00a0, 0x00ff, -32 },
+  { 0x0131, 0x0131, -81 },
+  { 0x0192, 0x0192, -177 },
+  { 0x2017, 0x2017, -7989 },
+  { 0x2500, 0x2502, -9245 },
+  { 0x250c, 0x251c, -9254 },
+  { 0x2524, 0x2524, -9261 },
+  { 0x252c, 0x252c, -9268 },
+  { 0x2534, 0x2534, -9275 },
+  { 0x253c, 0x253c, -9282 },
+  { 0x2550, 0x256c, -9301 },
+  { 0x2580, 0x2588, -9320 },
+  { 0x2591, 0x2593, -9328 },
+  { 0x25a0, 0x25a0, -9340 },
+  { 0xffff, 0xffff, 0 }
+};
+static const unsigned char from_ucs2[] = {
+
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+  0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+  0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+  0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+  0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
+  0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
+  0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+  0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
+  0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+  0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
+  0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
+  0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
+  0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
+  0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
+  0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
+  0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
+  0xff, 0xad, 0xbd, 0x9c, 0xcf, 0xbe, 0xdd, 0xf5,
+  0xf9, 0xb8, 0xa6, 0xae, 0xaa, 0xf0, 0xa9, 0xee,
+  0xf8, 0xf1, 0xfd, 0xfc, 0xef, 0xe6, 0xf4, 0xfa,
+  0xf7, 0xfb, 0xa7, 0xaf, 0xac, 0xab, 0xf3, 0xa8,
+  0xb7, 0xb5, 0xb6, 0xc7, 0x8e, 0x8f, 0x92, 0x80,
+  0xd4, 0x90, 0xd2, 0xd3, 0xde, 0xd6, 0xd7, 0xd8,
+  0xd1, 0xa5, 0xe3, 0xe0, 0xe2, 0xe5, 0x99, 0x9e,
+  0x9d, 0xeb, 0xe9, 0xea, 0x9a, 0xed, 0xe8, 0xe1,
+  0x85, 0xa0, 0x83, 0xc6, 0x84, 0x86, 0x91, 0x87,
+  0x8a, 0x82, 0x88, 0x89, 0x8d, 0xa1, 0x8c, 0x8b,
+  0xd0, 0xa4, 0x95, 0xa2, 0x93, 0xe4, 0x94, 0xf6,
+  0x9b, 0x97, 0xa3, 0x96, 0x81, 0xec, 0xe7, 0x98,
+  0xd5, 0x9f, 0xf2, 0xc4, 0x00, 0xb3, 0xda, 0x00,
+  0x00, 0x00, 0xbf, 0x00, 0x00, 0x00, 0xc0, 0x00,
+  0x00, 0x00, 0xd9, 0x00, 0x00, 0x00, 0xc3, 0xb4,
+  0xc2, 0xc1, 0xc5, 0xcd, 0xba, 0x00, 0x00, 0xc9,
+  0x00, 0x00, 0xbb, 0x00, 0x00, 0xc8, 0x00, 0x00,
+  0xbc, 0x00, 0x00, 0xcc, 0x00, 0x00, 0xb9, 0x00,
+  0x00, 0xcb, 0x00, 0x00, 0xca, 0x00, 0x00, 0xce,
+  0xdf, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0x00,
+  0xdb, 0xb0, 0xb1, 0xb2, 0xfe,
+};
+
+SMB_GENERATE_CHARSET_MODULE_8_BIT_GAP(CP850)
+
diff --git a/lib/util/charset/charcnv.c b/lib/util/charset/charcnv.c
index 998bb08fd7..076795a0b2 100644
--- a/lib/util/charset/charcnv.c
+++ b/lib/util/charset/charcnv.c
@@ -113,138 +113,3 @@ convert:
 	return destlen;
 
 }
-
-/**
- * Convert string from one encoding to another, making error checking etc
- *
- * @param src pointer to source string (multibyte or singlebyte)
- * @param srclen length of the source string in bytes
- * @param dest pointer to destination string (multibyte or singlebyte)
- * @param destlen maximal length allowed for string
- * @returns the number of bytes occupied in the destination
- * on error, returns -1, and sets errno
- **/
-_PUBLIC_ bool convert_string_error_handle(struct smb_iconv_handle *ic,
-					  charset_t from, charset_t to,
-					  void const *src, size_t srclen,
-					  void *dest, size_t destlen,
-					  size_t *converted_size)
-{
-	size_t i_len, o_len;
-	ssize_t retval;
-	const char* inbuf = (const char*)src;
-	char* outbuf = (char*)dest;
-	smb_iconv_t descriptor;
-
-	if (srclen == (size_t)-1)
-		srclen = strlen(inbuf)+1;
-
-	descriptor = get_conv_handle(ic, from, to);
-	if (descriptor == (smb_iconv_t)-1 || descriptor == (smb_iconv_t)0) {
-		if (converted_size) {
-			*converted_size = 0;
-		}
-		errno = EINVAL;
-		return -1;
-	}
-
-	i_len=srclen;
-	o_len=destlen;
-
-	retval = smb_iconv(descriptor,  &inbuf, &i_len, &outbuf, &o_len);
-
-	if (converted_size != NULL)
-		*converted_size = destlen-o_len;
-	return (retval != (ssize_t)-1);
-}
-
-
-/**
- * Convert string from one encoding to another, making error checking etc
- *
- * @param src pointer to source string (multibyte or singlebyte)
- * @param srclen length of the source string in bytes
- * @param dest pointer to destination string (multibyte or singlebyte)
- * @param destlen maximal length allowed for string
- * @returns the number of bytes occupied in the destination
- **/
-_PUBLIC_ bool convert_string_handle(struct smb_iconv_handle *ic,
-					 charset_t from, charset_t to,
-					 void const *src, size_t srclen,
-					 void *dest, size_t destlen, size_t *converted_size)
-{
-	bool retval;
-
-	retval = convert_string_error_handle(ic, from, to, src, srclen, dest, destlen, converted_size);
-	if(retval==false) {
-	    	const char *reason;
-		switch(errno) {
-		case EINVAL:
-			reason="Incomplete multibyte sequence";
-			return false;
-		case E2BIG:
-			reason="No more room";
-			if (from == CH_UNIX) {
-				DEBUG(0,("E2BIG: convert_string_handle(%s,%s): srclen=%d destlen=%d - '%s'\n",
-					 charset_name(ic, from), charset_name(ic, to),
-					 (int)srclen, (int)destlen,
-					 (const char *)src));
-			} else {
-				DEBUG(0,("E2BIG: convert_string_handle(%s,%s): srclen=%d destlen=%d\n",
-					 charset_name(ic, from), charset_name(ic, to),
-					 (int)srclen, (int)destlen));
-			}
-			return false;
-		case EILSEQ:
-			reason="Illegal multibyte sequence";
-			return false;
-		default:
-			return false;
-		}
-	}
-	return true;
-}
-	
-/**
- * Convert between character sets, allocating a new buffer using talloc for the result.
- *
- * @param srclen length of source buffer.
- * @param dest always set at least to NULL
- * @note -1 is not accepted for srclen.
- *
- * @returns Size in bytes of the converted string; or -1 in case of error.
- **/
-
-_PUBLIC_ bool convert_string_talloc_handle(TALLOC_CTX *ctx,
-						struct smb_iconv_handle *ic,
-						charset_t from, charset_t to, 
-						void const *src, size_t srclen, 
-						void *dst, size_t *converted_size)
-{
-	void **dest = (void **)dst;
-	smb_iconv_t descriptor;
-	ssize_t ret;
-
-	*dest = NULL;
-
-	if (src == NULL || srclen == (size_t)-1 || srclen == 0)
-		return false;
-
-	descriptor = get_conv_handle(ic, from, to);
-
-	if (descriptor == (smb_iconv_t)-1 || descriptor == (smb_iconv_t)0) {
-		/* conversion not supported, return -1*/
-		DEBUG(3, ("convert_string_talloc_handle: conversion from %s to %s not supported!\n",
-			  charset_name(ic, from), 
-			  charset_name(ic, to)));
-		return false;
-	}
-
-	ret = iconv_talloc(ctx, descriptor, src, srclen, dest);
-	if (ret == -1)
-		return false;
-	if (converted_size != NULL)
-		*converted_size = ret;
-	return true;
-}
-
diff --git a/lib/util/charset/charset.h b/lib/util/charset/charset.h
index 1078035592..b36c461003 100644
--- a/lib/util/charset/charset.h
+++ b/lib/util/charset/charset.h
@@ -28,7 +28,7 @@
 #include <talloc.h>
 
 /* this defines the charset types used in samba */
-typedef enum {CH_UTF16LE=0, CH_UTF16=0, CH_UNIX, CH_DISPLAY, CH_DOS, CH_UTF8, CH_UTF16BE, CH_UTF16MUNGED} charset_t;
+typedef enum {CH_UTF16LE=0, CH_UTF16=0, CH_UNIX, CH_DOS, CH_UTF8, CH_UTF16BE, CH_UTF16MUNGED} charset_t;
 
 #define NUM_CHARSETS 7
 
@@ -105,11 +105,6 @@ typedef struct smb_iconv_s {
 struct loadparm_context;
 struct smb_iconv_handle;
 
-/* replace some string functions with multi-byte
-   versions */
-#define strlower(s) strlower_m(s)
-#define strupper(s) strupper_m(s)
-
 char *strchr_m(const char *s, char c);
 /**
  * Calculate the number of units (8 or 16-bit, depending on the
@@ -137,8 +132,6 @@ int strcasecmp_m_handle(struct smb_iconv_handle *iconv_handle,
 			const char *s1, const char *s2);
 int strcasecmp_m(const char *s1, const char *s2);
 size_t count_chars_m(const char *s, char c);
-void strupper_m(char *s);
-void strlower_m(char *s);
 char *strupper_talloc(TALLOC_CTX *ctx, const char *src);
 char *talloc_strdup_upper(TALLOC_CTX *ctx, const char *src);
 char *strupper_talloc_n_handle(struct smb_iconv_handle *iconv_handle,
@@ -155,6 +148,7 @@ bool strhasupper_handle(struct smb_iconv_handle *ic,
 			const char *string);
 char *strrchr_m(const char *s, char c);
 char *strchr_m(const char *s, char c);
+char *strstr_m(const char *src, const char *findstr);
 
 bool push_ascii_talloc(TALLOC_CTX *ctx, char **dest, const char *src, size_t *converted_size);
 bool push_ucs2_talloc(TALLOC_CTX *ctx, smb_ucs2_t **dest, const char *src, size_t *converted_size);
@@ -188,8 +182,7 @@ extern struct smb_iconv_handle *global_iconv_handle;
 struct smb_iconv_handle *get_iconv_handle(void);
 struct smb_iconv_handle *get_iconv_testing_handle(TALLOC_CTX *mem_ctx, 
 						  const char *dos_charset, 
-						  const char *unix_charset, 
-						  const char *display_charset);
+						  const char *unix_charset);
 smb_iconv_t get_conv_handle(struct smb_iconv_handle *ic,
 			    charset_t from, charset_t to);
 const char *charset_name(struct smb_iconv_handle *ic, charset_t ch);
@@ -218,7 +211,6 @@ int codepoint_cmpi(codepoint_t c1, codepoint_t c2);
 struct smb_iconv_handle *smb_iconv_handle_reinit(TALLOC_CTX *mem_ctx,
 							   const char *dos_charset,
 							   const char *unix_charset,
-							   const char *display_charset,
 							   bool native_iconv,
 							   struct smb_iconv_handle *old_ic);
 
@@ -285,7 +277,7 @@ static size_t CHARSETNAME ## _push(void *cd, const char **inbuf, size_t *inbytes
 		int i; 										\
 		int done = 0; 									\
 												\
-		uint16 ch = SVAL(*inbuf,0); 							\
+		uint16_t ch = SVAL(*inbuf,0); 							\
 												\
 		for (i=0; from_idx[i].start != 0xffff; i++) {					\
 			if ((from_idx[i].start <= ch) && (from_idx[i].end >= ch)) {		\
diff --git a/lib/util/charset/charset_macosxfs.c b/lib/util/charset/charset_macosxfs.c
new file mode 100644
index 0000000000..4d2ba5b6ff
--- /dev/null
+++ b/lib/util/charset/charset_macosxfs.c
@@ -0,0 +1,605 @@
+/* 
+   Unix SMB/CIFS implementation.
+   Samba charset module for Mac OS X/Darwin
+   Copyright (C) Benjamin Riefenstahl 2003
+   
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/*
+ * modules/charset_macosxfs.c
+ *
+ * A Samba charset module to use on Mac OS X/Darwin as the filesystem
+ * and display encoding.
+ *
+ * Actually two implementations are provided here.  The default
+ * implementation is based on the official CFString API.  The other is
+ * based on internal CFString APIs as defined in the OpenDarwin
+ * source.
+ */
+
+#include "includes.h"
+#undef realloc
+
+/*
+ * Include OS frameworks.  These are only needed in this module.
+ */
+#include <CoreFoundation/CFString.h>
+
+/*
+ * See if autoconf has found us the internal headers in some form.
+ */
+#if HAVE_COREFOUNDATION_CFSTRINGENCODINGCONVERTER_H
+#	include <CoreFoundation/CFStringEncodingConverter.h>
+#	include <CoreFoundation/CFUnicodePrecomposition.h>
+#	define USE_INTERNAL_API 1
+#elif HAVE_CFSTRINGENCODINGCONVERTER_H
+#	include <CFStringEncodingConverter.h>
+#	include <CFUnicodePrecomposition.h>
+#	define USE_INTERNAL_API 1
+#endif
+
+/*
+ * Compile time configuration: Do we want debug output?
+ */
+/* #define DEBUG_STRINGS 1 */
+
+/*
+ * A simple, but efficient memory provider for our buffers.
+ */
+static inline void *resize_buffer (void *buffer, size_t *size, size_t newsize)
+{
+	if (newsize > *size) {
+		*size = newsize + 128;
+		buffer = realloc(buffer, *size);
+	}
+	return buffer;
+}
+
+/*
+ * While there is a version of OpenDarwin for intel, the usual case is
+ * big-endian PPC.  So we need byte swapping to handle the
+ * little-endian byte order of the network protocol.  We also need an
+ * additional dynamic buffer to do this work for incoming data blocks,
+ * because we have to consider the original data as constant.
+ *
+ * We abstract the differences away by providing a simple facade with
+ * these functions/macros:
+ *
+ *	le_to_native(dst,src,len)
+ *	native_to_le(cp,len)
+ *	set_ucbuffer_with_le(buffer,bufsize,data,size)
+ *	set_ucbuffer_with_le_copy(buffer,bufsize,data,size,reserve)
+ */
+#ifdef WORDS_BIGENDIAN
+
+static inline void swap_bytes (char * dst, const char * src, size_t len)
+{
+	const char *srcend = src + len;
+	while (src < srcend) {
+		dst[0] = src[1];
+		dst[1] = src[0];
+		dst += 2;
+		src += 2;
+	}
+}
+static inline void swap_bytes_inplace (char * cp, size_t len)
+{
+	char temp;
+	char *end = cp + len;
+	while (cp  < end) {
+		temp = cp[1];
+		cp[1] = cp[0];
+		cp[0] = temp;
+		cp += 2;
+	}
+}
+
+#define le_to_native(dst,src,len)	swap_bytes(dst,src,len)
+#define native_to_le(cp,len)		swap_bytes_inplace(cp,len)
+#define set_ucbuffer_with_le(buffer,bufsize,data,size) \
+	set_ucbuffer_with_le_copy(buffer,bufsize,data,size,0)
+
+#else	/* ! WORDS_BIGENDIAN */
+
+#define le_to_native(dst,src,len)	memcpy(dst,src,len)
+#define native_to_le(cp,len)		/* nothing */
+#define	set_ucbuffer_with_le(buffer,bufsize,data,size) \
+	(((void)(bufsize)),(UniChar*)(data))
+
+#endif
+
+static inline UniChar *set_ucbuffer_with_le_copy (
+	UniChar *buffer, size_t *bufsize,
+	const void *data, size_t size, size_t reserve)
+{
+	buffer = resize_buffer(buffer, bufsize, size+reserve);
+	le_to_native((char*)buffer,data,size);
+	return buffer;
+}
+
+
+/*
+ * A simple hexdump function for debugging error conditions.
+ */
+#define	debug_out(s)	DEBUG(0,(s))
+
+#ifdef DEBUG_STRINGS
+
+static void hexdump( const char * label, const char * s, size_t len )
+{
+	size_t restlen = len;
+	debug_out("<<<<<<<\n");
+	debug_out(label);
+	debug_out("\n");
+	while (restlen > 0) {
+		char line[100];
+		size_t i, j;
+		char * d = line;
+#undef sprintf
+		d += sprintf(d, "%04X ", (unsigned)(len-restlen));
+		*d++ = ' ';
+		for( i = 0; i<restlen && i<8; ++i ) {
+			d += sprintf(d, "%02X ", ((unsigned)s[i]) & 0xFF);
+		}
+		for( j = i; j<8; ++j ) {
+			d += sprintf(d, "   ");
+		}
+		*d++ = ' ';
+		for( i = 8; i<restlen && i<16; ++i ) {
+			d += sprintf(d, "%02X ", ((unsigned)s[i]) & 0xFF);
+		}
+		for( j = i; j<16; ++j ) {
+			d += sprintf(d, "   ");
+		}
+		*d++ = ' ';
+		for( i = 0; i<restlen && i<16; ++i ) {
+			if(s[i] < ' ' || s[i] >= 0x7F || !isprint(s[i]))
+				*d++ = '.';
+			else
+				*d++ = s[i];
+		}
+		*d++ = '\n';
+		*d = 0;
+		restlen -= i;
+		s += i;
+		debug_out(line);
+	}
+	debug_out(">>>>>>>\n");
+}
+
+#else	/* !DEBUG_STRINGS */
+
+#define hexdump(label,s,len) /* nothing */
+
+#endif
+
+
+#if !USE_INTERNAL_API
+
+/*
+ * An implementation based on documented Mac OS X APIs.
+ *
+ * This does a certain amount of memory management, creating and
+ * manipulating CFString objects.  We try to minimize the impact by
+ * keeping those objects around and re-using them.  We also use
+ * external backing store for the CFStrings where this is possible and
+ * benficial.
+ *
+ * The Unicode normalizations forms available at this level are
+ * generic, not specifically for the file system.  So they may not be
+ * perfect fits.
+ */
+static size_t macosxfs_encoding_pull(
+	void *cd,				/* Encoder handle */
+	char **inbuf, size_t *inbytesleft,	/* Script string */
+	char **outbuf, size_t *outbytesleft)	/* UTF-16-LE string */
+{
+	static const int script_code = kCFStringEncodingUTF8;
+	static CFMutableStringRef cfstring = NULL;
+	size_t outsize;
+	CFRange range;
+
+	(void) cd; /* UNUSED */
+
+	if (0 == *inbytesleft) {
+		return 0;
+	}
+
+	if (NULL == cfstring) {
+		/*
+		 * A version with an external backing store as in the
+		 * push function should have been more efficient, but
+		 * testing shows, that it is actually slower (!).
+		 * Maybe kCFAllocatorDefault gets shortcut evaluation
+		 * internally, while kCFAllocatorNull doesn't.
+		 */
+		cfstring = CFStringCreateMutable(kCFAllocatorDefault,0);
+	}
+
+	/*
+	 * Three methods of appending to a CFString, choose the most
+	 * efficient.
+	 */
+	if (0 == (*inbuf)[*inbytesleft-1]) {
+		CFStringAppendCString(cfstring, *inbuf, script_code);
+	} else if (*inbytesleft <= 255) {
+		Str255 buffer;
+		buffer[0] = *inbytesleft;
+		memcpy(buffer+1, *inbuf, buffer[0]);
+		CFStringAppendPascalString(cfstring, buffer, script_code);
+	} else {
+		/*
+		 * We would like to use a fixed buffer and a loop
+		 * here, but than we can't garantee that the input is
+		 * well-formed UTF-8, as we are supposed to do.
+		 */
+		static char *buffer = NULL;
+		static size_t buflen = 0;
+		buffer = resize_buffer(buffer, &buflen, *inbytesleft+1);
+		memcpy(buffer, *inbuf, *inbytesleft);
+		buffer[*inbytesleft] = 0;
+		CFStringAppendCString(cfstring, *inbuf, script_code);
+	}
+
+	/*
+	 * Compose characters, using the non-canonical composition
+	 * form.
+	 */
+	CFStringNormalize(cfstring, kCFStringNormalizationFormC);
+
+	outsize = CFStringGetLength(cfstring);
+	range = CFRangeMake(0,outsize);
+
+	if (outsize == 0) {
+		/*
+		 * HACK: smbd/mangle_hash2.c:is_legal_name() expects
+		 * errors here.  That function will always pass 2
+		 * characters.  smbd/open.c:check_for_pipe() cuts a
+		 * patchname to 10 characters blindly.  Suppress the
+		 * debug output in those cases.
+		 */
+		if(2 != *inbytesleft && 10 != *inbytesleft) {
+			debug_out("String conversion: "
+				  "An unknown error occurred\n");
+			hexdump("UTF8->UTF16LE (old) input",
+				*inbuf, *inbytesleft);
+		}
+		errno = EILSEQ; /* Not sure, but this is what we have
+				 * actually seen. */
+		return -1;
+	}
+	if (outsize*2 > *outbytesleft) {
+		CFStringDelete(cfstring, range);
+		debug_out("String conversion: "
+			  "Output buffer too small\n");
+		hexdump("UTF8->UTF16LE (old) input",
+			*inbuf, *inbytesleft);
+		errno = E2BIG;
+		return -1;
+	}
+
+        CFStringGetCharacters(cfstring, range, (UniChar*)*outbuf);
+	CFStringDelete(cfstring, range);
+
+	native_to_le(*outbuf, outsize*2);
+
+	/*
+	 * Add a converted null byte, if the CFString conversions
+	 * prevented that until now.
+	 */
+	if (0 == (*inbuf)[*inbytesleft-1] && 
+	    (0 != (*outbuf)[outsize*2-1] || 0 != (*outbuf)[outsize*2-2])) {
+
+		if ((outsize*2+2) > *outbytesleft) {
+			debug_out("String conversion: "
+				  "Output buffer too small\n");
+			hexdump("UTF8->UTF16LE (old) input",
+				*inbuf, *inbytesleft);
+			errno = E2BIG;
+			return -1;
+		}
+
+		(*outbuf)[outsize*2] = (*outbuf)[outsize*2+1] = 0;
+		outsize += 2;
+	}
+
+	*inbuf += *inbytesleft;
+	*inbytesleft = 0;
+	*outbuf += outsize*2;
+	*outbytesleft -= outsize*2;
+
+	return 0;
+}
+
+static size_t macosxfs_encoding_push(
+	void *cd,				/* Encoder handle */
+	char **inbuf, size_t *inbytesleft,	/* UTF-16-LE string */
+	char **outbuf, size_t *outbytesleft)	/* Script string */
+{
+	static const int script_code = kCFStringEncodingUTF8;
+	static CFMutableStringRef cfstring = NULL;
+	static UniChar *buffer = NULL;
+	static size_t buflen = 0;
+	CFIndex outsize, cfsize, charsconverted;
+
+	(void) cd; /* UNUSED */
+
+	if (0 == *inbytesleft) {
+		return 0;
+	}
+
+	/*
+	 * We need a buffer that can hold 4 times the original data,
+	 * because that is the theoretical maximum that decomposition
+	 * can create currently (in Unicode 4.0).
+	 */
+	buffer = set_ucbuffer_with_le_copy(
+		buffer, &buflen, *inbuf, *inbytesleft, 3 * *inbytesleft);
+
+	if (NULL == cfstring) {
+		cfstring = CFStringCreateMutableWithExternalCharactersNoCopy(
+			kCFAllocatorDefault,
+			buffer, *inbytesleft/2, buflen/2,
+			kCFAllocatorNull);
+	} else {
+		CFStringSetExternalCharactersNoCopy(
+			cfstring,
+			buffer, *inbytesleft/2, buflen/2);
+	}
+
+	/*
+	 * Decompose characters, using the non-canonical decomposition
+	 * form.
+	 *
+	 * NB: This isn't exactly what HFS+ wants (see note on
+	 * kCFStringEncodingUseHFSPlusCanonical in
+	 * CFStringEncodingConverter.h), but AFAIK it's the best that
+	 * the official API can do.
+	 */
+	CFStringNormalize(cfstring, kCFStringNormalizationFormD);
+
+	cfsize = CFStringGetLength(cfstring);
+	charsconverted = CFStringGetBytes(
+		cfstring, CFRangeMake(0,cfsize),
+		script_code, 0, false,
+		*outbuf, *outbytesleft, &outsize);
+
+	if (0 == charsconverted) {
+		debug_out("String conversion: "
+			  "Buffer too small or not convertable\n");
+		hexdump("UTF16LE->UTF8 (old) input",
+			*inbuf, *inbytesleft);
+		errno = EILSEQ; /* Probably more likely. */
+		return -1;
+	}
+
+	/*
+	 * Add a converted null byte, if the CFString conversions
+	 * prevented that until now.
+	 */
+	if (0 == (*inbuf)[*inbytesleft-1] && 0 == (*inbuf)[*inbytesleft-2] &&
+	    (0 != (*outbuf)[outsize-1])) {
+
+		if (((size_t)outsize+1) > *outbytesleft) {
+			debug_out("String conversion: "
+				  "Output buffer too small\n");
+			hexdump("UTF16LE->UTF8 (old) input",
+				*inbuf, *inbytesleft);
+			errno = E2BIG;
+			return -1;
+		}
+
+		(*outbuf)[outsize] = 0;
+		++outsize;
+	}
+
+	*inbuf += *inbytesleft;
+	*inbytesleft = 0;
+	*outbuf += outsize;
+	*outbytesleft -= outsize;
+
+	return 0;
+}
+
+#else /* USE_INTERNAL_API */
+
+/*
+ * An implementation based on internal code as known from the
+ * OpenDarwin CVS.
+ *
+ * This code doesn't need much memory management because it uses
+ * functions that operate on the raw memory directly.
+ *
+ * The push routine here is faster and more compatible with HFS+ than
+ * the other implementation above.  The pull routine is only faster
+ * for some strings, slightly slower for others.  The pull routine
+ * looses because it has to iterate over the data twice, once to
+ * decode UTF-8 and than to do the character composition required by
+ * Windows.
+ */
+static size_t macosxfs_encoding_pull(
+	void *cd,				/* Encoder handle */
+	char **inbuf, size_t *inbytesleft,	/* Script string */
+	char **outbuf, size_t *outbytesleft)	/* UTF-16-LE string */
+{
+	static const int script_code = kCFStringEncodingUTF8;
+	UInt32 srcCharsUsed = 0;
+	UInt32 dstCharsUsed = 0;
+	UInt32 result;
+	uint32_t dstDecomposedUsed = 0;
+	uint32_t dstPrecomposedUsed = 0;
+
+	(void) cd; /* UNUSED */
+
+	if (0 == *inbytesleft) {
+		return 0;
+	}
+
+        result = CFStringEncodingBytesToUnicode(
+		script_code, kCFStringEncodingComposeCombinings,
+		*inbuf, *inbytesleft, &srcCharsUsed,
+		(UniChar*)*outbuf, *outbytesleft, &dstCharsUsed);
+
+	switch(result) {
+	case kCFStringEncodingConversionSuccess:
+		if (*inbytesleft == srcCharsUsed)
+			break;
+		else
+			; /*fall through*/
+	case kCFStringEncodingInsufficientOutputBufferLength:
+		debug_out("String conversion: "
+			  "Output buffer too small\n");
+		hexdump("UTF8->UTF16LE (new) input",
+			*inbuf, *inbytesleft);
+		errno = E2BIG;
+		return -1;
+	case kCFStringEncodingInvalidInputStream:
+		/*
+		 * HACK: smbd/mangle_hash2.c:is_legal_name() expects
+		 * errors here.  That function will always pass 2
+		 * characters.  smbd/open.c:check_for_pipe() cuts a
+		 * patchname to 10 characters blindly.  Suppress the
+		 * debug output in those cases.
+		 */
+		if(2 != *inbytesleft && 10 != *inbytesleft) {
+			debug_out("String conversion: "
+				  "Invalid input sequence\n");
+			hexdump("UTF8->UTF16LE (new) input",
+				*inbuf, *inbytesleft);
+		}
+		errno = EILSEQ;
+		return -1;
+	case kCFStringEncodingConverterUnavailable:
+		debug_out("String conversion: "
+			  "Unknown encoding\n");
+		hexdump("UTF8->UTF16LE (new) input",
+			*inbuf, *inbytesleft);
+		errno = EINVAL;
+		return -1;
+	}
+
+	/*
+	 * It doesn't look like CFStringEncodingBytesToUnicode() can
+	 * produce precomposed characters (flags=ComposeCombinings
+	 * doesn't do it), so we need another pass over the data here.
+	 * We can do this in-place, as the string can only get
+	 * shorter.
+	 *
+	 * (Actually in theory there should be an internal
+	 * decomposition and reordering before the actual composition
+	 * step.  But we should be able to rely on that we always get
+	 * fully decomposed strings for input, so this can't create
+	 * problems in reality.)
+	 */
+	CFUniCharPrecompose(
+		(const UTF16Char *)*outbuf, dstCharsUsed, &dstDecomposedUsed,
+		(UTF16Char *)*outbuf, dstCharsUsed, &dstPrecomposedUsed);
+
+	native_to_le(*outbuf, dstPrecomposedUsed*2);
+
+	*inbuf += srcCharsUsed;
+	*inbytesleft -= srcCharsUsed;
+	*outbuf += dstPrecomposedUsed*2;
+	*outbytesleft -= dstPrecomposedUsed*2;
+
+	return 0;
+}
+
+static size_t macosxfs_encoding_push(
+	void *cd,				/* Encoder handle */
+	char **inbuf, size_t *inbytesleft,	/* UTF-16-LE string */
+	char **outbuf, size_t *outbytesleft)	/* Script string */
+{
+	static const int script_code = kCFStringEncodingUTF8;
+	static UniChar *buffer = NULL;
+	static size_t buflen = 0;
+	UInt32 srcCharsUsed=0, dstCharsUsed=0, result;
+
+	(void) cd; /* UNUSED */
+
+	if (0 == *inbytesleft) {
+		return 0;
+	}
+
+	buffer = set_ucbuffer_with_le(
+		buffer, &buflen, *inbuf, *inbytesleft);
+
+	result = CFStringEncodingUnicodeToBytes(
+		script_code, kCFStringEncodingUseHFSPlusCanonical,
+		buffer, *inbytesleft/2, &srcCharsUsed,
+		*outbuf, *outbytesleft, &dstCharsUsed);
+
+	switch(result) {
+	case kCFStringEncodingConversionSuccess:
+		if (*inbytesleft/2 == srcCharsUsed)
+			break;
+		else
+			; /*fall through*/
+	case kCFStringEncodingInsufficientOutputBufferLength:
+		debug_out("String conversion: "
+			  "Output buffer too small\n");
+		hexdump("UTF16LE->UTF8 (new) input",
+			*inbuf, *inbytesleft);
+		errno = E2BIG;
+		return -1;
+	case kCFStringEncodingInvalidInputStream:
+		/*
+		 * HACK: smbd/open.c:check_for_pipe():is_legal_name()
+		 * cuts a pathname to 10 characters blindly.  Suppress
+		 * the debug output in those cases.
+		 */
+		if(10 != *inbytesleft) {
+			debug_out("String conversion: "
+				  "Invalid input sequence\n");
+			hexdump("UTF16LE->UTF8 (new) input",
+				*inbuf, *inbytesleft);
+		}
+		errno = EILSEQ;
+		return -1;
+	case kCFStringEncodingConverterUnavailable:
+		debug_out("String conversion: "
+			  "Unknown encoding\n");
+		hexdump("UTF16LE->UTF8 (new) input",
+			*inbuf, *inbytesleft);
+		errno = EINVAL;
+		return -1;
+	}
+
+	*inbuf += srcCharsUsed*2;
+	*inbytesleft -= srcCharsUsed*2;
+	*outbuf += dstCharsUsed;
+	*outbytesleft -= dstCharsUsed;
+
+	return 0;
+}
+
+#endif /* USE_INTERNAL_API */
+
+/*
+ * For initialization, actually install the encoding as "macosxfs".
+ */
+static struct charset_functions macosxfs_encoding_functions = {
+	"MACOSXFS", macosxfs_encoding_pull, macosxfs_encoding_push
+};
+
+NTSTATUS charset_macosxfs_init(void)
+{
+	if (!smb_register_charset(&macosxfs_encoding_functions)) {
+		return NT_STATUS_INTERNAL_ERROR;
+	}
+	return NT_STATUS_OK;
+}
+
+/* eof */
diff --git a/lib/util/charset/codepoints.c b/lib/util/charset/codepoints.c
index cd54420e8e..8cc33a9782 100644
--- a/lib/util/charset/codepoints.c
+++ b/lib/util/charset/codepoints.c
@@ -23,7 +23,7 @@
 #include "includes.h"
 #include "lib/util/charset/charset.h"
 #include "system/locale.h"
-#include "dynconfig.h"
+#include "dynconfig/dynconfig.h"
 
 #ifdef strcasecmp
 #undef strcasecmp
@@ -168,17 +168,16 @@ struct smb_iconv_handle *get_iconv_handle(void)
 {
 	if (global_iconv_handle == NULL)
 		global_iconv_handle = smb_iconv_handle_reinit(talloc_autofree_context(),
-									"ASCII", "UTF-8", "ASCII", true, NULL);
+							      "ASCII", "UTF-8", true, NULL);
 	return global_iconv_handle;
 }
 
 struct smb_iconv_handle *get_iconv_testing_handle(TALLOC_CTX *mem_ctx, 
 						  const char *dos_charset, 
-						  const char *unix_charset, 
-						  const char *display_charset)
+						  const char *unix_charset)
 {
 	return smb_iconv_handle_reinit(mem_ctx,
-				       dos_charset, unix_charset, display_charset, true, NULL);
+				       dos_charset, unix_charset, true, NULL);
 }
 
 /**
@@ -190,7 +189,6 @@ const char *charset_name(struct smb_iconv_handle *ic, charset_t ch)
 	case CH_UTF16: return "UTF-16LE";
 	case CH_UNIX: return ic->unix_charset;
 	case CH_DOS: return ic->dos_charset;
-	case CH_DISPLAY: return ic->display_charset;
 	case CH_UTF8: return "UTF8";
 	case CH_UTF16BE: return "UTF-16BE";
 	case CH_UTF16MUNGED: return "UTF16_MUNGED";
@@ -219,37 +217,6 @@ static int close_iconv_handle(struct smb_iconv_handle *data)
 	return 0;
 }
 
-static const char *map_locale(const char *charset)
-{
-	if (strcmp(charset, "LOCALE") != 0) {
-		return charset;
-	}
-#if defined(HAVE_NL_LANGINFO) && defined(CODESET)
-	{
-		const char *ln;
-		smb_iconv_t handle;
-
-		ln = nl_langinfo(CODESET);
-		if (ln == NULL) {
-			DEBUG(1,("Unable to determine charset for LOCALE - using ASCII\n"));
-			return "ASCII";
-		}
-		/* Check whether the charset name is supported
-		   by iconv */
-		handle = smb_iconv_open(ln, "UCS-2LE");
-		if (handle == (smb_iconv_t) -1) {
-			DEBUG(5,("Locale charset '%s' unsupported, using ASCII instead\n", ln));
-			return "ASCII";
-		} else {
-			DEBUG(5,("Substituting charset '%s' for LOCALE\n", ln));
-			smb_iconv_close(handle);
-		}
-		return ln;
-	}
-#endif
-	return "ASCII";
-}
-
 /*
   the old_ic is passed in here as the smb_iconv_handle structure
   is used as a global pointer in some places (eg. python modules). We
@@ -261,14 +228,11 @@ static const char *map_locale(const char *charset)
 _PUBLIC_ struct smb_iconv_handle *smb_iconv_handle_reinit(TALLOC_CTX *mem_ctx,
 								    const char *dos_charset,
 								    const char *unix_charset,
-								    const char *display_charset,
 								    bool native_iconv,
 								    struct smb_iconv_handle *old_ic)
 {
 	struct smb_iconv_handle *ret;
 
-	display_charset = map_locale(display_charset);
-
 	if (old_ic != NULL) {
 		ret = old_ic;
 		close_iconv_handle(ret);
@@ -290,9 +254,13 @@ _PUBLIC_ struct smb_iconv_handle *smb_iconv_handle_reinit(TALLOC_CTX *mem_ctx,
 
 	talloc_set_destructor(ret, close_iconv_handle);
 
+	if (strcasecmp(dos_charset, "UTF8") == 0 || strcasecmp(dos_charset, "UTF-8") == 0) {
+		DEBUG(0,("ERROR: invalid DOS charset: 'dos charset' must not be UTF8, using (default value) CP850 instead\n"));
+		dos_charset = "CP850";
+	}
+
 	ret->dos_charset = talloc_strdup(ret->child_ctx, dos_charset);
 	ret->unix_charset = talloc_strdup(ret->child_ctx, unix_charset);
-	ret->display_charset = talloc_strdup(ret->child_ctx, display_charset);
 	ret->native_iconv = native_iconv;
 
 	return ret;
diff --git a/lib/util/charset/convert_string.c b/lib/util/charset/convert_string.c
index e51add2aaf..51f9fec137 100644
--- a/lib/util/charset/convert_string.c
+++ b/lib/util/charset/convert_string.c
@@ -2,7 +2,8 @@
    Unix SMB/CIFS implementation.
    Character set conversion Extensions
    Copyright (C) Igor Vergeichik <iverg@mail.ru> 2001
-   Copyright (C) Andrew Tridgell 2001
+   Copyright (C) Andrew Tridgell 2001-2011
+   Copyright (C) Andrew Bartlett 2011
    Copyright (C) Simo Sorce 2001
    Copyright (C) Martin Pool 2003
 
@@ -21,6 +22,7 @@
 
 */
 #include "includes.h"
+#include "system/iconv.h"
 
 /**
  * @file
@@ -177,28 +179,29 @@ bool convert_string_error_handle(struct smb_iconv_handle *ic,
 		size_t slen = srclen;
 		size_t dlen = destlen;
 		unsigned char lastp = '\0';
+		bool ret;
 
-		/* If all characters are ascii, fast path here. */
-		while (((slen == (size_t)-1) || (slen >= 2)) && dlen) {
-			if (((lastp = *p) <= 0x7f) && (p[1] == 0)) {
+		if (slen == (size_t)-1) {
+			while (dlen &&
+			       ((lastp = *p) <= 0x7f) && (p[1] == 0)) {
 				*q++ = *p;
-				if (slen != (size_t)-1) {
-					slen -= 2;
-				}
 				p += 2;
 				dlen--;
 				retval++;
 				if (!lastp)
 					break;
-			} else {
-#ifdef BROKEN_UNICODE_COMPOSE_CHARACTERS
-				goto general_case;
-#else
-				bool ret = convert_string_internal(ic, from, to, p, slen, q, dlen, converted_size);
-				*converted_size += retval;
-				return ret;
-#endif
 			}
+			if (lastp != 0) goto slow_path;
+		} else {
+			while (slen >= 2 && dlen &&
+			       (*p <= 0x7f) && (p[1] == 0)) {
+				*q++ = *p;
+				slen -= 2;
+				p += 2;
+				dlen--;
+				retval++;
+			}
+			if (slen != 0) goto slow_path;
 		}
 
 		*converted_size = retval;
@@ -212,6 +215,19 @@ bool convert_string_error_handle(struct smb_iconv_handle *ic,
 			}
 		}
 		return true;
+
+	slow_path:
+		/* come here when we hit a character we can't deal
+		 * with in the fast path
+		 */
+#ifdef BROKEN_UNICODE_COMPOSE_CHARACTERS
+		goto general_case;
+#else
+		ret = convert_string_internal(ic, from, to, p, slen, q, dlen, converted_size);
+		*converted_size += retval;
+		return ret;
+#endif
+
 	} else if (from != CH_UTF16LE && from != CH_UTF16BE && to == CH_UTF16LE) {
 		const unsigned char *p = (const unsigned char *)src;
 		unsigned char *q = (unsigned char *)dest;
@@ -221,8 +237,8 @@ bool convert_string_error_handle(struct smb_iconv_handle *ic,
 		unsigned char lastp = '\0';
 
 		/* If all characters are ascii, fast path here. */
-		while (slen && (dlen >= 2)) {
-			if ((lastp = *p) <= 0x7F) {
+		while (slen && (dlen >= 1)) {
+			if (dlen >=2 && (lastp = *p) <= 0x7F) {
 				*q++ = *p++;
 				*q++ = '\0';
 				if (slen != (size_t)-1) {
@@ -387,7 +403,7 @@ bool convert_string_talloc_handle(TALLOC_CTX *ctx, struct smb_iconv_handle *ic,
 	}
 
 	/* +2 is for ucs2 null termination. */
-	ob = (char *)TALLOC_REALLOC(ctx, ob, destlen + 2);
+	ob = talloc_realloc(ctx, ob, char, destlen + 2);
 
 	if (!ob) {
 		DEBUG(0, ("convert_string_talloc: realloc failed!\n"));
@@ -428,7 +444,7 @@ bool convert_string_talloc_handle(TALLOC_CTX *ctx, struct smb_iconv_handle *ic,
 	 */
 	if (o_len > 1024) {
 		/* We're shrinking here so we know the +2 is safe from wrap. */
-		ob = (char *)TALLOC_REALLOC(ctx,ob,destlen + 2);
+		ob = talloc_realloc(ctx,ob, char, destlen + 2);
 	}
 
 	if (destlen && !ob) {
diff --git a/lib/util/charset/pull_push.c b/lib/util/charset/pull_push.c
new file mode 100644
index 0000000000..b7a5bcdc65
--- /dev/null
+++ b/lib/util/charset/pull_push.c
@@ -0,0 +1,150 @@
+/*
+   Unix SMB/CIFS implementation.
+   Character set conversion Extensions
+   Copyright (C) Igor Vergeichik <iverg@mail.ru> 2001
+   Copyright (C) Andrew Tridgell 2001
+   Copyright (C) Simo Sorce 2001
+   Copyright (C) Martin Pool 2003
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+*/
+
+#include "includes.h"
+#include "system/locale.h"
+
+/**
+ * Copy a string from a unix char* src to a UCS2 destination,
+ * allocating a buffer using talloc().
+ *
+ * @param dest always set at least to NULL
+ * @parm converted_size set to the number of bytes occupied by the string in
+ * the destination on success.
+ *
+ * @return true if new buffer was correctly allocated, and string was
+ * converted.
+ **/
+bool push_ucs2_talloc(TALLOC_CTX *ctx, smb_ucs2_t **dest, const char *src,
+		      size_t *converted_size)
+{
+	size_t src_len = strlen(src)+1;
+
+	*dest = NULL;
+	return convert_string_talloc(ctx, CH_UNIX, CH_UTF16LE, src, src_len,
+				     (void **)dest, converted_size);
+}
+
+/**
+ * Copy a string from a unix char* src to a UTF-8 destination, allocating a buffer using talloc
+ *
+ * @param dest always set at least to NULL
+ * @parm converted_size set to the number of bytes occupied by the string in
+ * the destination on success.
+ *
+ * @return true if new buffer was correctly allocated, and string was
+ * converted.
+ **/
+
+bool push_utf8_talloc(TALLOC_CTX *ctx, char **dest, const char *src,
+		      size_t *converted_size)
+{
+	size_t src_len = strlen(src)+1;
+
+	*dest = NULL;
+	return convert_string_talloc(ctx, CH_UNIX, CH_UTF8, src, src_len,
+				     (void**)dest, converted_size);
+}
+
+/**
+ * Copy a string from a unix char* src to an ASCII destination,
+ * allocating a buffer using talloc().
+ *
+ * @param dest always set at least to NULL
+ *
+ * @param converted_size The number of bytes occupied by the string in the destination
+ * @returns boolean indicating if the conversion was successful
+ **/
+bool push_ascii_talloc(TALLOC_CTX *mem_ctx, char **dest, const char *src, size_t *converted_size)
+{
+	size_t src_len = strlen(src)+1;
+
+	*dest = NULL;
+	return convert_string_talloc(mem_ctx, CH_UNIX, CH_DOS, src, src_len,
+				     (void **)dest, converted_size);
+}
+
+/**
+ * Copy a string from a UCS2 src to a unix char * destination, allocating a buffer using talloc
+ *
+ * @param dest always set at least to NULL
+ * @parm converted_size set to the number of bytes occupied by the string in
+ * the destination on success.
+ *
+ * @return true if new buffer was correctly allocated, and string was
+ * converted.
+ **/
+
+bool pull_ucs2_talloc(TALLOC_CTX *ctx, char **dest, const smb_ucs2_t *src,
+		      size_t *converted_size)
+{
+	size_t src_len = (strlen_w(src)+1) * sizeof(smb_ucs2_t);
+
+	*dest = NULL;
+	return convert_string_talloc(ctx, CH_UTF16LE, CH_UNIX, src, src_len,
+				     (void **)dest, converted_size);
+}
+
+
+/**
+ * Copy a string from a UTF-8 src to a unix char * destination, allocating a buffer using talloc
+ *
+ * @param dest always set at least to NULL
+ * @parm converted_size set to the number of bytes occupied by the string in
+ * the destination on success.
+ *
+ * @return true if new buffer was correctly allocated, and string was
+ * converted.
+ **/
+
+bool pull_utf8_talloc(TALLOC_CTX *ctx, char **dest, const char *src,
+		      size_t *converted_size)
+{
+	size_t src_len = strlen(src)+1;
+
+	*dest = NULL;
+	return convert_string_talloc(ctx, CH_UTF8, CH_UNIX, src, src_len,
+				     (void **)dest, converted_size);
+}
+
+
+/**
+ * Copy a string from a DOS src to a unix char * destination, allocating a buffer using talloc
+ *
+ * @param dest always set at least to NULL
+ * @parm converted_size set to the number of bytes occupied by the string in
+ * the destination on success.
+ *
+ * @return true if new buffer was correctly allocated, and string was
+ * converted.
+ **/
+
+bool pull_ascii_talloc(TALLOC_CTX *ctx, char **dest, const char *src,
+		       size_t *converted_size)
+{
+	size_t src_len = strlen(src)+1;
+
+	*dest = NULL;
+	return convert_string_talloc(ctx, CH_DOS, CH_UNIX, src, src_len,
+				     (void **)dest, converted_size);
+}
diff --git a/lib/util/charset/tests/convert_string.c b/lib/util/charset/tests/convert_string.c
index 32fc11f527..9a5d974fe3 100644
--- a/lib/util/charset/tests/convert_string.c
+++ b/lib/util/charset/tests/convert_string.c
@@ -105,7 +105,7 @@ static bool test_gd_iso8859_cp850_handle(struct torture_context *tctx)
 	talloc_steal(tctx, gd_iso8859_1.data);
 	talloc_steal(tctx, gd_utf16le.data);
 
-	iconv_handle = get_iconv_testing_handle(tctx, "ISO8859-1", "CP850", "UTF8");
+	iconv_handle = get_iconv_testing_handle(tctx, "ISO8859-1", "CP850");
 	torture_assert(tctx, iconv_handle, "getting iconv handle");
 		
 	torture_assert(tctx, convert_string_talloc_handle(tctx, iconv_handle, 
@@ -199,11 +199,11 @@ static bool test_gd_iso8859_cp850_handle(struct torture_context *tctx)
 	torture_assert_data_blob_equal(tctx, gd_output, gd_cp850, "conversion from UTF8 to (unix charset) CP850 incorrect");
 	
 	torture_assert(tctx, convert_string_talloc_handle(tctx, iconv_handle, 
-						    CH_UTF8, CH_DISPLAY, 
+						    CH_UTF8, CH_UTF8, 
 						    gd_utf8.data, gd_utf8.length, 
 						    (void *)&gd_output.data, &gd_output.length), 
-		       "conversion from UTF8 to (display charset) UTF8");
-	torture_assert_data_blob_equal(tctx, gd_output, gd_utf8, "conversion from UTF8 to (display charset) UTF8 incorrect");
+		       "conversion from UTF8 to UTF8");
+	torture_assert_data_blob_equal(tctx, gd_output, gd_utf8, "conversion from UTF8 to UTF8 incorrect");
 	
 	torture_assert(tctx, convert_string_talloc_handle(tctx, iconv_handle, 
 						    CH_UTF16LE, CH_DOS, 
@@ -227,11 +227,11 @@ static bool test_gd_iso8859_cp850_handle(struct torture_context *tctx)
 	torture_assert_data_blob_equal(tctx, gd_output, gd_cp850, "conversion from UTF16LE to (unix charset) CP850 incorrect");
 	
 	torture_assert(tctx, convert_string_talloc_handle(tctx, iconv_handle, 
-						    CH_UTF16LE, CH_DISPLAY, 
+						    CH_UTF16LE, CH_UTF8, 
 						    gd_utf16le.data, gd_utf16le.length, 
 						    (void *)&gd_output.data, &gd_output.length), 
-		       "conversion from UTF16LE to (display charset) UTF8");
-	torture_assert_data_blob_equal(tctx, gd_output, gd_utf8, "conversion from UTF16LE to (display charset) UTF8 incorrect");
+		       "conversion from UTF16LE to UTF8");
+	torture_assert_data_blob_equal(tctx, gd_output, gd_utf8, "conversion from UTF16LE to UTF8 incorrect");
 	
 	torture_assert(tctx, convert_string_talloc_handle(tctx, iconv_handle, 
 						    CH_DOS, CH_DOS, 
@@ -248,11 +248,11 @@ static bool test_gd_iso8859_cp850_handle(struct torture_context *tctx)
 	torture_assert_data_blob_equal(tctx, gd_output, gd_cp850, "conversion from UTF16LE to (unix charset) CP850 incorrect");
 	
 	torture_assert(tctx, convert_string_talloc_handle(tctx, iconv_handle, 
-						    CH_DOS, CH_DISPLAY, 
+						    CH_DOS, CH_UTF8, 
 						    gd_iso8859_1.data, gd_iso8859_1.length, 
 						    (void *)&gd_output.data, &gd_output.length), 
-		       "conversion from (dos charset) ISO8859-1 to (display charset) UTF8");
-	torture_assert_data_blob_equal(tctx, gd_output, gd_utf8, "conversion from UTF16LE to (display charset) UTF8 incorrect");
+		       "conversion from (dos charset) ISO8859-1 to UTF8");
+	torture_assert_data_blob_equal(tctx, gd_output, gd_utf8, "conversion from UTF16LE to UTF8 incorrect");
 
 	torture_assert(tctx, convert_string_talloc_handle(tctx, iconv_handle, 
 						    CH_DOS, CH_UTF16LE, 
@@ -265,7 +265,7 @@ static bool test_gd_iso8859_cp850_handle(struct torture_context *tctx)
 						     (const char *)gd_iso8859_1.data,
 						     CH_DOS, CH_UTF16LE),
 				 gd_output.length / 2,
-				 "checking strlen_m_ext of round trip conversion of UTF16 latin charset greek to display charset UTF8 and back again");
+				 "checking strlen_m_ext of round trip conversion of UTF16 latin charset greek to UTF8 and back again");
 
 	torture_assert(tctx, convert_string_talloc_handle(tctx, iconv_handle,
 						    CH_DOS, CH_UTF8,
@@ -282,6 +282,191 @@ static bool test_gd_iso8859_cp850_handle(struct torture_context *tctx)
 	return true;
 }
 
+static bool test_gd_minus_1_handle(struct torture_context *tctx)
+{
+	struct smb_iconv_handle *iconv_handle;
+	DATA_BLOB gd_utf8 = base64_decode_data_blob(gd_utf8_base64);
+	DATA_BLOB gd_cp850 = base64_decode_data_blob(gd_cp850_base64);
+	DATA_BLOB gd_utf16le = base64_decode_data_blob(gd_utf16le_base64);
+	DATA_BLOB gd_output;
+	DATA_BLOB gd_utf8_terminated;
+	DATA_BLOB gd_cp850_terminated;
+	DATA_BLOB gd_utf16le_terminated;
+	
+	talloc_steal(tctx, gd_utf8.data);
+	talloc_steal(tctx, gd_cp850.data);
+	talloc_steal(tctx, gd_utf16le.data);
+
+	iconv_handle = get_iconv_testing_handle(tctx, "CP850", "CP850");
+	torture_assert(tctx, iconv_handle, "getting iconv handle");
+
+	gd_utf8_terminated = data_blob_talloc(tctx, NULL, gd_utf8.length + 1);
+	memcpy(gd_utf8_terminated.data, gd_utf8.data, gd_utf8.length);
+	gd_utf8_terminated.data[gd_utf8.length] = '\0';
+
+	gd_cp850_terminated = data_blob_talloc(tctx, NULL, gd_cp850.length + 1);
+	memcpy(gd_cp850_terminated.data, gd_cp850.data, gd_cp850.length);
+	gd_cp850_terminated.data[gd_cp850.length] = '\0';
+
+	gd_utf16le_terminated = data_blob_talloc(tctx, NULL, gd_utf16le.length + 2);
+	memcpy(gd_utf16le_terminated.data, gd_utf16le.data, gd_utf16le.length);
+	gd_utf16le_terminated.data[gd_utf16le.length] = '\0';
+	gd_utf16le_terminated.data[gd_utf16le.length + 1] = '\0';
+
+	gd_output = data_blob_talloc(tctx, NULL, gd_utf16le.length + 10);
+
+	torture_assert(tctx, convert_string_error_handle(iconv_handle,
+							  CH_UTF8, CH_UTF16LE,
+							  gd_utf8_terminated.data, -1,
+							 (void *)gd_output.data, gd_output.length, &gd_output.length),
+		       "conversion from UTF8 to UTF16LE null terminated");
+	torture_assert_data_blob_equal(tctx, gd_output, gd_utf16le_terminated, "conversion from UTF8 to UTF16LE null terminated");
+
+	gd_output = data_blob_talloc(tctx, NULL, gd_utf16le.length + 10);
+	torture_assert(tctx, convert_string_error_handle(iconv_handle,
+							  CH_UTF8, CH_UTF16LE,
+							  gd_utf8_terminated.data, -1,
+							  (void *)gd_output.data, gd_utf16le.length, &gd_output.length) == false,
+		       "conversion from UTF8 to UTF16LE null terminated should fail");
+	torture_assert_errno_equal(tctx, E2BIG, "conversion from UTF8 to UTF16LE should fail E2BIG");
+	torture_assert_data_blob_equal(tctx, gd_output, gd_utf16le, "conversion from UTF8 to UTF16LE null terminated");
+
+	gd_output = data_blob_talloc(tctx, NULL, gd_utf16le.length + 10);
+	torture_assert(tctx, convert_string_error_handle(iconv_handle,
+							  CH_UTF8, CH_UTF16LE,
+							  gd_utf8_terminated.data, -1,
+							  (void *)gd_output.data, gd_utf16le.length - 1, &gd_output.length) == false,
+		       "conversion from UTF8 to UTF16LE null terminated should fail");
+	torture_assert_errno_equal(tctx, E2BIG, "conversion from UTF8 to UTF16LE should fail E2BIG");
+
+	gd_output = data_blob_talloc(tctx, NULL, gd_utf16le.length + 10);
+	torture_assert(tctx, convert_string_error_handle(iconv_handle,
+							  CH_UTF8, CH_UTF16LE,
+							  gd_utf8_terminated.data, -1,
+							  (void *)gd_output.data, gd_utf16le.length - 2, &gd_output.length) == false,
+		       "conversion from UTF8 to UTF16LE null terminated should fail");
+	torture_assert_errno_equal(tctx, E2BIG, "conversion from UTF8 to UTF16LE should fail E2BIG");
+
+	gd_output = data_blob_talloc(tctx, NULL, gd_utf8.length + 10);
+	torture_assert(tctx, convert_string_error_handle(iconv_handle,
+							 CH_UTF16LE, CH_UTF8,
+							 gd_utf16le_terminated.data, -1,
+							 (void *)gd_output.data, gd_output.length, &gd_output.length),
+		       "conversion from UTF16LE to UTF8 null terminated");
+	torture_assert_data_blob_equal(tctx, gd_output, gd_utf8_terminated, "conversion from UTF16LE to UTF8 null terminated");
+
+	gd_output = data_blob_talloc(tctx, NULL, gd_utf8.length + 10);
+
+	torture_assert(tctx, convert_string_error_handle(iconv_handle,
+							 CH_UTF16LE, CH_UTF8,
+							 gd_utf16le_terminated.data, -1,
+							 (void *)gd_output.data, gd_utf8.length, &gd_output.length) == false,
+		       "conversion from UTF16LE to UTF8 null terminated should fail");
+	torture_assert_errno_equal(tctx, E2BIG, "conversion from UTF16LE to UTF8 should fail E2BIG");
+	torture_assert_data_blob_equal(tctx, gd_output, gd_utf8, "conversion from UTF16LE to UTF8 null terminated");
+
+	gd_output = data_blob_talloc(tctx, NULL, gd_utf8.length + 10);
+
+	torture_assert(tctx, convert_string_error_handle(iconv_handle,
+							 CH_UTF16LE, CH_UTF8,
+							 gd_utf16le_terminated.data, -1,
+							 (void *)gd_output.data, gd_utf8.length - 1, &gd_output.length) == false,
+		       "conversion from UTF16LE to UTF8 null terminated should fail");
+	torture_assert_errno_equal(tctx, E2BIG, "conversion from UTF16LE to UTF8 should fail E2BIG");
+
+	gd_output = data_blob_talloc(tctx, NULL, gd_utf8.length + 10);
+
+	torture_assert(tctx, convert_string_error_handle(iconv_handle,
+							 CH_UTF16LE, CH_UTF8,
+							 gd_utf16le_terminated.data, -1,
+							 (void *)gd_output.data, gd_utf8.length - 2, &gd_output.length) == false,
+		       "conversion from UTF16LE to UTF8 null terminated should fail");
+	torture_assert_errno_equal(tctx, E2BIG, "conversion from UTF16LE to UTF8 should fail E2BIG");
+
+	gd_output = data_blob_talloc(tctx, NULL, gd_cp850.length + 10);
+
+	torture_assert(tctx, convert_string_error_handle(iconv_handle,
+							 CH_UTF16LE, CH_DOS,
+							 gd_utf16le_terminated.data, -1,
+							 (void *)gd_output.data, gd_output.length, &gd_output.length),
+		       "conversion from UTF16LE to CP850 (dos) null terminated");
+	torture_assert_data_blob_equal(tctx, gd_output, gd_cp850_terminated, "conversion from UTF16LE to CP850 (dos) null terminated");
+
+	/* Now null terminate the string early, the confirm we don't skip the NULL and convert any further */
+	gd_utf8_terminated.data[3] = '\0';
+	gd_utf8_terminated.length = 4; /* used for the comparison only */
+
+	gd_cp850_terminated.data[2] = '\0';
+	gd_cp850_terminated.length = 3; /* used for the comparison only */
+
+	gd_utf16le_terminated.data[4] = '\0';
+	gd_utf16le_terminated.data[5] = '\0';
+	gd_utf16le_terminated.length = 6; /* used for the comparison only */
+
+	gd_output = data_blob_talloc(tctx, NULL, gd_utf16le.length + 10);
+
+	torture_assert(tctx, convert_string_error_handle(iconv_handle,
+							  CH_UTF8, CH_UTF16LE,
+							  gd_utf8_terminated.data, -1,
+							  (void *)gd_output.data, gd_output.length, &gd_output.length),
+		       "conversion from UTF8 to UTF16LE null terminated");
+	torture_assert_data_blob_equal(tctx, gd_output, gd_utf16le_terminated, "conversion from UTF8 to UTF16LE null terminated early");
+
+	gd_output = data_blob_talloc(tctx, NULL, gd_utf8.length + 10);
+
+	torture_assert(tctx, convert_string_error_handle(iconv_handle,
+							  CH_UTF16LE, CH_UTF8,
+							  gd_utf16le_terminated.data, -1,
+							 (void *)gd_output.data, gd_output.length, &gd_output.length),
+		       "conversion from UTF16LE to UTF8 null terminated");
+	torture_assert_data_blob_equal(tctx, gd_output, gd_utf8_terminated, "conversion from UTF16LE to UTF8 null terminated early");
+
+	gd_output = data_blob_talloc(tctx, NULL, gd_utf16le.length + 10);
+
+	torture_assert(tctx, convert_string_error_handle(iconv_handle,
+							  CH_DOS, CH_UTF16LE,
+							  gd_cp850_terminated.data, -1,
+							  (void *)gd_output.data, gd_output.length, &gd_output.length),
+		       "conversion from CP850 to UTF16LE null terminated");
+	torture_assert_data_blob_equal(tctx, gd_output, gd_utf16le_terminated, "conversion from UTF8 to UTF16LE null terminated early");
+
+	gd_output = data_blob_talloc(tctx, NULL, gd_cp850.length + 10);
+
+	torture_assert(tctx, convert_string_error_handle(iconv_handle,
+							  CH_UTF16LE, CH_DOS,
+							  gd_utf16le_terminated.data, -1,
+							 (void *)gd_output.data, gd_output.length, &gd_output.length),
+		       "conversion from UTF16LE to UTF8 null terminated");
+	torture_assert_data_blob_equal(tctx, gd_output, gd_cp850_terminated, "conversion from UTF16LE to UTF8 null terminated early");
+	
+	/* Now null terminate the string particularly early, the confirm we don't skip the NULL and convert any further */
+	gd_utf8_terminated.data[1] = '\0';
+	gd_utf8_terminated.length = 2; /* used for the comparison only */
+	
+	gd_utf16le_terminated.data[2] = '\0';
+	gd_utf16le_terminated.data[3] = '\0';
+	gd_utf16le_terminated.length = 4; /* used for the comparison only */
+
+	gd_output = data_blob_talloc(tctx, NULL, gd_utf16le.length + 10);
+
+	torture_assert(tctx, convert_string_error_handle(iconv_handle, CH_UTF8, CH_UTF16LE,
+							  gd_utf8_terminated.data, -1,
+							 (void *)gd_output.data, gd_output.length, &gd_output.length),
+		       "conversion from UTF8 to UTF16LE null terminated");
+	torture_assert_data_blob_equal(tctx, gd_output, gd_utf16le_terminated, "conversion from UTF8 to UTF16LE null terminated very early");
+
+	gd_output = data_blob_talloc(tctx, NULL, gd_utf8.length + 10);
+
+	torture_assert(tctx, convert_string_error_handle(iconv_handle,
+							  CH_UTF16LE, CH_UTF8,
+							  gd_utf16le_terminated.data, -1,
+							 (void *)gd_output.data, gd_output.length, &gd_output.length),
+		       "conversion from UTF16LE to UTF8 null terminated");
+	torture_assert_data_blob_equal(tctx, gd_output, gd_utf8_terminated, "conversion from UTF16LE to UTF8 null terminated very early");
+
+	return true;
+}
+
 static bool test_gd_ascii_handle(struct torture_context *tctx)
 {
 	struct smb_iconv_handle *iconv_handle;
@@ -296,7 +481,7 @@ static bool test_gd_ascii_handle(struct torture_context *tctx)
 	talloc_steal(tctx, gd_iso8859_1.data);
 	talloc_steal(tctx, gd_utf16le.data);
 
-	iconv_handle = get_iconv_testing_handle(tctx, "ASCII", "UTF8", "UTF8");
+	iconv_handle = get_iconv_testing_handle(tctx, "ASCII", "UTF8");
 	torture_assert(tctx, iconv_handle, "getting iconv handle");
 
 	torture_assert(tctx, convert_string_talloc_handle(tctx, iconv_handle,
@@ -365,7 +550,7 @@ static bool test_plato_english_iso8859_cp850_handle(struct torture_context *tctx
 	
 	talloc_steal(tctx, plato_english_utf16le.data);
 
-	iconv_handle = get_iconv_testing_handle(tctx, "ISO8859-1", "CP850", "UTF8");
+	iconv_handle = get_iconv_testing_handle(tctx, "ISO8859-1", "CP850");
 	torture_assert(tctx, iconv_handle, "getting iconv handle");
 		
 	torture_assert(tctx, convert_string_talloc_handle(tctx, iconv_handle, 
@@ -383,11 +568,11 @@ static bool test_plato_english_iso8859_cp850_handle(struct torture_context *tctx
 	torture_assert_data_blob_equal(tctx, plato_english_output, plato_english_cp850, "conversion from UTF8 to (unix charset) CP850 incorrect");
 	
 	torture_assert(tctx, convert_string_talloc_handle(tctx, iconv_handle, 
-						    CH_UTF8, CH_DISPLAY, 
+						    CH_UTF8, CH_UTF8, 
 						    plato_english_utf8.data, plato_english_utf8.length, 
 						    (void *)&plato_english_output.data, &plato_english_output.length), 
-		       "conversion from UTF8 to (display charset) UTF8");
-	torture_assert_data_blob_equal(tctx, plato_english_output, plato_english_utf8, "conversion from UTF8 to (display charset) UTF8 incorrect");
+		       "conversion from UTF8 to UTF8");
+	torture_assert_data_blob_equal(tctx, plato_english_output, plato_english_utf8, "conversion from UTF8 to UTF8 incorrect");
 	
 	torture_assert(tctx, convert_string_talloc_handle(tctx, iconv_handle, 
 						    CH_UTF16LE, CH_DOS, 
@@ -436,11 +621,11 @@ static bool test_plato_english_iso8859_cp850_handle(struct torture_context *tctx
 	torture_assert_data_blob_equal(tctx, plato_english_output, plato_english_cp850, "conversion from UTF16LE to (unix charset) CP850 incorrect");
 	
 	torture_assert(tctx, convert_string_talloc_handle(tctx, iconv_handle, 
-						    CH_UTF16LE, CH_DISPLAY, 
+						    CH_UTF16LE, CH_UTF8, 
 						    plato_english_utf16le.data, plato_english_utf16le.length, 
 						    (void *)&plato_english_output.data, &plato_english_output.length), 
-		       "conversion from UTF16LE to (display charset) UTF8");
-	torture_assert_data_blob_equal(tctx, plato_english_output, plato_english_utf8, "conversion from UTF16LE to (display charset) UTF8 incorrect");
+		       "conversion from UTF16LE to UTF8");
+	torture_assert_data_blob_equal(tctx, plato_english_output, plato_english_utf8, "conversion from UTF16LE to UTF8 incorrect");
 	
 	torture_assert(tctx, convert_string_talloc_handle(tctx, iconv_handle, 
 						    CH_DOS, CH_DOS, 
@@ -457,11 +642,11 @@ static bool test_plato_english_iso8859_cp850_handle(struct torture_context *tctx
 	torture_assert_data_blob_equal(tctx, plato_english_output, plato_english_cp850, "conversion from UTF16LE to (unix charset) CP850 incorrect");
 	
 	torture_assert(tctx, convert_string_talloc_handle(tctx, iconv_handle, 
-						    CH_DOS, CH_DISPLAY, 
+						    CH_DOS, CH_UTF8, 
 						    plato_english_iso8859_1.data, plato_english_iso8859_1.length, 
 						    (void *)&plato_english_output.data, &plato_english_output.length), 
-		       "conversion from (dos charset) ISO8859-1 to (display charset) UTF8");
-	torture_assert_data_blob_equal(tctx, plato_english_output, plato_english_utf8, "conversion from UTF16LE to (display charset) UTF8 incorrect");
+		       "conversion from (dos charset) ISO8859-1 to UTF8");
+	torture_assert_data_blob_equal(tctx, plato_english_output, plato_english_utf8, "conversion from UTF16LE to UTF8 incorrect");
 
 	torture_assert(tctx, convert_string_talloc_handle(tctx, iconv_handle, 
 						    CH_DOS, CH_UTF16LE, 
@@ -472,6 +657,261 @@ static bool test_plato_english_iso8859_cp850_handle(struct torture_context *tctx
 	return true;
 }
 
+static bool test_plato_english_minus_1_handle(struct torture_context *tctx)
+{
+	struct smb_iconv_handle *iconv_handle;
+	DATA_BLOB plato_english_utf8 = data_blob_string_const(plato_english_ascii);
+	DATA_BLOB plato_english_utf16le = base64_decode_data_blob(plato_english_utf16le_base64);
+	DATA_BLOB plato_english_output;
+	DATA_BLOB plato_english_utf8_terminated;
+	DATA_BLOB plato_english_utf16le_terminated;
+	
+	talloc_steal(tctx, plato_english_utf16le.data);
+
+	iconv_handle = get_iconv_testing_handle(tctx, "ISO8859-1", "CP850");
+	torture_assert(tctx, iconv_handle, "getting iconv handle");
+
+	plato_english_utf8_terminated = data_blob_talloc(tctx, NULL, plato_english_utf8.length + 1);
+	memcpy(plato_english_utf8_terminated.data, plato_english_utf8.data, plato_english_utf8.length);
+	plato_english_utf8_terminated.data[plato_english_utf8.length] = '\0';
+
+	plato_english_utf16le_terminated = data_blob_talloc(tctx, NULL, plato_english_utf16le.length + 2);
+	memcpy(plato_english_utf16le_terminated.data, plato_english_utf16le.data, plato_english_utf16le.length);
+	plato_english_utf16le_terminated.data[plato_english_utf16le.length] = '\0';
+	plato_english_utf16le_terminated.data[plato_english_utf16le.length + 1] = '\0';
+		
+	plato_english_output = data_blob_talloc(tctx, NULL, plato_english_utf16le.length + 10);
+
+	torture_assert(tctx, convert_string_error_handle(iconv_handle,
+							  CH_UTF8, CH_UTF16LE,
+							  plato_english_utf8_terminated.data, -1,
+							 (void *)plato_english_output.data, plato_english_output.length, &plato_english_output.length),
+		       "conversion from UTF8 to UTF16LE null terminated");
+	torture_assert_data_blob_equal(tctx, plato_english_output, plato_english_utf16le_terminated, "conversion from UTF8 to UTF16LE null terminated");
+
+	torture_assert(tctx, convert_string_error_handle(iconv_handle,
+							  CH_UTF8, CH_UTF16LE,
+							  plato_english_utf8_terminated.data, -1,
+							  (void *)plato_english_output.data, plato_english_utf16le.length, &plato_english_output.length) == false,
+		       "conversion from UTF8 to UTF16LE null terminated should fail");
+	torture_assert_errno_equal(tctx, E2BIG, "conversion from UTF8 to UTF16LE should fail E2BIG");
+	torture_assert_data_blob_equal(tctx, plato_english_output, plato_english_utf16le, "conversion from UTF8 to UTF16LE null terminated");
+
+	torture_assert(tctx, convert_string_error_handle(iconv_handle,
+							  CH_UTF8, CH_UTF16LE,
+							  plato_english_utf8_terminated.data, -1,
+							  (void *)plato_english_output.data, plato_english_utf16le.length - 1, &plato_english_output.length) == false,
+		       "conversion from UTF8 to UTF16LE null terminated should fail");
+	torture_assert_errno_equal(tctx, E2BIG, "conversion from UTF8 to UTF16LE should fail E2BIG");
+
+	torture_assert(tctx, convert_string_error_handle(iconv_handle,
+							  CH_UTF8, CH_UTF16LE,
+							  plato_english_utf8_terminated.data, -1,
+							  (void *)plato_english_output.data, plato_english_utf16le.length - 2, &plato_english_output.length) == false,
+		       "conversion from UTF8 to UTF16LE null terminated should fail");
+	torture_assert_errno_equal(tctx, E2BIG, "conversion from UTF8 to UTF16LE should fail E2BIG");
+
+	plato_english_output = data_blob_talloc(tctx, NULL, plato_english_utf8.length + 10);
+
+	torture_assert(tctx, convert_string_error_handle(iconv_handle,
+							 CH_UTF16LE, CH_UTF8,
+							 plato_english_utf16le_terminated.data, -1,
+							 (void *)plato_english_output.data, plato_english_output.length, &plato_english_output.length),
+		       "conversion from UTF16LE to UTF8 null terminated");
+	torture_assert_data_blob_equal(tctx, plato_english_output, plato_english_utf8_terminated, "conversion from UTF16LE to UTF8 null terminated");
+
+	torture_assert(tctx, convert_string_error_handle(iconv_handle,
+							 CH_UTF16LE, CH_UTF8,
+							 plato_english_utf16le_terminated.data, -1,
+							 (void *)plato_english_output.data, plato_english_utf8.length, &plato_english_output.length) == false,
+		       "conversion from UTF16LE to UTF8 null terminated should fail");
+	torture_assert_errno_equal(tctx, E2BIG, "conversion from UTF16LE to UTF8 should fail E2BIG");
+	torture_assert_data_blob_equal(tctx, plato_english_output, plato_english_utf8, "conversion from UTF16LE to UTF8 null terminated");
+
+	torture_assert(tctx, convert_string_error_handle(iconv_handle,
+							 CH_UTF16LE, CH_UTF8,
+							 plato_english_utf16le_terminated.data, -1,
+							 (void *)plato_english_output.data, plato_english_utf8.length - 1, &plato_english_output.length) == false,
+		       "conversion from UTF16LE to UTF8 null terminated should fail");
+	torture_assert_errno_equal(tctx, E2BIG, "conversion from UTF16LE to UTF8 should fail E2BIG");
+
+	torture_assert(tctx, convert_string_error_handle(iconv_handle,
+							 CH_UTF16LE, CH_UTF8,
+							 plato_english_utf16le_terminated.data, -1,
+							 (void *)plato_english_output.data, plato_english_utf8.length - 2, &plato_english_output.length) == false,
+		       "conversion from UTF16LE to UTF8 null terminated should fail");
+	torture_assert_errno_equal(tctx, E2BIG, "conversion from UTF16LE to UTF8 should fail E2BIG");
+
+	/* Now null terminate the string early, the confirm we don't skip the NULL and convert any further */
+	plato_english_utf8_terminated.data[3] = '\0';
+	plato_english_utf8_terminated.length = 4; /* used for the comparison only */
+
+	plato_english_utf16le_terminated.data[6] = '\0';
+	plato_english_utf16le_terminated.data[7] = '\0';
+	plato_english_utf16le_terminated.length = 8; /* used for the comparison only */
+
+	plato_english_output = data_blob_talloc(tctx, NULL, plato_english_utf16le.length + 10);
+
+	torture_assert(tctx, convert_string_error_handle(iconv_handle,
+							  CH_UTF8, CH_UTF16LE,
+							  plato_english_utf8_terminated.data, -1,
+							  (void *)plato_english_output.data, plato_english_output.length, &plato_english_output.length),
+		       "conversion from UTF8 to UTF16LE null terminated");
+	torture_assert_data_blob_equal(tctx, plato_english_output, plato_english_utf16le_terminated, "conversion from UTF8 to UTF16LE null terminated early");
+
+	plato_english_output = data_blob_talloc(tctx, NULL, plato_english_utf8.length + 10);
+
+	torture_assert(tctx, convert_string_error_handle(iconv_handle,
+							  CH_UTF16LE, CH_UTF8,
+							  plato_english_utf16le_terminated.data, -1,
+							 (void *)plato_english_output.data, plato_english_output.length, &plato_english_output.length),
+		       "conversion from UTF16LE to UTF8 null terminated");
+	torture_assert_data_blob_equal(tctx, plato_english_output, plato_english_utf8_terminated, "conversion from UTF16LE to UTF8 null terminated early");
+
+	
+	/* Now null terminate the string particularly early, the confirm we don't skip the NULL and convert any further */
+	plato_english_utf8_terminated.data[1] = '\0';
+	plato_english_utf8_terminated.length = 2; /* used for the comparison only */
+	
+	plato_english_utf16le_terminated.data[2] = '\0';
+	plato_english_utf16le_terminated.data[3] = '\0';
+	plato_english_utf16le_terminated.length = 4; /* used for the comparison only */
+
+	plato_english_output = data_blob_talloc(tctx, NULL, plato_english_utf16le.length + 10);
+
+	torture_assert(tctx, convert_string_error_handle(iconv_handle, CH_UTF8, CH_UTF16LE,
+							  plato_english_utf8_terminated.data, -1,
+							 (void *)plato_english_output.data, plato_english_output.length, &plato_english_output.length),
+		       "conversion from UTF8 to UTF16LE null terminated");
+	torture_assert_data_blob_equal(tctx, plato_english_output, plato_english_utf16le_terminated, "conversion from UTF8 to UTF16LE null terminated very early");
+
+	plato_english_output = data_blob_talloc(tctx, NULL, plato_english_utf8.length + 10);
+
+	torture_assert(tctx, convert_string_error_handle(iconv_handle,
+							  CH_UTF16LE, CH_UTF8,
+							  plato_english_utf16le_terminated.data, -1,
+							 (void *)plato_english_output.data, plato_english_output.length, &plato_english_output.length),
+		       "conversion from UTF16LE to UTF8 null terminated");
+	torture_assert_data_blob_equal(tctx, plato_english_output, plato_english_utf8_terminated, "conversion from UTF16LE to UTF8 null terminated very early");
+
+	return true;
+}
+
+static bool test_plato_minus_1_handle(struct torture_context *tctx)
+{
+	struct smb_iconv_handle *iconv_handle;
+	DATA_BLOB plato_utf8 = base64_decode_data_blob(plato_utf8_base64);
+	DATA_BLOB plato_utf16le = base64_decode_data_blob(plato_utf16le_base64);
+	DATA_BLOB plato_output;
+	DATA_BLOB plato_utf8_terminated;
+	DATA_BLOB plato_utf16le_terminated;
+	
+	talloc_steal(tctx, plato_utf8.data);
+	talloc_steal(tctx, plato_utf16le.data);
+
+	iconv_handle = get_iconv_testing_handle(tctx, "ISO8859-1", "CP850");
+	torture_assert(tctx, iconv_handle, "getting iconv handle");
+
+	plato_utf8_terminated = data_blob_talloc(tctx, NULL, plato_utf8.length + 1);
+	memcpy(plato_utf8_terminated.data, plato_utf8.data, plato_utf8.length);
+	plato_utf8_terminated.data[plato_utf8.length] = '\0';
+
+	plato_utf16le_terminated = data_blob_talloc(tctx, NULL, plato_utf16le.length + 2);
+	memcpy(plato_utf16le_terminated.data, plato_utf16le.data, plato_utf16le.length);
+	plato_utf16le_terminated.data[plato_utf16le.length] = '\0';
+	plato_utf16le_terminated.data[plato_utf16le.length + 1] = '\0';
+
+	plato_output = data_blob_talloc(tctx, NULL, plato_utf16le.length + 10);
+
+	torture_assert(tctx, convert_string_error_handle(iconv_handle,
+							  CH_UTF8, CH_UTF16LE,
+							  plato_utf8_terminated.data, -1,
+							 (void *)plato_output.data, plato_output.length, &plato_output.length),
+		       "conversion from UTF8 to UTF16LE null terminated");
+	torture_assert_data_blob_equal(tctx, plato_output, plato_utf16le_terminated, "conversion from UTF8 to UTF16LE null terminated");
+
+	torture_assert(tctx, convert_string_error_handle(iconv_handle,
+							  CH_UTF8, CH_UTF16LE,
+							  plato_utf8_terminated.data, -1,
+							  (void *)plato_output.data, plato_utf16le.length, &plato_output.length) == false,
+		       "conversion from UTF8 to UTF16LE null terminated should fail");
+	torture_assert_errno_equal(tctx, E2BIG, "conversion from UTF8 to UTF16LE should fail E2BIG");
+	torture_assert_data_blob_equal(tctx, plato_output, plato_utf16le, "conversion from UTF8 to UTF16LE null terminated");
+
+	torture_assert(tctx, convert_string_error_handle(iconv_handle,
+							  CH_UTF8, CH_UTF16LE,
+							  plato_utf8_terminated.data, -1,
+							  (void *)plato_output.data, plato_utf16le.length - 1, &plato_output.length) == false,
+		       "conversion from UTF8 to UTF16LE null terminated should fail");
+	torture_assert_errno_equal(tctx, E2BIG, "conversion from UTF8 to UTF16LE should fail E2BIG");
+
+	torture_assert(tctx, convert_string_error_handle(iconv_handle,
+							  CH_UTF8, CH_UTF16LE,
+							  plato_utf8_terminated.data, -1,
+							  (void *)plato_output.data, plato_utf16le.length - 2, &plato_output.length) == false,
+		       "conversion from UTF8 to UTF16LE null terminated should fail");
+	torture_assert_errno_equal(tctx, E2BIG, "conversion from UTF8 to UTF16LE should fail E2BIG");
+
+	plato_output = data_blob_talloc(tctx, NULL, plato_utf8.length + 10);
+
+	torture_assert(tctx, convert_string_error_handle(iconv_handle,
+							 CH_UTF16LE, CH_UTF8,
+							 plato_utf16le_terminated.data, -1,
+							 (void *)plato_output.data, plato_output.length, &plato_output.length),
+		       "conversion from UTF16LE to UTF8 null terminated");
+	torture_assert_data_blob_equal(tctx, plato_output, plato_utf8_terminated, "conversion from UTF16LE to UTF8 null terminated");
+
+	torture_assert(tctx, convert_string_error_handle(iconv_handle,
+							 CH_UTF16LE, CH_UTF8,
+							 plato_utf16le_terminated.data, -1,
+							 (void *)plato_output.data, plato_utf8.length, &plato_output.length) == false,
+		       "conversion from UTF16LE to UTF8 null terminated should fail");
+	torture_assert_errno_equal(tctx, E2BIG, "conversion from UTF16LE to UTF8 should fail E2BIG");
+	torture_assert_data_blob_equal(tctx, plato_output, plato_utf8, "conversion from UTF16LE to UTF8 null terminated");
+
+	torture_assert(tctx, convert_string_error_handle(iconv_handle,
+							 CH_UTF16LE, CH_UTF8,
+							 plato_utf16le_terminated.data, -1,
+							 (void *)plato_output.data, plato_utf8.length - 1, &plato_output.length) == false,
+		       "conversion from UTF16LE to UTF8 null terminated should fail");
+	torture_assert_errno_equal(tctx, E2BIG, "conversion from UTF16LE to UTF8 should fail E2BIG");
+
+	torture_assert(tctx, convert_string_error_handle(iconv_handle,
+							 CH_UTF16LE, CH_UTF8,
+							 plato_utf16le_terminated.data, -1,
+							 (void *)plato_output.data, plato_utf8.length - 2, &plato_output.length) == false,
+		       "conversion from UTF16LE to UTF8 null terminated should fail");
+	torture_assert_errno_equal(tctx, E2BIG, "conversion from UTF16LE to UTF8 should fail E2BIG");
+
+	/* Now null terminate the string early, the confirm we don't skip the NULL and convert any further */
+	plato_utf8_terminated.data[5] = '\0';
+	plato_utf8_terminated.length = 6; /* used for the comparison only */
+
+	plato_utf16le_terminated.data[4] = '\0';
+	plato_utf16le_terminated.data[5] = '\0';
+	plato_utf16le_terminated.length = 6; /* used for the comparison only */
+
+	plato_output = data_blob_talloc(tctx, NULL, plato_utf16le.length + 10);
+
+	torture_assert(tctx, convert_string_error_handle(iconv_handle,
+							  CH_UTF8, CH_UTF16LE,
+							  plato_utf8_terminated.data, -1,
+							  (void *)plato_output.data, plato_output.length, &plato_output.length),
+		       "conversion from UTF8 to UTF16LE null terminated");
+	torture_assert_data_blob_equal(tctx, plato_output, plato_utf16le_terminated, "conversion from UTF8 to UTF16LE null terminated early");
+
+	plato_output = data_blob_talloc(tctx, NULL, plato_utf8.length + 10);
+
+	torture_assert(tctx, convert_string_error_handle(iconv_handle,
+							  CH_UTF16LE, CH_UTF8,
+							  plato_utf16le_terminated.data, -1,
+							 (void *)plato_output.data, plato_output.length, &plato_output.length),
+		       "conversion from UTF16LE to UTF8 null terminated");
+	torture_assert_data_blob_equal(tctx, plato_output, plato_utf8_terminated, "conversion from UTF16LE to UTF8 null terminated early");
+	
+	return true;
+}
+
 static bool test_plato_cp850_utf8_handle(struct torture_context *tctx)
 {
 	struct smb_iconv_handle *iconv_handle;
@@ -483,7 +923,7 @@ static bool test_plato_cp850_utf8_handle(struct torture_context *tctx)
 	talloc_steal(tctx, plato_utf8.data);
 	talloc_steal(tctx, plato_utf16le.data);
 
-	iconv_handle = get_iconv_testing_handle(tctx, "CP850", "UTF8", "UTF8");
+	iconv_handle = get_iconv_testing_handle(tctx, "CP850", "UTF8");
 	torture_assert(tctx, iconv_handle, "creating iconv handle");
 		
 	torture_assert(tctx, convert_string_talloc_handle(tctx, iconv_handle, 
@@ -568,11 +1008,11 @@ static bool test_plato_cp850_utf8_handle(struct torture_context *tctx)
 	torture_assert_data_blob_equal(tctx, plato_output, plato_utf8, "conversion from UTF8 to (unix charset) UTF8 incorrect");
 
 	torture_assert(tctx, convert_string_talloc_handle(tctx, iconv_handle, 
-						    CH_UTF8, CH_DISPLAY, 
+						    CH_UTF8, CH_UTF8, 
 						    plato_utf8.data, plato_utf8.length, 
 						    (void *)&plato_output.data, &plato_output.length),
 		       "conversion of UTF16 ancient greek to unix charset UTF8 failed");
-	torture_assert_data_blob_equal(tctx, plato_output, plato_utf8, "conversion from UTF8 to (display charset) UTF8 incorrect");
+	torture_assert_data_blob_equal(tctx, plato_output, plato_utf8, "conversion from UTF8 to UTF8 incorrect");
 	
 	torture_assert(tctx, convert_string_talloc_handle(tctx, iconv_handle, 
 						    CH_UTF16LE, CH_DOS, 
@@ -627,39 +1067,39 @@ static bool test_plato_cp850_utf8_handle(struct torture_context *tctx)
 		       "conversion of UTF16 ancient greek to UTF8 failed");
 	torture_assert_data_blob_equal(tctx, plato_output, plato_utf8, "conversion from UTF16LE to UTF8 incorrect");
 	torture_assert(tctx, convert_string_talloc_handle(tctx, iconv_handle, 
-							  CH_UTF16LE, CH_DISPLAY, 
+							  CH_UTF16LE, CH_UTF8, 
 							  plato_utf16le.data, plato_utf16le.length, 
 							  (void *)&plato_output.data, &plato_output.length),
-		       "conversion of UTF16 ancient greek to display charset UTF8 failed");
-	torture_assert_data_blob_equal(tctx, plato_output, plato_utf8, "conversion from UTF16LE to (display charset) UTF8 incorrect");
+		       "conversion of UTF16 ancient greek to UTF8 failed");
+	torture_assert_data_blob_equal(tctx, plato_output, plato_utf8, "conversion from UTF16LE to UTF8 incorrect");
 	
 	torture_assert(tctx, convert_string_talloc_handle(tctx, iconv_handle, 
-							  CH_DISPLAY, CH_UTF16LE, 
+							  CH_UTF8, CH_UTF16LE, 
 							  plato_output.data, plato_output.length, 
 							  (void *)&plato_output2.data, &plato_output2.length),
-		       "round trip conversion of UTF16 ancient greek to display charset UTF8 and back again failed");
+		       "round trip conversion of UTF16 ancient greek to UTF8 and back again failed");
 	torture_assert_data_blob_equal(tctx, plato_output2, plato_utf16le,
-				       "round trip conversion of UTF16 ancient greek to display charset UTF8 and back again failed");
+				       "round trip conversion of UTF16 ancient greek to UTF8 and back again failed");
 	torture_assert_int_equal(tctx,
 				 strlen_m_ext_handle(iconv_handle,
 						     (const char *)plato_output.data,
-						     CH_DISPLAY, CH_UTF16LE),
+						     CH_UTF8, CH_UTF16LE),
 				 plato_output2.length / 2,
-				 "checking strlen_m_ext of round trip conversion of UTF16 latin charset greek to display charset UTF8 and back again");
+				 "checking strlen_m_ext of round trip conversion of UTF16 latin charset greek to UTF8 and back again");
 
 	torture_assert(tctx, convert_string_talloc_handle(tctx, iconv_handle,
-							  CH_DISPLAY, CH_UTF8,
+							  CH_UTF8, CH_UTF8,
 							  plato_output.data, plato_output.length,
 							  (void *)&plato_output2.data, &plato_output2.length),
-		       "conversion of display charset UTF8 to UTF8");
+		       "conversion of UTF8 to UTF8");
 	torture_assert_data_blob_equal(tctx, plato_output2, plato_utf8,
-				       "conversion of display charset UTF8 to UTF8");
+				       "conversion of UTF8 to UTF8");
 	torture_assert_int_equal(tctx,
 				 strlen_m_ext_handle(iconv_handle,
 						     (const char *)plato_output.data,
-						     CH_DISPLAY, CH_UTF8),
+						     CH_UTF8, CH_UTF8),
 				 plato_output2.length,
-				 "checking strlen_m_ext of conversion of display charset UTF8 to UTF8");
+				 "checking strlen_m_ext of conversion of UTF8 to UTF8");
 	return true;
 }
 
@@ -674,7 +1114,7 @@ static bool test_plato_latin_cp850_utf8_handle(struct torture_context *tctx)
 	talloc_steal(tctx, plato_latin_utf8.data);
 	talloc_steal(tctx, plato_latin_utf16le.data);
 
-	iconv_handle = get_iconv_testing_handle(tctx, "CP850", "UTF8", "UTF8");
+	iconv_handle = get_iconv_testing_handle(tctx, "CP850", "UTF8");
 	torture_assert(tctx, iconv_handle, "creating iconv handle");
 		
 	torture_assert(tctx, convert_string_talloc_handle(tctx, iconv_handle, 
@@ -691,11 +1131,11 @@ static bool test_plato_latin_cp850_utf8_handle(struct torture_context *tctx)
 	torture_assert_data_blob_equal(tctx, plato_latin_output, plato_latin_utf8, "conversion from UTF8 to (unix charset) UTF8 incorrect");
 	
 	torture_assert(tctx, convert_string_talloc_handle(tctx, iconv_handle, 
-						    CH_UTF8, CH_DISPLAY, 
+						    CH_UTF8, CH_UTF8, 
 						    plato_latin_utf8.data, plato_latin_utf8.length, 
 						    (void *)&plato_latin_output.data, &plato_latin_output.length),
 		       "conversion of UTF16 latin charset greek to unix charset UTF8 failed");
-	torture_assert_data_blob_equal(tctx, plato_latin_output, plato_latin_utf8, "conversion from UTF8 to (display charset) UTF8 incorrect");
+	torture_assert_data_blob_equal(tctx, plato_latin_output, plato_latin_utf8, "conversion from UTF8 to UTF8 incorrect");
 	
 	torture_assert(tctx, convert_string_talloc_handle(tctx, iconv_handle, 
 						    CH_UTF16LE, CH_DOS, 
@@ -711,25 +1151,25 @@ static bool test_plato_latin_cp850_utf8_handle(struct torture_context *tctx)
 	torture_assert_data_blob_equal(tctx, plato_latin_output, plato_latin_utf8, "conversion from UTF16LE to (unix charset) CP850 incorrect");
 	
 	torture_assert(tctx, convert_string_talloc_handle(tctx, iconv_handle, 
-							  CH_UTF16LE, CH_DISPLAY, 
+							  CH_UTF16LE, CH_UTF8, 
 							  plato_latin_utf16le.data, plato_latin_utf16le.length, 
 							  (void *)&plato_latin_output.data, &plato_latin_output.length),
-		       "conversion of UTF16 latin charset greek to display charset UTF8 failed");
-	torture_assert_data_blob_equal(tctx, plato_latin_output, plato_latin_utf8, "conversion from UTF16LE to (display charset) UTF8 incorrect");
+		       "conversion of UTF16 latin charset greek to UTF8 failed");
+	torture_assert_data_blob_equal(tctx, plato_latin_output, plato_latin_utf8, "conversion from UTF16LE to UTF8 incorrect");
 	
 	torture_assert(tctx, convert_string_talloc_handle(tctx, iconv_handle, 
-							  CH_DISPLAY, CH_UTF16LE, 
+							  CH_UTF8, CH_UTF16LE, 
 							  plato_latin_output.data, plato_latin_output.length, 
 							  (void *)&plato_latin_output2.data, &plato_latin_output2.length),
-		       "round trip conversion of UTF16 latin charset greek to display charset UTF8 and back again failed");
+		       "round trip conversion of UTF16 latin charset greek to UTF8 and back again failed");
 	torture_assert_data_blob_equal(tctx, plato_latin_output2, plato_latin_utf16le,
-				       "round trip conversion of UTF16 latin charset greek to display charset UTF8 and back again failed");
+				       "round trip conversion of UTF16 latin charset greek to UTF8 and back again failed");
 	torture_assert_int_equal(tctx,
 				 strlen_m_ext_handle(iconv_handle,
 						     (const char *)plato_latin_output.data,
-						     CH_DISPLAY, CH_UTF16LE),
+						     CH_UTF8, CH_UTF16LE),
 				 plato_latin_output2.length / 2,
-				 "checking strlen_m_ext of round trip conversion of UTF16 latin charset greek to display charset UTF8 and back again");
+				 "checking strlen_m_ext of round trip conversion of UTF16 latin charset greek to UTF8 and back again");
 	return true;
 }
 
@@ -742,7 +1182,7 @@ static bool test_gd_case_utf8_handle(struct torture_context *tctx)
 	char *gd_lower, *gd_upper;
 	talloc_steal(tctx, gd_utf8.data);
 
-	iconv_handle = get_iconv_testing_handle(tctx, "ASCII", "UTF8", "UTF8");
+	iconv_handle = get_iconv_testing_handle(tctx, "ASCII", "UTF8");
 	torture_assert(tctx, iconv_handle, "getting utf8 iconv handle");
 
 	torture_assert(tctx,
@@ -805,7 +1245,7 @@ static bool test_gd_case_cp850_handle(struct torture_context *tctx)
 	char *gd_lower, *gd_upper;
 	talloc_steal(tctx, gd_cp850.data);
 
-	iconv_handle = get_iconv_testing_handle(tctx, "ASCII", "CP850", "CP850");
+	iconv_handle = get_iconv_testing_handle(tctx, "ASCII", "CP850");
 	torture_assert(tctx, iconv_handle, "getting cp850 iconv handle");
 
 	torture_assert(tctx,
@@ -866,7 +1306,7 @@ static bool test_plato_case_utf8_handle(struct torture_context *tctx)
 	char *plato_lower, *plato_upper;
 	talloc_steal(tctx, plato_utf8.data);
 
-	iconv_handle = get_iconv_testing_handle(tctx, "ASCII", "UTF8", "UTF8");
+	iconv_handle = get_iconv_testing_handle(tctx, "ASCII", "UTF8");
 	torture_assert(tctx, iconv_handle, "getting utf8 iconv handle");
 
 	torture_assert(tctx,
@@ -1248,9 +1688,12 @@ struct torture_suite *torture_local_convert_string_handle(TALLOC_CTX *mem_ctx)
 	struct torture_suite *suite = torture_suite_create(mem_ctx, "convert_string_handle");
 
 	torture_suite_add_simple_test(suite, "gd_ascii", test_gd_ascii_handle);
+	torture_suite_add_simple_test(suite, "gd_minus_1", test_gd_minus_1_handle);
 	torture_suite_add_simple_test(suite, "gd_iso8859_cp850", test_gd_iso8859_cp850_handle);
 	torture_suite_add_simple_test(suite, "plato_english_iso8859_cp850", test_plato_english_iso8859_cp850_handle);
+	torture_suite_add_simple_test(suite, "plato_english_minus_1", test_plato_english_minus_1_handle);
 	torture_suite_add_simple_test(suite, "plato_cp850_utf8", test_plato_cp850_utf8_handle);
+	torture_suite_add_simple_test(suite, "plato_minus_1", test_plato_minus_1_handle);
 	torture_suite_add_simple_test(suite, "plato_latin_cp850_utf8", test_plato_latin_cp850_utf8_handle);
 	return suite;
 }
diff --git a/lib/util/charset/util_str.c b/lib/util/charset/util_str.c
index e8f0b788b1..688ab5a0a1 100644
--- a/lib/util/charset/util_str.c
+++ b/lib/util/charset/util_str.c
@@ -5,6 +5,8 @@
    Copyright (C) Simo Sorce 2001
    Copyright (C) Andrew Bartlett 2011
    Copyright (C) Jeremy Allison  1992-2007
+   Copyright (C) Martin Pool     2003
+   Copyright (C) James Peach	 2006
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -167,7 +169,6 @@ _PUBLIC_ size_t strlen_m_ext_handle(struct smb_iconv_handle *ic,
 	switch (dst_charset) {
 	case CH_DOS:
 	case CH_UNIX:
-	case CH_DISPLAY:
 		smb_panic("cannot call strlen_m_ext() with a variable dest charset (must be UTF16* or UTF8)");
 	default:
 		break;
@@ -327,7 +328,7 @@ _PUBLIC_ char *strchr_m(const char *src, char c)
 
 	for (s = src; *s && !(((unsigned char)s[0]) & 0x80); s++) {
 		if (*s == c)
-			return (char *)s;
+			return discard_const_p(char, s);
 	}
 
 	if (!*s)
@@ -395,7 +396,7 @@ _PUBLIC_ char *strrchr_m(const char *s, char c)
 					break;
 				}
 				/* No - we have a match ! */
-				return (char *)cp;
+				return discard_const_p(char , cp);
 			}
 		} while (cp-- != s);
 		if (!got_mb)
@@ -473,3 +474,84 @@ _PUBLIC_ bool strhasupper(const char *string)
 	struct smb_iconv_handle *ic = get_iconv_handle();
 	return strhasupper_handle(ic, string);
 }
+
+/***********************************************************************
+ strstr_m - We convert via ucs2 for now.
+***********************************************************************/
+
+char *strstr_m(const char *src, const char *findstr)
+{
+	smb_ucs2_t *p;
+	smb_ucs2_t *src_w, *find_w;
+	const char *s;
+	char *s2;
+	char *retp;
+	size_t converted_size, findstr_len = 0;
+
+	TALLOC_CTX *frame; /* Only set up in the iconv case */
+
+	/* for correctness */
+	if (!findstr[0]) {
+		return discard_const_p(char, src);
+	}
+
+	/* Samba does single character findstr calls a *lot*. */
+	if (findstr[1] == '\0')
+		return strchr_m(src, *findstr);
+
+	/* We optimise for the ascii case, knowing that all our
+	   supported multi-byte character sets are ascii-compatible
+	   (ie. they match for the first 128 chars) */
+
+	for (s = src; *s && !(((unsigned char)s[0]) & 0x80); s++) {
+		if (*s == *findstr) {
+			if (!findstr_len)
+				findstr_len = strlen(findstr);
+
+			if (strncmp(s, findstr, findstr_len) == 0) {
+				return discard_const_p(char, s);
+			}
+		}
+	}
+
+	if (!*s)
+		return NULL;
+
+#if 1 /* def BROKEN_UNICODE_COMPOSE_CHARACTERS */
+	/* 'make check' fails unless we do this */
+
+	/* With compose characters we must restart from the beginning. JRA. */
+	s = src;
+#endif
+
+	frame = talloc_stackframe();
+
+	if (!push_ucs2_talloc(frame, &src_w, src, &converted_size)) {
+		DEBUG(0,("strstr_m: src malloc fail\n"));
+		TALLOC_FREE(frame);
+		return NULL;
+	}
+
+	if (!push_ucs2_talloc(frame, &find_w, findstr, &converted_size)) {
+		DEBUG(0,("strstr_m: find malloc fail\n"));
+		TALLOC_FREE(frame);
+		return NULL;
+	}
+
+	p = strstr_w(src_w, find_w);
+
+	if (!p) {
+		TALLOC_FREE(frame);
+		return NULL;
+	}
+
+	*p = 0;
+	if (!pull_ucs2_talloc(frame, &s2, src_w, &converted_size)) {
+		TALLOC_FREE(frame);
+		DEBUG(0,("strstr_m: dest malloc fail\n"));
+		return NULL;
+	}
+	retp = discard_const_p(char, (s+strlen(s2)));
+	TALLOC_FREE(frame);
+	return retp;
+}
diff --git a/lib/util/charset/util_unistr.c b/lib/util/charset/util_unistr.c
index a1be501c7c..e4ae65053c 100644
--- a/lib/util/charset/util_unistr.c
+++ b/lib/util/charset/util_unistr.c
@@ -161,85 +161,6 @@ _PUBLIC_ char *talloc_strdup_upper(TALLOC_CTX *ctx, const char *src)
 }
 
 /**
- Convert a string to lower case.
-**/
-_PUBLIC_ void strlower_m(char *s)
-{
-	char *d;
-	struct smb_iconv_handle *iconv_handle;
-
-	/* this is quite a common operation, so we want it to be
-	   fast. We optimise for the ascii case, knowing that all our
-	   supported multi-byte character sets are ascii-compatible
-	   (ie. they match for the first 128 chars) */
-	while (*s && !(((uint8_t)*s) & 0x80)) {
-		*s = tolower((uint8_t)*s);
-		s++;
-	}
-
-	if (!*s)
-		return;
-
-	iconv_handle = get_iconv_handle();
-
-	d = s;
-
-	while (*s) {
-		size_t c_size, c_size2;
-		codepoint_t c = next_codepoint_handle(iconv_handle, s, &c_size);
-		c_size2 = push_codepoint_handle(iconv_handle, d, tolower_m(c));
-		if (c_size2 > c_size) {
-			DEBUG(0,("FATAL: codepoint 0x%x (0x%x) expanded from %d to %d bytes in strlower_m\n",
-				 c, tolower_m(c), (int)c_size, (int)c_size2));
-			smb_panic("codepoint expansion in strlower_m\n");
-		}
-		s += c_size;
-		d += c_size2;
-	}
-	*d = 0;
-}
-
-/**
- Convert a string to UPPER case.
-**/
-_PUBLIC_ void strupper_m(char *s)
-{
-	char *d;
-	struct smb_iconv_handle *iconv_handle;
-
-	/* this is quite a common operation, so we want it to be
-	   fast. We optimise for the ascii case, knowing that all our
-	   supported multi-byte character sets are ascii-compatible
-	   (ie. they match for the first 128 chars) */
-	while (*s && !(((uint8_t)*s) & 0x80)) {
-		*s = toupper((uint8_t)*s);
-		s++;
-	}
-
-	if (!*s)
-		return;
-
-	iconv_handle = get_iconv_handle();
-
-	d = s;
-
-	while (*s) {
-		size_t c_size, c_size2;
-		codepoint_t c = next_codepoint_handle(iconv_handle, s, &c_size);
-		c_size2 = push_codepoint_handle(iconv_handle, d, toupper_m(c));
-		if (c_size2 > c_size) {
-			DEBUG(0,("FATAL: codepoint 0x%x (0x%x) expanded from %d to %d bytes in strupper_m\n",
-				 c, toupper_m(c), (int)c_size, (int)c_size2));
-			smb_panic("codepoint expansion in strupper_m\n");
-		}
-		s += c_size;
-		d += c_size2;
-	}
-	*d = 0;
-}
-
-
-/**
  Find the number of 'c' chars in a string
 **/
 _PUBLIC_ size_t count_chars_m(const char *s, char c)
@@ -273,7 +194,7 @@ _PUBLIC_ size_t count_chars_m(const char *s, char c)
  * @param dest_len the maximum length in bytes allowed in the
  * destination.  If @p dest_len is -1 then no maximum is used.
  **/
-static bool push_ascii(void *dest, const char *src, size_t dest_len, int flags, size_t *converted_size)
+static bool push_ascii_string(void *dest, const char *src, size_t dest_len, int flags, size_t *converted_size)
 {
 	size_t src_len;
 	bool ret;
@@ -283,7 +204,7 @@ static bool push_ascii(void *dest, const char *src, size_t dest_len, int flags,
 		if (tmpbuf == NULL) {
 			return false;
 		}
-		ret = push_ascii(dest, tmpbuf, dest_len, flags & ~STR_UPPER, converted_size);
+		ret = push_ascii_string(dest, tmpbuf, dest_len, flags & ~STR_UPPER, converted_size);
 		talloc_free(tmpbuf);
 		return ret;
 	}
@@ -297,23 +218,6 @@ static bool push_ascii(void *dest, const char *src, size_t dest_len, int flags,
 }
 
 /**
- * Copy a string from a unix char* src to an ASCII destination,
- * allocating a buffer using talloc().
- *
- * @param dest always set at least to NULL 
- *
- * @returns The number of bytes occupied by the string in the destination
- *         or -1 in case of error.
- **/
-_PUBLIC_ bool push_ascii_talloc(TALLOC_CTX *ctx, char **dest, const char *src, size_t *converted_size)
-{
-	size_t src_len = strlen(src)+1;
-	*dest = NULL;
-	return convert_string_talloc(ctx, CH_UNIX, CH_DOS, src, src_len, (void **)dest, converted_size);
-}
-
-
-/**
  * Copy a string from a dos codepage source to a unix char* destination.
  *
  * The resulting string in "dest" is always null terminated.
@@ -328,7 +232,7 @@ _PUBLIC_ bool push_ascii_talloc(TALLOC_CTX *ctx, char **dest, const char *src, s
  * @param src_len is the length of the source area in bytes.
  * @returns the number of bytes occupied by the string in @p src.
  **/
-static ssize_t pull_ascii(char *dest, const void *src, size_t dest_len, size_t src_len, int flags)
+static ssize_t pull_ascii_string(char *dest, const void *src, size_t dest_len, size_t src_len, int flags)
 {
 	size_t size = 0;
 
@@ -411,38 +315,6 @@ static ssize_t push_ucs2(void *dest, const char *src, size_t dest_len, int flags
 
 
 /**
- * Copy a string from a unix char* src to a UCS2 destination,
- * allocating a buffer using talloc().
- *
- * @param dest always set at least to NULL 
- *
- * @returns The number of bytes occupied by the string in the destination
- *         or -1 in case of error.
- **/
-_PUBLIC_ bool push_ucs2_talloc(TALLOC_CTX *ctx, smb_ucs2_t **dest, const char *src, size_t *converted_size)
-{
-	size_t src_len = strlen(src)+1;
-	*dest = NULL;
-	return convert_string_talloc(ctx, CH_UNIX, CH_UTF16, src, src_len, (void **)dest, converted_size);
-}
-
-
-/**
- * Copy a string from a unix char* src to a UTF-8 destination, allocating a buffer using talloc
- *
- * @param dest always set at least to NULL 
- *
- * @returns The number of bytes occupied by the string in the destination
- **/
-
-_PUBLIC_ bool push_utf8_talloc(TALLOC_CTX *ctx, char **dest, const char *src, size_t *converted_size)
-{
-	size_t src_len = strlen(src)+1;
-	*dest = NULL;
-	return convert_string_talloc(ctx, CH_UNIX, CH_UTF8, src, src_len, (void **)dest, converted_size);
-}
-
-/**
  Copy a string from a ucs2 source to a unix char* destination.
  Flags can have:
   STR_TERMINATE means the string in src is null terminated.
@@ -484,51 +356,6 @@ static size_t pull_ucs2(char *dest, const void *src, size_t dest_len, size_t src
 }
 
 /**
- * Copy a string from a ASCII src to a unix char * destination, allocating a buffer using talloc
- *
- * @param dest always set at least to NULL 
- *
- * @returns The number of bytes occupied by the string in the destination
- **/
-
-_PUBLIC_ bool pull_ascii_talloc(TALLOC_CTX *ctx, char **dest, const char *src, size_t *converted_size)
-{
-	size_t src_len = strlen(src)+1;
-	*dest = NULL;
-	return convert_string_talloc(ctx, CH_DOS, CH_UNIX, src, src_len, (void **)dest, converted_size);
-}
-
-/**
- * Copy a string from a UCS2 src to a unix char * destination, allocating a buffer using talloc
- *
- * @param dest always set at least to NULL 
- *
- * @returns The number of bytes occupied by the string in the destination
- **/
-
-_PUBLIC_ bool pull_ucs2_talloc(TALLOC_CTX *ctx, char **dest, const smb_ucs2_t *src, size_t *converted_size)
-{
-	size_t src_len = utf16_len(src);
-	*dest = NULL;
-	return convert_string_talloc(ctx, CH_UTF16, CH_UNIX, src, src_len, (void **)dest, converted_size);
-}
-
-/**
- * Copy a string from a UTF-8 src to a unix char * destination, allocating a buffer using talloc
- *
- * @param dest always set at least to NULL 
- *
- * @returns The number of bytes occupied by the string in the destination
- **/
-
-_PUBLIC_ bool pull_utf8_talloc(TALLOC_CTX *ctx, char **dest, const char *src, size_t *converted_size)
-{
-	size_t src_len = strlen(src)+1;
-	*dest = NULL;
-	return convert_string_talloc(ctx, CH_UTF8, CH_UNIX, src, src_len, (void **)dest, converted_size);
-}
-
-/**
  Copy a string from a char* src to a unicode or ascii
  dos codepage destination choosing unicode or ascii based on the 
  flags in the SMB buffer starting at base_ptr.
@@ -546,7 +373,7 @@ _PUBLIC_ ssize_t push_string(void *dest, const char *src, size_t dest_len, int f
 {
 	if (flags & STR_ASCII) {
 		size_t size = 0;
-		if (push_ascii(dest, src, dest_len, flags, &size)) {
+		if (push_ascii_string(dest, src, dest_len, flags, &size)) {
 			return (ssize_t)size;
 		} else {
 			return (ssize_t)-1;
@@ -577,7 +404,7 @@ _PUBLIC_ ssize_t push_string(void *dest, const char *src, size_t dest_len, int f
 _PUBLIC_ ssize_t pull_string(char *dest, const void *src, size_t dest_len, size_t src_len, int flags)
 {
 	if (flags & STR_ASCII) {
-		return pull_ascii(dest, src, dest_len, src_len, flags);
+		return pull_ascii_string(dest, src, dest_len, src_len, flags);
 	} else if (flags & STR_UNICODE) {
 		return pull_ucs2(dest, src, dest_len, src_len, flags);
 	} else {
@@ -585,68 +412,3 @@ _PUBLIC_ ssize_t pull_string(char *dest, const void *src, size_t dest_len, size_
 		return -1;
 	}
 }
-
-
-/**
- * Convert string from one encoding to another, making error checking etc
- *
- * @param src pointer to source string (multibyte or singlebyte)
- * @param srclen length of the source string in bytes
- * @param dest pointer to destination string (multibyte or singlebyte)
- * @param destlen maximal length allowed for string
- * @param converted_size the number of bytes occupied in the destination
- *
- * @returns true on success, false on fail.
- **/
-_PUBLIC_ bool convert_string(charset_t from, charset_t to,
-			       void const *src, size_t srclen, 
-			       void *dest, size_t destlen,
-			       size_t *converted_size)
-{
-	return convert_string_handle(get_iconv_handle(), from, to,
-					src, srclen,
-					dest, destlen, converted_size);
-}
-
-/**
- * Convert string from one encoding to another, making error checking etc
- *
- * @param src pointer to source string (multibyte or singlebyte)
- * @param srclen length of the source string in bytes
- * @param dest pointer to destination string (multibyte or singlebyte)
- * @param destlen maximal length allowed for string
- * @param converted_size the number of bytes occupied in the destination
- *
- * @returns true on success, false on fail.
- **/
-_PUBLIC_ bool convert_string_error(charset_t from, charset_t to,
-				   void const *src, size_t srclen,
-				   void *dest, size_t destlen,
-				   size_t *converted_size)
-{
-	return convert_string_error_handle(get_iconv_handle(), from, to,
-					   src, srclen,
-					   dest, destlen, converted_size);
-}
-
-/**
- * Convert between character sets, allocating a new buffer using talloc for the result.
- *
- * @param srclen length of source buffer.
- * @param dest always set at least to NULL
- * @param converted_size Size in bytes of the converted string
- * @note -1 is not accepted for srclen.
- *
- * @returns boolean indication whether the conversion succeeded
- **/
-
-_PUBLIC_ bool convert_string_talloc(TALLOC_CTX *ctx, 
-				    charset_t from, charset_t to, 
-				    void const *src, size_t srclen, 
-				    void *dest, size_t *converted_size)
-{
-	return convert_string_talloc_handle(ctx, get_iconv_handle(),
-						 from, to, src, srclen, dest,
-						 converted_size);
-}
-
diff --git a/lib/util/charset/util_unistr_w.c b/lib/util/charset/util_unistr_w.c
index a550e52776..3fbed7f67c 100644
--- a/lib/util/charset/util_unistr_w.c
+++ b/lib/util/charset/util_unistr_w.c
@@ -22,8 +22,8 @@
 #include "includes.h"
 
 /* Copy into a smb_ucs2_t from a possibly unaligned buffer. Return the copied smb_ucs2_t */
-#define COPY_UCS2_CHAR(dest,src) (((unsigned char *)(dest))[0] = ((unsigned char *)(src))[0],\
-				((unsigned char *)(dest))[1] = ((unsigned char *)(src))[1], (dest))
+#define COPY_UCS2_CHAR(dest,src) (((unsigned char *)(dest))[0] = ((const unsigned char *)(src))[0],\
+				((unsigned char *)(dest))[1] = ((const unsigned char *)(src))[1], (dest))
 
 
 /* return an ascii version of a ucs2 character */
@@ -72,12 +72,12 @@ smb_ucs2_t *strchr_w(const smb_ucs2_t *s, smb_ucs2_t c)
 	smb_ucs2_t cp;
 	while (*(COPY_UCS2_CHAR(&cp,s))) {
 		if (c == cp) {
-			return (smb_ucs2_t *)s;
+			return discard_const_p(smb_ucs2_t, s);
 		}
 		s++;
 	}
 	if (c == cp) {
-		return (smb_ucs2_t *)s;
+		return discard_const_p(smb_ucs2_t, s);
 	}
 
 	return NULL;
@@ -104,7 +104,7 @@ smb_ucs2_t *strrchr_w(const smb_ucs2_t *s, smb_ucs2_t c)
 	p += (len - 1);
 	do {
 		if (c == *(COPY_UCS2_CHAR(&cp,p))) {
-			return (smb_ucs2_t *)p;
+			return discard_const_p(smb_ucs2_t, p);
 		}
 	} while (p-- != s);
 	return NULL;
@@ -234,38 +234,6 @@ static int strncmp_w(const smb_ucs2_t *a, const smb_ucs2_t *b, size_t len)
 	return (len - n)?(*(COPY_UCS2_CHAR(&cpa,a)) - *(COPY_UCS2_CHAR(&cpb,b))):0;
 }
 
-/*******************************************************************
- Case insensitive string comparison.
-********************************************************************/
-
-int strcasecmp_w(const smb_ucs2_t *a, const smb_ucs2_t *b)
-{
-	smb_ucs2_t cpa, cpb;
-
-	while ((*COPY_UCS2_CHAR(&cpb,b)) && toupper_m(*(COPY_UCS2_CHAR(&cpa,a))) == toupper_m(cpb)) {
-		a++;
-		b++;
-	}
-	return (tolower_m(*(COPY_UCS2_CHAR(&cpa,a))) - tolower_m(*(COPY_UCS2_CHAR(&cpb,b))));
-}
-
-/*******************************************************************
- Case insensitive string comparison, length limited.
-********************************************************************/
-
-int strncasecmp_w(const smb_ucs2_t *a, const smb_ucs2_t *b, size_t len)
-{
-	smb_ucs2_t cpa, cpb;
-	size_t n = 0;
-
-	while ((n < len) && *COPY_UCS2_CHAR(&cpb,b) && (toupper_m(*(COPY_UCS2_CHAR(&cpa,a))) == toupper_m(cpb))) {
-		a++;
-		b++;
-		n++;
-	}
-	return (len - n)?(tolower_m(*(COPY_UCS2_CHAR(&cpa,a))) - tolower_m(*(COPY_UCS2_CHAR(&cpb,b)))):0;
-}
-
 /*
   The *_wa() functions take a combination of 7 bit ascii
   and wide characters They are used so that you can use string
diff --git a/lib/util/charset/weird.c b/lib/util/charset/weird.c
new file mode 100644
index 0000000000..5db8cdcecd
--- /dev/null
+++ b/lib/util/charset/weird.c
@@ -0,0 +1,134 @@
+/* 
+   Unix SMB/CIFS implementation.
+   Samba module with developer tools
+   Copyright (C) Andrew Tridgell 2001
+   Copyright (C) Jelmer Vernooij 2002
+   
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "includes.h"
+
+static struct {
+	char from;
+	const char *to;
+	int len;
+} weird_table[] = {
+	{'q', "^q^", 3},
+	{'Q', "^Q^", 3},
+	{0, NULL}
+};
+
+static size_t weird_pull(void *cd, const char **inbuf, size_t *inbytesleft,
+			 char **outbuf, size_t *outbytesleft)
+{
+	while (*inbytesleft >= 1 && *outbytesleft >= 2) {
+		int i;
+		int done = 0;
+		for (i=0;weird_table[i].from;i++) {
+			if (strncmp((*inbuf), 
+				    weird_table[i].to, 
+				    weird_table[i].len) == 0) {
+				if (*inbytesleft < weird_table[i].len) {
+					DEBUG(0,("ERROR: truncated weird string\n"));
+					/* smb_panic("weird_pull"); */
+
+				} else {
+					(*outbuf)[0] = weird_table[i].from;
+					(*outbuf)[1] = 0;
+					(*inbytesleft)  -= weird_table[i].len;
+					(*outbytesleft) -= 2;
+					(*inbuf)  += weird_table[i].len;
+					(*outbuf) += 2;
+					done = 1;
+					break;
+				}
+			}
+		}
+		if (done) continue;
+		(*outbuf)[0] = (*inbuf)[0];
+		(*outbuf)[1] = 0;
+		(*inbytesleft)  -= 1;
+		(*outbytesleft) -= 2;
+		(*inbuf)  += 1;
+		(*outbuf) += 2;
+	}
+
+	if (*inbytesleft > 0) {
+		errno = E2BIG;
+		return -1;
+	}
+	
+	return 0;
+}
+
+static size_t weird_push(void *cd, const char **inbuf, size_t *inbytesleft,
+			 char **outbuf, size_t *outbytesleft)
+{
+	int ir_count=0;
+
+	while (*inbytesleft >= 2 && *outbytesleft >= 1) {
+		int i;
+		int done=0;
+		for (i=0;weird_table[i].from;i++) {
+			if ((*inbuf)[0] == weird_table[i].from &&
+			    (*inbuf)[1] == 0) {
+				if (*outbytesleft < weird_table[i].len) {
+					DEBUG(0,("No room for weird character\n"));
+					/* smb_panic("weird_push"); */
+				} else {
+					memcpy(*outbuf, weird_table[i].to, 
+					       weird_table[i].len);
+					(*inbytesleft)  -= 2;
+					(*outbytesleft) -= weird_table[i].len;
+					(*inbuf)  += 2;
+					(*outbuf) += weird_table[i].len;
+					done = 1;
+					break;
+				}
+			}
+		}
+		if (done) continue;
+
+		(*outbuf)[0] = (*inbuf)[0];
+		if ((*inbuf)[1]) ir_count++;
+		(*inbytesleft)  -= 2;
+		(*outbytesleft) -= 1;
+		(*inbuf)  += 2;
+		(*outbuf) += 1;
+	}
+
+	if (*inbytesleft == 1) {
+		errno = EINVAL;
+		return -1;
+	}
+
+	if (*inbytesleft > 1) {
+		errno = E2BIG;
+		return -1;
+	}
+	
+	return ir_count;
+}
+
+struct charset_functions weird_functions = {"WEIRD", weird_pull, weird_push};
+
+NTSTATUS charset_weird_init(void);
+NTSTATUS charset_weird_init(void)
+{
+	if (!smb_register_charset(&weird_functions)) {
+		return NT_STATUS_INTERNAL_ERROR;
+	}
+	return NT_STATUS_OK;
+}
diff --git a/lib/util/charset/wscript_build b/lib/util/charset/wscript_build
index 29e168dce1..1f2c8dfa7a 100644
--- a/lib/util/charset/wscript_build
+++ b/lib/util/charset/wscript_build
@@ -1,18 +1,44 @@
 #!/usr/bin/env python
 
-
-if bld.env._SAMBA_BUILD_ == 4:
-    bld.SAMBA_SUBSYSTEM('CHARSET',
-                        source='charcnv.c util_unistr.c',
-                        public_deps='CODEPOINTS',
-                        public_headers='charset.h',
-                        )
-
 bld.SAMBA_SUBSYSTEM('ICONV_WRAPPER',
                     source='iconv.c',
                     public_deps='iconv replace talloc')
 
-bld.SAMBA_SUBSYSTEM('CODEPOINTS',
-	source='codepoints.c util_str.c util_unistr_w.c',
-	deps='DYNCONFIG ICONV_WRAPPER'
-	)
+bld.SAMBA_SUBSYSTEM('CHARSET',
+                    public_headers='charset.h',
+                    source='codepoints.c convert_string.c util_str.c util_unistr_w.c charcnv.c pull_push.c util_unistr.c',
+                    deps='DYNCONFIG ICONV_WRAPPER',
+                    public_deps='talloc')
+
+bld.SAMBA_MODULE('charset_weird',
+                 subsystem='CHARSET',
+                 source='weird.c',
+                 init_function='',
+                 deps='samba-util',
+                 internal_module=bld.SAMBA3_IS_STATIC_MODULE('charset_weird'),
+                 enabled=bld.SAMBA3_IS_ENABLED_MODULE('charset_weird'))
+
+bld.SAMBA_MODULE('charset_CP850',
+                 subsystem='CHARSET',
+                 source='CP850.c',
+                 init_function='',
+                 deps='samba-util',
+                 internal_module=bld.SAMBA3_IS_STATIC_MODULE('charset_CP850'),
+                 enabled=bld.SAMBA3_IS_ENABLED_MODULE('charset_CP850'))
+
+bld.SAMBA_MODULE('charset_CP437',
+                 subsystem='CHARSET',
+                 source='CP437.c',
+                 init_function='',
+                 deps='samba-util',
+                 internal_module=bld.SAMBA3_IS_STATIC_MODULE('charset_CP437'),
+                 enabled=bld.SAMBA3_IS_ENABLED_MODULE('charset_CP437'))
+
+bld.SAMBA_MODULE('charset_macosxfs',
+                 subsystem='CHARSET',
+                 source='charset_macosxfs.c',
+                 init_function='',
+                 internal_module=bld.SAMBA3_IS_STATIC_MODULE('charset_macosxfs'),
+                 enabled=bld.SAMBA3_IS_ENABLED_MODULE('charset_macosxfs'))
+
+
diff --git a/lib/util/data_blob.h b/lib/util/data_blob.h
index 83e6cd5f09..558ade9248 100644
--- a/lib/util/data_blob.h
+++ b/lib/util/data_blob.h
@@ -1,7 +1,10 @@
 /* 
    Unix SMB/CIFS implementation.
    DATA BLOB
-   
+
+   Copyright (C) Andrew Tridgell 2001
+   Copyright (C) Andrew Bartlett 2001
+
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 3 of the License, or
diff --git a/lib/util/debug.c b/lib/util/debug.c
index b0a78823fc..c1b33de6d1 100644
--- a/lib/util/debug.c
+++ b/lib/util/debug.c
@@ -203,7 +203,7 @@ void gfree_debugsyms(void)
 
 	TALLOC_FREE(format_bufr);
 
-	debug_num_classes = DBGC_MAX_FIXED;
+	debug_num_classes = 0;
 
 	state.initialized = false;
 }
diff --git a/lib/util/debug_s3.h b/lib/util/debug_s3.h
index 96b8ed74d9..9e5211b19b 100644
--- a/lib/util/debug_s3.h
+++ b/lib/util/debug_s3.h
@@ -17,6 +17,8 @@
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
+#include "librpc/gen_ndr/server_id.h"
+
 struct messaging_context;
 struct server_id;
 void debug_message(struct messaging_context *msg_ctx, void *private_data, uint32_t msg_type, struct server_id src, DATA_BLOB *data);
diff --git a/lib/util/dprintf.c b/lib/util/dprintf.c
index e9a15dcbe6..90ca36c1ae 100644
--- a/lib/util/dprintf.c
+++ b/lib/util/dprintf.c
@@ -33,58 +33,10 @@
 
 #include "includes.h"
 #include "system/locale.h"
-#include "param/param.h"
 
-static smb_iconv_t display_cd = (smb_iconv_t)-1;
-
-void d_set_iconv(smb_iconv_t cd)
+static int d_vfprintf(FILE *f, const char *format, va_list ap) 
 {
-	if (display_cd != (smb_iconv_t)-1)
-		talloc_free(display_cd);
-
-	display_cd = cd;
-}
-
-_PUBLIC_ int d_vfprintf(FILE *f, const char *format, va_list ap) 
-{
-	char *p, *p2;
-	int ret, clen;
-	va_list ap2;
-
-	/* If there's nothing to convert, take a shortcut */
-	if (display_cd == (smb_iconv_t)-1) {
-		return vfprintf(f, format, ap);
-	}
-
-	/* do any message translations */
-	va_copy(ap2, ap);
-	ret = vasprintf(&p, format, ap2);
-	va_end(ap2);
-
-	if (ret <= 0) return ret;
-
-	clen = iconv_talloc(NULL, display_cd, p, ret, (void **)&p2);
-        if (clen == -1) {
-		/* the string can't be converted - do the best we can,
-		   filling in non-printing chars with '?' */
-		int i;
-		for (i=0;i<ret;i++) {
-			if (isprint(p[i]) || isspace(p[i])) {
-				fwrite(p+i, 1, 1, f);
-			} else {
-				fwrite("?", 1, 1, f);
-			}
-		}
-		SAFE_FREE(p);
-		return ret;
-        }
-
-	/* good, its converted OK */
-	SAFE_FREE(p);
-	ret = fwrite(p2, 1, clen, f);
-	talloc_free(p2);
-
-	return ret;
+	return vfprintf(f, format, ap);
 }
 
 
@@ -100,15 +52,25 @@ _PUBLIC_ int d_fprintf(FILE *f, const char *format, ...)
 	return ret;
 }
 
-_PUBLIC_ int d_printf(const char *format, ...)
+static FILE *outfile;
+
+_PUBLIC_  int d_printf(const char *format, ...)
 {
 	int ret;
-	va_list ap;
-
-	va_start(ap, format);
-	ret = d_vfprintf(stdout, format, ap);
-	va_end(ap);
-
-	return ret;
+       va_list ap;
+       
+       if (!outfile) outfile = stdout;
+       
+       va_start(ap, format);
+       ret = d_vfprintf(outfile, format, ap);
+       va_end(ap);
+       
+       return ret;
 }
 
+/* interactive programs need a way of tell d_*() to write to stderr instead
+   of stdout */
+void display_set_stderr(void)
+{
+	outfile = stderr;
+}
diff --git a/lib/util/fault.c b/lib/util/fault.c
index 086dc33545..708dc670d1 100644
--- a/lib/util/fault.c
+++ b/lib/util/fault.c
@@ -119,7 +119,7 @@ static void smb_panic_default(const char *why)
 	if (panic_action && *panic_action) {
 		char pidstr[20];
 		char cmdstring[200];
-		safe_strcpy(cmdstring, panic_action, sizeof(cmdstring)-1);
+		strlcpy(cmdstring, panic_action, sizeof(cmdstring));
 		snprintf(pidstr, sizeof(pidstr), "%d", (int) getpid());
 		all_string_sub(cmdstring, "%PID%", pidstr, sizeof(cmdstring));
 		DEBUG(0, ("smb_panic(): calling panic action [%s]\n", cmdstring));
diff --git a/lib/util/ms_fnmatch.c b/lib/util/ms_fnmatch.c
index 73fb0e0966..1ba5888ca0 100644
--- a/lib/util/ms_fnmatch.c
+++ b/lib/util/ms_fnmatch.c
@@ -154,7 +154,7 @@ static int ms_fnmatch_core(const char *p, const char *n,
 	return -1;
 }
 
-int ms_fnmatch(const char *pattern, const char *string, enum protocol_types protocol)
+int ms_fnmatch_protocol(const char *pattern, const char *string, int protocol)
 {
 	int ret, count, i;
 	struct max_n *max_n = NULL;
@@ -192,7 +192,7 @@ int ms_fnmatch(const char *pattern, const char *string, enum protocol_types prot
 				p[i] = '<';
 			}
 		}
-		ret = ms_fnmatch(p, string, PROTOCOL_NT1);
+		ret = ms_fnmatch_protocol(p, string, PROTOCOL_NT1);
 		talloc_free(p);
 		return ret;
 	}
@@ -217,5 +217,5 @@ int ms_fnmatch(const char *pattern, const char *string, enum protocol_types prot
 /** a generic fnmatch function - uses for non-CIFS pattern matching */
 int gen_fnmatch(const char *pattern, const char *string)
 {
-	return ms_fnmatch(pattern, string, PROTOCOL_NT1);
+	return ms_fnmatch_protocol(pattern, string, PROTOCOL_NT1);
 }
diff --git a/lib/util/parmlist.c b/lib/util/parmlist.c
index 6658fa7e33..0f2f3af8ee 100644
--- a/lib/util/parmlist.c
+++ b/lib/util/parmlist.c
@@ -20,6 +20,8 @@
 #include "../lib/util/dlinklist.h"
 #include "../lib/util/parmlist.h"
 
+#undef strcasecmp
+
 struct parmlist_entry *parmlist_get(struct parmlist *ctx, const char *name)
 {
 	struct parmlist_entry *e;
diff --git a/lib/util/server_id.c b/lib/util/server_id.c
new file mode 100644
index 0000000000..195deeac7c
--- /dev/null
+++ b/lib/util/server_id.c
@@ -0,0 +1,41 @@
+/*
+   Unix SMB/CIFS implementation.
+   Samba utility functions
+   Copyright (C) Andrew Bartlett 2011
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "includes.h"
+#include "librpc/gen_ndr/server_id.h"
+
+char *server_id_str(TALLOC_CTX *mem_ctx, const struct server_id *id)
+{
+	if (id->vnn == NONCLUSTER_VNN && id->task_id == 0) {
+		return talloc_asprintf(mem_ctx,
+				       "%llu",
+				       (unsigned long long)id->pid);
+	} else if (id->vnn == NONCLUSTER_VNN) {
+		return talloc_asprintf(mem_ctx,
+				       "%llu.%u",
+				       (unsigned long long)id->pid,
+				       (unsigned)id->task_id);
+	} else {
+		return talloc_asprintf(mem_ctx,
+				       "%u:%llu.%u",
+				       (unsigned)id->vnn,
+				       (unsigned long long)id->pid,
+				       (unsigned)id->task_id);
+	}
+}
diff --git a/lib/util/string_wrappers.h b/lib/util/string_wrappers.h
index 75718e942b..37384fc5a3 100644
--- a/lib/util/string_wrappers.h
+++ b/lib/util/string_wrappers.h
@@ -41,28 +41,36 @@ size_t __unsafe_string_function_usage_here_size_t__(void);
 
 #endif /* HAVE_COMPILER_WILL_OPTIMIZE_OUT_FNS */
 
-#define safe_strcpy_base(dest, src, base, size) \
-    safe_strcpy(dest, src, size-PTR_DIFF(dest,base)-1)
+#define strlcpy_base(dest, src, base, size) \
+do { \
+	const char *_strlcpy_base_src = (const char *)src; \
+	strlcpy((dest), _strlcpy_base_src? _strlcpy_base_src : "", (size)-PTR_DIFF((dest),(base))); \
+} while (0)
 
 /* String copy functions - macro hell below adds 'type checking' (limited,
    but the best we can do in C) */
 
-#define fstrcpy(d,s) safe_strcpy((d),(s),sizeof(fstring)-1)
-#define fstrcat(d,s) safe_strcat((d),(s),sizeof(fstring)-1)
-#define nstrcpy(d,s) safe_strcpy((d), (s),sizeof(nstring)-1)
-#define unstrcpy(d,s) safe_strcpy((d), (s),sizeof(unstring)-1)
-
-/* the addition of the DEVELOPER checks in safe_strcpy means we must
- * update a lot of code. To make this a little easier here are some
- * functions that provide the lengths with less pain */
-
-/* overmalloc_safe_strcpy: DEPRECATED!  Used when you know the
- * destination buffer is longer than maxlength, but you don't know how
- * long.  This is not a good situation, because we can't do the normal
- * sanity checks. Don't use in new code! */
-
-#define overmalloc_safe_strcpy(dest,src,maxlength) \
-	safe_strcpy_fn(dest,src,maxlength)
+#define fstrcpy(d,s) \
+do { \
+	const char *_fstrcpy_src = (const char *)(s); \
+	strlcpy((d),_fstrcpy_src ? _fstrcpy_src : "",sizeof(fstring)); \
+} while (0)
+
+#define fstrcat(d,s) \
+do { \
+	const char *_fstrcat_src = (const char *)(s); \
+	strlcat((d),_fstrcat_src ? _fstrcat_src : "",sizeof(fstring)); \
+} while (0)
+#define nstrcpy(d,s) \
+do { \
+	const char *_nstrcpy_src = (const char *)(s); \
+	strlcpy((d),_nstrcpy_src ? _nstrcpy_src : "",sizeof(fstring)); \
+} while (0)
+#define unstrcpy(d,s) \
+do { \
+	const char *_unstrcpy_src = (const char *)(s); \
+	strlcpy((d),_unstrcpy_src ? _unstrcpy_src : "",sizeof(fstring)); \
+} while (0)
 
 #ifdef HAVE_COMPILER_WILL_OPTIMIZE_OUT_FNS
 
@@ -70,16 +78,6 @@ size_t __unsafe_string_function_usage_here_size_t__(void);
    have the correct types (this works only where sizeof() returns the size of the buffer, not
    the size of the pointer). */
 
-#define safe_strcpy(d, s, max_len) \
-    (CHECK_STRING_SIZE(d, max_len+1) \
-    ? __unsafe_string_function_usage_here__() \
-    : safe_strcpy_fn((d), (s), (max_len)))
-
-#define safe_strcat(d, s, max_len) \
-    (CHECK_STRING_SIZE(d, max_len+1) \
-    ? __unsafe_string_function_usage_here__() \
-    : safe_strcat_fn((d), (s), (max_len)))
-
 #define push_string_check(dest, src, dest_len, flags) \
     (CHECK_STRING_SIZE(dest, dest_len) \
     ? __unsafe_string_function_usage_here_size_t__() \
@@ -113,8 +111,6 @@ size_t __unsafe_string_function_usage_here_size_t__(void);
 
 #else
 
-#define safe_strcpy safe_strcpy_fn
-#define safe_strcat safe_strcat_fn
 #define push_string_check push_string_check_fn
 #define clistr_push clistr_push_fn
 #define clistr_pull clistr_pull_fn
diff --git a/lib/util/substitute.c b/lib/util/substitute.c
index 32945a7213..500d12777f 100644
--- a/lib/util/substitute.c
+++ b/lib/util/substitute.c
@@ -29,18 +29,20 @@
  **/
 
 /**
- Substitute a string for a pattern in another string. Make sure there is 
+ Substitute a string for a pattern in another string. Make sure there is
  enough room!
 
- This routine looks for pattern in s and replaces it with 
- insert. It may do multiple replacements.
+ This routine looks for pattern in s and replaces it with
+ insert. It may do multiple replacements or just one.
 
  Any of " ; ' $ or ` in the insert string are replaced with _
  if len==0 then the string cannot be extended. This is different from the old
  use of len==0 which was for no length checks to be done.
 **/
 
-_PUBLIC_ void string_sub(char *s, const char *pattern, const char *insert, size_t len)
+static void string_sub2(char *s,const char *pattern, const char *insert, size_t len,
+			bool remove_unsafe_characters, bool replace_once,
+			bool allow_trailing_dollar)
 {
 	char *p;
 	ssize_t ls, lp, li, i;
@@ -55,9 +57,10 @@ _PUBLIC_ void string_sub(char *s, const char *pattern, const char *insert, size_
 	if (len == 0)
 		len = ls + 1; /* len is number of *bytes* */
 
-	while (lp <= ls && (p = strstr(s, pattern))) {
+	while (lp <= ls && (p = strstr_m(s,pattern))) {
 		if (ls + (li-lp) >= len) {
-			DEBUG(0,("ERROR: string overflow by %d in string_sub(%.50s, %d)\n", 
+			DEBUG(0,("ERROR: string overflow by "
+				"%d in string_sub(%.50s, %d)\n",
 				 (int)(ls + (li-lp) - len),
 				 pattern, (int)len));
 			break;
@@ -67,25 +70,50 @@ _PUBLIC_ void string_sub(char *s, const char *pattern, const char *insert, size_
 		}
 		for (i=0;i<li;i++) {
 			switch (insert[i]) {
+			case '$':
+				/* allow a trailing $
+				 * (as in machine accounts) */
+				if (allow_trailing_dollar && (i == li - 1 )) {
+					p[i] = insert[i];
+					break;
+				}
 			case '`':
 			case '"':
 			case '\'':
 			case ';':
-			case '$':
 			case '%':
 			case '\r':
 			case '\n':
-				p[i] = '_';
-				break;
+				if ( remove_unsafe_characters ) {
+					p[i] = '_';
+					/* yes this break should be here
+					 * since we want to fall throw if
+					 * not replacing unsafe chars */
+					break;
+				}
 			default:
 				p[i] = insert[i];
 			}
 		}
 		s = p + li;
 		ls += (li-lp);
+
+		if (replace_once)
+			break;
 	}
 }
 
+void string_sub_once(char *s, const char *pattern,
+		const char *insert, size_t len)
+{
+	string_sub2( s, pattern, insert, len, true, true, false );
+}
+
+void string_sub(char *s,const char *pattern, const char *insert, size_t len)
+{
+	string_sub2( s, pattern, insert, len, true, false, false );
+}
+
 /**
  * Talloc'ed version of string_sub
  */
@@ -146,13 +174,14 @@ _PUBLIC_ void all_string_sub(char *s,const char *pattern,const char *insert, siz
 
 	if (!*pattern)
 		return;
-	
+
 	if (len == 0)
 		len = ls + 1; /* len is number of *bytes* */
-	
-	while (lp <= ls && (p = strstr(s,pattern))) {
+
+	while (lp <= ls && (p = strstr_m(s,pattern))) {
 		if (ls + (li-lp) >= len) {
-			DEBUG(0,("ERROR: string overflow by %d in all_string_sub(%.50s, %d)\n", 
+			DEBUG(0,("ERROR: string overflow by "
+				"%d in all_string_sub(%.50s, %d)\n",
 				 (int)(ls + (li-lp) - len),
 				 pattern, (int)len));
 			break;
diff --git a/lib/util/system.c b/lib/util/system.c
index 9bf5de1a83..1e80f1a88a 100644
--- a/lib/util/system.c
+++ b/lib/util/system.c
@@ -22,6 +22,8 @@
 #include "system/network.h"
 #include "system/filesys.h"
 
+#undef malloc
+
 /*
    The idea is that this file will eventually have wrappers around all
    important system calls in samba. The aims are:
@@ -37,6 +39,42 @@
      expansions/etc make sense to the OS should be acceptable to Samba.
 */
 
+/*******************************************************************
+ A wrapper for memalign
+********************************************************************/
+
+void *sys_memalign( size_t align, size_t size )
+{
+#if defined(HAVE_POSIX_MEMALIGN)
+	void *p = NULL;
+	int ret = posix_memalign( &p, align, size );
+	if ( ret == 0 )
+		return p;
+
+	return NULL;
+#elif defined(HAVE_MEMALIGN)
+	return memalign( align, size );
+#else
+	/* On *BSD systems memaligns doesn't exist, but memory will
+	 * be aligned on allocations of > pagesize. */
+#if defined(SYSCONF_SC_PAGESIZE)
+	size_t pagesize = (size_t)sysconf(_SC_PAGESIZE);
+#elif defined(HAVE_GETPAGESIZE)
+	size_t pagesize = (size_t)getpagesize();
+#else
+	size_t pagesize = (size_t)-1;
+#endif
+	if (pagesize == (size_t)-1) {
+		DEBUG(0,("memalign functionalaity not available on this platform!\n"));
+		return NULL;
+	}
+	if (size < pagesize) {
+		size = pagesize;
+	}
+	return malloc(size);
+#endif
+}
+
 /**************************************************************************
 A wrapper for gethostbyname() that tries avoids looking up hostnames 
 in the root domain, which can cause dial-on-demand links to come up for no
@@ -117,3 +155,76 @@ _PUBLIC_ pid_t sys_getpid(void)
 
 	return mypid;
 }
+
+
+_PUBLIC_ int sys_getpeereid( int s, uid_t *uid)
+{
+#if defined(HAVE_PEERCRED)
+	struct ucred cred;
+	socklen_t cred_len = sizeof(struct ucred);
+	int ret;
+
+	ret = getsockopt(s, SOL_SOCKET, SO_PEERCRED, (void *)&cred, &cred_len);
+	if (ret != 0) {
+		return -1;
+	}
+
+	if (cred_len != sizeof(struct ucred)) {
+		errno = EINVAL;
+		return -1;
+	}
+
+	*uid = cred.uid;
+	return 0;
+#else
+#if defined(HAVE_GETPEEREID)
+	gid_t gid;
+	return getpeereid(s, uid, &gid);
+#endif
+	errno = ENOSYS;
+	return -1;
+#endif
+}
+
+_PUBLIC_ int sys_getnameinfo(const struct sockaddr *psa,
+			     int salen,
+			     char *host,
+			     size_t hostlen,
+			     char *service,
+			     size_t servlen,
+			     int flags)
+{
+	/*
+	 * For Solaris we must make sure salen is the
+	 * correct length for the incoming sa_family.
+	 */
+
+	if (salen == sizeof(struct sockaddr_storage)) {
+		salen = sizeof(struct sockaddr_in);
+#if defined(HAVE_IPV6)
+		if (psa->sa_family == AF_INET6) {
+			salen = sizeof(struct sockaddr_in6);
+		}
+#endif
+	}
+	return getnameinfo(psa, salen, host, hostlen, service, servlen, flags);
+}
+
+_PUBLIC_ int sys_connect(int fd, const struct sockaddr * addr)
+{
+	socklen_t salen = (socklen_t)-1;
+
+	if (addr->sa_family == AF_INET) {
+	    salen = sizeof(struct sockaddr_in);
+	} else if (addr->sa_family == AF_UNIX) {
+	    salen = sizeof(struct sockaddr_un);
+	}
+#if defined(HAVE_IPV6)
+	else if (addr->sa_family == AF_INET6) {
+	    salen = sizeof(struct sockaddr_in6);
+	}
+#endif
+
+	return connect(fd, addr, salen);
+}
+
diff --git a/lib/util/talloc_stack.c b/lib/util/talloc_stack.c
index 8e559cc20f..16e9d745d3 100644
--- a/lib/util/talloc_stack.c
+++ b/lib/util/talloc_stack.c
@@ -188,3 +188,20 @@ TALLOC_CTX *talloc_tos(void)
 
 	return ts->talloc_stack[ts->talloc_stacksize-1];
 }
+
+/*
+ * return true if a talloc stackframe exists
+ * this can be used to prevent memory leaks for code that can
+ * optionally use a talloc stackframe (eg. nt_errstr())
+ */
+
+bool talloc_stackframe_exists(void)
+{
+	struct talloc_stackframe *ts =
+		(struct talloc_stackframe *)SMB_THREAD_GET_TLS(global_ts);
+
+	if (ts == NULL || ts->talloc_stacksize == 0) {
+		return false;
+	}
+	return true;
+}
diff --git a/lib/util/talloc_stack.h b/lib/util/talloc_stack.h
index 0e8fab3759..ec0c1c6f37 100644
--- a/lib/util/talloc_stack.h
+++ b/lib/util/talloc_stack.h
@@ -53,4 +53,12 @@ TALLOC_CTX *talloc_stackframe_pool(size_t poolsize);
 
 TALLOC_CTX *talloc_tos(void);
 
+/*
+ * return true if a talloc stackframe exists
+ * this can be used to prevent memory leaks for code that can
+ * optionally use a talloc stackframe (eg. nt_errstr())
+ */
+
+bool talloc_stackframe_exists(void);
+
 #endif
diff --git a/lib/util/tdb_wrap.c b/lib/util/tdb_wrap.c
new file mode 100644
index 0000000000..71aea5e36c
--- /dev/null
+++ b/lib/util/tdb_wrap.c
@@ -0,0 +1,215 @@
+/* 
+   Unix SMB/CIFS implementation.
+   TDB wrap functions
+
+   Copyright (C) Andrew Tridgell 2004
+   Copyright (C) Jelmer Vernooij <jelmer@samba.org> 2007
+   
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "includes.h"
+#include "lib/util/dlinklist.h"
+#include "lib/util/tdb_wrap.h"
+
+/* FIXME: TDB2 does this internally, so no need to wrap multiple opens! */
+#if BUILD_TDB2
+static void tdb_wrap_log(struct tdb_context *tdb,
+			 enum tdb_log_level level,
+			 const char *message,
+			 void *unused)
+{
+	int dl;
+	const char *name = tdb_name(tdb);
+
+	switch (level) {
+	case TDB_LOG_USE_ERROR:
+	case TDB_LOG_ERROR:
+		dl = 0;
+		break;
+	case TDB_LOG_WARNING:
+		dl = 2;
+		break;
+	default:
+		dl = 0;
+	}
+
+	DEBUG(dl, ("tdb(%s): %s", name ? name : "unnamed", message));
+}
+#else
+/*
+ Log tdb messages via DEBUG().
+*/
+static void tdb_wrap_log(TDB_CONTEXT *tdb, enum tdb_debug_level level, 
+			 const char *format, ...) PRINTF_ATTRIBUTE(3,4);
+
+static void tdb_wrap_log(TDB_CONTEXT *tdb, enum tdb_debug_level level, 
+			 const char *format, ...)
+{
+	va_list ap;
+	char *ptr = NULL;
+	int debuglevel = 0;
+	int ret;
+
+	switch (level) {
+	case TDB_DEBUG_FATAL:
+		debuglevel = 0;
+		break;
+	case TDB_DEBUG_ERROR:
+		debuglevel = 1;
+		break;
+	case TDB_DEBUG_WARNING:
+		debuglevel = 2;
+		break;
+	case TDB_DEBUG_TRACE:
+		debuglevel = 5;
+		break;
+	default:
+		debuglevel = 0;
+	}		
+
+	va_start(ap, format);
+	ret = vasprintf(&ptr, format, ap);
+	va_end(ap);
+
+	if (ret != -1) {
+		const char *name = tdb_name(tdb);
+		DEBUG(debuglevel, ("tdb(%s): %s", name ? name : "unnamed", ptr));
+		free(ptr);
+	}
+}
+#endif
+
+struct tdb_wrap_private {
+	struct tdb_context *tdb;
+	const char *name;
+	struct tdb_wrap_private *next, *prev;
+};
+
+static struct tdb_wrap_private *tdb_list;
+
+/* destroy the last connection to a tdb */
+static int tdb_wrap_private_destructor(struct tdb_wrap_private *w)
+{
+	tdb_close(w->tdb);
+	DLIST_REMOVE(tdb_list, w);
+	return 0;
+}				 
+
+static struct tdb_wrap_private *tdb_wrap_private_open(TALLOC_CTX *mem_ctx,
+						      const char *name,
+						      int hash_size,
+						      int tdb_flags,
+						      int open_flags,
+						      mode_t mode)
+{
+	struct tdb_wrap_private *result;
+
+	result = talloc(mem_ctx, struct tdb_wrap_private);
+	if (result == NULL) {
+		return NULL;
+	}
+	result->name = talloc_strdup(result, name);
+	if (result->name == NULL) {
+		goto fail;
+	}
+
+#if _SAMBA_BUILD_ == 3	
+	/* This #if _SAMBA_BUILD == 3 is very unfortunate, as it means
+	 * that in the top level build, these options are not
+	 * available for these databases.  However, having two
+	 * different tdb_wrap lists is a worse fate, so this will do
+	 * for now */
+
+	if (!lp_use_mmap()) {
+		tdb_flags |= TDB_NOMMAP;
+	}
+
+	if ((hash_size == 0) && (name != NULL)) {
+		const char *base;
+		base = strrchr_m(name, '/');
+
+		if (base != NULL) {
+			base += 1;
+		} else {
+			base = name;
+		}
+		hash_size = lp_parm_int(-1, "tdb_hashsize", base, 0);
+	}
+#endif
+
+	result->tdb = tdb_open_compat(name, hash_size, tdb_flags,
+				      open_flags, mode, tdb_wrap_log, NULL);
+	if (result->tdb == NULL) {
+		goto fail;
+	}
+	talloc_set_destructor(result, tdb_wrap_private_destructor);
+	DLIST_ADD(tdb_list, result);
+	return result;
+
+fail:
+	TALLOC_FREE(result);
+	return NULL;
+}
+
+/*
+  wrapped connection to a tdb database
+  to close just talloc_free() the tdb_wrap pointer
+ */
+struct tdb_wrap *tdb_wrap_open(TALLOC_CTX *mem_ctx,
+			       const char *name, int hash_size, int tdb_flags,
+			       int open_flags, mode_t mode)
+{
+	struct tdb_wrap *result;
+	struct tdb_wrap_private *w;
+
+	result = talloc(mem_ctx, struct tdb_wrap);
+	if (result == NULL) {
+		return NULL;
+	}
+
+	for (w=tdb_list;w;w=w->next) {
+		if (strcmp(name, w->name) == 0) {
+			break;
+		}
+	}
+
+	if (w == NULL) {
+		w = tdb_wrap_private_open(result, name, hash_size, tdb_flags,
+					  open_flags, mode);
+	} else {
+		/*
+		 * Correctly use talloc_reference: The tdb will be
+		 * closed when "w" is being freed. The caller never
+		 * sees "w", so an incorrect use of talloc_free(w)
+		 * instead of calling talloc_unlink is not possible.
+		 * To avoid having to refcount ourselves, "w" will
+		 * have multiple parents that hang off all the
+		 * tdb_wrap's being returned from here. Those parents
+		 * can be freed without problem.
+		 */
+		if (talloc_reference(result, w) == NULL) {
+			goto fail;
+		}
+	}
+	if (w == NULL) {
+		goto fail;
+	}
+	result->tdb = w->tdb;
+	return result;
+fail:
+	TALLOC_FREE(result);
+	return NULL;
+}
+
diff --git a/lib/util/tdb_wrap.h b/lib/util/tdb_wrap.h
new file mode 100644
index 0000000000..6f9f3834d4
--- /dev/null
+++ b/lib/util/tdb_wrap.h
@@ -0,0 +1,42 @@
+/* 
+   Unix SMB/CIFS implementation.
+
+   database wrap headers
+
+   Copyright (C) Andrew Tridgell 2004
+   
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/* IMPORTANT: tdb_wrap should be always preferred over tdb_context for end consumer functions
+   it's because if the code will be running inside smbd, then we must use the linked list
+   of open tdb files, to determine if the tdb we desire is already open
+   as otherwise, when you close the tdb (even on a different file descriptor),
+   ALL LOCKS are lost (due to a real screwup in the POSIX specification that nobody has been able to get fixed)
+*/
+
+#ifndef _TDB_WRAP_H_
+#define _TDB_WRAP_H_
+
+#include "tdb_compat.h"
+
+struct tdb_wrap {
+	struct tdb_context *tdb;
+};
+
+struct tdb_wrap *tdb_wrap_open(TALLOC_CTX *mem_ctx,
+			       const char *name, int hash_size, int tdb_flags,
+			       int open_flags, mode_t mode);
+
+#endif /* _TDB_WRAP_H_ */
diff --git a/lib/util/tests/asn1_tests.c b/lib/util/tests/asn1_tests.c
index ac8ca538f8..3ee64c3f7a 100644
--- a/lib/util/tests/asn1_tests.c
+++ b/lib/util/tests/asn1_tests.c
@@ -4,6 +4,8 @@
    util_asn1 testing
 
    Copyright (C) Kamen Mazdrashki <kamen.mazdrashki@postpath.com> 2009
+   Copyright (C) Volker Lendecke 2004
+   Copyright (C) Andrew Bartlett 2011
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -103,6 +105,55 @@ static const struct oid_data partial_oid_data_ok[] = {
 	},
 };
 
+static const struct {
+	DATA_BLOB blob;
+	int value;
+} integer_tests[] = {
+        {
+		.blob = {"\x02\x01\x00", 3},
+		.value = 0
+	},
+	{
+		.blob = {"\x02\x01\x7f", 3},
+		.value = 127
+	},
+	{
+		.blob = {"\x02\x02\x00\x80", 4},
+		.value = 128
+	},
+	{
+		.blob = {"\x02\x02\x01\x00", 4},
+		.value = 256
+	},
+	{
+		.blob = {"\x02\x01\x80", 3},
+		.value = -128
+	},
+	{
+		.blob = {"\x02\x02\xff\x7f", 4},
+		.value = -129
+	},
+	{
+		.blob = {"\x02\x01\xff", 3},
+		.value = -1
+	},
+	{
+		.blob = {"\x02\x02\xff\x01", 4},
+		.value = -255
+	},
+	{
+		.blob = {"\x02\x02\x00\xff", 4},
+		.value = 255
+	},
+	{
+		.blob = {"\x02\x04\x80\x00\x00\x00", 6},
+		.value = 0x80000000
+	},
+	{
+		.blob = {"\x02\x04\x7f\xff\xff\xff", 6},
+		.value = 0x7fffffff
+	}
+};
 
 /* Testing ber_write_OID_String() function */
 static bool test_ber_write_OID_String(struct torture_context *tctx)
@@ -260,6 +311,46 @@ static bool test_ber_read_partial_OID_String(struct torture_context *tctx)
 	return true;
 }
 
+/*
+ * Testing asn1_read_Integer and asn1_write_Integer functions,
+ * inspired by Love Hornquist Astrand
+ */
+
+static bool test_asn1_Integer(struct torture_context *tctx)
+{
+	int i;
+	TALLOC_CTX *mem_ctx;
+
+	mem_ctx = talloc_new(tctx);
+
+	for (i = 0; i < ARRAY_SIZE(integer_tests); i++) {
+		ASN1_DATA *data;
+		DATA_BLOB blob;
+		int val;
+
+		data = asn1_init(mem_ctx);
+		if (!data) {
+			return -1;
+		}
+
+		asn1_write_Integer(data, integer_tests[i].value);
+
+		blob.data = data->data;
+		blob.length = data->length;
+		torture_assert_data_blob_equal(tctx, blob, integer_tests[i].blob, "asn1_write_Integer gave incorrect result");
+
+		asn1_load(data, blob);
+		torture_assert(tctx, asn1_read_Integer(data, &val), "asn1_write_Integer output could not be read by asn1_read_Integer()");
+
+		torture_assert_int_equal(tctx, val, integer_tests[i].value,
+			"readback of asn1_write_Integer output by asn1_read_Integer() failed");
+	}
+
+	talloc_free(mem_ctx);
+
+	return true;
+}
+
 
 /* LOCAL-ASN1 test suite creation */
 struct torture_suite *torture_local_util_asn1(TALLOC_CTX *mem_ctx)
@@ -278,5 +369,8 @@ struct torture_suite *torture_local_util_asn1(TALLOC_CTX *mem_ctx)
 	torture_suite_add_simple_test(suite, "ber_read_partial_OID_String",
 				      test_ber_read_partial_OID_String);
 
+	torture_suite_add_simple_test(suite, "asn1_Integer",
+				      test_asn1_Integer);
+
 	return suite;
 }
diff --git a/lib/util/tests/str.c b/lib/util/tests/str.c
index 6b38feaf43..f9f3abf731 100644
--- a/lib/util/tests/str.c
+++ b/lib/util/tests/str.c
@@ -25,7 +25,7 @@
 static bool test_string_sub_simple(struct torture_context *tctx)
 {
 	char tmp[100];
-	safe_strcpy(tmp, "foobar", sizeof(tmp));
+	strlcpy(tmp, "foobar", sizeof(tmp));
 	string_sub(tmp, "foo", "bar", sizeof(tmp));
 	torture_assert_str_equal(tctx, tmp, "barbar", "invalid sub");
 	return true;
@@ -34,7 +34,7 @@ static bool test_string_sub_simple(struct torture_context *tctx)
 static bool test_string_sub_multiple(struct torture_context *tctx)
 {
 	char tmp[100];
-	safe_strcpy(tmp, "fooblafoo", sizeof(tmp));
+	strlcpy(tmp, "fooblafoo", sizeof(tmp));
 	string_sub(tmp, "foo", "bar", sizeof(tmp));
 	torture_assert_str_equal(tctx, tmp, "barblabar", "invalid sub");
 	return true;
@@ -43,7 +43,7 @@ static bool test_string_sub_multiple(struct torture_context *tctx)
 static bool test_string_sub_longer(struct torture_context *tctx)
 {
 	char tmp[100];
-	safe_strcpy(tmp, "foobla", sizeof(tmp));
+	strlcpy(tmp, "foobla", sizeof(tmp));
 	string_sub(tmp, "foo", "blie", sizeof(tmp));
 	torture_assert_str_equal(tctx, tmp, "bliebla", "invalid sub");
 	return true;
@@ -52,7 +52,7 @@ static bool test_string_sub_longer(struct torture_context *tctx)
 static bool test_string_sub_shorter(struct torture_context *tctx)
 {
 	char tmp[100];
-	safe_strcpy(tmp, "foobla", sizeof(tmp));
+	strlcpy(tmp, "foobla", sizeof(tmp));
 	string_sub(tmp, "foo", "bl", sizeof(tmp));
 	torture_assert_str_equal(tctx, tmp, "blbla", "invalid sub");
 	return true;
@@ -61,7 +61,7 @@ static bool test_string_sub_shorter(struct torture_context *tctx)
 static bool test_string_sub_special_char(struct torture_context *tctx)
 {
 	char tmp[100];
-	safe_strcpy(tmp, "foobla", sizeof(tmp));
+	strlcpy(tmp, "foobla", sizeof(tmp));
 	string_sub(tmp, "foo", "%b;l", sizeof(tmp));
 	torture_assert_str_equal(tctx, tmp, "_b_lbla", "invalid sub");
 	return true;
diff --git a/lib/util/tests/time.c b/lib/util/tests/time.c
index 592f88f88b..a8b26762e3 100644
--- a/lib/util/tests/time.c
+++ b/lib/util/tests/time.c
@@ -81,29 +81,11 @@ static bool test_timestring(struct torture_context *tctx)
 	return true;
 }
 
-static bool test_get_time_zone(struct torture_context *tctx)
-{
-	time_t t = time(NULL);
-	int old_extra_time_offset = extra_time_offset;
-	int old_offset, new_offset;
-	/* test that extra_time_offset works */
-
-	old_offset = get_time_zone(t);
-	extra_time_offset = 42;
-	new_offset = get_time_zone(t);
-	extra_time_offset = old_extra_time_offset;
-	torture_assert_int_equal(tctx, old_offset+60*42, new_offset,
-				 "time offset not used");
-	return true;
-}
-
-
 struct torture_suite *torture_local_util_time(TALLOC_CTX *mem_ctx)
 {
 	struct torture_suite *suite = torture_suite_create(mem_ctx, "time");
 
 	torture_suite_add_simple_test(suite, "null_time", test_null_time);
-	torture_suite_add_simple_test(suite, "get_time_zone", test_get_time_zone);
 	torture_suite_add_simple_test(suite, "null_nttime", test_null_nttime);
 	torture_suite_add_simple_test(suite, "http_timestring", 
 								  test_http_timestring);
diff --git a/lib/util/time.c b/lib/util/time.c
index 4843fc9697..31aa05cd0f 100644
--- a/lib/util/time.c
+++ b/lib/util/time.c
@@ -580,6 +580,24 @@ _PUBLIC_ struct timeval timeval_current_ofs(uint32_t secs, uint32_t usecs)
 }
 
 /**
+  return a timeval milliseconds into the future
+*/
+_PUBLIC_ struct timeval timeval_current_ofs_msec(uint32_t msecs)
+{
+	struct timeval tv = timeval_current();
+	return timeval_add(&tv, msecs / 1000, (msecs % 1000) * 1000);
+}
+
+/**
+  return a timeval microseconds into the future
+*/
+_PUBLIC_ struct timeval timeval_current_ofs_usec(uint32_t usecs)
+{
+	struct timeval tv = timeval_current();
+	return timeval_add(&tv, usecs / 1000000, usecs % 1000000);
+}
+
+/**
   compare two timeval structures. 
   Return -1 if tv1 < tv2
   Return 0 if tv1 == tv2
@@ -720,8 +738,6 @@ static int tm_diff(struct tm *a, struct tm *b)
 }
 
 
-int extra_time_offset=0;
-
 /**
   return the UTC offset in seconds west of UTC, or 0 if it cannot be determined
  */
@@ -735,7 +751,7 @@ _PUBLIC_ int get_time_zone(time_t t)
 	tm = localtime(&t);
 	if (!tm)
 		return 0;
-	return tm_diff(&tm_utc,tm)+60*extra_time_offset;
+	return tm_diff(&tm_utc,tm);
 }
 
 struct timespec nt_time_to_unix_timespec(NTTIME *nt)
diff --git a/lib/util/time.h b/lib/util/time.h
index 3a406340f4..204c261c1d 100644
--- a/lib/util/time.h
+++ b/lib/util/time.h
@@ -1,7 +1,12 @@
 /* 
    Unix SMB/CIFS implementation.
    time utility functions
-   
+
+   Copyright (C) Andrew Tridgell 		1992-2004
+   Copyright (C) Stefan (metze) Metzmacher	2002
+   Copyright (C) Jeremy Allison			2007
+   Copyright (C) Andrew Bartlett                2011
+
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 3 of the License, or
@@ -213,6 +218,16 @@ struct timeval timeval_sum(const struct timeval *tv1,
 _PUBLIC_ struct timeval timeval_current_ofs(uint32_t secs, uint32_t usecs);
 
 /**
+  return a timeval milliseconds into the future
+*/
+_PUBLIC_ struct timeval timeval_current_ofs_msec(uint32_t msecs);
+
+/**
+  return a timeval microseconds into the future
+*/
+_PUBLIC_ struct timeval timeval_current_ofs_usec(uint32_t usecs);
+
+/**
   compare two timeval structures. 
   Return -1 if tv1 < tv2
   Return 0 if tv1 == tv2
@@ -285,7 +300,4 @@ struct timespec convert_time_t_to_timespec(time_t t);
 
 bool null_timespec(struct timespec ts);
 
-/** Extra minutes to add to the normal GMT to local time conversion. */
-extern int extra_time_offset;
-
 #endif /* _SAMBA_TIME_H_ */
diff --git a/lib/util/util.c b/lib/util/util.c
index d4a936fae9..7f30d436e8 100644
--- a/lib/util/util.c
+++ b/lib/util/util.c
@@ -152,7 +152,8 @@ _PUBLIC_ bool directory_create_or_exist(const char *dname, uid_t uid,
 		}
 		if ((st.st_mode & 0777) != dir_perms) {
 			DEBUG(0, ("invalid permissions on directory "
-				  "%s\n", dname));
+				  "'%s': has 0%o should be 0%o\n", dname,
+				  (st.st_mode & 0777), dir_perms));
 			umask(old_umask);
 			return false;
 		}
diff --git a/lib/util/util.h b/lib/util/util.h
index 45779912f3..c715440186 100644
--- a/lib/util/util.h
+++ b/lib/util/util.h
@@ -62,6 +62,8 @@ extern const char *panic_action;
 
 #include "lib/util/memory.h"
 
+#include "lib/util/string_wrappers.h"
+
 /**
  * Write backtrace to debug log
  */
@@ -113,6 +115,8 @@ void CatchChildLeaveStatus(void);
 
 /* The following definitions come from lib/util/system.c  */
 
+void *sys_memalign( size_t align, size_t size );
+
 /**************************************************************************
 A wrapper for gethostbyname() that tries avoids looking up hostnames 
 in the root domain, which can cause dial-on-demand links to come up for no
@@ -131,8 +135,20 @@ _PUBLIC_ pid_t sys_fork(void);
  **/
 _PUBLIC_ pid_t sys_getpid(void);
 
-/* The following definitions come from lib/util/genrand.c  */
+_PUBLIC_ int sys_getpeereid( int s, uid_t *uid);
+
+struct sockaddr;
+
+_PUBLIC_ int sys_getnameinfo(const struct sockaddr *psa,
+			     int salen,
+			     char *host,
+			     size_t hostlen,
+			     char *service,
+			     size_t servlen,
+			     int flags);
+_PUBLIC_ int sys_connect(int fd, const struct sockaddr * addr);
 
+/* The following definitions come from lib/util/genrand.c  */
 /**
  Copy any user given reseed data.
 **/
@@ -195,14 +211,10 @@ _PUBLIC_ char** generate_unique_strs(TALLOC_CTX *mem_ctx, size_t len,
                                          uint32_t num);
 
 /* The following definitions come from lib/util/dprintf.c  */
-#if _SAMBA_BUILD_ == 4
 
-_PUBLIC_ void d_set_iconv(smb_iconv_t);
-_PUBLIC_ int d_vfprintf(FILE *f, const char *format, va_list ap) PRINTF_ATTRIBUTE(2,0);
 _PUBLIC_ int d_fprintf(FILE *f, const char *format, ...) PRINTF_ATTRIBUTE(2,3);
 _PUBLIC_ int d_printf(const char *format, ...) PRINTF_ATTRIBUTE(1,2);
 _PUBLIC_ void display_set_stderr(void);
-#endif
 
 /* The following definitions come from lib/util/util_str.c  */
 
@@ -233,18 +245,6 @@ _PUBLIC_ bool trim_string(char *s, const char *front, const char *back);
 _PUBLIC_ _PURE_ size_t count_chars(const char *s, char c);
 
 /**
- Safe string copy into a known length string. maxlength does not
- include the terminating zero.
-**/
-_PUBLIC_ char *safe_strcpy(char *dest,const char *src, size_t maxlength);
-
-/**
- Safe string cat into a string. maxlength does not
- include the terminating zero.
-**/
-_PUBLIC_ char *safe_strcat(char *dest, const char *src, size_t maxlength);
-
-/**
  Routine to get hex characters and turn them into a 16 byte array.
  the array can be variable length, and any non-hex-numeric
  characters are skipped.  "0xnn" or "0Xnn" is specially catered
@@ -284,6 +284,8 @@ _PUBLIC_ char *hex_encode_talloc(TALLOC_CTX *mem_ctx, const unsigned char *buff_
 **/
 _PUBLIC_ void string_sub(char *s,const char *pattern, const char *insert, size_t len);
 
+_PUBLIC_ void string_sub_once(char *s, const char *pattern,
+			      const char *insert, size_t len);
 
 _PUBLIC_ char *string_sub_talloc(TALLOC_CTX *mem_ctx, const char *s, 
 				const char *pattern, const char *insert);
@@ -369,12 +371,10 @@ _PUBLIC_ bool set_boolean(const char *boolean_string, bool *boolean);
  */
 _PUBLIC_ bool conv_str_bool(const char * str, bool * val);
 
-#if _SAMBA_BUILD_ == 4
 /**
  * Convert a size specification like 16K into an integral number of bytes. 
  **/
-_PUBLIC_ bool conv_str_size(const char * str, uint64_t * val);
-#endif
+_PUBLIC_ bool conv_str_size_error(const char * str, uint64_t * val);
 
 /**
  * Parse a uint64_t value from a string
@@ -775,11 +775,12 @@ enum protocol_types {
 	PROTOCOL_SMB2
 };
 
-int ms_fnmatch(const char *pattern, const char *string, enum protocol_types protocol);
+#endif
+
+int ms_fnmatch_protocol(const char *pattern, const char *string, int protocol);
 
 /** a generic fnmatch function - uses for non-CIFS pattern matching */
 int gen_fnmatch(const char *pattern, const char *string);
-#endif
 
 /* The following definitions come from lib/util/idtree.c  */
 
@@ -886,4 +887,32 @@ int samba_runcmd_recv(struct tevent_req *req, int *perrno);
 void samba_start_debugger(void);
 #endif
 
+/**
+ * @brief Returns an absolute path to a file in the Samba modules directory.
+ *
+ * @param name File to find, relative to MODULESDIR.
+ *
+ * @retval Pointer to a string containing the full path.
+ **/
+char *modules_path(TALLOC_CTX *mem_ctx, const char *name);
+
+/**
+ * @brief Returns an absolute path to a file in the Samba data directory.
+ *
+ * @param name File to find, relative to CODEPAGEDIR.
+ *
+ * @retval Pointer to a talloc'ed string containing the full path.
+ **/
+char *data_path(TALLOC_CTX *mem_ctx, const char *name);
+
+/**
+ * @brief Returns the platform specific shared library extension.
+ *
+ * @retval Pointer to a const char * containing the extension.
+ **/
+const char *shlib_ext(void);
+
+struct server_id;
+char *server_id_str(TALLOC_CTX *mem_ctx, const struct server_id *id);
+
 #endif /* _SAMBA_UTIL_H_ */
diff --git a/lib/util/util_ldb.h b/lib/util/util_ldb.h
index d2bc3b0ff7..66916443c3 100644
--- a/lib/util/util_ldb.h
+++ b/lib/util/util_ldb.h
@@ -1,3 +1,26 @@
+/*
+   Unix SMB/CIFS implementation.
+
+   common share info functions
+
+   Copyright (C) Andrew Tridgell 2004
+   Copyright (C) Tim Potter 2004
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+
 #ifndef __LIB_UTIL_UTIL_LDB_H__
 #define __LIB_UTIL_UTIL_LDB_H__
 
diff --git a/lib/util/util_net.c b/lib/util/util_net.c
index 9c8f5c6d47..64aa674d8b 100644
--- a/lib/util/util_net.c
+++ b/lib/util/util_net.c
@@ -54,6 +54,15 @@ bool interpret_string_addr_internal(struct addrinfo **ppres,
 
 	/* By default make sure it supports TCP. */
 	hints.ai_socktype = SOCK_STREAM;
+
+	/* always try as a numeric host first. This prevents unnecessary name
+	 * lookups, and also ensures we accept IPv6 addresses */
+	hints.ai_flags = AI_PASSIVE | AI_NUMERICHOST;
+	ret = getaddrinfo(str, NULL, &hints, ppres);
+	if (ret == 0) {
+		return true;
+	}
+
 	hints.ai_flags = flags;
 
 	/* Linux man page on getaddrinfo() says port will be
@@ -297,10 +306,10 @@ bool is_ipaddress_v4(const char *str)
 }
 
 /**
- * Return true if a string could be an IPv4 or IPv6 address.
+ * Return true if a string could be a IPv6 address.
  */
 
-bool is_ipaddress(const char *str)
+bool is_ipaddress_v6(const char *str)
 {
 #if defined(HAVE_IPV6)
 	int ret = -1;
@@ -328,7 +337,16 @@ bool is_ipaddress(const char *str)
 		}
 	}
 #endif
-	return is_ipaddress_v4(str);
+	return false;
+}
+
+/**
+ * Return true if a string could be an IPv4 or IPv6 address.
+ */
+
+bool is_ipaddress(const char *str)
+{
+	return is_ipaddress_v4(str) || is_ipaddress_v6(str);
 }
 
 /**
@@ -405,7 +423,7 @@ bool is_zero_addr(const struct sockaddr_storage *pss)
  */
 void zero_ip_v4(struct in_addr *ip)
 {
-	memset(ip, '\0', sizeof(struct in_addr));
+	ZERO_STRUCTP(ip);
 }
 
 /**
@@ -415,7 +433,7 @@ void in_addr_to_sockaddr_storage(struct sockaddr_storage *ss,
 		struct in_addr ip)
 {
 	struct sockaddr_in *sa = (struct sockaddr_in *)ss;
-	memset(ss, '\0', sizeof(*ss));
+	ZERO_STRUCTP(ss);
 	sa->sin_family = AF_INET;
 	sa->sin_addr = ip;
 }
@@ -540,3 +558,319 @@ void set_sockaddr_port(struct sockaddr *psa, uint16_t port)
 }
 
 
+/****************************************************************************
+ Get a port number in host byte order from a sockaddr_storage.
+****************************************************************************/
+
+uint16_t get_sockaddr_port(const struct sockaddr_storage *pss)
+{
+	uint16_t port = 0;
+
+	if (pss->ss_family != AF_INET) {
+#if defined(HAVE_IPV6)
+		/* IPv6 */
+		const struct sockaddr_in6 *sa6 =
+			(const struct sockaddr_in6 *)pss;
+		port = ntohs(sa6->sin6_port);
+#endif
+	} else {
+		const struct sockaddr_in *sa =
+			(const struct sockaddr_in *)pss;
+		port = ntohs(sa->sin_port);
+	}
+	return port;
+}
+
+/****************************************************************************
+ Print out an IPv4 or IPv6 address from a struct sockaddr_storage.
+****************************************************************************/
+
+char *print_sockaddr_len(char *dest,
+			 size_t destlen,
+			const struct sockaddr *psa,
+			socklen_t psalen)
+{
+	if (destlen > 0) {
+		dest[0] = '\0';
+	}
+	(void)sys_getnameinfo(psa,
+			psalen,
+			dest, destlen,
+			NULL, 0,
+			NI_NUMERICHOST);
+	return dest;
+}
+
+/****************************************************************************
+ Print out an IPv4 or IPv6 address from a struct sockaddr_storage.
+****************************************************************************/
+
+char *print_sockaddr(char *dest,
+			size_t destlen,
+			const struct sockaddr_storage *psa)
+{
+	return print_sockaddr_len(dest, destlen, (const struct sockaddr *)psa,
+			sizeof(struct sockaddr_storage));
+}
+
+/****************************************************************************
+ Print out a canonical IPv4 or IPv6 address from a struct sockaddr_storage.
+****************************************************************************/
+
+char *print_canonical_sockaddr(TALLOC_CTX *ctx,
+			const struct sockaddr_storage *pss)
+{
+	char addr[INET6_ADDRSTRLEN];
+	char *dest = NULL;
+	int ret;
+
+	/* Linux getnameinfo() man pages says port is unitialized if
+	   service name is NULL. */
+
+	ret = sys_getnameinfo((const struct sockaddr *)pss,
+			sizeof(struct sockaddr_storage),
+			addr, sizeof(addr),
+			NULL, 0,
+			NI_NUMERICHOST);
+	if (ret != 0) {
+		return NULL;
+	}
+
+	if (pss->ss_family != AF_INET) {
+#if defined(HAVE_IPV6)
+		dest = talloc_asprintf(ctx, "[%s]", addr);
+#else
+		return NULL;
+#endif
+	} else {
+		dest = talloc_asprintf(ctx, "%s", addr);
+	}
+
+	return dest;
+}
+
+/****************************************************************************
+ Return the port number we've bound to on a socket.
+****************************************************************************/
+
+int get_socket_port(int fd)
+{
+	struct sockaddr_storage sa;
+	socklen_t length = sizeof(sa);
+
+	if (fd == -1) {
+		return -1;
+	}
+
+	if (getsockname(fd, (struct sockaddr *)&sa, &length) < 0) {
+		int level = (errno == ENOTCONN) ? 2 : 0;
+		DEBUG(level, ("getsockname failed. Error was %s\n",
+			       strerror(errno)));
+		return -1;
+	}
+
+#if defined(HAVE_IPV6)
+	if (sa.ss_family == AF_INET6) {
+		return ntohs(((struct sockaddr_in6 *)&sa)->sin6_port);
+	}
+#endif
+	if (sa.ss_family == AF_INET) {
+		return ntohs(((struct sockaddr_in *)&sa)->sin_port);
+	}
+	return -1;
+}
+
+/****************************************************************************
+ Return the string of an IP address (IPv4 or IPv6).
+****************************************************************************/
+
+static const char *get_socket_addr(int fd, char *addr_buf, size_t addr_len)
+{
+	struct sockaddr_storage sa;
+	socklen_t length = sizeof(sa);
+
+	/* Ok, returning a hard coded IPv4 address
+	 * is bogus, but it's just as bogus as a
+	 * zero IPv6 address. No good choice here.
+	 */
+
+	strlcpy(addr_buf, "0.0.0.0", addr_len);
+
+	if (fd == -1) {
+		return addr_buf;
+	}
+
+	if (getsockname(fd, (struct sockaddr *)&sa, &length) < 0) {
+		DEBUG(0,("getsockname failed. Error was %s\n",
+			strerror(errno) ));
+		return addr_buf;
+	}
+
+	return print_sockaddr_len(addr_buf, addr_len, (struct sockaddr *)&sa, length);
+}
+
+const char *client_socket_addr(int fd, char *addr, size_t addr_len)
+{
+	return get_socket_addr(fd, addr, addr_len);
+}
+
+
+enum SOCK_OPT_TYPES {OPT_BOOL,OPT_INT,OPT_ON};
+
+typedef struct smb_socket_option {
+	const char *name;
+	int level;
+	int option;
+	int value;
+	int opttype;
+} smb_socket_option;
+
+static const smb_socket_option socket_options[] = {
+  {"SO_KEEPALIVE", SOL_SOCKET, SO_KEEPALIVE, 0, OPT_BOOL},
+  {"SO_REUSEADDR", SOL_SOCKET, SO_REUSEADDR, 0, OPT_BOOL},
+  {"SO_BROADCAST", SOL_SOCKET, SO_BROADCAST, 0, OPT_BOOL},
+#ifdef TCP_NODELAY
+  {"TCP_NODELAY", IPPROTO_TCP, TCP_NODELAY, 0, OPT_BOOL},
+#endif
+#ifdef TCP_KEEPCNT
+  {"TCP_KEEPCNT", IPPROTO_TCP, TCP_KEEPCNT, 0, OPT_INT},
+#endif
+#ifdef TCP_KEEPIDLE
+  {"TCP_KEEPIDLE", IPPROTO_TCP, TCP_KEEPIDLE, 0, OPT_INT},
+#endif
+#ifdef TCP_KEEPINTVL
+  {"TCP_KEEPINTVL", IPPROTO_TCP, TCP_KEEPINTVL, 0, OPT_INT},
+#endif
+#ifdef IPTOS_LOWDELAY
+  {"IPTOS_LOWDELAY", IPPROTO_IP, IP_TOS, IPTOS_LOWDELAY, OPT_ON},
+#endif
+#ifdef IPTOS_THROUGHPUT
+  {"IPTOS_THROUGHPUT", IPPROTO_IP, IP_TOS, IPTOS_THROUGHPUT, OPT_ON},
+#endif
+#ifdef SO_REUSEPORT
+  {"SO_REUSEPORT", SOL_SOCKET, SO_REUSEPORT, 0, OPT_BOOL},
+#endif
+#ifdef SO_SNDBUF
+  {"SO_SNDBUF", SOL_SOCKET, SO_SNDBUF, 0, OPT_INT},
+#endif
+#ifdef SO_RCVBUF
+  {"SO_RCVBUF", SOL_SOCKET, SO_RCVBUF, 0, OPT_INT},
+#endif
+#ifdef SO_SNDLOWAT
+  {"SO_SNDLOWAT", SOL_SOCKET, SO_SNDLOWAT, 0, OPT_INT},
+#endif
+#ifdef SO_RCVLOWAT
+  {"SO_RCVLOWAT", SOL_SOCKET, SO_RCVLOWAT, 0, OPT_INT},
+#endif
+#ifdef SO_SNDTIMEO
+  {"SO_SNDTIMEO", SOL_SOCKET, SO_SNDTIMEO, 0, OPT_INT},
+#endif
+#ifdef SO_RCVTIMEO
+  {"SO_RCVTIMEO", SOL_SOCKET, SO_RCVTIMEO, 0, OPT_INT},
+#endif
+#ifdef TCP_FASTACK
+  {"TCP_FASTACK", IPPROTO_TCP, TCP_FASTACK, 0, OPT_INT},
+#endif
+#ifdef TCP_QUICKACK
+  {"TCP_QUICKACK", IPPROTO_TCP, TCP_QUICKACK, 0, OPT_BOOL},
+#endif
+#ifdef TCP_KEEPALIVE_THRESHOLD
+  {"TCP_KEEPALIVE_THRESHOLD", IPPROTO_TCP, TCP_KEEPALIVE_THRESHOLD, 0, OPT_INT},
+#endif
+#ifdef TCP_KEEPALIVE_ABORT_THRESHOLD
+  {"TCP_KEEPALIVE_ABORT_THRESHOLD", IPPROTO_TCP, TCP_KEEPALIVE_ABORT_THRESHOLD, 0, OPT_INT},
+#endif
+  {NULL,0,0,0,0}};
+
+/****************************************************************************
+ Print socket options.
+****************************************************************************/
+
+static void print_socket_options(int s)
+{
+	int value;
+	socklen_t vlen = 4;
+	const smb_socket_option *p = &socket_options[0];
+
+	/* wrapped in if statement to prevent streams
+	 * leak in SCO Openserver 5.0 */
+	/* reported on samba-technical  --jerry */
+	if ( DEBUGLEVEL >= 5 ) {
+		DEBUG(5,("Socket options:\n"));
+		for (; p->name != NULL; p++) {
+			if (getsockopt(s, p->level, p->option,
+						(void *)&value, &vlen) == -1) {
+				DEBUGADD(5,("\tCould not test socket option %s.\n",
+							p->name));
+			} else {
+				DEBUGADD(5,("\t%s = %d\n",
+							p->name,value));
+			}
+		}
+	}
+ }
+
+/****************************************************************************
+ Set user socket options.
+****************************************************************************/
+
+void set_socket_options(int fd, const char *options)
+{
+	TALLOC_CTX *ctx = talloc_new(NULL);
+	char *tok;
+
+	while (next_token_talloc(ctx, &options, &tok," \t,")) {
+		int ret=0,i;
+		int value = 1;
+		char *p;
+		bool got_value = false;
+
+		if ((p = strchr_m(tok,'='))) {
+			*p = 0;
+			value = atoi(p+1);
+			got_value = true;
+		}
+
+		for (i=0;socket_options[i].name;i++)
+			if (strequal(socket_options[i].name,tok))
+				break;
+
+		if (!socket_options[i].name) {
+			DEBUG(0,("Unknown socket option %s\n",tok));
+			continue;
+		}
+
+		switch (socket_options[i].opttype) {
+		case OPT_BOOL:
+		case OPT_INT:
+			ret = setsockopt(fd,socket_options[i].level,
+					socket_options[i].option,
+					(char *)&value,sizeof(int));
+			break;
+
+		case OPT_ON:
+			if (got_value)
+				DEBUG(0,("syntax error - %s "
+					"does not take a value\n",tok));
+
+			{
+				int on = socket_options[i].value;
+				ret = setsockopt(fd,socket_options[i].level,
+					socket_options[i].option,
+					(char *)&on,sizeof(int));
+			}
+			break;
+		}
+
+		if (ret != 0) {
+			/* be aware that some systems like Solaris return
+			 * EINVAL to a setsockopt() call when the client
+			 * sent a RST previously - no need to worry */
+			DEBUG(2,("Failed to set socket option %s (Error %s)\n",
+				tok, strerror(errno) ));
+		}
+	}
+
+	TALLOC_FREE(ctx);
+	print_socket_options(fd);
+}
diff --git a/lib/util/util_net.h b/lib/util/util_net.h
index 530311e5c8..fc2776a32b 100644
--- a/lib/util/util_net.h
+++ b/lib/util/util_net.h
@@ -50,6 +50,15 @@ void set_sockaddr_port(struct sockaddr *psa, uint16_t port);
 **/
 _PUBLIC_ bool is_zero_ip_v4(struct in_addr ip);
 
+void in_addr_to_sockaddr_storage(struct sockaddr_storage *ss,
+				 struct in_addr ip);
+#if defined(HAVE_IPV6)
+/**
+ * Convert an IPv6 struct in_addr to a struct sockaddr_storage.
+ */
+void in6_addr_to_sockaddr_storage(struct sockaddr_storage *ss,
+				  struct in6_addr ip);
+#endif
 /**
  Are two IPs on the same subnet?
 **/
@@ -60,6 +69,11 @@ _PUBLIC_ bool same_net_v4(struct in_addr ip1,struct in_addr ip2,struct in_addr m
 **/
 _PUBLIC_ bool is_ipaddress(const char *str);
 
+bool is_broadcast_addr(const struct sockaddr *pss);
+bool is_loopback_ip_v4(struct in_addr ip);
+bool is_loopback_addr(const struct sockaddr *pss);
+bool is_zero_addr(const struct sockaddr_storage *pss);
+void zero_ip_v4(struct in_addr *ip);
 /**
  Interpret an internet address or name into an IP address in 4 byte form.
 **/
@@ -71,6 +85,30 @@ _PUBLIC_ uint32_t interpret_addr(const char *str);
 _PUBLIC_ struct in_addr interpret_addr2(const char *str);
 
 _PUBLIC_ bool is_ipaddress_v4(const char *str);
-
+_PUBLIC_ bool is_ipaddress_v6(const char *str);
+
+bool is_address_any(const struct sockaddr *psa);
+bool same_net(const struct sockaddr *ip1,
+	      const struct sockaddr *ip2,
+	      const struct sockaddr *mask);
+bool sockaddr_equal(const struct sockaddr *ip1,
+		    const struct sockaddr *ip2);
+
+bool is_address_any(const struct sockaddr *psa);
+uint16_t get_sockaddr_port(const struct sockaddr_storage *pss);
+char *print_sockaddr_len(char *dest,
+			 size_t destlen,
+			 const struct sockaddr *psa,
+			 socklen_t psalen);
+char *print_sockaddr(char *dest,
+			size_t destlen,
+			const struct sockaddr_storage *psa);
+char *print_canonical_sockaddr(TALLOC_CTX *ctx,
+			const struct sockaddr_storage *pss);
+const char *client_name(int fd);
+int get_socket_port(int fd);
+const char *client_socket_addr(int fd, char *addr, size_t addr_len);
+
+void set_socket_options(int fd, const char *options);
 
 #endif /* _SAMBA_UTIL_NET_H_ */
diff --git a/lib/util/util_paths.c b/lib/util/util_paths.c
new file mode 100644
index 0000000000..0baa6801c5
--- /dev/null
+++ b/lib/util/util_paths.c
@@ -0,0 +1,63 @@
+/* 
+   Unix SMB/CIFS implementation.
+   Samba utility functions
+   Copyright (C) Andrew Tridgell 1992-1998
+   Copyright (C) Jeremy Allison 2001-2007
+   Copyright (C) Simo Sorce 2001
+   Copyright (C) Jim McDonough <jmcd@us.ibm.com> 2003
+   Copyright (C) James Peach 2006
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "includes.h"
+#include "dynconfig/dynconfig.h"
+
+/**
+ * @brief Returns an absolute path to a file in the Samba modules directory.
+ *
+ * @param name File to find, relative to MODULESDIR.
+ *
+ * @retval Pointer to a string containing the full path.
+ **/
+
+char *modules_path(TALLOC_CTX *mem_ctx, const char *name)
+{
+	return talloc_asprintf(mem_ctx, "%s/%s", get_dyn_MODULESDIR(), name);
+}
+
+/**
+ * @brief Returns an absolute path to a file in the Samba data directory.
+ *
+ * @param name File to find, relative to CODEPAGEDIR.
+ *
+ * @retval Pointer to a talloc'ed string containing the full path.
+ **/
+
+char *data_path(TALLOC_CTX *mem_ctx, const char *name)
+{
+	return talloc_asprintf(mem_ctx, "%s/%s", get_dyn_CODEPAGEDIR(), name);
+}
+
+/**
+ * @brief Returns the platform specific shared library extension.
+ *
+ * @retval Pointer to a const char * containing the extension.
+ **/
+
+const char *shlib_ext(void)
+{
+	return get_dyn_SHLIBEXT();
+}
+
diff --git a/lib/util/util_str.c b/lib/util/util_str.c
index cf1b07ff0f..388d7887ef 100644
--- a/lib/util/util_str.c
+++ b/lib/util/util_str.c
@@ -32,87 +32,6 @@
  **/
 
 /**
- Safe string copy into a known length string. maxlength does not
- include the terminating zero.
-**/
-_PUBLIC_ char *safe_strcpy(char *dest,const char *src, size_t maxlength)
-{
-	size_t len;
-
-	if (!dest) {
-		DEBUG(0,("ERROR: NULL dest in safe_strcpy\n"));
-		return NULL;
-	}
-
-#ifdef DEVELOPER
-	/* We intentionally write out at the extremity of the destination
-	 * string.  If the destination is too short (e.g. pstrcpy into mallocd
-	 * or fstring) then this should cause an error under a memory
-	 * checker. */
-	dest[maxlength] = '\0';
-	if (PTR_DIFF(&len, dest) > 0) {  /* check if destination is on the stack, ok if so */
-		log_suspicious_usage("safe_strcpy", src);
-	}
-#endif
-
-	if (!src) {
-		*dest = 0;
-		return dest;
-	}  
-
-	len = strlen(src);
-
-	if (len > maxlength) {
-		DEBUG(0,("ERROR: string overflow by %u (%u - %u) in safe_strcpy [%.50s]\n",
-			 (unsigned int)(len-maxlength), (unsigned)len, (unsigned)maxlength, src));
-		len = maxlength;
-	}
-      
-	memmove(dest, src, len);
-	dest[len] = 0;
-	return dest;
-}  
-
-/**
- Safe string cat into a string. maxlength does not
- include the terminating zero.
-**/
-_PUBLIC_ char *safe_strcat(char *dest, const char *src, size_t maxlength)
-{
-	size_t src_len, dest_len;
-
-	if (!dest) {
-		DEBUG(0,("ERROR: NULL dest in safe_strcat\n"));
-		return NULL;
-	}
-
-	if (!src)
-		return dest;
-	
-#ifdef DEVELOPER
-	if (PTR_DIFF(&src_len, dest) > 0) {  /* check if destination is on the stack, ok if so */
-		log_suspicious_usage("safe_strcat", src);
-	}
-#endif
-	src_len = strlen(src);
-	dest_len = strlen(dest);
-
-	if (src_len + dest_len > maxlength) {
-		DEBUG(0,("ERROR: string overflow by %d in safe_strcat [%.50s]\n",
-			 (int)(src_len + dest_len - maxlength), src));
-		if (maxlength > dest_len) {
-			memcpy(&dest[dest_len], src, maxlength - dest_len);
-		}
-		dest[maxlength] = 0;
-		return NULL;
-	}
-	
-	memcpy(&dest[dest_len], src, src_len);
-	dest[dest_len + src_len] = 0;
-	return dest;
-}
-
-/**
   format a string into length-prefixed dotted domain format, as used in NBT
   and in some ADS structures
 **/
@@ -175,7 +94,7 @@ _PUBLIC_ bool conv_str_bool(const char * str, bool * val)
 /**
  * Convert a size specification like 16K into an integral number of bytes. 
  **/
-_PUBLIC_ bool conv_str_size(const char * str, uint64_t * val)
+_PUBLIC_ bool conv_str_size_error(const char * str, uint64_t * val)
 {
 	char *		    end = NULL;
 	unsigned long long  lval;
@@ -246,6 +165,6 @@ _PUBLIC_ bool strequal(const char *s1, const char *s2)
 	if (!s1 || !s2)
 		return false;
   
-	return strcasecmp(s1,s2) == 0;
+	return strcasecmp_m(s1,s2) == 0;
 }
 
diff --git a/lib/util/util_tdb.c b/lib/util/util_tdb.c
index 4a81678808..02c7095f66 100644
--- a/lib/util/util_tdb.c
+++ b/lib/util/util_tdb.c
@@ -20,7 +20,7 @@
 */
 
 #include "includes.h"
-#include <tdb.h>
+#include "../lib/tdb_compat/tdb_compat.h"
 #include "../lib/util/util_tdb.h"
 
 /* these are little tdb utility functions that are meant to make
@@ -57,7 +57,7 @@ TDB_DATA string_term_tdb_data(const char *string)
 }
 
 /****************************************************************************
- Lock a chain by string. Return -1 if lock failed.
+ Lock a chain by string. Return non-zero if lock failed.
 ****************************************************************************/
 
 int tdb_lock_bystring(struct tdb_context *tdb, const char *keyval)
@@ -79,7 +79,7 @@ void tdb_unlock_bystring(struct tdb_context *tdb, const char *keyval)
 }
 
 /****************************************************************************
- Read lock a chain by string. Return -1 if lock failed.
+ Read lock a chain by string. Return non-zero if lock failed.
 ****************************************************************************/
 
 int tdb_read_lock_bystring(struct tdb_context *tdb, const char *keyval)
@@ -111,7 +111,7 @@ int32_t tdb_fetch_int32_byblob(struct tdb_context *tdb, TDB_DATA key)
 	TDB_DATA data;
 	int32_t ret;
 
-	data = tdb_fetch(tdb, key);
+	data = tdb_fetch_compat(tdb, key);
 	if (!data.dptr || data.dsize != sizeof(int32_t)) {
 		SAFE_FREE(data.dptr);
 		return -1;
@@ -133,7 +133,7 @@ int32_t tdb_fetch_int32(struct tdb_context *tdb, const char *keystr)
 }
 
 /****************************************************************************
- Store a int32_t value by an arbitrary blob key, return 0 on success, -1 on failure.
+ Store a int32_t value by an arbitrary blob key, return 0 on success, -ve on failure.
  Input is int32_t in native byte order. Output in tdb is in little-endian.
 ****************************************************************************/
 
@@ -150,7 +150,7 @@ int tdb_store_int32_byblob(struct tdb_context *tdb, TDB_DATA key, int32_t v)
 }
 
 /****************************************************************************
- Store a int32_t value by string key, return 0 on success, -1 on failure.
+ Store a int32_t value by string key, return 0 on success, -ve on failure.
  Input is int32_t in native byte order. Output in tdb is in little-endian.
 ****************************************************************************/
 
@@ -168,7 +168,7 @@ bool tdb_fetch_uint32_byblob(struct tdb_context *tdb, TDB_DATA key, uint32_t *va
 {
 	TDB_DATA data;
 
-	data = tdb_fetch(tdb, key);
+	data = tdb_fetch_compat(tdb, key);
 	if (!data.dptr || data.dsize != sizeof(uint32_t)) {
 		SAFE_FREE(data.dptr);
 		return false;
@@ -190,7 +190,7 @@ bool tdb_fetch_uint32(struct tdb_context *tdb, const char *keystr, uint32_t *val
 }
 
 /****************************************************************************
- Store a uint32_t value by an arbitrary blob key, return 0 on success, -1 on failure.
+ Store a uint32_t value by an arbitrary blob key, return true on success, false on failure.
  Input is uint32_t in native byte order. Output in tdb is in little-endian.
 ****************************************************************************/
 
@@ -204,14 +204,14 @@ bool tdb_store_uint32_byblob(struct tdb_context *tdb, TDB_DATA key, uint32_t val
 	data.dptr = (unsigned char *)&v_store;
 	data.dsize = sizeof(uint32_t);
 
-	if (tdb_store(tdb, key, data, TDB_REPLACE) == -1)
+	if (tdb_store(tdb, key, data, TDB_REPLACE) != 0)
 		ret = false;
 
 	return ret;
 }
 
 /****************************************************************************
- Store a uint32_t value by string key, return 0 on success, -1 on failure.
+ Store a uint32_t value by string key, return true on success, false on failure.
  Input is uint32_t in native byte order. Output in tdb is in little-endian.
 ****************************************************************************/
 
@@ -220,7 +220,7 @@ bool tdb_store_uint32(struct tdb_context *tdb, const char *keystr, uint32_t valu
 	return tdb_store_uint32_byblob(tdb, string_term_tdb_data(keystr), value);
 }
 /****************************************************************************
- Store a buffer by a null terminated string key.  Return 0 on success, -1
+ Store a buffer by a null terminated string key.  Return 0 on success, -ve
  on failure.
 ****************************************************************************/
 
@@ -240,7 +240,7 @@ TDB_DATA tdb_fetch_bystring(struct tdb_context *tdb, const char *keystr)
 {
 	TDB_DATA key = string_term_tdb_data(keystr);
 
-	return tdb_fetch(tdb, key);
+	return tdb_fetch_compat(tdb, key);
 }
 
 /****************************************************************************
@@ -263,7 +263,7 @@ int32_t tdb_change_int32_atomic(struct tdb_context *tdb, const char *keystr, int
 	int32_t val;
 	int32_t ret = -1;
 
-	if (tdb_lock_bystring(tdb, keystr) == -1)
+	if (tdb_lock_bystring(tdb, keystr) != 0)
 		return -1;
 
 	if ((val = tdb_fetch_int32(tdb, keystr)) == -1) {
@@ -284,7 +284,7 @@ int32_t tdb_change_int32_atomic(struct tdb_context *tdb, const char *keystr, int
 	/* Increment value for storage and return next time */
 	val += change_val;
 		
-	if (tdb_store_int32(tdb, keystr, val) == -1)
+	if (tdb_store_int32(tdb, keystr, val) != 0)
 		goto err_out;
 
 	ret = 0;
@@ -304,7 +304,7 @@ bool tdb_change_uint32_atomic(struct tdb_context *tdb, const char *keystr, uint3
 	uint32_t val;
 	bool ret = false;
 
-	if (tdb_lock_bystring(tdb, keystr) == -1)
+	if (tdb_lock_bystring(tdb, keystr) != 0)
 		return false;
 
 	if (!tdb_fetch_uint32(tdb, keystr, &val)) {
diff --git a/lib/util/util_tdb.h b/lib/util/util_tdb.h
index d2f6648462..2d805d7d20 100644
--- a/lib/util/util_tdb.h
+++ b/lib/util/util_tdb.h
@@ -1,7 +1,27 @@
+/*
+   Unix SMB/CIFS implementation.
+
+   tdb utility functions
+
+   Copyright (C) Andrew Tridgell 1992-2006
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
 #ifndef _____LIB_UTIL_UTIL_TDB_H__
 #define _____LIB_UTIL_UTIL_TDB_H__
 
-
 /***************************************************************
  Make a TDB_DATA and keep the const warning in one place
 ****************************************************************/
@@ -11,7 +31,7 @@ TDB_DATA string_tdb_data(const char *string);
 TDB_DATA string_term_tdb_data(const char *string);
 
 /****************************************************************************
- Lock a chain by string. Return -1 if lock failed.
+ Lock a chain by string. Return non-zero if lock failed.
 ****************************************************************************/
 int tdb_lock_bystring(struct tdb_context *tdb, const char *keyval);
 
@@ -21,7 +41,7 @@ int tdb_lock_bystring(struct tdb_context *tdb, const char *keyval);
 void tdb_unlock_bystring(struct tdb_context *tdb, const char *keyval);
 
 /****************************************************************************
- Read lock a chain by string. Return -1 if lock failed.
+ Read lock a chain by string. Return non-zero if lock failed.
 ****************************************************************************/
 int tdb_read_lock_bystring(struct tdb_context *tdb, const char *keyval);
 
@@ -43,13 +63,13 @@ int32_t tdb_fetch_int32_byblob(struct tdb_context *tdb, TDB_DATA key);
 int32_t tdb_fetch_int32(struct tdb_context *tdb, const char *keystr);
 
 /****************************************************************************
- Store a int32_t value by an arbitrary blob key, return 0 on success, -1 on failure.
+ Store a int32_t value by an arbitrary blob key, return 0 on success, -ve on failure.
  Input is int32_t in native byte order. Output in tdb is in little-endian.
 ****************************************************************************/
 int tdb_store_int32_byblob(struct tdb_context *tdb, TDB_DATA key, int32_t v);
 
 /****************************************************************************
- Store a int32_t value by string key, return 0 on success, -1 on failure.
+ Store a int32_t value by string key, return 0 on success, -ve on failure.
  Input is int32_t in native byte order. Output in tdb is in little-endian.
 ****************************************************************************/
 int tdb_store_int32(struct tdb_context *tdb, const char *keystr, int32_t v);
@@ -67,19 +87,19 @@ bool tdb_fetch_uint32_byblob(struct tdb_context *tdb, TDB_DATA key, uint32_t *va
 bool tdb_fetch_uint32(struct tdb_context *tdb, const char *keystr, uint32_t *value);
 
 /****************************************************************************
- Store a uint32_t value by an arbitrary blob key, return 0 on success, -1 on failure.
+ Store a uint32_t value by an arbitrary blob key, return true on success, false on failure.
  Input is uint32_t in native byte order. Output in tdb is in little-endian.
 ****************************************************************************/
 bool tdb_store_uint32_byblob(struct tdb_context *tdb, TDB_DATA key, uint32_t value);
 
 /****************************************************************************
- Store a uint32_t value by string key, return 0 on success, -1 on failure.
+ Store a uint32_t value by string key, return true on success, false on failure.
  Input is uint32_t in native byte order. Output in tdb is in little-endian.
 ****************************************************************************/
 bool tdb_store_uint32(struct tdb_context *tdb, const char *keystr, uint32_t value);
 
 /****************************************************************************
- Store a buffer by a null terminated string key.  Return 0 on success, -1
+ Store a buffer by a null terminated string key.  Return 0 on success, -ve
  on failure.
 ****************************************************************************/
 int tdb_store_bystring(struct tdb_context *tdb, const char *keystr, TDB_DATA data, int flags);
@@ -91,7 +111,7 @@ int tdb_store_bystring(struct tdb_context *tdb, const char *keystr, TDB_DATA dat
 TDB_DATA tdb_fetch_bystring(struct tdb_context *tdb, const char *keystr);
 
 /****************************************************************************
- Delete an entry using a null terminated string key. 
+ Delete an entry using a null terminated string key.  0 on success, -ve on err.
 ****************************************************************************/
 int tdb_delete_bystring(struct tdb_context *tdb, const char *keystr);
 
diff --git a/lib/util/wrap_xattr.h b/lib/util/wrap_xattr.h
index 64b28d250c..745b93d764 100644
--- a/lib/util/wrap_xattr.h
+++ b/lib/util/wrap_xattr.h
@@ -1,3 +1,24 @@
+/*
+   Unix SMB/CIFS implementation.
+
+   POSIX NTVFS backend - xattr support using filesystem xattrs
+
+   Copyright (C) Andrew Tridgell 2004
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
 #ifndef __LIB_UTIL_WRAP_XATTR_H__
 #define __LIB_UTIL_WRAP_XATTR_H__
 
diff --git a/lib/util/wscript_build b/lib/util/wscript_build
index aad386ef2a..bdc9d10150 100755
--- a/lib/util/wscript_build
+++ b/lib/util/wscript_build
@@ -1,69 +1,25 @@
 #!/usr/bin/env python
 
-common_util_sources = '''talloc_stack.c smb_threads.c xfile.c data_blob.c
+bld.SAMBA_LIBRARY('samba-util',
+                  source='''talloc_stack.c smb_threads.c xfile.c data_blob.c
                     util_file.c time.c rbtree.c rfc1738.c select.c
                     genrand.c fsusage.c blocking.c become_daemon.c
                     signal.c system.c params.c util.c util_id.c util_net.c
-                    util_strlist.c idtree.c debug.c fault.c base64.c
-                    util_str_common.c'''
-
-common_util_headers = 'debug.h'
-common_util_public_deps = 'talloc pthread LIBCRYPTO'
-s4_util_sources = '''dprintf.c ms_fnmatch.c parmlist.c substitute.c util_str.c'''
-s4_util_deps = 'DYNCONFIG'
-s4_util_public_deps = 'talloc CHARSET execinfo uid_wrapper'
-s4_util_public_headers = 'attr.h byteorder.h data_blob.h memory.h safe_string.h time.h talloc_stack.h xfile.h dlinklist.h util.h'
-s4_util_header_path = [ ('dlinklist.h util.h', '.'), ('*', 'util') ]
-
-if bld.env.enable_s3build or bld.env._SAMBA_BUILD_ == 3:
-    # as we move files into common between samba-util and samba-util3, move them here.
-    # Both samba-util and samba-util3 depend on this private library
-    bld.SAMBA_LIBRARY('samba-util-common',
-                    source=common_util_sources,
-                    public_deps=common_util_public_deps,
-                    # until we get all the dependencies in this library in common
-                    # we need to allow this library to be built with unresolved symbols
-                    allow_undefined_symbols=True,
-                    local_include=False,
-                    public_headers=common_util_headers,
-                    header_path= [('*', 'util') ],
-                    private_library=True
-                    )
-
-    if bld.env._SAMBA_BUILD_ == 4:
-        bld.SAMBA_LIBRARY('samba-util',
-                        source=s4_util_sources,
-                        deps=s4_util_deps + ' samba-util-common',
-                        public_deps=s4_util_public_deps,
-                        public_headers=s4_util_public_headers,
-                        header_path= s4_util_header_path,
-                        local_include=False,
-                        vnum='0.0.1',
-                        pc_files='samba-util.pc'
-                        )
-
-else:
-    if bld.env._SAMBA_BUILD_ == 4:
-        bld.SAMBA_LIBRARY('samba-util',
-                        source=s4_util_sources + " " + common_util_sources,
-                        deps=s4_util_deps,
-                        public_deps=s4_util_public_deps + ' ' + common_util_public_deps,
-                        public_headers=s4_util_public_headers + ' ' + common_util_headers,
-                        header_path= s4_util_header_path,
-                        local_include=False,
-                        vnum='0.0.1',
-                        pc_files='samba-util.pc'
-                        )
-
-    # dummy subsystem for avoid wider deps changes.
-    bld.SAMBA_SUBSYSTEM('samba-util-common',
-                        source=[],
-                        deps='samba-util',
-                        local_include=False,)
+                    util_strlist.c util_paths.c idtree.c debug.c fault.c base64.c
+                    util_str.c util_str_common.c substitute.c ms_fnmatch.c
+                    server_id.c dprintf.c parmlist.c''',
+                  deps='DYNCONFIG',
+                  public_deps='talloc execinfo uid_wrapper pthread LIBCRYPTO CHARSET',
+                  public_headers='debug.h attr.h byteorder.h data_blob.h memory.h safe_string.h time.h talloc_stack.h xfile.h dlinklist.h util.h string_wrappers.h',
+                  header_path= [ ('dlinklist.h util.h', '.'), ('*', 'util') ],
+                  local_include=False,
+                  vnum='0.0.1',
+                  pc_files='samba-util.pc'
+                  )
 
 bld.SAMBA_LIBRARY('asn1util',
                   source='asn1.c',
-                  deps='talloc samba-util-common',
+                  deps='talloc samba-util',
                   private_library=True,
                   local_include=False)
 
@@ -88,7 +44,7 @@ bld.SAMBA_LIBRARY('wrap_xattr',
 bld.SAMBA_LIBRARY('UTIL_TDB',
 	source='util_tdb.c',
 	local_include=False,
-	public_deps='tdb talloc',
+	public_deps='tdb_compat talloc',
                   private_library=True
 	)
 
@@ -121,3 +77,12 @@ bld.SAMBA_SUBSYSTEM('UTIL_PW',
 	local_include=False,
 	public_deps='talloc'
 	)
+
+
+bld.SAMBA_LIBRARY('tdb-wrap',
+                  source='tdb_wrap.c',
+                  deps='tdb_compat talloc samba-util',
+                  private_library=True,
+                  local_include=False
+                  )
+